xref: /OpenGrok/opengrok-indexer/src/main/java/org/opengrok/indexer/util/LineBreaker.java (revision 5d9f3aa0ca3da3a714233f987fa732f62c0965f6)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * See LICENSE.txt included in this distribution for the specific
9  * language governing permissions and limitations under the License.
10  *
11  * When distributing Covered Code, include this CDDL HEADER in each
12  * file and include the License file at LICENSE.txt.
13  * If applicable, add the following below this CDDL HEADER, with the
14  * fields enclosed by brackets "[]" replaced with your own identifying
15  * information: Portions Copyright [yyyy] [name of copyright owner]
16  *
17  * CDDL HEADER END
18  */
19 
20 /*
21  * Copyright (c) 2018, 2020, Chris Fraire <cfraire@me.com>.
22  */
23 package org.opengrok.indexer.util;
24 
25 import java.io.IOException;
26 import java.io.Reader;
27 import java.util.ArrayList;
28 import java.util.Arrays;
29 import java.util.List;
30 import org.opengrok.indexer.analysis.StreamSource;
31 
32 /**
33  * Represents a reader of source text to find end-of-line tokens -- in
34  * accordance with {@link StringUtils#STANDARD_EOL} -- in order to determine
35  * line offsets but discarding line content.
36  */
37 public class LineBreaker {
38 
39     private int length;
40     private int count;
41     private int[] lineOffsets;
42 
43     /**
44      * Calls
45      * {@link #reset(org.opengrok.indexer.analysis.StreamSource, org.opengrok.indexer.util.ReaderWrapper)}
46      * with {@code src} and {@code null}.
47      * @param src a defined instance
48      * @throws java.io.IOException if an I/O error occurs
49      */
reset(StreamSource src)50     public void reset(StreamSource src) throws IOException {
51         reset(src, null);
52     }
53 
54     /**
55      * Resets the breaker using the specified inputs.
56      * @param src a defined instance
57      * @param wrapper an optional instance
58      * @throws java.io.IOException if an I/O error occurs
59      */
reset(StreamSource src, ReaderWrapper wrapper)60     public void reset(StreamSource src, ReaderWrapper wrapper)
61             throws IOException {
62         if (src == null) {
63             throw new IllegalArgumentException("`src' is null");
64         }
65 
66         SplitterUtil.reset(this::reset, src, wrapper);
67     }
68 
reset(Reader reader)69     private void reset(Reader reader) throws IOException {
70         length = 0;
71         lineOffsets = null;
72 
73         List<Long> newOffsets = new ArrayList<>();
74         LineBreakerScanner scanner = new LineBreakerScanner(reader);
75         scanner.setTarget(newOffsets);
76         scanner.consume();
77         long fullLength = scanner.getLength();
78         /*
79          * Lucene cannot go past Integer.MAX_VALUE so revise the length to fit
80          * within the Integer constraint.
81          */
82         length = fullLength > Integer.MAX_VALUE ? Integer.MAX_VALUE : (int) fullLength;
83         count = newOffsets.size() - 1;
84 
85         lineOffsets = new int[newOffsets.size()];
86         for (int i = 0; i < lineOffsets.length; ++i) {
87             long fullOffset = newOffsets.get(i);
88             if (fullOffset <= Integer.MAX_VALUE) {
89                 lineOffsets[i] = (int) fullOffset;
90             } else {
91                 /*
92                  * Lucene cannot go past Integer.MAX_VALUE so revise the line
93                  * breaks to fit within the Integer constraint, and stop.
94                  */
95                 lineOffsets[i] = Integer.MAX_VALUE;
96                 lineOffsets = Arrays.copyOf(lineOffsets, i + 1);
97                 count -= newOffsets.size() - lineOffsets.length;
98                 break;
99             }
100         }
101     }
102 
103     /**
104      * Gets the number of characters in the original source document.
105      * @return value
106      */
originalLength()107     public int originalLength() {
108         return length;
109     }
110 
111     /**
112      * Gets the number of split lines.
113      */
count()114     public int count() {
115         if (lineOffsets == null) {
116             throw new IllegalStateException("reset() did not succeed");
117         }
118         return count;
119     }
120 
121     /**
122      * Gets the starting document character offset of the line at the
123      * specified index in the lines list.
124      * @param index greater than or equal to zero and less than or equal to
125      * {@link #count()}
126      * @return line starting offset
127      * @throws IllegalArgumentException if {@code index} is out of bounds
128      */
getOffset(int index)129     public int getOffset(int index) {
130         if (lineOffsets == null) {
131             throw new IllegalStateException("reset() did not succeed");
132         }
133         if (index < 0 || index >= lineOffsets.length) {
134             throw new IllegalArgumentException("index is out of bounds");
135         }
136         return lineOffsets[index];
137     }
138 
139     /**
140      * Find the line index for the specified document offset.
141      * @param offset greater than or equal to zero and less than
142      * {@link #originalLength()}.
143      * @return -1 if {@code offset} is beyond the document bounds; otherwise,
144      * a valid index
145      */
findLineIndex(int offset)146     public int findLineIndex(int offset) {
147         if (lineOffsets == null) {
148             throw new IllegalStateException("reset() did not succeed");
149         }
150         return SplitterUtil.findLineIndex(length, lineOffsets, offset);
151     }
152 }
153