xref: /OpenGrok/opengrok-indexer/src/main/java/org/opengrok/indexer/util/SourceSplitter.java (revision 5d9f3aa0ca3da3a714233f987fa732f62c0965f6)
1b5840353SAdam Hornáček /*
2b5840353SAdam Hornáček  * CDDL HEADER START
3b5840353SAdam Hornáček  *
4b5840353SAdam Hornáček  * The contents of this file are subject to the terms of the
5b5840353SAdam Hornáček  * Common Development and Distribution License (the "License").
6b5840353SAdam Hornáček  * You may not use this file except in compliance with the License.
7b5840353SAdam Hornáček  *
8b5840353SAdam Hornáček  * See LICENSE.txt included in this distribution for the specific
9b5840353SAdam Hornáček  * language governing permissions and limitations under the License.
10b5840353SAdam Hornáček  *
11b5840353SAdam Hornáček  * When distributing Covered Code, include this CDDL HEADER in each
12b5840353SAdam Hornáček  * file and include the License file at LICENSE.txt.
13b5840353SAdam Hornáček  * If applicable, add the following below this CDDL HEADER, with the
14b5840353SAdam Hornáček  * fields enclosed by brackets "[]" replaced with your own identifying
15b5840353SAdam Hornáček  * information: Portions Copyright [yyyy] [name of copyright owner]
16b5840353SAdam Hornáček  *
17b5840353SAdam Hornáček  * CDDL HEADER END
18b5840353SAdam Hornáček  */
19b5840353SAdam Hornáček 
20b5840353SAdam Hornáček /*
21ee1827acSChris Fraire  * Copyright (c) 2018, 2020, Chris Fraire <cfraire@me.com>.
22b5840353SAdam Hornáček  */
239805b761SAdam Hornáček package org.opengrok.indexer.util;
24b5840353SAdam Hornáček 
25b5840353SAdam Hornáček import java.io.IOException;
26b5840353SAdam Hornáček import java.io.Reader;
27ee1827acSChris Fraire import java.io.StringReader;
28b5840353SAdam Hornáček import java.util.ArrayList;
29b5840353SAdam Hornáček import java.util.List;
309805b761SAdam Hornáček import org.opengrok.indexer.analysis.StreamSource;
31b5840353SAdam Hornáček 
32b5840353SAdam Hornáček /**
33ee1827acSChris Fraire  * Represents a splitter of source text into lines, where end-of-line tokens --
34ee1827acSChris Fraire  * in accordance with {@link StringUtils#STANDARD_EOL} -- are maintained instead
35ee1827acSChris Fraire  * of being stripped.
36b5840353SAdam Hornáček  */
37b5840353SAdam Hornáček public class SourceSplitter {
38b5840353SAdam Hornáček 
39b5840353SAdam Hornáček     private int length;
40b5840353SAdam Hornáček     private String[] lines;
41b5840353SAdam Hornáček     private int[] lineOffsets;
42b5840353SAdam Hornáček 
43b5840353SAdam Hornáček     /**
44b5840353SAdam Hornáček      * Gets the number of characters in the original source document.
45b5840353SAdam Hornáček      */
originalLength()46b5840353SAdam Hornáček     public int originalLength() {
47b5840353SAdam Hornáček         return length;
48b5840353SAdam Hornáček     }
49b5840353SAdam Hornáček 
50b5840353SAdam Hornáček     /**
51b5840353SAdam Hornáček      * Gets the number of split lines.
52b5840353SAdam Hornáček      */
count()53b5840353SAdam Hornáček     public int count() {
54b5840353SAdam Hornáček         if (lines == null) {
55ee1827acSChris Fraire             throw new IllegalStateException("reset() did not succeed");
56b5840353SAdam Hornáček         }
57b5840353SAdam Hornáček         return lines.length;
58b5840353SAdam Hornáček     }
59b5840353SAdam Hornáček 
60b5840353SAdam Hornáček     /**
61ee1827acSChris Fraire      * Gets the line at the specified index in the lines list.
62ee1827acSChris Fraire      * @param index greater than or equal to zero and less than
63b5840353SAdam Hornáček      * {@link #count()}
64b5840353SAdam Hornáček      * @return defined instance
65ee1827acSChris Fraire      * @throws IllegalArgumentException if {@code index} is out of bounds
66b5840353SAdam Hornáček      */
getLine(int index)67ee1827acSChris Fraire     public String getLine(int index) {
68b5840353SAdam Hornáček         if (lines == null) {
69ee1827acSChris Fraire             throw new IllegalStateException("reset() did not succeed");
70b5840353SAdam Hornáček         }
71ee1827acSChris Fraire         if (index < 0 || index >= lines.length) {
72ee1827acSChris Fraire             throw new IllegalArgumentException("index is out of bounds");
73b5840353SAdam Hornáček         }
74ee1827acSChris Fraire         return lines[index];
75b5840353SAdam Hornáček     }
76b5840353SAdam Hornáček 
77b5840353SAdam Hornáček     /**
78ee1827acSChris Fraire      * Gets the starting document character offset of the line at the
79ee1827acSChris Fraire      * specified index in the lines list.
80ee1827acSChris Fraire      * @param index greater than or equal to zero and less than or equal to
81b5840353SAdam Hornáček      * {@link #count()}
82ee1827acSChris Fraire      * @return line starting offset
83ee1827acSChris Fraire      * @throws IllegalArgumentException if {@code index} is out of bounds
84b5840353SAdam Hornáček      */
getOffset(int index)85ee1827acSChris Fraire     public int getOffset(int index) {
86b5840353SAdam Hornáček         if (lineOffsets == null) {
87ee1827acSChris Fraire             throw new IllegalStateException("reset() did not succeed");
88b5840353SAdam Hornáček         }
89ee1827acSChris Fraire         if (index < 0 || index >= lineOffsets.length) {
90ee1827acSChris Fraire             throw new IllegalArgumentException("index is out of bounds");
91b5840353SAdam Hornáček         }
92ee1827acSChris Fraire         return lineOffsets[index];
93b5840353SAdam Hornáček     }
94b5840353SAdam Hornáček 
95b5840353SAdam Hornáček     /**
96ee1827acSChris Fraire      * Find the line index for the specified document offset.
97ee1827acSChris Fraire      * @param offset greater than or equal to zero and less than
98b5840353SAdam Hornáček      * {@link #originalLength()}.
99ee1827acSChris Fraire      * @return -1 if {@code offset} is beyond the document bounds; otherwise,
100ee1827acSChris Fraire      * a valid index
101b5840353SAdam Hornáček      */
findLineIndex(int offset)102ee1827acSChris Fraire     public int findLineIndex(int offset) {
103b5840353SAdam Hornáček         if (lineOffsets == null) {
104ee1827acSChris Fraire             throw new IllegalStateException("reset() did not succeed");
105b5840353SAdam Hornáček         }
106ee1827acSChris Fraire         return SplitterUtil.findLineIndex(length, lineOffsets, offset);
107b5840353SAdam Hornáček     }
108b5840353SAdam Hornáček 
109b5840353SAdam Hornáček     /**
110b5840353SAdam Hornáček      * Reset the splitter to use the specified content.
111b5840353SAdam Hornáček      * @param original a defined instance
112b5840353SAdam Hornáček      */
reset(String original)113b5840353SAdam Hornáček     public void reset(String original) {
114b5840353SAdam Hornáček         if (original == null) {
115b5840353SAdam Hornáček             throw new IllegalArgumentException("`original' is null");
116b5840353SAdam Hornáček         }
117b5840353SAdam Hornáček 
118ee1827acSChris Fraire         try {
119ee1827acSChris Fraire             reset(new StringReader(original));
120ee1827acSChris Fraire         } catch (IOException ex) {
121b5840353SAdam Hornáček             /*
122ee1827acSChris Fraire              * Should not get here, as String and StringReader operations cannot
123ee1827acSChris Fraire              * throw IOException.
124b5840353SAdam Hornáček              */
125ee1827acSChris Fraire             throw new RuntimeException(ex);
126b5840353SAdam Hornáček         }
127b5840353SAdam Hornáček     }
128b5840353SAdam Hornáček 
129b5840353SAdam Hornáček     /**
130b5840353SAdam Hornáček      * Calls
1319805b761SAdam Hornáček      * {@link #reset(org.opengrok.indexer.analysis.StreamSource, org.opengrok.indexer.util.ReaderWrapper)}
132b5840353SAdam Hornáček      * with {@code src} and {@code null}.
133b5840353SAdam Hornáček      * @param src a defined instance
134b5840353SAdam Hornáček      * @throws java.io.IOException if an I/O error occurs
135b5840353SAdam Hornáček      */
reset(StreamSource src)136b5840353SAdam Hornáček     public void reset(StreamSource src) throws IOException {
137b5840353SAdam Hornáček         reset(src, null);
138b5840353SAdam Hornáček     }
139b5840353SAdam Hornáček 
140b5840353SAdam Hornáček     /**
141b5840353SAdam Hornáček      * Reset the splitter to use the specified inputs.
142b5840353SAdam Hornáček      * @param src a defined instance
143b5840353SAdam Hornáček      * @param wrapper an optional instance
144b5840353SAdam Hornáček      * @throws java.io.IOException if an I/O error occurs
145b5840353SAdam Hornáček      */
reset(StreamSource src, ReaderWrapper wrapper)146b5840353SAdam Hornáček     public void reset(StreamSource src, ReaderWrapper wrapper)
147b5840353SAdam Hornáček             throws IOException {
148b5840353SAdam Hornáček         if (src == null) {
149b5840353SAdam Hornáček             throw new IllegalArgumentException("`src' is null");
150b5840353SAdam Hornáček         }
151b5840353SAdam Hornáček 
152ee1827acSChris Fraire         SplitterUtil.reset(this::reset, src, wrapper);
153ee1827acSChris Fraire     }
154ee1827acSChris Fraire 
reset(Reader reader)155ee1827acSChris Fraire     private void reset(Reader reader) throws IOException {
156b5840353SAdam Hornáček         length = 0;
157b5840353SAdam Hornáček         lines = null;
158b5840353SAdam Hornáček         lineOffsets = null;
159b5840353SAdam Hornáček 
160b5840353SAdam Hornáček         List<String> slist = new ArrayList<>();
16154544730SChris Fraire         SourceSplitterScanner scanner = new SourceSplitterScanner(reader);
16254544730SChris Fraire         scanner.setTarget(slist);
16354544730SChris Fraire         scanner.consume();
164*d051e170SChris Fraire         long fullLength = scanner.getLength();
165*d051e170SChris Fraire         /*
166*d051e170SChris Fraire          * Lucene cannot go past Integer.MAX_VALUE so revise the length to fit
167*d051e170SChris Fraire          * within the Integer constraint.
168*d051e170SChris Fraire          */
169*d051e170SChris Fraire         length = fullLength > Integer.MAX_VALUE ? Integer.MAX_VALUE : (int) fullLength;
170b5840353SAdam Hornáček 
171ee1827acSChris Fraire         lines = slist.toArray(new String[0]);
172b5840353SAdam Hornáček         setLineOffsets();
173b5840353SAdam Hornáček     }
174b5840353SAdam Hornáček 
setLineOffsets()175b5840353SAdam Hornáček     private void setLineOffsets() {
176ee1827acSChris Fraire         /*
177ee1827acSChris Fraire          * Add one more entry for lineOffsets so that findLineIndex() can
178b5840353SAdam Hornáček          * easily work on the last line.
179b5840353SAdam Hornáček          */
180b5840353SAdam Hornáček         lineOffsets = new int[lines.length + 1];
181ee1827acSChris Fraire         int offset = 0;
182b5840353SAdam Hornáček         for (int i = 0; i < lineOffsets.length; ++i) {
183ee1827acSChris Fraire             lineOffsets[i] = offset;
184b5840353SAdam Hornáček             if (i < lines.length) {
185ee1827acSChris Fraire                 offset += lines[i].length();
186b5840353SAdam Hornáček             }
187b5840353SAdam Hornáček         }
188b5840353SAdam Hornáček     }
189b5840353SAdam Hornáček }
190