xref: /OpenGrok/opengrok-indexer/src/main/java/org/opengrok/indexer/util/SourceSplitter.java (revision 5d9f3aa0ca3da3a714233f987fa732f62c0965f6)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * See LICENSE.txt included in this distribution for the specific
9  * language governing permissions and limitations under the License.
10  *
11  * When distributing Covered Code, include this CDDL HEADER in each
12  * file and include the License file at LICENSE.txt.
13  * If applicable, add the following below this CDDL HEADER, with the
14  * fields enclosed by brackets "[]" replaced with your own identifying
15  * information: Portions Copyright [yyyy] [name of copyright owner]
16  *
17  * CDDL HEADER END
18  */
19 
20 /*
21  * Copyright (c) 2018, 2020, Chris Fraire <cfraire@me.com>.
22  */
23 package org.opengrok.indexer.util;
24 
25 import java.io.IOException;
26 import java.io.Reader;
27 import java.io.StringReader;
28 import java.util.ArrayList;
29 import java.util.List;
30 import org.opengrok.indexer.analysis.StreamSource;
31 
32 /**
33  * Represents a splitter of source text into lines, where end-of-line tokens --
34  * in accordance with {@link StringUtils#STANDARD_EOL} -- are maintained instead
35  * of being stripped.
36  */
37 public class SourceSplitter {
38 
39     private int length;
40     private String[] lines;
41     private int[] lineOffsets;
42 
43     /**
44      * Gets the number of characters in the original source document.
45      */
originalLength()46     public int originalLength() {
47         return length;
48     }
49 
50     /**
51      * Gets the number of split lines.
52      */
count()53     public int count() {
54         if (lines == null) {
55             throw new IllegalStateException("reset() did not succeed");
56         }
57         return lines.length;
58     }
59 
60     /**
61      * Gets the line at the specified index in the lines list.
62      * @param index greater than or equal to zero and less than
63      * {@link #count()}
64      * @return defined instance
65      * @throws IllegalArgumentException if {@code index} is out of bounds
66      */
getLine(int index)67     public String getLine(int index) {
68         if (lines == null) {
69             throw new IllegalStateException("reset() did not succeed");
70         }
71         if (index < 0 || index >= lines.length) {
72             throw new IllegalArgumentException("index is out of bounds");
73         }
74         return lines[index];
75     }
76 
77     /**
78      * Gets the starting document character offset of the line at the
79      * specified index in the lines list.
80      * @param index greater than or equal to zero and less than or equal to
81      * {@link #count()}
82      * @return line starting offset
83      * @throws IllegalArgumentException if {@code index} is out of bounds
84      */
getOffset(int index)85     public int getOffset(int index) {
86         if (lineOffsets == null) {
87             throw new IllegalStateException("reset() did not succeed");
88         }
89         if (index < 0 || index >= lineOffsets.length) {
90             throw new IllegalArgumentException("index is out of bounds");
91         }
92         return lineOffsets[index];
93     }
94 
95     /**
96      * Find the line index for the specified document offset.
97      * @param offset greater than or equal to zero and less than
98      * {@link #originalLength()}.
99      * @return -1 if {@code offset} is beyond the document bounds; otherwise,
100      * a valid index
101      */
findLineIndex(int offset)102     public int findLineIndex(int offset) {
103         if (lineOffsets == null) {
104             throw new IllegalStateException("reset() did not succeed");
105         }
106         return SplitterUtil.findLineIndex(length, lineOffsets, offset);
107     }
108 
109     /**
110      * Reset the splitter to use the specified content.
111      * @param original a defined instance
112      */
reset(String original)113     public void reset(String original) {
114         if (original == null) {
115             throw new IllegalArgumentException("`original' is null");
116         }
117 
118         try {
119             reset(new StringReader(original));
120         } catch (IOException ex) {
121             /*
122              * Should not get here, as String and StringReader operations cannot
123              * throw IOException.
124              */
125             throw new RuntimeException(ex);
126         }
127     }
128 
129     /**
130      * Calls
131      * {@link #reset(org.opengrok.indexer.analysis.StreamSource, org.opengrok.indexer.util.ReaderWrapper)}
132      * with {@code src} and {@code null}.
133      * @param src a defined instance
134      * @throws java.io.IOException if an I/O error occurs
135      */
reset(StreamSource src)136     public void reset(StreamSource src) throws IOException {
137         reset(src, null);
138     }
139 
140     /**
141      * Reset the splitter to use the specified inputs.
142      * @param src a defined instance
143      * @param wrapper an optional instance
144      * @throws java.io.IOException if an I/O error occurs
145      */
reset(StreamSource src, ReaderWrapper wrapper)146     public void reset(StreamSource src, ReaderWrapper wrapper)
147             throws IOException {
148         if (src == null) {
149             throw new IllegalArgumentException("`src' is null");
150         }
151 
152         SplitterUtil.reset(this::reset, src, wrapper);
153     }
154 
reset(Reader reader)155     private void reset(Reader reader) throws IOException {
156         length = 0;
157         lines = null;
158         lineOffsets = null;
159 
160         List<String> slist = new ArrayList<>();
161         SourceSplitterScanner scanner = new SourceSplitterScanner(reader);
162         scanner.setTarget(slist);
163         scanner.consume();
164         long fullLength = scanner.getLength();
165         /*
166          * Lucene cannot go past Integer.MAX_VALUE so revise the length to fit
167          * within the Integer constraint.
168          */
169         length = fullLength > Integer.MAX_VALUE ? Integer.MAX_VALUE : (int) fullLength;
170 
171         lines = slist.toArray(new String[0]);
172         setLineOffsets();
173     }
174 
setLineOffsets()175     private void setLineOffsets() {
176         /*
177          * Add one more entry for lineOffsets so that findLineIndex() can
178          * easily work on the last line.
179          */
180         lineOffsets = new int[lines.length + 1];
181         int offset = 0;
182         for (int i = 0; i < lineOffsets.length; ++i) {
183             lineOffsets[i] = offset;
184             if (i < lines.length) {
185                 offset += lines[i].length();
186             }
187         }
188     }
189 }
190