1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * See LICENSE.txt included in this distribution for the specific 9 * language governing permissions and limitations under the License. 10 * 11 * When distributing Covered Code, include this CDDL HEADER in each 12 * file and include the License file at LICENSE.txt. 13 * If applicable, add the following below this CDDL HEADER, with the 14 * fields enclosed by brackets "[]" replaced with your own identifying 15 * information: Portions Copyright [yyyy] [name of copyright owner] 16 * 17 * CDDL HEADER END 18 */ 19 20 /* 21 * Copyright (c) 2018, 2020, Chris Fraire <cfraire@me.com>. 22 */ 23 package org.opengrok.indexer.util; 24 25 import java.io.IOException; 26 import java.io.Reader; 27 import java.io.StringReader; 28 import java.util.ArrayList; 29 import java.util.List; 30 import org.opengrok.indexer.analysis.StreamSource; 31 32 /** 33 * Represents a splitter of source text into lines, where end-of-line tokens -- 34 * in accordance with {@link StringUtils#STANDARD_EOL} -- are maintained instead 35 * of being stripped. 36 */ 37 public class SourceSplitter { 38 39 private int length; 40 private String[] lines; 41 private int[] lineOffsets; 42 43 /** 44 * Gets the number of characters in the original source document. 45 */ originalLength()46 public int originalLength() { 47 return length; 48 } 49 50 /** 51 * Gets the number of split lines. 52 */ count()53 public int count() { 54 if (lines == null) { 55 throw new IllegalStateException("reset() did not succeed"); 56 } 57 return lines.length; 58 } 59 60 /** 61 * Gets the line at the specified index in the lines list. 62 * @param index greater than or equal to zero and less than 63 * {@link #count()} 64 * @return defined instance 65 * @throws IllegalArgumentException if {@code index} is out of bounds 66 */ getLine(int index)67 public String getLine(int index) { 68 if (lines == null) { 69 throw new IllegalStateException("reset() did not succeed"); 70 } 71 if (index < 0 || index >= lines.length) { 72 throw new IllegalArgumentException("index is out of bounds"); 73 } 74 return lines[index]; 75 } 76 77 /** 78 * Gets the starting document character offset of the line at the 79 * specified index in the lines list. 80 * @param index greater than or equal to zero and less than or equal to 81 * {@link #count()} 82 * @return line starting offset 83 * @throws IllegalArgumentException if {@code index} is out of bounds 84 */ getOffset(int index)85 public int getOffset(int index) { 86 if (lineOffsets == null) { 87 throw new IllegalStateException("reset() did not succeed"); 88 } 89 if (index < 0 || index >= lineOffsets.length) { 90 throw new IllegalArgumentException("index is out of bounds"); 91 } 92 return lineOffsets[index]; 93 } 94 95 /** 96 * Find the line index for the specified document offset. 97 * @param offset greater than or equal to zero and less than 98 * {@link #originalLength()}. 99 * @return -1 if {@code offset} is beyond the document bounds; otherwise, 100 * a valid index 101 */ findLineIndex(int offset)102 public int findLineIndex(int offset) { 103 if (lineOffsets == null) { 104 throw new IllegalStateException("reset() did not succeed"); 105 } 106 return SplitterUtil.findLineIndex(length, lineOffsets, offset); 107 } 108 109 /** 110 * Reset the splitter to use the specified content. 111 * @param original a defined instance 112 */ reset(String original)113 public void reset(String original) { 114 if (original == null) { 115 throw new IllegalArgumentException("`original' is null"); 116 } 117 118 try { 119 reset(new StringReader(original)); 120 } catch (IOException ex) { 121 /* 122 * Should not get here, as String and StringReader operations cannot 123 * throw IOException. 124 */ 125 throw new RuntimeException(ex); 126 } 127 } 128 129 /** 130 * Calls 131 * {@link #reset(org.opengrok.indexer.analysis.StreamSource, org.opengrok.indexer.util.ReaderWrapper)} 132 * with {@code src} and {@code null}. 133 * @param src a defined instance 134 * @throws java.io.IOException if an I/O error occurs 135 */ reset(StreamSource src)136 public void reset(StreamSource src) throws IOException { 137 reset(src, null); 138 } 139 140 /** 141 * Reset the splitter to use the specified inputs. 142 * @param src a defined instance 143 * @param wrapper an optional instance 144 * @throws java.io.IOException if an I/O error occurs 145 */ reset(StreamSource src, ReaderWrapper wrapper)146 public void reset(StreamSource src, ReaderWrapper wrapper) 147 throws IOException { 148 if (src == null) { 149 throw new IllegalArgumentException("`src' is null"); 150 } 151 152 SplitterUtil.reset(this::reset, src, wrapper); 153 } 154 reset(Reader reader)155 private void reset(Reader reader) throws IOException { 156 length = 0; 157 lines = null; 158 lineOffsets = null; 159 160 List<String> slist = new ArrayList<>(); 161 SourceSplitterScanner scanner = new SourceSplitterScanner(reader); 162 scanner.setTarget(slist); 163 scanner.consume(); 164 long fullLength = scanner.getLength(); 165 /* 166 * Lucene cannot go past Integer.MAX_VALUE so revise the length to fit 167 * within the Integer constraint. 168 */ 169 length = fullLength > Integer.MAX_VALUE ? Integer.MAX_VALUE : (int) fullLength; 170 171 lines = slist.toArray(new String[0]); 172 setLineOffsets(); 173 } 174 setLineOffsets()175 private void setLineOffsets() { 176 /* 177 * Add one more entry for lineOffsets so that findLineIndex() can 178 * easily work on the last line. 179 */ 180 lineOffsets = new int[lines.length + 1]; 181 int offset = 0; 182 for (int i = 0; i < lineOffsets.length; ++i) { 183 lineOffsets[i] = offset; 184 if (i < lines.length) { 185 offset += lines[i].length(); 186 } 187 } 188 } 189 } 190