1b5840353SAdam Hornáček /* 2b5840353SAdam Hornáček * CDDL HEADER START 3b5840353SAdam Hornáček * 4b5840353SAdam Hornáček * The contents of this file are subject to the terms of the 5b5840353SAdam Hornáček * Common Development and Distribution License (the "License"). 6b5840353SAdam Hornáček * You may not use this file except in compliance with the License. 7b5840353SAdam Hornáček * 8b5840353SAdam Hornáček * See LICENSE.txt included in this distribution for the specific 9b5840353SAdam Hornáček * language governing permissions and limitations under the License. 10b5840353SAdam Hornáček * 11b5840353SAdam Hornáček * When distributing Covered Code, include this CDDL HEADER in each 12b5840353SAdam Hornáček * file and include the License file at LICENSE.txt. 13b5840353SAdam Hornáček * If applicable, add the following below this CDDL HEADER, with the 14b5840353SAdam Hornáček * fields enclosed by brackets "[]" replaced with your own identifying 15b5840353SAdam Hornáček * information: Portions Copyright [yyyy] [name of copyright owner] 16b5840353SAdam Hornáček * 17b5840353SAdam Hornáček * CDDL HEADER END 18b5840353SAdam Hornáček */ 19b5840353SAdam Hornáček 20b5840353SAdam Hornáček /* 21ee1827acSChris Fraire * Copyright (c) 2018, 2020, Chris Fraire <cfraire@me.com>. 22b5840353SAdam Hornáček */ 239805b761SAdam Hornáček package org.opengrok.indexer.util; 24b5840353SAdam Hornáček 25b5840353SAdam Hornáček import java.io.IOException; 26b5840353SAdam Hornáček import java.io.Reader; 27ee1827acSChris Fraire import java.io.StringReader; 28b5840353SAdam Hornáček import java.util.ArrayList; 29b5840353SAdam Hornáček import java.util.List; 309805b761SAdam Hornáček import org.opengrok.indexer.analysis.StreamSource; 31b5840353SAdam Hornáček 32b5840353SAdam Hornáček /** 33ee1827acSChris Fraire * Represents a splitter of source text into lines, where end-of-line tokens -- 34ee1827acSChris Fraire * in accordance with {@link StringUtils#STANDARD_EOL} -- are maintained instead 35ee1827acSChris Fraire * of being stripped. 36b5840353SAdam Hornáček */ 37b5840353SAdam Hornáček public class SourceSplitter { 38b5840353SAdam Hornáček 39b5840353SAdam Hornáček private int length; 40b5840353SAdam Hornáček private String[] lines; 41b5840353SAdam Hornáček private int[] lineOffsets; 42b5840353SAdam Hornáček 43b5840353SAdam Hornáček /** 44b5840353SAdam Hornáček * Gets the number of characters in the original source document. 45b5840353SAdam Hornáček */ originalLength()46b5840353SAdam Hornáček public int originalLength() { 47b5840353SAdam Hornáček return length; 48b5840353SAdam Hornáček } 49b5840353SAdam Hornáček 50b5840353SAdam Hornáček /** 51b5840353SAdam Hornáček * Gets the number of split lines. 52b5840353SAdam Hornáček */ count()53b5840353SAdam Hornáček public int count() { 54b5840353SAdam Hornáček if (lines == null) { 55ee1827acSChris Fraire throw new IllegalStateException("reset() did not succeed"); 56b5840353SAdam Hornáček } 57b5840353SAdam Hornáček return lines.length; 58b5840353SAdam Hornáček } 59b5840353SAdam Hornáček 60b5840353SAdam Hornáček /** 61ee1827acSChris Fraire * Gets the line at the specified index in the lines list. 62ee1827acSChris Fraire * @param index greater than or equal to zero and less than 63b5840353SAdam Hornáček * {@link #count()} 64b5840353SAdam Hornáček * @return defined instance 65ee1827acSChris Fraire * @throws IllegalArgumentException if {@code index} is out of bounds 66b5840353SAdam Hornáček */ getLine(int index)67ee1827acSChris Fraire public String getLine(int index) { 68b5840353SAdam Hornáček if (lines == null) { 69ee1827acSChris Fraire throw new IllegalStateException("reset() did not succeed"); 70b5840353SAdam Hornáček } 71ee1827acSChris Fraire if (index < 0 || index >= lines.length) { 72ee1827acSChris Fraire throw new IllegalArgumentException("index is out of bounds"); 73b5840353SAdam Hornáček } 74ee1827acSChris Fraire return lines[index]; 75b5840353SAdam Hornáček } 76b5840353SAdam Hornáček 77b5840353SAdam Hornáček /** 78ee1827acSChris Fraire * Gets the starting document character offset of the line at the 79ee1827acSChris Fraire * specified index in the lines list. 80ee1827acSChris Fraire * @param index greater than or equal to zero and less than or equal to 81b5840353SAdam Hornáček * {@link #count()} 82ee1827acSChris Fraire * @return line starting offset 83ee1827acSChris Fraire * @throws IllegalArgumentException if {@code index} is out of bounds 84b5840353SAdam Hornáček */ getOffset(int index)85ee1827acSChris Fraire public int getOffset(int index) { 86b5840353SAdam Hornáček if (lineOffsets == null) { 87ee1827acSChris Fraire throw new IllegalStateException("reset() did not succeed"); 88b5840353SAdam Hornáček } 89ee1827acSChris Fraire if (index < 0 || index >= lineOffsets.length) { 90ee1827acSChris Fraire throw new IllegalArgumentException("index is out of bounds"); 91b5840353SAdam Hornáček } 92ee1827acSChris Fraire return lineOffsets[index]; 93b5840353SAdam Hornáček } 94b5840353SAdam Hornáček 95b5840353SAdam Hornáček /** 96ee1827acSChris Fraire * Find the line index for the specified document offset. 97ee1827acSChris Fraire * @param offset greater than or equal to zero and less than 98b5840353SAdam Hornáček * {@link #originalLength()}. 99ee1827acSChris Fraire * @return -1 if {@code offset} is beyond the document bounds; otherwise, 100ee1827acSChris Fraire * a valid index 101b5840353SAdam Hornáček */ findLineIndex(int offset)102ee1827acSChris Fraire public int findLineIndex(int offset) { 103b5840353SAdam Hornáček if (lineOffsets == null) { 104ee1827acSChris Fraire throw new IllegalStateException("reset() did not succeed"); 105b5840353SAdam Hornáček } 106ee1827acSChris Fraire return SplitterUtil.findLineIndex(length, lineOffsets, offset); 107b5840353SAdam Hornáček } 108b5840353SAdam Hornáček 109b5840353SAdam Hornáček /** 110b5840353SAdam Hornáček * Reset the splitter to use the specified content. 111b5840353SAdam Hornáček * @param original a defined instance 112b5840353SAdam Hornáček */ reset(String original)113b5840353SAdam Hornáček public void reset(String original) { 114b5840353SAdam Hornáček if (original == null) { 115b5840353SAdam Hornáček throw new IllegalArgumentException("`original' is null"); 116b5840353SAdam Hornáček } 117b5840353SAdam Hornáček 118ee1827acSChris Fraire try { 119ee1827acSChris Fraire reset(new StringReader(original)); 120ee1827acSChris Fraire } catch (IOException ex) { 121b5840353SAdam Hornáček /* 122ee1827acSChris Fraire * Should not get here, as String and StringReader operations cannot 123ee1827acSChris Fraire * throw IOException. 124b5840353SAdam Hornáček */ 125ee1827acSChris Fraire throw new RuntimeException(ex); 126b5840353SAdam Hornáček } 127b5840353SAdam Hornáček } 128b5840353SAdam Hornáček 129b5840353SAdam Hornáček /** 130b5840353SAdam Hornáček * Calls 1319805b761SAdam Hornáček * {@link #reset(org.opengrok.indexer.analysis.StreamSource, org.opengrok.indexer.util.ReaderWrapper)} 132b5840353SAdam Hornáček * with {@code src} and {@code null}. 133b5840353SAdam Hornáček * @param src a defined instance 134b5840353SAdam Hornáček * @throws java.io.IOException if an I/O error occurs 135b5840353SAdam Hornáček */ reset(StreamSource src)136b5840353SAdam Hornáček public void reset(StreamSource src) throws IOException { 137b5840353SAdam Hornáček reset(src, null); 138b5840353SAdam Hornáček } 139b5840353SAdam Hornáček 140b5840353SAdam Hornáček /** 141b5840353SAdam Hornáček * Reset the splitter to use the specified inputs. 142b5840353SAdam Hornáček * @param src a defined instance 143b5840353SAdam Hornáček * @param wrapper an optional instance 144b5840353SAdam Hornáček * @throws java.io.IOException if an I/O error occurs 145b5840353SAdam Hornáček */ reset(StreamSource src, ReaderWrapper wrapper)146b5840353SAdam Hornáček public void reset(StreamSource src, ReaderWrapper wrapper) 147b5840353SAdam Hornáček throws IOException { 148b5840353SAdam Hornáček if (src == null) { 149b5840353SAdam Hornáček throw new IllegalArgumentException("`src' is null"); 150b5840353SAdam Hornáček } 151b5840353SAdam Hornáček 152ee1827acSChris Fraire SplitterUtil.reset(this::reset, src, wrapper); 153ee1827acSChris Fraire } 154ee1827acSChris Fraire reset(Reader reader)155ee1827acSChris Fraire private void reset(Reader reader) throws IOException { 156b5840353SAdam Hornáček length = 0; 157b5840353SAdam Hornáček lines = null; 158b5840353SAdam Hornáček lineOffsets = null; 159b5840353SAdam Hornáček 160b5840353SAdam Hornáček List<String> slist = new ArrayList<>(); 16154544730SChris Fraire SourceSplitterScanner scanner = new SourceSplitterScanner(reader); 16254544730SChris Fraire scanner.setTarget(slist); 16354544730SChris Fraire scanner.consume(); 164*d051e170SChris Fraire long fullLength = scanner.getLength(); 165*d051e170SChris Fraire /* 166*d051e170SChris Fraire * Lucene cannot go past Integer.MAX_VALUE so revise the length to fit 167*d051e170SChris Fraire * within the Integer constraint. 168*d051e170SChris Fraire */ 169*d051e170SChris Fraire length = fullLength > Integer.MAX_VALUE ? Integer.MAX_VALUE : (int) fullLength; 170b5840353SAdam Hornáček 171ee1827acSChris Fraire lines = slist.toArray(new String[0]); 172b5840353SAdam Hornáček setLineOffsets(); 173b5840353SAdam Hornáček } 174b5840353SAdam Hornáček setLineOffsets()175b5840353SAdam Hornáček private void setLineOffsets() { 176ee1827acSChris Fraire /* 177ee1827acSChris Fraire * Add one more entry for lineOffsets so that findLineIndex() can 178b5840353SAdam Hornáček * easily work on the last line. 179b5840353SAdam Hornáček */ 180b5840353SAdam Hornáček lineOffsets = new int[lines.length + 1]; 181ee1827acSChris Fraire int offset = 0; 182b5840353SAdam Hornáček for (int i = 0; i < lineOffsets.length; ++i) { 183ee1827acSChris Fraire lineOffsets[i] = offset; 184b5840353SAdam Hornáček if (i < lines.length) { 185ee1827acSChris Fraire offset += lines[i].length(); 186b5840353SAdam Hornáček } 187b5840353SAdam Hornáček } 188b5840353SAdam Hornáček } 189b5840353SAdam Hornáček } 190