1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * See LICENSE.txt included in this distribution for the specific 9 * language governing permissions and limitations under the License. 10 * 11 * When distributing Covered Code, include this CDDL HEADER in each 12 * file and include the License file at LICENSE.txt. 13 * If applicable, add the following below this CDDL HEADER, with the 14 * fields enclosed by brackets "[]" replaced with your own identifying 15 * information: Portions Copyright [yyyy] [name of copyright owner] 16 * 17 * CDDL HEADER END 18 */ 19 20 /* 21 * Copyright (c) 2018, 2020, Chris Fraire <cfraire@me.com>. 22 */ 23 package org.opengrok.indexer.util; 24 25 import java.io.IOException; 26 import java.io.Reader; 27 import java.util.ArrayList; 28 import java.util.Arrays; 29 import java.util.List; 30 import org.opengrok.indexer.analysis.StreamSource; 31 32 /** 33 * Represents a reader of source text to find end-of-line tokens -- in 34 * accordance with {@link StringUtils#STANDARD_EOL} -- in order to determine 35 * line offsets but discarding line content. 36 */ 37 public class LineBreaker { 38 39 private int length; 40 private int count; 41 private int[] lineOffsets; 42 43 /** 44 * Calls 45 * {@link #reset(org.opengrok.indexer.analysis.StreamSource, org.opengrok.indexer.util.ReaderWrapper)} 46 * with {@code src} and {@code null}. 47 * @param src a defined instance 48 * @throws java.io.IOException if an I/O error occurs 49 */ reset(StreamSource src)50 public void reset(StreamSource src) throws IOException { 51 reset(src, null); 52 } 53 54 /** 55 * Resets the breaker using the specified inputs. 56 * @param src a defined instance 57 * @param wrapper an optional instance 58 * @throws java.io.IOException if an I/O error occurs 59 */ reset(StreamSource src, ReaderWrapper wrapper)60 public void reset(StreamSource src, ReaderWrapper wrapper) 61 throws IOException { 62 if (src == null) { 63 throw new IllegalArgumentException("`src' is null"); 64 } 65 66 SplitterUtil.reset(this::reset, src, wrapper); 67 } 68 reset(Reader reader)69 private void reset(Reader reader) throws IOException { 70 length = 0; 71 lineOffsets = null; 72 73 List<Long> newOffsets = new ArrayList<>(); 74 LineBreakerScanner scanner = new LineBreakerScanner(reader); 75 scanner.setTarget(newOffsets); 76 scanner.consume(); 77 long fullLength = scanner.getLength(); 78 /* 79 * Lucene cannot go past Integer.MAX_VALUE so revise the length to fit 80 * within the Integer constraint. 81 */ 82 length = fullLength > Integer.MAX_VALUE ? Integer.MAX_VALUE : (int) fullLength; 83 count = newOffsets.size() - 1; 84 85 lineOffsets = new int[newOffsets.size()]; 86 for (int i = 0; i < lineOffsets.length; ++i) { 87 long fullOffset = newOffsets.get(i); 88 if (fullOffset <= Integer.MAX_VALUE) { 89 lineOffsets[i] = (int) fullOffset; 90 } else { 91 /* 92 * Lucene cannot go past Integer.MAX_VALUE so revise the line 93 * breaks to fit within the Integer constraint, and stop. 94 */ 95 lineOffsets[i] = Integer.MAX_VALUE; 96 lineOffsets = Arrays.copyOf(lineOffsets, i + 1); 97 count -= newOffsets.size() - lineOffsets.length; 98 break; 99 } 100 } 101 } 102 103 /** 104 * Gets the number of characters in the original source document. 105 * @return value 106 */ originalLength()107 public int originalLength() { 108 return length; 109 } 110 111 /** 112 * Gets the number of split lines. 113 */ count()114 public int count() { 115 if (lineOffsets == null) { 116 throw new IllegalStateException("reset() did not succeed"); 117 } 118 return count; 119 } 120 121 /** 122 * Gets the starting document character offset of the line at the 123 * specified index in the lines list. 124 * @param index greater than or equal to zero and less than or equal to 125 * {@link #count()} 126 * @return line starting offset 127 * @throws IllegalArgumentException if {@code index} is out of bounds 128 */ getOffset(int index)129 public int getOffset(int index) { 130 if (lineOffsets == null) { 131 throw new IllegalStateException("reset() did not succeed"); 132 } 133 if (index < 0 || index >= lineOffsets.length) { 134 throw new IllegalArgumentException("index is out of bounds"); 135 } 136 return lineOffsets[index]; 137 } 138 139 /** 140 * Find the line index for the specified document offset. 141 * @param offset greater than or equal to zero and less than 142 * {@link #originalLength()}. 143 * @return -1 if {@code offset} is beyond the document bounds; otherwise, 144 * a valid index 145 */ findLineIndex(int offset)146 public int findLineIndex(int offset) { 147 if (lineOffsets == null) { 148 throw new IllegalStateException("reset() did not succeed"); 149 } 150 return SplitterUtil.findLineIndex(length, lineOffsets, offset); 151 } 152 } 153