1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * See LICENSE.txt included in this distribution for the specific 9 * language governing permissions and limitations under the License. 10 * 11 * When distributing Covered Code, include this CDDL HEADER in each 12 * file and include the License file at LICENSE.txt. 13 * If applicable, add the following below this CDDL HEADER, with the 14 * fields enclosed by brackets "[]" replaced with your own identifying 15 * information: Portions Copyright [yyyy] [name of copyright owner] 16 * 17 * CDDL HEADER END 18 */ 19 20 /* 21 * Copyright (c) 2005, 2021, Oracle and/or its affiliates. All rights reserved. 22 * Portions Copyright (c) 2017, 2020, Chris Fraire <cfraire@me.com>. 23 */ 24 package org.opengrok.indexer.analysis.plain; 25 26 import java.io.IOException; 27 import java.io.InputStream; 28 import java.io.Reader; 29 import java.io.Writer; 30 import java.util.concurrent.CompletableFuture; 31 import java.util.concurrent.ExecutionException; 32 import java.util.concurrent.TimeUnit; 33 import java.util.function.Supplier; 34 35 import org.apache.lucene.document.Document; 36 import org.apache.lucene.document.StoredField; 37 import org.opengrok.indexer.analysis.AnalyzerFactory; 38 import org.opengrok.indexer.analysis.Definitions; 39 import org.opengrok.indexer.analysis.ExpandTabsReader; 40 import org.opengrok.indexer.analysis.JFlexTokenizer; 41 import org.opengrok.indexer.analysis.JFlexXref; 42 import org.opengrok.indexer.analysis.NumLinesLOC; 43 import org.opengrok.indexer.analysis.OGKTextField; 44 import org.opengrok.indexer.analysis.OGKTextVecField; 45 import org.opengrok.indexer.analysis.Scopes; 46 import org.opengrok.indexer.analysis.StreamSource; 47 import org.opengrok.indexer.analysis.TextAnalyzer; 48 import org.opengrok.indexer.analysis.WriteXrefArgs; 49 import org.opengrok.indexer.analysis.Xrefer; 50 import org.opengrok.indexer.configuration.RuntimeEnvironment; 51 import org.opengrok.indexer.search.QueryBuilder; 52 import org.opengrok.indexer.util.NullWriter; 53 54 /** 55 * Analyzer for plain text files. 56 * 57 * Created on September 21, 2005 58 * @author Chandan 59 */ 60 public class PlainAnalyzer extends TextAnalyzer { 61 62 /** 63 * Creates a new instance of PlainAnalyzer. 64 * @param factory defined instance for the analyzer 65 */ PlainAnalyzer(AnalyzerFactory factory)66 protected PlainAnalyzer(AnalyzerFactory factory) { 67 super(factory); 68 } 69 70 /** 71 * Creates a new instance of {@link PlainAnalyzer}. 72 * @param factory defined instance for the analyzer 73 * @param symbolTokenizerFactory defined instance for the analyzer 74 */ PlainAnalyzer(AnalyzerFactory factory, Supplier<JFlexTokenizer> symbolTokenizerFactory)75 protected PlainAnalyzer(AnalyzerFactory factory, 76 Supplier<JFlexTokenizer> symbolTokenizerFactory) { 77 super(factory, symbolTokenizerFactory); 78 } 79 80 /** 81 * @return {@code null} as there is no aligned language 82 */ 83 @Override getCtagsLang()84 public String getCtagsLang() { 85 return null; 86 } 87 88 /** 89 * Gets a version number to be used to tag processed documents so that 90 * re-analysis can be re-done later if a stored version number is different 91 * from the current implementation. 92 * @return 20180208_00 93 */ 94 @Override getSpecializedVersionNo()95 protected int getSpecializedVersionNo() { 96 return 20180208_00; // Edit comment above too! 97 } 98 99 /** 100 * Creates a wrapped {@link PlainXref} instance. 101 * @param reader the data to produce xref for 102 * @return an xref instance 103 */ 104 @Override newXref(Reader reader)105 protected Xrefer newXref(Reader reader) { 106 return new JFlexXref(new PlainXref(reader)); 107 } 108 109 @Override getReader(InputStream stream)110 protected Reader getReader(InputStream stream) throws IOException { 111 return ExpandTabsReader.wrap(super.getReader(stream), project); 112 } 113 114 private static class XrefWork { 115 Xrefer xrefer; 116 Exception exception; 117 XrefWork(Xrefer xrefer)118 XrefWork(Xrefer xrefer) { 119 this.xrefer = xrefer; 120 } 121 XrefWork(Exception e)122 XrefWork(Exception e) { 123 this.exception = e; 124 } 125 } 126 127 @Override analyze(Document doc, StreamSource src, Writer xrefOut)128 public void analyze(Document doc, StreamSource src, Writer xrefOut) throws IOException, InterruptedException { 129 Definitions defs = null; 130 NullWriter nullWriter = null; 131 132 doc.add(new OGKTextField(QueryBuilder.FULL, getReader(src.getStream()))); 133 134 String fullPath = doc.get(QueryBuilder.FULLPATH); 135 if (fullPath != null && ctags != null) { 136 defs = ctags.doCtags(fullPath); 137 if (defs != null && defs.numberOfSymbols() > 0) { 138 tryAddingDefs(doc, defs, src); 139 byte[] tags = defs.serialize(); 140 doc.add(new StoredField(QueryBuilder.TAGS, tags)); 141 } 142 } 143 /* 144 * This is to explicitly use appropriate analyzer's token stream to 145 * work around #1376: symbols search works like full text search. 146 */ 147 JFlexTokenizer symbolTokenizer = symbolTokenizerFactory.get(); 148 OGKTextField ref = new OGKTextField(QueryBuilder.REFS, symbolTokenizer); 149 symbolTokenizer.setReader(getReader(src.getStream())); 150 doc.add(ref); 151 152 if (scopesEnabled && xrefOut == null) { 153 /* 154 * Scopes are generated during xref generation. If xrefs are 155 * turned off we still need to run writeXref() to produce scopes, 156 * we use a dummy writer that will throw away any xref output. 157 */ 158 nullWriter = new NullWriter(); 159 xrefOut = nullWriter; 160 } 161 162 if (xrefOut != null) { 163 try (Reader in = getReader(src.getStream())) { 164 RuntimeEnvironment env = RuntimeEnvironment.getInstance(); 165 WriteXrefArgs args = new WriteXrefArgs(in, xrefOut); 166 args.setDefs(defs); 167 args.setProject(project); 168 CompletableFuture<XrefWork> future = CompletableFuture.supplyAsync(() -> { 169 try { 170 return new XrefWork(writeXref(args)); 171 } catch (IOException e) { 172 return new XrefWork(e); 173 } 174 }, env.getIndexerParallelizer().getXrefWatcherExecutor()). 175 orTimeout(env.getXrefTimeout(), TimeUnit.SECONDS); 176 XrefWork xrefWork = future.get(); // Will throw ExecutionException wrapping TimeoutException on timeout. 177 Xrefer xref = xrefWork.xrefer; 178 179 if (xref != null) { 180 Scopes scopes = xref.getScopes(); 181 if (scopes.size() > 0) { 182 byte[] scopesSerialized = scopes.serialize(); 183 doc.add(new StoredField(QueryBuilder.SCOPES, 184 scopesSerialized)); 185 } 186 187 String path = doc.get(QueryBuilder.PATH); 188 addNumLinesLOC(doc, new NumLinesLOC(path, xref.getLineNumber(), xref.getLOC())); 189 } else { 190 // Re-throw the exception from writeXref(). 191 throw new IOException(xrefWork.exception); 192 } 193 } catch (ExecutionException e) { 194 throw new InterruptedException("failed to generate xref :" + e); 195 } finally { 196 if (nullWriter != null) { 197 nullWriter.close(); 198 } 199 } 200 } 201 } 202 203 // DefinitionsTokenStream should not be used in try-with-resources 204 @SuppressWarnings("java:S2095") tryAddingDefs(Document doc, Definitions defs, StreamSource src)205 private void tryAddingDefs(Document doc, Definitions defs, StreamSource src) throws IOException { 206 207 DefinitionsTokenStream defstream = new DefinitionsTokenStream(); 208 defstream.initialize(defs, src, this::wrapReader); 209 210 /* 211 * Testing showed that UnifiedHighlighter will fall back to 212 * ANALYSIS in the presence of multi-term queries (MTQs) such as 213 * prefixes and wildcards even for fields that are analyzed with 214 * POSTINGS -- i.e. with DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS. 215 * This is despite UnifiedHighlighter seeming to indicate that 216 * postings should be sufficient in the comment for 217 * shouldHandleMultiTermQuery(String): "MTQ highlighting can be 218 * expensive, particularly when using offsets in postings." 219 * DEFS re-analysis will not be correct, however, as the 220 * PlainAnalyzer which UnifiedHighlighter will use on-the-fly will 221 * not correctly integrate ctags Definitions. 222 * Storing term vectors, however, allows UnifiedHighlighter to 223 * avoid re-analysis at the cost of a larger index. As DEFS are a 224 * small subset of source text, it seems worth the cost to get 225 * accurate highlighting for DEFS MTQs. 226 */ 227 doc.add(new OGKTextVecField(QueryBuilder.DEFS, defstream)); 228 } 229 230 /** 231 * Identical to {@link #getReader(java.io.InputStream)} but overlaying an 232 * existing stream. 233 * @see #getReader(java.io.InputStream) 234 */ wrapReader(Reader reader)235 private Reader wrapReader(Reader reader) { 236 return ExpandTabsReader.wrap(reader, project); 237 } 238 } 239