xref: /OpenGrok/opengrok-indexer/src/main/java/org/opengrok/indexer/analysis/plain/PlainAnalyzer.java (revision a160de5f846e5fdb484e6579c6f0855d9207181e)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * See LICENSE.txt included in this distribution for the specific
9  * language governing permissions and limitations under the License.
10  *
11  * When distributing Covered Code, include this CDDL HEADER in each
12  * file and include the License file at LICENSE.txt.
13  * If applicable, add the following below this CDDL HEADER, with the
14  * fields enclosed by brackets "[]" replaced with your own identifying
15  * information: Portions Copyright [yyyy] [name of copyright owner]
16  *
17  * CDDL HEADER END
18  */
19 
20 /*
21  * Copyright (c) 2005, 2021, Oracle and/or its affiliates. All rights reserved.
22  * Portions Copyright (c) 2017, 2020, Chris Fraire <cfraire@me.com>.
23  */
24 package org.opengrok.indexer.analysis.plain;
25 
26 import java.io.IOException;
27 import java.io.InputStream;
28 import java.io.Reader;
29 import java.io.Writer;
30 import java.util.concurrent.CompletableFuture;
31 import java.util.concurrent.ExecutionException;
32 import java.util.concurrent.TimeUnit;
33 import java.util.function.Supplier;
34 
35 import org.apache.lucene.document.Document;
36 import org.apache.lucene.document.StoredField;
37 import org.opengrok.indexer.analysis.AnalyzerFactory;
38 import org.opengrok.indexer.analysis.Definitions;
39 import org.opengrok.indexer.analysis.ExpandTabsReader;
40 import org.opengrok.indexer.analysis.JFlexTokenizer;
41 import org.opengrok.indexer.analysis.JFlexXref;
42 import org.opengrok.indexer.analysis.NumLinesLOC;
43 import org.opengrok.indexer.analysis.OGKTextField;
44 import org.opengrok.indexer.analysis.OGKTextVecField;
45 import org.opengrok.indexer.analysis.Scopes;
46 import org.opengrok.indexer.analysis.StreamSource;
47 import org.opengrok.indexer.analysis.TextAnalyzer;
48 import org.opengrok.indexer.analysis.WriteXrefArgs;
49 import org.opengrok.indexer.analysis.Xrefer;
50 import org.opengrok.indexer.configuration.RuntimeEnvironment;
51 import org.opengrok.indexer.search.QueryBuilder;
52 import org.opengrok.indexer.util.NullWriter;
53 
54 /**
55  * Analyzer for plain text files.
56  *
57  * Created on September 21, 2005
58  * @author Chandan
59  */
60 public class PlainAnalyzer extends TextAnalyzer {
61 
62     /**
63      * Creates a new instance of PlainAnalyzer.
64      * @param factory defined instance for the analyzer
65      */
PlainAnalyzer(AnalyzerFactory factory)66     protected PlainAnalyzer(AnalyzerFactory factory) {
67         super(factory);
68     }
69 
70     /**
71      * Creates a new instance of {@link PlainAnalyzer}.
72      * @param factory defined instance for the analyzer
73      * @param symbolTokenizerFactory defined instance for the analyzer
74      */
PlainAnalyzer(AnalyzerFactory factory, Supplier<JFlexTokenizer> symbolTokenizerFactory)75     protected PlainAnalyzer(AnalyzerFactory factory,
76             Supplier<JFlexTokenizer> symbolTokenizerFactory) {
77         super(factory, symbolTokenizerFactory);
78     }
79 
80     /**
81      * @return {@code null} as there is no aligned language
82      */
83     @Override
getCtagsLang()84     public String getCtagsLang() {
85         return null;
86     }
87 
88     /**
89      * Gets a version number to be used to tag processed documents so that
90      * re-analysis can be re-done later if a stored version number is different
91      * from the current implementation.
92      * @return 20180208_00
93      */
94     @Override
getSpecializedVersionNo()95     protected int getSpecializedVersionNo() {
96         return 20180208_00; // Edit comment above too!
97     }
98 
99     /**
100      * Creates a wrapped {@link PlainXref} instance.
101      * @param reader the data to produce xref for
102      * @return an xref instance
103      */
104     @Override
newXref(Reader reader)105     protected Xrefer newXref(Reader reader) {
106         return new JFlexXref(new PlainXref(reader));
107     }
108 
109     @Override
getReader(InputStream stream)110     protected Reader getReader(InputStream stream) throws IOException {
111         return ExpandTabsReader.wrap(super.getReader(stream), project);
112     }
113 
114     private static class XrefWork {
115         Xrefer xrefer;
116         Exception exception;
117 
XrefWork(Xrefer xrefer)118         XrefWork(Xrefer xrefer) {
119             this.xrefer = xrefer;
120         }
121 
XrefWork(Exception e)122         XrefWork(Exception e) {
123             this.exception = e;
124         }
125     }
126 
127     @Override
analyze(Document doc, StreamSource src, Writer xrefOut)128     public void analyze(Document doc, StreamSource src, Writer xrefOut) throws IOException, InterruptedException {
129         Definitions defs = null;
130         NullWriter nullWriter = null;
131 
132         doc.add(new OGKTextField(QueryBuilder.FULL, getReader(src.getStream())));
133 
134         String fullPath = doc.get(QueryBuilder.FULLPATH);
135         if (fullPath != null && ctags != null) {
136             defs = ctags.doCtags(fullPath);
137             if (defs != null && defs.numberOfSymbols() > 0) {
138                 tryAddingDefs(doc, defs, src);
139                 byte[] tags = defs.serialize();
140                 doc.add(new StoredField(QueryBuilder.TAGS, tags));
141             }
142         }
143         /*
144          * This is to explicitly use appropriate analyzer's token stream to
145          * work around #1376: symbols search works like full text search.
146          */
147         JFlexTokenizer symbolTokenizer = symbolTokenizerFactory.get();
148         OGKTextField ref = new OGKTextField(QueryBuilder.REFS, symbolTokenizer);
149         symbolTokenizer.setReader(getReader(src.getStream()));
150         doc.add(ref);
151 
152         if (scopesEnabled && xrefOut == null) {
153             /*
154              * Scopes are generated during xref generation. If xrefs are
155              * turned off we still need to run writeXref() to produce scopes,
156              * we use a dummy writer that will throw away any xref output.
157              */
158             nullWriter = new NullWriter();
159             xrefOut = nullWriter;
160         }
161 
162         if (xrefOut != null) {
163             try (Reader in = getReader(src.getStream())) {
164                 RuntimeEnvironment env = RuntimeEnvironment.getInstance();
165                 WriteXrefArgs args = new WriteXrefArgs(in, xrefOut);
166                 args.setDefs(defs);
167                 args.setProject(project);
168                 CompletableFuture<XrefWork> future = CompletableFuture.supplyAsync(() -> {
169                     try {
170                         return new XrefWork(writeXref(args));
171                     } catch (IOException e) {
172                         return new XrefWork(e);
173                     }
174                 }, env.getIndexerParallelizer().getXrefWatcherExecutor()).
175                         orTimeout(env.getXrefTimeout(), TimeUnit.SECONDS);
176                 XrefWork xrefWork = future.get(); // Will throw ExecutionException wrapping TimeoutException on timeout.
177                 Xrefer xref = xrefWork.xrefer;
178 
179                 if (xref != null) {
180                     Scopes scopes = xref.getScopes();
181                     if (scopes.size() > 0) {
182                         byte[] scopesSerialized = scopes.serialize();
183                         doc.add(new StoredField(QueryBuilder.SCOPES,
184                                 scopesSerialized));
185                     }
186 
187                     String path = doc.get(QueryBuilder.PATH);
188                     addNumLinesLOC(doc, new NumLinesLOC(path, xref.getLineNumber(), xref.getLOC()));
189                 } else {
190                     // Re-throw the exception from writeXref().
191                     throw new IOException(xrefWork.exception);
192                 }
193             } catch (ExecutionException e) {
194                 throw new InterruptedException("failed to generate xref :" + e);
195             } finally {
196                 if (nullWriter != null) {
197                     nullWriter.close();
198                 }
199             }
200         }
201     }
202 
203     // DefinitionsTokenStream should not be used in try-with-resources
204     @SuppressWarnings("java:S2095")
tryAddingDefs(Document doc, Definitions defs, StreamSource src)205     private void tryAddingDefs(Document doc, Definitions defs, StreamSource src) throws IOException {
206 
207         DefinitionsTokenStream defstream = new DefinitionsTokenStream();
208         defstream.initialize(defs, src, this::wrapReader);
209 
210         /*
211          *     Testing showed that UnifiedHighlighter will fall back to
212          * ANALYSIS in the presence of multi-term queries (MTQs) such as
213          * prefixes and wildcards even for fields that are analyzed with
214          * POSTINGS -- i.e. with DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS.
215          * This is despite UnifiedHighlighter seeming to indicate that
216          * postings should be sufficient in the comment for
217          * shouldHandleMultiTermQuery(String): "MTQ highlighting can be
218          * expensive, particularly when using offsets in postings."
219          *     DEFS re-analysis will not be correct, however, as the
220          * PlainAnalyzer which UnifiedHighlighter will use on-the-fly will
221          * not correctly integrate ctags Definitions.
222          *     Storing term vectors, however, allows UnifiedHighlighter to
223          * avoid re-analysis at the cost of a larger index. As DEFS are a
224          * small subset of source text, it seems worth the cost to get
225          * accurate highlighting for DEFS MTQs.
226          */
227         doc.add(new OGKTextVecField(QueryBuilder.DEFS, defstream));
228     }
229 
230     /**
231      * Identical to {@link #getReader(java.io.InputStream)} but overlaying an
232      * existing stream.
233      * @see #getReader(java.io.InputStream)
234      */
wrapReader(Reader reader)235     private Reader wrapReader(Reader reader) {
236         return ExpandTabsReader.wrap(reader, project);
237     }
238 }
239