xref: /OpenGrok/opengrok-indexer/src/main/java/org/opengrok/indexer/search/context/OGKUnifiedHighlighter.java (revision ff44f24ac10337e272510634599e287cb25d0e9c)
1 /*
2  * Licensed to the Apache Software Foundation (ASF) under one or more
3  * contributor license agreements.  See the NOTICE file distributed with
4  * this work for additional information regarding copyright ownership.
5  * The ASF licenses this file to You under the Apache License, Version 2.0
6  * (the "License"); you may not use this file except in compliance with
7  * the License.  You may obtain a copy of the License at
8  *
9  *     http://www.apache.org/licenses/LICENSE-2.0
10  *
11  * Unless required by applicable law or agreed to in writing, software
12  * distributed under the License is distributed on an "AS IS" BASIS,
13  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14  * See the License for the specific language governing permissions and
15  * limitations under the License.
16  *
17  * Portions Copyright (c) 2018, Chris Fraire <cfraire@me.com>.
18  */
19 
20 package org.opengrok.indexer.search.context;
21 
22 import java.io.BufferedReader;
23 import java.io.File;
24 import java.io.IOException;
25 import java.io.InputStream;
26 import java.io.Reader;
27 import java.nio.charset.StandardCharsets;
28 import java.util.ArrayList;
29 import java.util.Arrays;
30 import java.util.List;
31 import java.util.Map;
32 import java.util.logging.Level;
33 import java.util.logging.Logger;
34 import org.apache.lucene.analysis.Analyzer;
35 import org.apache.lucene.document.DateTools;
36 import org.apache.lucene.document.Document;
37 import org.apache.lucene.search.DocIdSetIterator;
38 import org.apache.lucene.search.IndexSearcher;
39 import org.apache.lucene.search.Query;
40 import org.apache.lucene.search.uhighlight.UHComponents;
41 import org.apache.lucene.search.uhighlight.UnifiedHighlighter;
42 import org.apache.lucene.util.BytesRef;
43 import org.opengrok.indexer.analysis.AnalyzerGuru;
44 import org.opengrok.indexer.analysis.ExpandTabsReader;
45 import org.opengrok.indexer.analysis.StreamSource;
46 import org.opengrok.indexer.configuration.RuntimeEnvironment;
47 import org.opengrok.indexer.logger.LoggerFactory;
48 import org.opengrok.indexer.search.QueryBuilder;
49 import org.opengrok.indexer.util.IOUtils;
50 import org.opengrok.indexer.web.Util;
51 
52 /**
53  * Represents a subclass of {@link UnifiedHighlighter} with customizations for
54  * OpenGrok.
55  */
56 public class OGKUnifiedHighlighter extends UnifiedHighlighter {
57 
58     private static final Logger LOGGER = LoggerFactory.getLogger(
59         OGKUnifiedHighlighter.class);
60 
61     private final RuntimeEnvironment env;
62 
63     private int tabSize;
64 
65     private String fileTypeName;
66 
67     /**
68      * Initializes an instance with
69      * {@link UnifiedHighlighter#UnifiedHighlighter(org.apache.lucene.search.IndexSearcher, org.apache.lucene.analysis.Analyzer)}
70      * for the specified {@code indexSearcher} and {@code indexAnalyzer}, and
71      * stores the {@code env} for later use.
72      * @param env a required instance
73      * @param indexSearcher a required instance
74      * @param indexAnalyzer a required instance
75      * @throws IllegalArgumentException if any argument is null
76      */
OGKUnifiedHighlighter(RuntimeEnvironment env, IndexSearcher indexSearcher, Analyzer indexAnalyzer)77     public OGKUnifiedHighlighter(RuntimeEnvironment env,
78             IndexSearcher indexSearcher, Analyzer indexAnalyzer) {
79         super(indexSearcher, indexAnalyzer);
80 
81         if (env == null) {
82             throw new IllegalArgumentException("env is null");
83         }
84         this.env = env;
85     }
86 
87     /**
88      * Gets a file type name-specific analyzer during the execution of
89      * {@link #highlightFieldsUnion(java.lang.String[], org.apache.lucene.search.Query, int, int)},
90      * or just gets the object passed in to the constructor at all other times.
91      * @return a defined instance
92      */
93     @Override
getIndexAnalyzer()94     public Analyzer getIndexAnalyzer() {
95         String ftname = fileTypeName;
96         if (ftname == null) {
97             return indexAnalyzer;
98         }
99         Analyzer fa = AnalyzerGuru.getAnalyzer(ftname);
100         return fa == null ? indexAnalyzer : fa;
101     }
102 
getTabSize()103     public int getTabSize() {
104         return tabSize;
105     }
106 
setTabSize(int value)107     public void setTabSize(int value) {
108         this.tabSize = value;
109     }
110 
111     /**
112      * Transiently arranges that {@link #getIndexAnalyzer()} returns a file type
113      * name-specific analyzer during a subsequent call of
114      * {@link #highlightFieldsUnionWork(java.lang.String[], org.apache.lucene.search.Query, int, int)}.
115      * @param fields a defined instance
116      * @param query a defined instance
117      * @param docId a valid document ID
118      * @param lineLimit the maximum number of lines to return
119      * @return a defined instance or else {@code null} if there are no results
120      * @throws IOException if accessing the Lucene document fails
121      */
highlightFieldsUnion(String[] fields, Query query, int docId, int lineLimit)122     public String highlightFieldsUnion(String[] fields, Query query,
123             int docId, int lineLimit) throws IOException {
124         /**
125          * Setting fileTypeName has to happen before getFieldHighlighter() is
126          * called by highlightFieldsAsObjects() so that the result of
127          * getIndexAnalyzer() (if it is called due to requiring ANALYSIS) can be
128          * influenced by fileTypeName.
129          */
130         Document doc = searcher.doc(docId);
131         fileTypeName = doc == null ? null : doc.get(QueryBuilder.TYPE);
132         try {
133             return highlightFieldsUnionWork(fields, query, docId, lineLimit);
134         } finally {
135             fileTypeName = null;
136         }
137     }
138 
139     /**
140      * Calls
141      * {@link #highlightFieldsAsObjects(java.lang.String[], org.apache.lucene.search.Query, int[], int[])},
142      * and merges multiple passages if the formatter returns
143      * {@link FormattedLines} or else returns the first formatted result.
144      * @param fields a defined instance
145      * @param query a defined instance
146      * @param docId a valid document ID
147      * @param lineLimit the maximum number of lines to return
148      * @return a defined instance or else {@code null} if there are no results
149      * @throws IOException if accessing the Lucene document fails
150      */
highlightFieldsUnionWork(String[] fields, Query query, int docId, int lineLimit)151     protected String highlightFieldsUnionWork(String[] fields, Query query,
152             int docId, int lineLimit) throws IOException {
153         int[] maxPassagesCopy = new int[fields.length];
154         /**
155          * N.b. linelimit + 1 so that the ContextFormatter has an indication
156          * when to display the "more..." link.
157          */
158         Arrays.fill(maxPassagesCopy, lineLimit + 1);
159 
160         FormattedLines res = null;
161         Map<String, Object[]> mappedRes = highlightFieldsAsObjects(fields,
162             query, new int[]{docId}, maxPassagesCopy);
163         for (Object[] flinesz : mappedRes.values()) {
164             for (Object obj : flinesz) {
165                 /**
166                  * Empirical testing showed that the passage could be null if
167                  * the original source text is not available to the highlighter.
168                  */
169                 if (obj != null) {
170                     if (!(obj instanceof FormattedLines)) {
171                         return obj.toString();
172                     }
173                     FormattedLines flines = (FormattedLines) obj;
174                     res = res == null ? flines : res.merge(flines);
175                 }
176             }
177         }
178         if (res == null) {
179             return null;
180         }
181         if (res.getCount() > lineLimit) {
182             res.setLimited(true);
183             while (res.getCount() > lineLimit) {
184                 res.pop();
185             }
186         }
187         return res.toString();
188     }
189 
190     /**
191      * Produces original text by reading from OpenGrok source content relative
192      * to {@link RuntimeEnvironment#getSourceRootPath()} and returns the content
193      * for each document if the timestamp matches -- or else just {@code null}
194      * for a missing file or a timestamp mismatch (as "the returned Strings must
195      * be identical to what was indexed.")
196      * <p>
197      * "This method must load fields for at least one document from the given
198      * {@link DocIdSetIterator} but need not return all of them; by default the
199      * character lengths are summed and this method will return early when
200      * {@code cacheCharsThreshold} is exceeded. Specifically if that number is
201      * 0, then only one document is fetched no matter what. Values in the array
202      * of {@link CharSequence} will be {@code null} if no value was found."
203      * @return a defined instance
204      * @throws IOException if an I/O error occurs
205      */
206     @Override
loadFieldValues(String[] fields, DocIdSetIterator docIter, int cacheCharsThreshold)207     protected List<CharSequence[]> loadFieldValues(String[] fields,
208         DocIdSetIterator docIter, int cacheCharsThreshold) throws IOException {
209 
210         List<CharSequence[]> docListOfFields = new ArrayList<>(
211             cacheCharsThreshold == 0 ? 1 : (int) Math.min(64, docIter.cost()));
212 
213         int sumChars = 0;
214         do {
215             int docId = docIter.nextDoc();
216             if (docId == DocIdSetIterator.NO_MORE_DOCS) {
217                 break;
218             }
219             Document doc = searcher.doc(docId);
220 
221             String path = doc.get(QueryBuilder.PATH);
222             String storedU = doc.get(QueryBuilder.U);
223             String content = getRepoFileContent(path, storedU);
224 
225             CharSequence[] seqs = new CharSequence[fields.length];
226             Arrays.fill(seqs, content);
227             docListOfFields.add(seqs);
228 
229             if (content != null) {
230                 sumChars += content.length();
231             }
232         } while (sumChars <= cacheCharsThreshold && cacheCharsThreshold != 0);
233 
234         return docListOfFields;
235     }
236 
237     /**
238      * Returns the value from the {@code super} implementation, with logging for
239      * ANALYSIS of any field but {@link QueryBuilder#FULL} or
240      * {@link QueryBuilder#REFS}.
241      * @return the value from the {@code super} implementation
242      */
243     @Override
getOptimizedOffsetSource(UHComponents components)244     protected OffsetSource getOptimizedOffsetSource(UHComponents components) {
245 
246         OffsetSource res = super.getOptimizedOffsetSource(components);
247         String field = components.getField();
248         if (res == OffsetSource.ANALYSIS) {
249             /**
250              *     Testing showed that UnifiedHighlighter falls back to
251              * ANALYSIS in the presence of multi-term queries (MTQs) such as
252              * prefixes and wildcards even for fields that are analyzed with
253              * POSTINGS -- i.e. with DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS.
254              * This is despite UnifiedHighlighter seeming to indicate that
255              * postings should be sufficient in the comment for
256              * shouldHandleMultiTermQuery(String): "MTQ highlighting can be
257              * expensive, particularly when using offsets in postings."
258              *     DEFS are stored with term vectors to avoid this problem,
259              * since re-analysis would not at all accord with ctags Definitions.
260              *     For FULL and REFS, highlightFieldsUnion() arranges that
261              * getIndexAnalyzer() can return a TYPE-specific analyzer for use by
262              * getOffsetStrategy() -- if re-ANALYSIS is required.
263              */
264             switch (field) {
265                 case QueryBuilder.FULL:
266                 case QueryBuilder.REFS:
267                     // Acceptable -- as described above.
268                     break;
269                 default:
270                     if (LOGGER.isLoggable(Level.FINE)) {
271                         OffsetSource defaultRes = getOffsetSource(field);
272                         LOGGER.log(Level.FINE, "Field {0} using {1} vs {2}",
273                             new Object[]{field, res, defaultRes});
274                     }
275                     break;
276             }
277         }
278         return res;
279     }
280 
getRepoFileContent(String repoRelPath, String storedU)281     private String getRepoFileContent(String repoRelPath, String storedU)
282             throws IOException {
283 
284         if (storedU == null) {
285             LOGGER.log(Level.FINE, "Missing U[UID] for: {0}",
286                 repoRelPath);
287             return null;
288         }
289 
290         String repoAbsPath = env.getSourceRootPath() + repoRelPath;
291         File repoAbsFile = new File(repoAbsPath);
292         if (!repoAbsFile.exists()) {
293             LOGGER.log(Level.FINE, "Missing file: {0}", repoAbsPath);
294             return null;
295         }
296 
297         repoRelPath = Util.fixPathIfWindows(repoRelPath);
298         // Verify that timestamp (U) is unchanged by comparing UID.
299         String uid = Util.path2uid(repoRelPath,
300             DateTools.timeToString(repoAbsFile.lastModified(),
301             DateTools.Resolution.MILLISECOND));
302         BytesRef buid = new BytesRef(uid);
303         BytesRef storedBuid = new BytesRef(storedU);
304         if (storedBuid.compareTo(buid) != 0) {
305             LOGGER.log(Level.FINE, "Last-modified differs for: {0}",
306                 repoRelPath);
307             return null;
308         }
309 
310         StringBuilder bld = new StringBuilder();
311         StreamSource src = StreamSource.fromFile(repoAbsFile);
312         try (InputStream in = src.getStream();
313             Reader rdr = getReader(in)) {
314             int c;
315             while ((c = rdr.read()) != -1) {
316                 bld.append((char) c);
317             }
318         }
319 
320         return bld.toString();
321     }
322 
getReader(InputStream in)323     private Reader getReader(InputStream in) throws IOException {
324         Reader bsrdr = IOUtils.createBOMStrippedReader(in,
325             StandardCharsets.UTF_8.name());
326         BufferedReader bufrdr = new BufferedReader(bsrdr);
327         return ExpandTabsReader.wrap(bufrdr, tabSize);
328     }
329 }
330