1 /* 2 * Licensed to the Apache Software Foundation (ASF) under one or more 3 * contributor license agreements. See the NOTICE file distributed with 4 * this work for additional information regarding copyright ownership. 5 * The ASF licenses this file to You under the Apache License, Version 2.0 6 * (the "License"); you may not use this file except in compliance with 7 * the License. You may obtain a copy of the License at 8 * 9 * http://www.apache.org/licenses/LICENSE-2.0 10 * 11 * Unless required by applicable law or agreed to in writing, software 12 * distributed under the License is distributed on an "AS IS" BASIS, 13 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 * See the License for the specific language governing permissions and 15 * limitations under the License. 16 * 17 * Portions Copyright (c) 2018, Chris Fraire <cfraire@me.com>. 18 */ 19 20 package org.opengrok.indexer.search.context; 21 22 import java.io.BufferedReader; 23 import java.io.File; 24 import java.io.IOException; 25 import java.io.InputStream; 26 import java.io.Reader; 27 import java.nio.charset.StandardCharsets; 28 import java.util.ArrayList; 29 import java.util.Arrays; 30 import java.util.List; 31 import java.util.Map; 32 import java.util.logging.Level; 33 import java.util.logging.Logger; 34 import org.apache.lucene.analysis.Analyzer; 35 import org.apache.lucene.document.DateTools; 36 import org.apache.lucene.document.Document; 37 import org.apache.lucene.search.DocIdSetIterator; 38 import org.apache.lucene.search.IndexSearcher; 39 import org.apache.lucene.search.Query; 40 import org.apache.lucene.search.uhighlight.UHComponents; 41 import org.apache.lucene.search.uhighlight.UnifiedHighlighter; 42 import org.apache.lucene.util.BytesRef; 43 import org.opengrok.indexer.analysis.AnalyzerGuru; 44 import org.opengrok.indexer.analysis.ExpandTabsReader; 45 import org.opengrok.indexer.analysis.StreamSource; 46 import org.opengrok.indexer.configuration.RuntimeEnvironment; 47 import org.opengrok.indexer.logger.LoggerFactory; 48 import org.opengrok.indexer.search.QueryBuilder; 49 import org.opengrok.indexer.util.IOUtils; 50 import org.opengrok.indexer.web.Util; 51 52 /** 53 * Represents a subclass of {@link UnifiedHighlighter} with customizations for 54 * OpenGrok. 55 */ 56 public class OGKUnifiedHighlighter extends UnifiedHighlighter { 57 58 private static final Logger LOGGER = LoggerFactory.getLogger( 59 OGKUnifiedHighlighter.class); 60 61 private final RuntimeEnvironment env; 62 63 private int tabSize; 64 65 private String fileTypeName; 66 67 /** 68 * Initializes an instance with 69 * {@link UnifiedHighlighter#UnifiedHighlighter(org.apache.lucene.search.IndexSearcher, org.apache.lucene.analysis.Analyzer)} 70 * for the specified {@code indexSearcher} and {@code indexAnalyzer}, and 71 * stores the {@code env} for later use. 72 * @param env a required instance 73 * @param indexSearcher a required instance 74 * @param indexAnalyzer a required instance 75 * @throws IllegalArgumentException if any argument is null 76 */ OGKUnifiedHighlighter(RuntimeEnvironment env, IndexSearcher indexSearcher, Analyzer indexAnalyzer)77 public OGKUnifiedHighlighter(RuntimeEnvironment env, 78 IndexSearcher indexSearcher, Analyzer indexAnalyzer) { 79 super(indexSearcher, indexAnalyzer); 80 81 if (env == null) { 82 throw new IllegalArgumentException("env is null"); 83 } 84 this.env = env; 85 } 86 87 /** 88 * Gets a file type name-specific analyzer during the execution of 89 * {@link #highlightFieldsUnion(java.lang.String[], org.apache.lucene.search.Query, int, int)}, 90 * or just gets the object passed in to the constructor at all other times. 91 * @return a defined instance 92 */ 93 @Override getIndexAnalyzer()94 public Analyzer getIndexAnalyzer() { 95 String ftname = fileTypeName; 96 if (ftname == null) { 97 return indexAnalyzer; 98 } 99 Analyzer fa = AnalyzerGuru.getAnalyzer(ftname); 100 return fa == null ? indexAnalyzer : fa; 101 } 102 getTabSize()103 public int getTabSize() { 104 return tabSize; 105 } 106 setTabSize(int value)107 public void setTabSize(int value) { 108 this.tabSize = value; 109 } 110 111 /** 112 * Transiently arranges that {@link #getIndexAnalyzer()} returns a file type 113 * name-specific analyzer during a subsequent call of 114 * {@link #highlightFieldsUnionWork(java.lang.String[], org.apache.lucene.search.Query, int, int)}. 115 * @param fields a defined instance 116 * @param query a defined instance 117 * @param docId a valid document ID 118 * @param lineLimit the maximum number of lines to return 119 * @return a defined instance or else {@code null} if there are no results 120 * @throws IOException if accessing the Lucene document fails 121 */ highlightFieldsUnion(String[] fields, Query query, int docId, int lineLimit)122 public String highlightFieldsUnion(String[] fields, Query query, 123 int docId, int lineLimit) throws IOException { 124 /** 125 * Setting fileTypeName has to happen before getFieldHighlighter() is 126 * called by highlightFieldsAsObjects() so that the result of 127 * getIndexAnalyzer() (if it is called due to requiring ANALYSIS) can be 128 * influenced by fileTypeName. 129 */ 130 Document doc = searcher.doc(docId); 131 fileTypeName = doc == null ? null : doc.get(QueryBuilder.TYPE); 132 try { 133 return highlightFieldsUnionWork(fields, query, docId, lineLimit); 134 } finally { 135 fileTypeName = null; 136 } 137 } 138 139 /** 140 * Calls 141 * {@link #highlightFieldsAsObjects(java.lang.String[], org.apache.lucene.search.Query, int[], int[])}, 142 * and merges multiple passages if the formatter returns 143 * {@link FormattedLines} or else returns the first formatted result. 144 * @param fields a defined instance 145 * @param query a defined instance 146 * @param docId a valid document ID 147 * @param lineLimit the maximum number of lines to return 148 * @return a defined instance or else {@code null} if there are no results 149 * @throws IOException if accessing the Lucene document fails 150 */ highlightFieldsUnionWork(String[] fields, Query query, int docId, int lineLimit)151 protected String highlightFieldsUnionWork(String[] fields, Query query, 152 int docId, int lineLimit) throws IOException { 153 int[] maxPassagesCopy = new int[fields.length]; 154 /** 155 * N.b. linelimit + 1 so that the ContextFormatter has an indication 156 * when to display the "more..." link. 157 */ 158 Arrays.fill(maxPassagesCopy, lineLimit + 1); 159 160 FormattedLines res = null; 161 Map<String, Object[]> mappedRes = highlightFieldsAsObjects(fields, 162 query, new int[]{docId}, maxPassagesCopy); 163 for (Object[] flinesz : mappedRes.values()) { 164 for (Object obj : flinesz) { 165 /** 166 * Empirical testing showed that the passage could be null if 167 * the original source text is not available to the highlighter. 168 */ 169 if (obj != null) { 170 if (!(obj instanceof FormattedLines)) { 171 return obj.toString(); 172 } 173 FormattedLines flines = (FormattedLines) obj; 174 res = res == null ? flines : res.merge(flines); 175 } 176 } 177 } 178 if (res == null) { 179 return null; 180 } 181 if (res.getCount() > lineLimit) { 182 res.setLimited(true); 183 while (res.getCount() > lineLimit) { 184 res.pop(); 185 } 186 } 187 return res.toString(); 188 } 189 190 /** 191 * Produces original text by reading from OpenGrok source content relative 192 * to {@link RuntimeEnvironment#getSourceRootPath()} and returns the content 193 * for each document if the timestamp matches -- or else just {@code null} 194 * for a missing file or a timestamp mismatch (as "the returned Strings must 195 * be identical to what was indexed.") 196 * <p> 197 * "This method must load fields for at least one document from the given 198 * {@link DocIdSetIterator} but need not return all of them; by default the 199 * character lengths are summed and this method will return early when 200 * {@code cacheCharsThreshold} is exceeded. Specifically if that number is 201 * 0, then only one document is fetched no matter what. Values in the array 202 * of {@link CharSequence} will be {@code null} if no value was found." 203 * @return a defined instance 204 * @throws IOException if an I/O error occurs 205 */ 206 @Override loadFieldValues(String[] fields, DocIdSetIterator docIter, int cacheCharsThreshold)207 protected List<CharSequence[]> loadFieldValues(String[] fields, 208 DocIdSetIterator docIter, int cacheCharsThreshold) throws IOException { 209 210 List<CharSequence[]> docListOfFields = new ArrayList<>( 211 cacheCharsThreshold == 0 ? 1 : (int) Math.min(64, docIter.cost())); 212 213 int sumChars = 0; 214 do { 215 int docId = docIter.nextDoc(); 216 if (docId == DocIdSetIterator.NO_MORE_DOCS) { 217 break; 218 } 219 Document doc = searcher.doc(docId); 220 221 String path = doc.get(QueryBuilder.PATH); 222 String storedU = doc.get(QueryBuilder.U); 223 String content = getRepoFileContent(path, storedU); 224 225 CharSequence[] seqs = new CharSequence[fields.length]; 226 Arrays.fill(seqs, content); 227 docListOfFields.add(seqs); 228 229 if (content != null) { 230 sumChars += content.length(); 231 } 232 } while (sumChars <= cacheCharsThreshold && cacheCharsThreshold != 0); 233 234 return docListOfFields; 235 } 236 237 /** 238 * Returns the value from the {@code super} implementation, with logging for 239 * ANALYSIS of any field but {@link QueryBuilder#FULL} or 240 * {@link QueryBuilder#REFS}. 241 * @return the value from the {@code super} implementation 242 */ 243 @Override getOptimizedOffsetSource(UHComponents components)244 protected OffsetSource getOptimizedOffsetSource(UHComponents components) { 245 246 OffsetSource res = super.getOptimizedOffsetSource(components); 247 String field = components.getField(); 248 if (res == OffsetSource.ANALYSIS) { 249 /** 250 * Testing showed that UnifiedHighlighter falls back to 251 * ANALYSIS in the presence of multi-term queries (MTQs) such as 252 * prefixes and wildcards even for fields that are analyzed with 253 * POSTINGS -- i.e. with DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS. 254 * This is despite UnifiedHighlighter seeming to indicate that 255 * postings should be sufficient in the comment for 256 * shouldHandleMultiTermQuery(String): "MTQ highlighting can be 257 * expensive, particularly when using offsets in postings." 258 * DEFS are stored with term vectors to avoid this problem, 259 * since re-analysis would not at all accord with ctags Definitions. 260 * For FULL and REFS, highlightFieldsUnion() arranges that 261 * getIndexAnalyzer() can return a TYPE-specific analyzer for use by 262 * getOffsetStrategy() -- if re-ANALYSIS is required. 263 */ 264 switch (field) { 265 case QueryBuilder.FULL: 266 case QueryBuilder.REFS: 267 // Acceptable -- as described above. 268 break; 269 default: 270 if (LOGGER.isLoggable(Level.FINE)) { 271 OffsetSource defaultRes = getOffsetSource(field); 272 LOGGER.log(Level.FINE, "Field {0} using {1} vs {2}", 273 new Object[]{field, res, defaultRes}); 274 } 275 break; 276 } 277 } 278 return res; 279 } 280 getRepoFileContent(String repoRelPath, String storedU)281 private String getRepoFileContent(String repoRelPath, String storedU) 282 throws IOException { 283 284 if (storedU == null) { 285 LOGGER.log(Level.FINE, "Missing U[UID] for: {0}", 286 repoRelPath); 287 return null; 288 } 289 290 String repoAbsPath = env.getSourceRootPath() + repoRelPath; 291 File repoAbsFile = new File(repoAbsPath); 292 if (!repoAbsFile.exists()) { 293 LOGGER.log(Level.FINE, "Missing file: {0}", repoAbsPath); 294 return null; 295 } 296 297 repoRelPath = Util.fixPathIfWindows(repoRelPath); 298 // Verify that timestamp (U) is unchanged by comparing UID. 299 String uid = Util.path2uid(repoRelPath, 300 DateTools.timeToString(repoAbsFile.lastModified(), 301 DateTools.Resolution.MILLISECOND)); 302 BytesRef buid = new BytesRef(uid); 303 BytesRef storedBuid = new BytesRef(storedU); 304 if (storedBuid.compareTo(buid) != 0) { 305 LOGGER.log(Level.FINE, "Last-modified differs for: {0}", 306 repoRelPath); 307 return null; 308 } 309 310 StringBuilder bld = new StringBuilder(); 311 StreamSource src = StreamSource.fromFile(repoAbsFile); 312 try (InputStream in = src.getStream(); 313 Reader rdr = getReader(in)) { 314 int c; 315 while ((c = rdr.read()) != -1) { 316 bld.append((char) c); 317 } 318 } 319 320 return bld.toString(); 321 } 322 getReader(InputStream in)323 private Reader getReader(InputStream in) throws IOException { 324 Reader bsrdr = IOUtils.createBOMStrippedReader(in, 325 StandardCharsets.UTF_8.name()); 326 BufferedReader bufrdr = new BufferedReader(bsrdr); 327 return ExpandTabsReader.wrap(bufrdr, tabSize); 328 } 329 } 330