1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * See LICENSE.txt included in this distribution for the specific 9 * language governing permissions and limitations under the License. 10 * 11 * When distributing Covered Code, include this CDDL HEADER in each 12 * file and include the License file at LICENSE.txt. 13 * If applicable, add the following below this CDDL HEADER, with the 14 * fields enclosed by brackets "[]" replaced with your own identifying 15 * information: Portions Copyright [yyyy] [name of copyright owner] 16 * 17 * CDDL HEADER END 18 */ 19 20 /* 21 * Copyright (c) 2005, 2020, Oracle and/or its affiliates. All rights reserved. 22 * Portions Copyright (c) 2018, Chris Fraire <cfraire@me.com>. 23 */ 24 package org.opengrok.indexer.search; 25 26 import java.io.BufferedReader; 27 import java.io.File; 28 import java.io.FileInputStream; 29 import java.io.FileNotFoundException; 30 import java.io.FileReader; 31 import java.io.IOException; 32 import java.io.InputStreamReader; 33 import java.io.Reader; 34 import java.nio.charset.StandardCharsets; 35 import java.util.ArrayList; 36 import java.util.List; 37 import java.util.SortedSet; 38 import java.util.TreeSet; 39 import java.util.logging.Level; 40 import java.util.logging.Logger; 41 import java.util.zip.GZIPInputStream; 42 import org.apache.lucene.analysis.charfilter.HTMLStripCharFilter; 43 import org.apache.lucene.document.Document; 44 import org.apache.lucene.index.DirectoryReader; 45 import org.apache.lucene.index.IndexReader; 46 import org.apache.lucene.index.IndexableField; 47 import org.apache.lucene.index.MultiReader; 48 import org.apache.lucene.queryparser.classic.ParseException; 49 import org.apache.lucene.search.IndexSearcher; 50 import org.apache.lucene.search.Query; 51 import org.apache.lucene.search.ScoreDoc; 52 import org.apache.lucene.search.TopScoreDocCollector; 53 import org.apache.lucene.store.FSDirectory; 54 import org.apache.lucene.util.Version; 55 import org.opengrok.indexer.analysis.AbstractAnalyzer; 56 import org.opengrok.indexer.analysis.CompatibleAnalyser; 57 import org.opengrok.indexer.analysis.Definitions; 58 import org.opengrok.indexer.analysis.Scopes; 59 import org.opengrok.indexer.configuration.Project; 60 import org.opengrok.indexer.configuration.RuntimeEnvironment; 61 import org.opengrok.indexer.configuration.SuperIndexSearcher; 62 import org.opengrok.indexer.history.HistoryException; 63 import org.opengrok.indexer.index.IndexDatabase; 64 import org.opengrok.indexer.logger.LoggerFactory; 65 import org.opengrok.indexer.search.Summary.Fragment; 66 import org.opengrok.indexer.search.context.Context; 67 import org.opengrok.indexer.search.context.HistoryContext; 68 import org.opengrok.indexer.util.Statistics; 69 import org.opengrok.indexer.util.TandemPath; 70 import org.opengrok.indexer.web.Prefix; 71 72 /** 73 * This is an encapsulation of the details on how to search in the index database. 74 * This is used for searching via the REST API. 75 * 76 * @author Trond Norbye 2005 77 * @author Lubos Kosco - upgrade to lucene 3.x, 4.x, 5.x 78 */ 79 public class SearchEngine { 80 81 private static final Logger LOGGER = LoggerFactory.getLogger(SearchEngine.class); 82 83 /** 84 * Message text used when logging exceptions thrown when searching. 85 */ 86 private static final String SEARCH_EXCEPTION_MSG = "Exception searching {0}"; 87 //NOTE below will need to be changed after new lucene upgrade, if they 88 //increase the version - every change of below makes us incompatible with the 89 //old index and we need to ask for reindex 90 /** 91 * Version of Lucene index common for the whole application. 92 */ 93 public static final Version LUCENE_VERSION = Version.LATEST; 94 public static final String LUCENE_VERSION_HELP = LUCENE_VERSION.major + "_" + LUCENE_VERSION.minor + "_" + LUCENE_VERSION.bugfix; 95 /** 96 * Holds value of property definition. 97 */ 98 private String definition; 99 /** 100 * Holds value of property file. 101 */ 102 private String file; 103 /** 104 * Holds value of property freetext. 105 */ 106 private String freetext; 107 /** 108 * Holds value of property history. 109 */ 110 private String history; 111 /** 112 * Holds value of property symbol. 113 */ 114 private String symbol; 115 /** 116 * Holds value of property type. 117 */ 118 private String type; 119 /** 120 * Holds value of property indexDatabase. 121 */ 122 private Query query; 123 private QueryBuilder queryBuilder; 124 private final CompatibleAnalyser analyzer = new CompatibleAnalyser(); 125 private Context sourceContext; 126 private HistoryContext historyContext; 127 private Summarizer summarizer; 128 // internal structure to hold the results from lucene 129 private final List<Document> docs; 130 private final char[] content = new char[1024 * 8]; 131 private String source; 132 private String data; 133 int hitsPerPage = RuntimeEnvironment.getInstance().getHitsPerPage(); 134 int cachePages = RuntimeEnvironment.getInstance().getCachePages(); 135 int totalHits = 0; 136 private ScoreDoc[] hits; 137 private TopScoreDocCollector collector; 138 private IndexSearcher searcher; 139 boolean allCollected; 140 private final ArrayList<SuperIndexSearcher> searcherList = new ArrayList<>(); 141 142 /** 143 * Creates a new instance of SearchEngine. 144 */ SearchEngine()145 public SearchEngine() { 146 docs = new ArrayList<>(); 147 } 148 149 /** 150 * Create a QueryBuilder using the fields that have been set on this 151 * SearchEngine. 152 * 153 * @return a query builder 154 */ createQueryBuilder()155 private QueryBuilder createQueryBuilder() { 156 return new QueryBuilder() 157 .setFreetext(freetext) 158 .setDefs(definition) 159 .setRefs(symbol) 160 .setPath(file) 161 .setHist(history) 162 .setType(type); 163 } 164 isValidQuery()165 public boolean isValidQuery() { 166 boolean ret; 167 try { 168 query = createQueryBuilder().build(); 169 ret = (query != null); 170 } catch (ParseException e) { 171 ret = false; 172 } 173 174 return ret; 175 } 176 177 /** 178 * Search one index. This is used if no projects are set up. 179 * @param paging whether to use paging (if yes, first X pages will load 180 * faster) 181 * @param root which db to search 182 * @throws IOException 183 */ searchSingleDatabase(File root, boolean paging)184 private void searchSingleDatabase(File root, boolean paging) throws IOException { 185 IndexReader ireader = DirectoryReader.open(FSDirectory.open(root.toPath())); 186 searcher = new IndexSearcher(ireader); 187 searchIndex(searcher, paging); 188 } 189 190 /** 191 * Perform search on multiple indexes in parallel. 192 * @param paging whether to use paging (if yes, first X pages will load 193 * faster) 194 * @param root list of projects to search 195 * @throws IOException 196 */ searchMultiDatabase(List<Project> root, boolean paging)197 private void searchMultiDatabase(List<Project> root, boolean paging) throws IOException { 198 SortedSet<String> projects = new TreeSet<>(); 199 for (Project p : root) { 200 projects.add(p.getName()); 201 } 202 203 // We use MultiReader even for single project. This should 204 // not matter given that MultiReader is just a cheap wrapper 205 // around set of IndexReader objects. 206 MultiReader searchables = RuntimeEnvironment.getInstance(). 207 getMultiReader(projects, searcherList); 208 searcher = new IndexSearcher(searchables); 209 searchIndex(searcher, paging); 210 } 211 searchIndex(IndexSearcher searcher, boolean paging)212 private void searchIndex(IndexSearcher searcher, boolean paging) throws IOException { 213 collector = TopScoreDocCollector.create(hitsPerPage * cachePages, Short.MAX_VALUE); 214 Statistics stat = new Statistics(); 215 searcher.search(query, collector); 216 totalHits = collector.getTotalHits(); 217 stat.report(LOGGER, Level.FINEST, "search via SearchEngine done", 218 "search.latency", new String[]{"category", "engine", 219 "outcome", totalHits > 0 ? "success" : "empty"}); 220 if (!paging && totalHits > 0) { 221 collector = TopScoreDocCollector.create(totalHits, Short.MAX_VALUE); 222 searcher.search(query, collector); 223 } 224 hits = collector.topDocs().scoreDocs; 225 for (ScoreDoc hit : hits) { 226 int docId = hit.doc; 227 Document d = searcher.doc(docId); 228 docs.add(d); 229 } 230 } 231 232 /** 233 * Gets the instance from {@code search(...)} if it was called. 234 * @return defined instance or {@code null} 235 */ getQuery()236 public String getQuery() { 237 return query != null ? query.toString() : null; 238 } 239 240 /** 241 * Gets the instance from {@code search(...)} if it was called. 242 * @return defined instance or {@code null} 243 */ getQueryObject()244 public Query getQueryObject() { 245 return query; 246 } 247 248 /** 249 * Gets the builder from {@code search(...)} if it was called. 250 * <p> 251 * (Modifying the builder will have no effect on this 252 * {@link SearchEngine}.) 253 * @return defined instance or {@code null} 254 */ getQueryBuilder()255 public QueryBuilder getQueryBuilder() { 256 return queryBuilder; 257 } 258 259 /** 260 * Gets the searcher from {@code search(...)} if it was called. 261 * @return defined instance or {@code null} 262 */ getSearcher()263 public IndexSearcher getSearcher() { 264 return searcher; 265 } 266 267 /** 268 * Execute a search aware of current request, limited to specific project names. 269 * 270 * This filters out all projects which are not allowed for the current request. 271 * 272 * Before calling this function, 273 * you must set the appropriate search criteria with the set-functions. Note 274 * that this search will return the first cachePages of hitsPerPage, for 275 * more you need to call more. 276 * 277 * Call to search() must be eventually followed by call to destroy() 278 * so that IndexSearcher objects are properly freed. 279 * 280 * @param projects projects to search 281 * @return The number of hits 282 */ search(List<Project> projects)283 public int search(List<Project> projects) { 284 return search(projects, new File(RuntimeEnvironment.getInstance().getDataRootFile(), IndexDatabase.INDEX_DIR)); 285 } 286 287 /** 288 * Execute a search without authorization. 289 * 290 * Before calling this function, you must set the 291 * appropriate search criteria with the set-functions. Note that this search 292 * will return the first cachePages of hitsPerPage, for more you need to 293 * call more. 294 * 295 * Call to search() must be eventually followed by call to destroy() 296 * so that IndexSearcher objects are properly freed. 297 * 298 * @return The number of hits 299 */ search()300 public int search() { 301 RuntimeEnvironment env = RuntimeEnvironment.getInstance(); 302 return search( 303 env.hasProjects() ? env.getProjectList() : new ArrayList<>(), 304 new File(env.getDataRootFile(), IndexDatabase.INDEX_DIR)); 305 } 306 307 /** 308 * Execute a search on projects or root file. 309 * 310 * If @param projects is an empty list it tries to search in @code 311 * searchSingleDatabase with root set to @param root 312 * 313 * Call to search() must be eventually followed by call to destroy() 314 * so that IndexSearcher objects are properly freed. 315 * 316 * @return The number of hits 317 */ search(List<Project> projects, File root)318 private int search(List<Project> projects, File root) { 319 source = RuntimeEnvironment.getInstance().getSourceRootPath(); 320 data = RuntimeEnvironment.getInstance().getDataRootPath(); 321 docs.clear(); 322 323 QueryBuilder newBuilder = createQueryBuilder(); 324 try { 325 query = newBuilder.build(); 326 if (query != null) { 327 328 if (projects.isEmpty()) { 329 // search the index database 330 //NOTE this assumes that src does not contain any project, just 331 // data files - so no authorization can be enforced 332 searchSingleDatabase(root, true); 333 } else { 334 // search all projects 335 //TODO support paging per project (in search.java) 336 //TODO optimize if only one project by falling back to SingleDatabase ? 337 //NOTE projects are already filtered if we accessed through web page @see search(HttpServletRequest) 338 searchMultiDatabase(projects, false); 339 } 340 } 341 } catch (Exception e) { 342 LOGGER.log( 343 Level.WARNING, SEARCH_EXCEPTION_MSG, e); 344 } 345 346 if (!docs.isEmpty()) { 347 sourceContext = null; 348 summarizer = null; 349 try { 350 sourceContext = new Context(query, newBuilder); 351 if (sourceContext.isEmpty()) { 352 sourceContext = null; 353 } 354 summarizer = new Summarizer(query, analyzer); 355 } catch (Exception e) { 356 LOGGER.log(Level.WARNING, "An error occurred while creating summary", e); 357 } 358 359 historyContext = null; 360 try { 361 historyContext = new HistoryContext(query); 362 if (historyContext.isEmpty()) { 363 historyContext = null; 364 } 365 } catch (Exception e) { 366 LOGGER.log(Level.WARNING, "An error occurred while getting history context", e); 367 } 368 } 369 int count = hits == null ? 0 : hits.length; 370 queryBuilder = newBuilder; 371 return count; 372 } 373 374 /** 375 * Gets the queried score docs from {@code search(...)} if it was called. 376 * @return a defined instance if a query succeeded, or {@code null} 377 */ scoreDocs()378 public ScoreDoc[] scoreDocs() { 379 return hits; 380 } 381 382 /** 383 * Gets the document of the specified {@code docId} from 384 * {@code search(...)} if it was called. 385 * 386 * @param docId document ID 387 * @return a defined instance if a query succeeded 388 * @throws java.io.IOException if an error occurs obtaining the Lucene 389 * document by ID 390 */ doc(int docId)391 public Document doc(int docId) throws IOException { 392 if (searcher == null) { 393 throw new IllegalStateException("search(...) did not succeed"); 394 } 395 return searcher.doc(docId); 396 } 397 398 /** 399 * Get results , if no search was started before, no results are returned. 400 * This method will requery if {@code end} is more than first query from search, 401 * hence performance hit applies, if you want results in later pages than 402 * number of cachePages. {@code end} has to be bigger than {@code start} ! 403 * 404 * @param start start of the hit list 405 * @param end end of the hit list 406 * @param ret list of results from start to end or null/empty if no search 407 * was started 408 */ results(int start, int end, List<Hit> ret)409 public void results(int start, int end, List<Hit> ret) { 410 411 //return if no start search() was done 412 if (hits == null || (end < start)) { 413 ret.clear(); 414 return; 415 } 416 417 ret.clear(); 418 419 // TODO check if below fits for if end=old hits.length, or it should include it 420 if (end > hits.length && !allCollected) { 421 //do the requery, we want more than 5 pages 422 collector = TopScoreDocCollector.create(totalHits, Short.MAX_VALUE); 423 try { 424 searcher.search(query, collector); 425 } catch (Exception e) { // this exception should never be hit, since search() will hit this before 426 LOGGER.log( 427 Level.WARNING, SEARCH_EXCEPTION_MSG, e); 428 } 429 hits = collector.topDocs().scoreDocs; 430 Document d = null; 431 for (int i = start; i < hits.length; i++) { 432 int docId = hits[i].doc; 433 try { 434 d = searcher.doc(docId); 435 } catch (Exception e) { 436 LOGGER.log( 437 Level.SEVERE, SEARCH_EXCEPTION_MSG, e); 438 } 439 docs.add(d); 440 } 441 allCollected = true; 442 } 443 444 //TODO generation of ret(results) could be cashed and consumers of engine would just print them in whatever 445 // form they need, this way we could get rid of docs 446 // the only problem is that count of docs is usually smaller than number of results 447 for (int ii = start; ii < end; ++ii) { 448 boolean alt = (ii % 2 == 0); 449 boolean hasContext = false; 450 try { 451 Document doc = docs.get(ii); 452 String filename = doc.get(QueryBuilder.PATH); 453 454 AbstractAnalyzer.Genre genre = AbstractAnalyzer.Genre.get(doc.get(QueryBuilder.T)); 455 Definitions tags = null; 456 IndexableField tagsField = doc.getField(QueryBuilder.TAGS); 457 if (tagsField != null) { 458 tags = Definitions.deserialize(tagsField.binaryValue().bytes); 459 } 460 Scopes scopes = null; 461 IndexableField scopesField = doc.getField(QueryBuilder.SCOPES); 462 if (scopesField != null) { 463 scopes = Scopes.deserialize(scopesField.binaryValue().bytes); 464 } 465 int nhits = docs.size(); 466 467 if (sourceContext != null) { 468 sourceContext.toggleAlt(); 469 try { 470 if (AbstractAnalyzer.Genre.PLAIN == genre && (source != null)) { 471 // SRCROOT is read with UTF-8 as a default. 472 hasContext = sourceContext.getContext( 473 new InputStreamReader(new FileInputStream( 474 source + filename), StandardCharsets.UTF_8), 475 null, null, null, filename, tags, nhits > 100, 476 getDefinition() != null, ret, scopes); 477 } else if (AbstractAnalyzer.Genre.XREFABLE == genre && data != null && summarizer != null) { 478 int l; 479 /** 480 * For backward compatibility, read the 481 * OpenGrok-produced document using the system 482 * default charset. 483 */ 484 try (Reader r = RuntimeEnvironment.getInstance().isCompressXref() 485 ? new HTMLStripCharFilter(new BufferedReader(new InputStreamReader(new GZIPInputStream(new FileInputStream( 486 TandemPath.join(data + Prefix.XREF_P + filename, ".gz")))))) 487 : new HTMLStripCharFilter(new BufferedReader(new FileReader(data + Prefix.XREF_P + filename)))) { 488 l = r.read(content); 489 } 490 //TODO FIX below fragmenter according to either summarizer or context 491 // (to get line numbers, might be hard, since xref writers will need to be fixed too, 492 // they generate just one line of html code now :( ) 493 Summary sum = summarizer.getSummary(new String(content, 0, l)); 494 Fragment[] fragments = sum.getFragments(); 495 for (Fragment fragment : fragments) { 496 String match = fragment.toString(); 497 if (match.length() > 0) { 498 if (!fragment.isEllipsis()) { 499 Hit hit = new Hit(filename, fragment.toString(), "", true, alt); 500 ret.add(hit); 501 } 502 hasContext = true; 503 } 504 } 505 } else { 506 LOGGER.log(Level.WARNING, "Unknown genre: {0} for {1}", new Object[]{genre, filename}); 507 hasContext |= sourceContext.getContext(null, null, null, null, filename, tags, false, false, ret, scopes); 508 } 509 } catch (FileNotFoundException exp) { 510 LOGGER.log(Level.WARNING, "Couldn''t read summary from {0} ({1})", new Object[]{filename, exp.getMessage()}); 511 hasContext |= sourceContext.getContext(null, null, null, null, filename, tags, false, false, ret, scopes); 512 } 513 } 514 if (historyContext != null) { 515 hasContext |= historyContext.getContext(source + filename, filename, ret); 516 } 517 if (!hasContext) { 518 ret.add(new Hit(filename, "...", "", false, alt)); 519 } 520 } catch (IOException | ClassNotFoundException | HistoryException e) { 521 LOGGER.log( 522 Level.WARNING, SEARCH_EXCEPTION_MSG, e); 523 } 524 } 525 } 526 destroy()527 public void destroy() { 528 for (SuperIndexSearcher is : searcherList) { 529 try { 530 is.getSearcherManager().release(is); 531 } catch (IOException ex) { 532 LOGGER.log(Level.WARNING, "cannot release indexSearcher", ex); 533 } 534 } 535 } 536 537 /** 538 * Getter for property definition. 539 * 540 * @return Value of property definition. 541 */ getDefinition()542 public String getDefinition() { 543 return this.definition; 544 } 545 546 /** 547 * Setter for property definition. 548 * 549 * @param definition New value of property definition. 550 */ setDefinition(String definition)551 public void setDefinition(String definition) { 552 this.definition = definition; 553 } 554 555 /** 556 * Getter for property file. 557 * 558 * @return Value of property file. 559 */ getFile()560 public String getFile() { 561 return this.file; 562 } 563 564 /** 565 * Setter for property file. 566 * 567 * @param file New value of property file. 568 */ setFile(String file)569 public void setFile(String file) { 570 this.file = file; 571 } 572 573 /** 574 * Getter for property freetext. 575 * 576 * @return Value of property freetext. 577 */ getFreetext()578 public String getFreetext() { 579 return this.freetext; 580 } 581 582 /** 583 * Setter for property freetext. 584 * 585 * @param freetext New value of property freetext. 586 */ setFreetext(String freetext)587 public void setFreetext(String freetext) { 588 this.freetext = freetext; 589 } 590 591 /** 592 * Getter for property history. 593 * 594 * @return Value of property history. 595 */ getHistory()596 public String getHistory() { 597 return this.history; 598 } 599 600 /** 601 * Setter for property history. 602 * 603 * @param history New value of property history. 604 */ setHistory(String history)605 public void setHistory(String history) { 606 this.history = history; 607 } 608 609 /** 610 * Getter for property symbol. 611 * 612 * @return Value of property symbol. 613 */ getSymbol()614 public String getSymbol() { 615 return this.symbol; 616 } 617 618 /** 619 * Setter for property symbol. 620 * 621 * @param symbol New value of property symbol. 622 */ setSymbol(String symbol)623 public void setSymbol(String symbol) { 624 this.symbol = symbol; 625 } 626 627 /** 628 * Getter for property type. 629 * 630 * @return Value of property type. 631 */ getType()632 public String getType() { 633 return this.type; 634 } 635 636 /** 637 * Setter for property type. 638 * 639 * @param fileType New value of property type. 640 */ setType(String fileType)641 public void setType(String fileType) { 642 this.type = fileType; 643 } 644 } 645