1b5840353SAdam Hornáček /* 2b5840353SAdam Hornáček * CDDL HEADER START 3b5840353SAdam Hornáček * 4b5840353SAdam Hornáček * The contents of this file are subject to the terms of the 5b5840353SAdam Hornáček * Common Development and Distribution License (the "License"). 6b5840353SAdam Hornáček * You may not use this file except in compliance with the License. 7b5840353SAdam Hornáček * 8b5840353SAdam Hornáček * See LICENSE.txt included in this distribution for the specific 9b5840353SAdam Hornáček * language governing permissions and limitations under the License. 10b5840353SAdam Hornáček * 11b5840353SAdam Hornáček * When distributing Covered Code, include this CDDL HEADER in each 12b5840353SAdam Hornáček * file and include the License file at LICENSE.txt. 13b5840353SAdam Hornáček * If applicable, add the following below this CDDL HEADER, with the 14b5840353SAdam Hornáček * fields enclosed by brackets "[]" replaced with your own identifying 15b5840353SAdam Hornáček * information: Portions Copyright [yyyy] [name of copyright owner] 16b5840353SAdam Hornáček * 17b5840353SAdam Hornáček * CDDL HEADER END 18b5840353SAdam Hornáček */ 19b5840353SAdam Hornáček 20b5840353SAdam Hornáček /* 214b8c49e9SVladimir Kotal * Copyright (c) 2005, 2020, Oracle and/or its affiliates. All rights reserved. 22b5840353SAdam Hornáček * Portions Copyright (c) 2018, Chris Fraire <cfraire@me.com>. 23b5840353SAdam Hornáček */ 249805b761SAdam Hornáček package org.opengrok.indexer.search; 25b5840353SAdam Hornáček 26b5840353SAdam Hornáček import java.io.BufferedReader; 27b5840353SAdam Hornáček import java.io.File; 28b5840353SAdam Hornáček import java.io.FileInputStream; 29b5840353SAdam Hornáček import java.io.FileNotFoundException; 30b5840353SAdam Hornáček import java.io.FileReader; 31b5840353SAdam Hornáček import java.io.IOException; 32b5840353SAdam Hornáček import java.io.InputStreamReader; 33b5840353SAdam Hornáček import java.io.Reader; 34b5840353SAdam Hornáček import java.nio.charset.StandardCharsets; 35b5840353SAdam Hornáček import java.util.ArrayList; 36b5840353SAdam Hornáček import java.util.List; 37b5840353SAdam Hornáček import java.util.SortedSet; 38b5840353SAdam Hornáček import java.util.TreeSet; 39b5840353SAdam Hornáček import java.util.logging.Level; 40b5840353SAdam Hornáček import java.util.logging.Logger; 41b5840353SAdam Hornáček import java.util.zip.GZIPInputStream; 42b5840353SAdam Hornáček import org.apache.lucene.analysis.charfilter.HTMLStripCharFilter; 43b5840353SAdam Hornáček import org.apache.lucene.document.Document; 44b5840353SAdam Hornáček import org.apache.lucene.index.DirectoryReader; 45b5840353SAdam Hornáček import org.apache.lucene.index.IndexReader; 46b5840353SAdam Hornáček import org.apache.lucene.index.IndexableField; 47b5840353SAdam Hornáček import org.apache.lucene.index.MultiReader; 48b5840353SAdam Hornáček import org.apache.lucene.queryparser.classic.ParseException; 49b5840353SAdam Hornáček import org.apache.lucene.search.IndexSearcher; 50b5840353SAdam Hornáček import org.apache.lucene.search.Query; 51b5840353SAdam Hornáček import org.apache.lucene.search.ScoreDoc; 52b5840353SAdam Hornáček import org.apache.lucene.search.TopScoreDocCollector; 53b5840353SAdam Hornáček import org.apache.lucene.store.FSDirectory; 54b5840353SAdam Hornáček import org.apache.lucene.util.Version; 5557eefa47SKryštof Tulinger import org.opengrok.indexer.analysis.AbstractAnalyzer; 569805b761SAdam Hornáček import org.opengrok.indexer.analysis.CompatibleAnalyser; 579805b761SAdam Hornáček import org.opengrok.indexer.analysis.Definitions; 589805b761SAdam Hornáček import org.opengrok.indexer.analysis.Scopes; 599805b761SAdam Hornáček import org.opengrok.indexer.configuration.Project; 609805b761SAdam Hornáček import org.opengrok.indexer.configuration.RuntimeEnvironment; 619805b761SAdam Hornáček import org.opengrok.indexer.configuration.SuperIndexSearcher; 629805b761SAdam Hornáček import org.opengrok.indexer.history.HistoryException; 639805b761SAdam Hornáček import org.opengrok.indexer.index.IndexDatabase; 649805b761SAdam Hornáček import org.opengrok.indexer.logger.LoggerFactory; 659805b761SAdam Hornáček import org.opengrok.indexer.search.Summary.Fragment; 669805b761SAdam Hornáček import org.opengrok.indexer.search.context.Context; 679805b761SAdam Hornáček import org.opengrok.indexer.search.context.HistoryContext; 68ef6b5de2SVladimir Kotal import org.opengrok.indexer.util.Statistics; 694da26a1eSChris Fraire import org.opengrok.indexer.util.TandemPath; 709805b761SAdam Hornáček import org.opengrok.indexer.web.Prefix; 71b5840353SAdam Hornáček 72b5840353SAdam Hornáček /** 734b8c49e9SVladimir Kotal * This is an encapsulation of the details on how to search in the index database. 744b8c49e9SVladimir Kotal * This is used for searching via the REST API. 75b5840353SAdam Hornáček * 76b5840353SAdam Hornáček * @author Trond Norbye 2005 77b5840353SAdam Hornáček * @author Lubos Kosco - upgrade to lucene 3.x, 4.x, 5.x 78b5840353SAdam Hornáček */ 79b5840353SAdam Hornáček public class SearchEngine { 80b5840353SAdam Hornáček 81b5840353SAdam Hornáček private static final Logger LOGGER = LoggerFactory.getLogger(SearchEngine.class); 82b5840353SAdam Hornáček 83b5840353SAdam Hornáček /** 84b5840353SAdam Hornáček * Message text used when logging exceptions thrown when searching. 85b5840353SAdam Hornáček */ 86b5840353SAdam Hornáček private static final String SEARCH_EXCEPTION_MSG = "Exception searching {0}"; 87b5840353SAdam Hornáček //NOTE below will need to be changed after new lucene upgrade, if they 88b5840353SAdam Hornáček //increase the version - every change of below makes us incompatible with the 89b5840353SAdam Hornáček //old index and we need to ask for reindex 90b5840353SAdam Hornáček /** 91ff44f24aSAdam Hornáček * Version of Lucene index common for the whole application. 92b5840353SAdam Hornáček */ 93b5840353SAdam Hornáček public static final Version LUCENE_VERSION = Version.LATEST; 94b5840353SAdam Hornáček public static final String LUCENE_VERSION_HELP = LUCENE_VERSION.major + "_" + LUCENE_VERSION.minor + "_" + LUCENE_VERSION.bugfix; 95b5840353SAdam Hornáček /** 96b5840353SAdam Hornáček * Holds value of property definition. 97b5840353SAdam Hornáček */ 98b5840353SAdam Hornáček private String definition; 99b5840353SAdam Hornáček /** 100b5840353SAdam Hornáček * Holds value of property file. 101b5840353SAdam Hornáček */ 102b5840353SAdam Hornáček private String file; 103b5840353SAdam Hornáček /** 104b5840353SAdam Hornáček * Holds value of property freetext. 105b5840353SAdam Hornáček */ 106b5840353SAdam Hornáček private String freetext; 107b5840353SAdam Hornáček /** 108b5840353SAdam Hornáček * Holds value of property history. 109b5840353SAdam Hornáček */ 110b5840353SAdam Hornáček private String history; 111b5840353SAdam Hornáček /** 112b5840353SAdam Hornáček * Holds value of property symbol. 113b5840353SAdam Hornáček */ 114b5840353SAdam Hornáček private String symbol; 115b5840353SAdam Hornáček /** 116ff44f24aSAdam Hornáček * Holds value of property type. 117b5840353SAdam Hornáček */ 118b5840353SAdam Hornáček private String type; 119b5840353SAdam Hornáček /** 120b5840353SAdam Hornáček * Holds value of property indexDatabase. 121b5840353SAdam Hornáček */ 122b5840353SAdam Hornáček private Query query; 123b5840353SAdam Hornáček private QueryBuilder queryBuilder; 124b5840353SAdam Hornáček private final CompatibleAnalyser analyzer = new CompatibleAnalyser(); 125b5840353SAdam Hornáček private Context sourceContext; 126b5840353SAdam Hornáček private HistoryContext historyContext; 127b5840353SAdam Hornáček private Summarizer summarizer; 128b5840353SAdam Hornáček // internal structure to hold the results from lucene 129b5840353SAdam Hornáček private final List<Document> docs; 130b5840353SAdam Hornáček private final char[] content = new char[1024 * 8]; 131b5840353SAdam Hornáček private String source; 132b5840353SAdam Hornáček private String data; 133b5840353SAdam Hornáček int hitsPerPage = RuntimeEnvironment.getInstance().getHitsPerPage(); 134b5840353SAdam Hornáček int cachePages = RuntimeEnvironment.getInstance().getCachePages(); 135b5840353SAdam Hornáček int totalHits = 0; 136b5840353SAdam Hornáček private ScoreDoc[] hits; 137b5840353SAdam Hornáček private TopScoreDocCollector collector; 138b5840353SAdam Hornáček private IndexSearcher searcher; 139b5840353SAdam Hornáček boolean allCollected; 140b5840353SAdam Hornáček private final ArrayList<SuperIndexSearcher> searcherList = new ArrayList<>(); 141b5840353SAdam Hornáček 142b5840353SAdam Hornáček /** 143ff44f24aSAdam Hornáček * Creates a new instance of SearchEngine. 144b5840353SAdam Hornáček */ SearchEngine()145b5840353SAdam Hornáček public SearchEngine() { 146b5840353SAdam Hornáček docs = new ArrayList<>(); 147b5840353SAdam Hornáček } 148b5840353SAdam Hornáček 149b5840353SAdam Hornáček /** 150b5840353SAdam Hornáček * Create a QueryBuilder using the fields that have been set on this 151b5840353SAdam Hornáček * SearchEngine. 152b5840353SAdam Hornáček * 153b5840353SAdam Hornáček * @return a query builder 154b5840353SAdam Hornáček */ createQueryBuilder()155b5840353SAdam Hornáček private QueryBuilder createQueryBuilder() { 156b5840353SAdam Hornáček return new QueryBuilder() 157b5840353SAdam Hornáček .setFreetext(freetext) 158b5840353SAdam Hornáček .setDefs(definition) 159b5840353SAdam Hornáček .setRefs(symbol) 160b5840353SAdam Hornáček .setPath(file) 161b5840353SAdam Hornáček .setHist(history) 162b5840353SAdam Hornáček .setType(type); 163b5840353SAdam Hornáček } 164b5840353SAdam Hornáček isValidQuery()165b5840353SAdam Hornáček public boolean isValidQuery() { 166b5840353SAdam Hornáček boolean ret; 167b5840353SAdam Hornáček try { 168b5840353SAdam Hornáček query = createQueryBuilder().build(); 169b5840353SAdam Hornáček ret = (query != null); 170b5840353SAdam Hornáček } catch (ParseException e) { 171b5840353SAdam Hornáček ret = false; 172b5840353SAdam Hornáček } 173b5840353SAdam Hornáček 174b5840353SAdam Hornáček return ret; 175b5840353SAdam Hornáček } 176b5840353SAdam Hornáček 177b5840353SAdam Hornáček /** 178b5840353SAdam Hornáček * Search one index. This is used if no projects are set up. 179b5840353SAdam Hornáček * @param paging whether to use paging (if yes, first X pages will load 180b5840353SAdam Hornáček * faster) 181b5840353SAdam Hornáček * @param root which db to search 182b5840353SAdam Hornáček * @throws IOException 183b5840353SAdam Hornáček */ searchSingleDatabase(File root, boolean paging)184b5840353SAdam Hornáček private void searchSingleDatabase(File root, boolean paging) throws IOException { 185b5840353SAdam Hornáček IndexReader ireader = DirectoryReader.open(FSDirectory.open(root.toPath())); 186b5840353SAdam Hornáček searcher = new IndexSearcher(ireader); 1871204ce31SVladimir Kotal searchIndex(searcher, paging); 188b5840353SAdam Hornáček } 189b5840353SAdam Hornáček 190b5840353SAdam Hornáček /** 191b5840353SAdam Hornáček * Perform search on multiple indexes in parallel. 192b5840353SAdam Hornáček * @param paging whether to use paging (if yes, first X pages will load 193b5840353SAdam Hornáček * faster) 194b5840353SAdam Hornáček * @param root list of projects to search 195b5840353SAdam Hornáček * @throws IOException 196b5840353SAdam Hornáček */ searchMultiDatabase(List<Project> root, boolean paging)197b5840353SAdam Hornáček private void searchMultiDatabase(List<Project> root, boolean paging) throws IOException { 198b5840353SAdam Hornáček SortedSet<String> projects = new TreeSet<>(); 199b5840353SAdam Hornáček for (Project p : root) { 200b5840353SAdam Hornáček projects.add(p.getName()); 201b5840353SAdam Hornáček } 202b5840353SAdam Hornáček 203b5840353SAdam Hornáček // We use MultiReader even for single project. This should 204b5840353SAdam Hornáček // not matter given that MultiReader is just a cheap wrapper 205b5840353SAdam Hornáček // around set of IndexReader objects. 206b5840353SAdam Hornáček MultiReader searchables = RuntimeEnvironment.getInstance(). 207b5840353SAdam Hornáček getMultiReader(projects, searcherList); 208b5840353SAdam Hornáček searcher = new IndexSearcher(searchables); 2091204ce31SVladimir Kotal searchIndex(searcher, paging); 2101204ce31SVladimir Kotal } 2111204ce31SVladimir Kotal searchIndex(IndexSearcher searcher, boolean paging)2121204ce31SVladimir Kotal private void searchIndex(IndexSearcher searcher, boolean paging) throws IOException { 2134cf88309SLubos Kosco collector = TopScoreDocCollector.create(hitsPerPage * cachePages, Short.MAX_VALUE); 214ef6b5de2SVladimir Kotal Statistics stat = new Statistics(); 215b5840353SAdam Hornáček searcher.search(query, collector); 216b5840353SAdam Hornáček totalHits = collector.getTotalHits(); 217ef6b5de2SVladimir Kotal stat.report(LOGGER, Level.FINEST, "search via SearchEngine done", 218ef6b5de2SVladimir Kotal "search.latency", new String[]{"category", "engine", 219ef6b5de2SVladimir Kotal "outcome", totalHits > 0 ? "success" : "empty"}); 220b5840353SAdam Hornáček if (!paging && totalHits > 0) { 2214cf88309SLubos Kosco collector = TopScoreDocCollector.create(totalHits, Short.MAX_VALUE); 222b5840353SAdam Hornáček searcher.search(query, collector); 223b5840353SAdam Hornáček } 224b5840353SAdam Hornáček hits = collector.topDocs().scoreDocs; 225b5840353SAdam Hornáček for (ScoreDoc hit : hits) { 226b5840353SAdam Hornáček int docId = hit.doc; 227b5840353SAdam Hornáček Document d = searcher.doc(docId); 228b5840353SAdam Hornáček docs.add(d); 229b5840353SAdam Hornáček } 230b5840353SAdam Hornáček } 231b5840353SAdam Hornáček 232b5840353SAdam Hornáček /** 233b5840353SAdam Hornáček * Gets the instance from {@code search(...)} if it was called. 234b5840353SAdam Hornáček * @return defined instance or {@code null} 235b5840353SAdam Hornáček */ getQuery()236b5840353SAdam Hornáček public String getQuery() { 237b5840353SAdam Hornáček return query != null ? query.toString() : null; 238b5840353SAdam Hornáček } 239b5840353SAdam Hornáček 240b5840353SAdam Hornáček /** 241b5840353SAdam Hornáček * Gets the instance from {@code search(...)} if it was called. 242b5840353SAdam Hornáček * @return defined instance or {@code null} 243b5840353SAdam Hornáček */ getQueryObject()244b5840353SAdam Hornáček public Query getQueryObject() { 245b5840353SAdam Hornáček return query; 246b5840353SAdam Hornáček } 247b5840353SAdam Hornáček 248b5840353SAdam Hornáček /** 249b5840353SAdam Hornáček * Gets the builder from {@code search(...)} if it was called. 250b5840353SAdam Hornáček * <p> 251b5840353SAdam Hornáček * (Modifying the builder will have no effect on this 252b5840353SAdam Hornáček * {@link SearchEngine}.) 253b5840353SAdam Hornáček * @return defined instance or {@code null} 254b5840353SAdam Hornáček */ getQueryBuilder()255b5840353SAdam Hornáček public QueryBuilder getQueryBuilder() { 256b5840353SAdam Hornáček return queryBuilder; 257b5840353SAdam Hornáček } 258b5840353SAdam Hornáček 259b5840353SAdam Hornáček /** 260b5840353SAdam Hornáček * Gets the searcher from {@code search(...)} if it was called. 261b5840353SAdam Hornáček * @return defined instance or {@code null} 262b5840353SAdam Hornáček */ getSearcher()263b5840353SAdam Hornáček public IndexSearcher getSearcher() { 264b5840353SAdam Hornáček return searcher; 265b5840353SAdam Hornáček } 266b5840353SAdam Hornáček 267b5840353SAdam Hornáček /** 268b5840353SAdam Hornáček * Execute a search aware of current request, limited to specific project names. 269b5840353SAdam Hornáček * 270b5840353SAdam Hornáček * This filters out all projects which are not allowed for the current request. 271b5840353SAdam Hornáček * 272b5840353SAdam Hornáček * Before calling this function, 273b5840353SAdam Hornáček * you must set the appropriate search criteria with the set-functions. Note 274b5840353SAdam Hornáček * that this search will return the first cachePages of hitsPerPage, for 275b5840353SAdam Hornáček * more you need to call more. 276b5840353SAdam Hornáček * 277b5840353SAdam Hornáček * Call to search() must be eventually followed by call to destroy() 278b5840353SAdam Hornáček * so that IndexSearcher objects are properly freed. 279b5840353SAdam Hornáček * 280d8a7afe2SAdam Hornacek * @param projects projects to search 281b5840353SAdam Hornáček * @return The number of hits 282b5840353SAdam Hornáček */ search(List<Project> projects)283d8a7afe2SAdam Hornacek public int search(List<Project> projects) { 284d8a7afe2SAdam Hornacek return search(projects, new File(RuntimeEnvironment.getInstance().getDataRootFile(), IndexDatabase.INDEX_DIR)); 285b5840353SAdam Hornáček } 286b5840353SAdam Hornáček 287b5840353SAdam Hornáček /** 288b5840353SAdam Hornáček * Execute a search without authorization. 289b5840353SAdam Hornáček * 290b5840353SAdam Hornáček * Before calling this function, you must set the 291b5840353SAdam Hornáček * appropriate search criteria with the set-functions. Note that this search 292b5840353SAdam Hornáček * will return the first cachePages of hitsPerPage, for more you need to 293b5840353SAdam Hornáček * call more. 294b5840353SAdam Hornáček * 295b5840353SAdam Hornáček * Call to search() must be eventually followed by call to destroy() 296b5840353SAdam Hornáček * so that IndexSearcher objects are properly freed. 297b5840353SAdam Hornáček * 298b5840353SAdam Hornáček * @return The number of hits 299b5840353SAdam Hornáček */ search()300b5840353SAdam Hornáček public int search() { 301b5840353SAdam Hornáček RuntimeEnvironment env = RuntimeEnvironment.getInstance(); 302b5840353SAdam Hornáček return search( 303b5840353SAdam Hornáček env.hasProjects() ? env.getProjectList() : new ArrayList<>(), 304b5840353SAdam Hornáček new File(env.getDataRootFile(), IndexDatabase.INDEX_DIR)); 305b5840353SAdam Hornáček } 306b5840353SAdam Hornáček 307b5840353SAdam Hornáček /** 308b5840353SAdam Hornáček * Execute a search on projects or root file. 309b5840353SAdam Hornáček * 310b5840353SAdam Hornáček * If @param projects is an empty list it tries to search in @code 311b5840353SAdam Hornáček * searchSingleDatabase with root set to @param root 312b5840353SAdam Hornáček * 313b5840353SAdam Hornáček * Call to search() must be eventually followed by call to destroy() 314b5840353SAdam Hornáček * so that IndexSearcher objects are properly freed. 315b5840353SAdam Hornáček * 316b5840353SAdam Hornáček * @return The number of hits 317b5840353SAdam Hornáček */ search(List<Project> projects, File root)318b5840353SAdam Hornáček private int search(List<Project> projects, File root) { 319b5840353SAdam Hornáček source = RuntimeEnvironment.getInstance().getSourceRootPath(); 320b5840353SAdam Hornáček data = RuntimeEnvironment.getInstance().getDataRootPath(); 321b5840353SAdam Hornáček docs.clear(); 322b5840353SAdam Hornáček 323b5840353SAdam Hornáček QueryBuilder newBuilder = createQueryBuilder(); 324b5840353SAdam Hornáček try { 325b5840353SAdam Hornáček query = newBuilder.build(); 326b5840353SAdam Hornáček if (query != null) { 327b5840353SAdam Hornáček 328b5840353SAdam Hornáček if (projects.isEmpty()) { 329b5840353SAdam Hornáček // search the index database 330b5840353SAdam Hornáček //NOTE this assumes that src does not contain any project, just 331b5840353SAdam Hornáček // data files - so no authorization can be enforced 332b5840353SAdam Hornáček searchSingleDatabase(root, true); 333b5840353SAdam Hornáček } else { 334b5840353SAdam Hornáček // search all projects 335b5840353SAdam Hornáček //TODO support paging per project (in search.java) 336b5840353SAdam Hornáček //TODO optimize if only one project by falling back to SingleDatabase ? 337b5840353SAdam Hornáček //NOTE projects are already filtered if we accessed through web page @see search(HttpServletRequest) 338b5840353SAdam Hornáček searchMultiDatabase(projects, false); 339b5840353SAdam Hornáček } 340b5840353SAdam Hornáček } 341b5840353SAdam Hornáček } catch (Exception e) { 342b5840353SAdam Hornáček LOGGER.log( 343b5840353SAdam Hornáček Level.WARNING, SEARCH_EXCEPTION_MSG, e); 344b5840353SAdam Hornáček } 345b5840353SAdam Hornáček 346b5840353SAdam Hornáček if (!docs.isEmpty()) { 347b5840353SAdam Hornáček sourceContext = null; 348b5840353SAdam Hornáček summarizer = null; 349b5840353SAdam Hornáček try { 350b5840353SAdam Hornáček sourceContext = new Context(query, newBuilder); 351b5840353SAdam Hornáček if (sourceContext.isEmpty()) { 352b5840353SAdam Hornáček sourceContext = null; 353b5840353SAdam Hornáček } 354b5840353SAdam Hornáček summarizer = new Summarizer(query, analyzer); 355b5840353SAdam Hornáček } catch (Exception e) { 356b5840353SAdam Hornáček LOGGER.log(Level.WARNING, "An error occurred while creating summary", e); 357b5840353SAdam Hornáček } 358b5840353SAdam Hornáček 359b5840353SAdam Hornáček historyContext = null; 360b5840353SAdam Hornáček try { 361b5840353SAdam Hornáček historyContext = new HistoryContext(query); 362b5840353SAdam Hornáček if (historyContext.isEmpty()) { 363b5840353SAdam Hornáček historyContext = null; 364b5840353SAdam Hornáček } 365b5840353SAdam Hornáček } catch (Exception e) { 366b5840353SAdam Hornáček LOGGER.log(Level.WARNING, "An error occurred while getting history context", e); 367b5840353SAdam Hornáček } 368b5840353SAdam Hornáček } 369b5840353SAdam Hornáček int count = hits == null ? 0 : hits.length; 370b5840353SAdam Hornáček queryBuilder = newBuilder; 371b5840353SAdam Hornáček return count; 372b5840353SAdam Hornáček } 373b5840353SAdam Hornáček 374b5840353SAdam Hornáček /** 375b5840353SAdam Hornáček * Gets the queried score docs from {@code search(...)} if it was called. 376b5840353SAdam Hornáček * @return a defined instance if a query succeeded, or {@code null} 377b5840353SAdam Hornáček */ scoreDocs()378b5840353SAdam Hornáček public ScoreDoc[] scoreDocs() { 379b5840353SAdam Hornáček return hits; 380b5840353SAdam Hornáček } 381b5840353SAdam Hornáček 382b5840353SAdam Hornáček /** 383b5840353SAdam Hornáček * Gets the document of the specified {@code docId} from 384b5840353SAdam Hornáček * {@code search(...)} if it was called. 38581b586e6SVladimir Kotal * 38681b586e6SVladimir Kotal * @param docId document ID 387b5840353SAdam Hornáček * @return a defined instance if a query succeeded 388b5840353SAdam Hornáček * @throws java.io.IOException if an error occurs obtaining the Lucene 389b5840353SAdam Hornáček * document by ID 390b5840353SAdam Hornáček */ doc(int docId)391b5840353SAdam Hornáček public Document doc(int docId) throws IOException { 392b5840353SAdam Hornáček if (searcher == null) { 393b5840353SAdam Hornáček throw new IllegalStateException("search(...) did not succeed"); 394b5840353SAdam Hornáček } 395b5840353SAdam Hornáček return searcher.doc(docId); 396b5840353SAdam Hornáček } 397b5840353SAdam Hornáček 398b5840353SAdam Hornáček /** 399ba599c91SVladimir Kotal * Get results , if no search was started before, no results are returned. 400ba599c91SVladimir Kotal * This method will requery if {@code end} is more than first query from search, 401b5840353SAdam Hornáček * hence performance hit applies, if you want results in later pages than 402ba599c91SVladimir Kotal * number of cachePages. {@code end} has to be bigger than {@code start} ! 403b5840353SAdam Hornáček * 404b5840353SAdam Hornáček * @param start start of the hit list 405b5840353SAdam Hornáček * @param end end of the hit list 406b5840353SAdam Hornáček * @param ret list of results from start to end or null/empty if no search 407b5840353SAdam Hornáček * was started 408b5840353SAdam Hornáček */ results(int start, int end, List<Hit> ret)409b5840353SAdam Hornáček public void results(int start, int end, List<Hit> ret) { 410b5840353SAdam Hornáček 411b5840353SAdam Hornáček //return if no start search() was done 412b5840353SAdam Hornáček if (hits == null || (end < start)) { 413b5840353SAdam Hornáček ret.clear(); 414b5840353SAdam Hornáček return; 415b5840353SAdam Hornáček } 416b5840353SAdam Hornáček 417b5840353SAdam Hornáček ret.clear(); 418b5840353SAdam Hornáček 419b5840353SAdam Hornáček // TODO check if below fits for if end=old hits.length, or it should include it 420ba599c91SVladimir Kotal if (end > hits.length && !allCollected) { 421b5840353SAdam Hornáček //do the requery, we want more than 5 pages 4224cf88309SLubos Kosco collector = TopScoreDocCollector.create(totalHits, Short.MAX_VALUE); 423b5840353SAdam Hornáček try { 424b5840353SAdam Hornáček searcher.search(query, collector); 425b5840353SAdam Hornáček } catch (Exception e) { // this exception should never be hit, since search() will hit this before 426b5840353SAdam Hornáček LOGGER.log( 427b5840353SAdam Hornáček Level.WARNING, SEARCH_EXCEPTION_MSG, e); 428b5840353SAdam Hornáček } 429b5840353SAdam Hornáček hits = collector.topDocs().scoreDocs; 430b5840353SAdam Hornáček Document d = null; 431b5840353SAdam Hornáček for (int i = start; i < hits.length; i++) { 432b5840353SAdam Hornáček int docId = hits[i].doc; 433b5840353SAdam Hornáček try { 434b5840353SAdam Hornáček d = searcher.doc(docId); 435b5840353SAdam Hornáček } catch (Exception e) { 436b5840353SAdam Hornáček LOGGER.log( 437b5840353SAdam Hornáček Level.SEVERE, SEARCH_EXCEPTION_MSG, e); 438b5840353SAdam Hornáček } 439b5840353SAdam Hornáček docs.add(d); 440b5840353SAdam Hornáček } 441b5840353SAdam Hornáček allCollected = true; 442b5840353SAdam Hornáček } 443b5840353SAdam Hornáček 444ff44f24aSAdam Hornáček //TODO generation of ret(results) could be cashed and consumers of engine would just print them in whatever 445ff44f24aSAdam Hornáček // form they need, this way we could get rid of docs 446b5840353SAdam Hornáček // the only problem is that count of docs is usually smaller than number of results 447b5840353SAdam Hornáček for (int ii = start; ii < end; ++ii) { 448b5840353SAdam Hornáček boolean alt = (ii % 2 == 0); 449b5840353SAdam Hornáček boolean hasContext = false; 450b5840353SAdam Hornáček try { 451b5840353SAdam Hornáček Document doc = docs.get(ii); 452b5840353SAdam Hornáček String filename = doc.get(QueryBuilder.PATH); 453b5840353SAdam Hornáček 45457eefa47SKryštof Tulinger AbstractAnalyzer.Genre genre = AbstractAnalyzer.Genre.get(doc.get(QueryBuilder.T)); 455b5840353SAdam Hornáček Definitions tags = null; 456b5840353SAdam Hornáček IndexableField tagsField = doc.getField(QueryBuilder.TAGS); 457b5840353SAdam Hornáček if (tagsField != null) { 458b5840353SAdam Hornáček tags = Definitions.deserialize(tagsField.binaryValue().bytes); 459b5840353SAdam Hornáček } 460b5840353SAdam Hornáček Scopes scopes = null; 461b5840353SAdam Hornáček IndexableField scopesField = doc.getField(QueryBuilder.SCOPES); 462b5840353SAdam Hornáček if (scopesField != null) { 463b5840353SAdam Hornáček scopes = Scopes.deserialize(scopesField.binaryValue().bytes); 464b5840353SAdam Hornáček } 465b5840353SAdam Hornáček int nhits = docs.size(); 466b5840353SAdam Hornáček 467b5840353SAdam Hornáček if (sourceContext != null) { 468b5840353SAdam Hornáček sourceContext.toggleAlt(); 469b5840353SAdam Hornáček try { 47057eefa47SKryštof Tulinger if (AbstractAnalyzer.Genre.PLAIN == genre && (source != null)) { 471b5840353SAdam Hornáček // SRCROOT is read with UTF-8 as a default. 472b5840353SAdam Hornáček hasContext = sourceContext.getContext( 473b5840353SAdam Hornáček new InputStreamReader(new FileInputStream( 474b5840353SAdam Hornáček source + filename), StandardCharsets.UTF_8), 475b5840353SAdam Hornáček null, null, null, filename, tags, nhits > 100, 476*70091cc0SVladimir Kotal getDefinition() != null, ret, scopes); 47757eefa47SKryštof Tulinger } else if (AbstractAnalyzer.Genre.XREFABLE == genre && data != null && summarizer != null) { 478b5840353SAdam Hornáček int l; 479b5840353SAdam Hornáček /** 480b5840353SAdam Hornáček * For backward compatibility, read the 481b5840353SAdam Hornáček * OpenGrok-produced document using the system 482b5840353SAdam Hornáček * default charset. 483b5840353SAdam Hornáček */ 484b5840353SAdam Hornáček try (Reader r = RuntimeEnvironment.getInstance().isCompressXref() 4854da26a1eSChris Fraire ? new HTMLStripCharFilter(new BufferedReader(new InputStreamReader(new GZIPInputStream(new FileInputStream( 4864da26a1eSChris Fraire TandemPath.join(data + Prefix.XREF_P + filename, ".gz")))))) 487b5840353SAdam Hornáček : new HTMLStripCharFilter(new BufferedReader(new FileReader(data + Prefix.XREF_P + filename)))) { 488b5840353SAdam Hornáček l = r.read(content); 489b5840353SAdam Hornáček } 490ff44f24aSAdam Hornáček //TODO FIX below fragmenter according to either summarizer or context 491ff44f24aSAdam Hornáček // (to get line numbers, might be hard, since xref writers will need to be fixed too, 492ff44f24aSAdam Hornáček // they generate just one line of html code now :( ) 493b5840353SAdam Hornáček Summary sum = summarizer.getSummary(new String(content, 0, l)); 494d1e826faSAdam Hornáček Fragment[] fragments = sum.getFragments(); 495d1e826faSAdam Hornáček for (Fragment fragment : fragments) { 496d1e826faSAdam Hornáček String match = fragment.toString(); 497b5840353SAdam Hornáček if (match.length() > 0) { 498d1e826faSAdam Hornáček if (!fragment.isEllipsis()) { 499d1e826faSAdam Hornáček Hit hit = new Hit(filename, fragment.toString(), "", true, alt); 500b5840353SAdam Hornáček ret.add(hit); 501b5840353SAdam Hornáček } 502b5840353SAdam Hornáček hasContext = true; 503b5840353SAdam Hornáček } 504b5840353SAdam Hornáček } 505b5840353SAdam Hornáček } else { 506b5840353SAdam Hornáček LOGGER.log(Level.WARNING, "Unknown genre: {0} for {1}", new Object[]{genre, filename}); 507b5840353SAdam Hornáček hasContext |= sourceContext.getContext(null, null, null, null, filename, tags, false, false, ret, scopes); 508b5840353SAdam Hornáček } 509b5840353SAdam Hornáček } catch (FileNotFoundException exp) { 510b5840353SAdam Hornáček LOGGER.log(Level.WARNING, "Couldn''t read summary from {0} ({1})", new Object[]{filename, exp.getMessage()}); 511b5840353SAdam Hornáček hasContext |= sourceContext.getContext(null, null, null, null, filename, tags, false, false, ret, scopes); 512b5840353SAdam Hornáček } 513b5840353SAdam Hornáček } 514b5840353SAdam Hornáček if (historyContext != null) { 515b5840353SAdam Hornáček hasContext |= historyContext.getContext(source + filename, filename, ret); 516b5840353SAdam Hornáček } 517b5840353SAdam Hornáček if (!hasContext) { 518b5840353SAdam Hornáček ret.add(new Hit(filename, "...", "", false, alt)); 519b5840353SAdam Hornáček } 520b5840353SAdam Hornáček } catch (IOException | ClassNotFoundException | HistoryException e) { 521b5840353SAdam Hornáček LOGGER.log( 522b5840353SAdam Hornáček Level.WARNING, SEARCH_EXCEPTION_MSG, e); 523b5840353SAdam Hornáček } 524b5840353SAdam Hornáček } 525b5840353SAdam Hornáček } 526b5840353SAdam Hornáček destroy()527b5840353SAdam Hornáček public void destroy() { 528b5840353SAdam Hornáček for (SuperIndexSearcher is : searcherList) { 529b5840353SAdam Hornáček try { 530b5840353SAdam Hornáček is.getSearcherManager().release(is); 531b5840353SAdam Hornáček } catch (IOException ex) { 532b5840353SAdam Hornáček LOGGER.log(Level.WARNING, "cannot release indexSearcher", ex); 533b5840353SAdam Hornáček } 534b5840353SAdam Hornáček } 535b5840353SAdam Hornáček } 536b5840353SAdam Hornáček 537b5840353SAdam Hornáček /** 538b5840353SAdam Hornáček * Getter for property definition. 539b5840353SAdam Hornáček * 540b5840353SAdam Hornáček * @return Value of property definition. 541b5840353SAdam Hornáček */ getDefinition()542b5840353SAdam Hornáček public String getDefinition() { 543b5840353SAdam Hornáček return this.definition; 544b5840353SAdam Hornáček } 545b5840353SAdam Hornáček 546b5840353SAdam Hornáček /** 547b5840353SAdam Hornáček * Setter for property definition. 548b5840353SAdam Hornáček * 549b5840353SAdam Hornáček * @param definition New value of property definition. 550b5840353SAdam Hornáček */ setDefinition(String definition)551b5840353SAdam Hornáček public void setDefinition(String definition) { 552b5840353SAdam Hornáček this.definition = definition; 553b5840353SAdam Hornáček } 554b5840353SAdam Hornáček 555b5840353SAdam Hornáček /** 556b5840353SAdam Hornáček * Getter for property file. 557b5840353SAdam Hornáček * 558b5840353SAdam Hornáček * @return Value of property file. 559b5840353SAdam Hornáček */ getFile()560b5840353SAdam Hornáček public String getFile() { 561b5840353SAdam Hornáček return this.file; 562b5840353SAdam Hornáček } 563b5840353SAdam Hornáček 564b5840353SAdam Hornáček /** 565b5840353SAdam Hornáček * Setter for property file. 566b5840353SAdam Hornáček * 567b5840353SAdam Hornáček * @param file New value of property file. 568b5840353SAdam Hornáček */ setFile(String file)569b5840353SAdam Hornáček public void setFile(String file) { 570b5840353SAdam Hornáček this.file = file; 571b5840353SAdam Hornáček } 572b5840353SAdam Hornáček 573b5840353SAdam Hornáček /** 574b5840353SAdam Hornáček * Getter for property freetext. 575b5840353SAdam Hornáček * 576b5840353SAdam Hornáček * @return Value of property freetext. 577b5840353SAdam Hornáček */ getFreetext()578b5840353SAdam Hornáček public String getFreetext() { 579b5840353SAdam Hornáček return this.freetext; 580b5840353SAdam Hornáček } 581b5840353SAdam Hornáček 582b5840353SAdam Hornáček /** 583b5840353SAdam Hornáček * Setter for property freetext. 584b5840353SAdam Hornáček * 585b5840353SAdam Hornáček * @param freetext New value of property freetext. 586b5840353SAdam Hornáček */ setFreetext(String freetext)587b5840353SAdam Hornáček public void setFreetext(String freetext) { 588b5840353SAdam Hornáček this.freetext = freetext; 589b5840353SAdam Hornáček } 590b5840353SAdam Hornáček 591b5840353SAdam Hornáček /** 592b5840353SAdam Hornáček * Getter for property history. 593b5840353SAdam Hornáček * 594b5840353SAdam Hornáček * @return Value of property history. 595b5840353SAdam Hornáček */ getHistory()596b5840353SAdam Hornáček public String getHistory() { 597b5840353SAdam Hornáček return this.history; 598b5840353SAdam Hornáček } 599b5840353SAdam Hornáček 600b5840353SAdam Hornáček /** 601b5840353SAdam Hornáček * Setter for property history. 602b5840353SAdam Hornáček * 603b5840353SAdam Hornáček * @param history New value of property history. 604b5840353SAdam Hornáček */ setHistory(String history)605b5840353SAdam Hornáček public void setHistory(String history) { 606b5840353SAdam Hornáček this.history = history; 607b5840353SAdam Hornáček } 608b5840353SAdam Hornáček 609b5840353SAdam Hornáček /** 610b5840353SAdam Hornáček * Getter for property symbol. 611b5840353SAdam Hornáček * 612b5840353SAdam Hornáček * @return Value of property symbol. 613b5840353SAdam Hornáček */ getSymbol()614b5840353SAdam Hornáček public String getSymbol() { 615b5840353SAdam Hornáček return this.symbol; 616b5840353SAdam Hornáček } 617b5840353SAdam Hornáček 618b5840353SAdam Hornáček /** 619b5840353SAdam Hornáček * Setter for property symbol. 620b5840353SAdam Hornáček * 621b5840353SAdam Hornáček * @param symbol New value of property symbol. 622b5840353SAdam Hornáček */ setSymbol(String symbol)623b5840353SAdam Hornáček public void setSymbol(String symbol) { 624b5840353SAdam Hornáček this.symbol = symbol; 625b5840353SAdam Hornáček } 626b5840353SAdam Hornáček 627b5840353SAdam Hornáček /** 628b5840353SAdam Hornáček * Getter for property type. 629b5840353SAdam Hornáček * 630b5840353SAdam Hornáček * @return Value of property type. 631b5840353SAdam Hornáček */ getType()632b5840353SAdam Hornáček public String getType() { 633b5840353SAdam Hornáček return this.type; 634b5840353SAdam Hornáček } 635b5840353SAdam Hornáček 636b5840353SAdam Hornáček /** 637b5840353SAdam Hornáček * Setter for property type. 638b5840353SAdam Hornáček * 639b5840353SAdam Hornáček * @param fileType New value of property type. 640b5840353SAdam Hornáček */ setType(String fileType)641b5840353SAdam Hornáček public void setType(String fileType) { 642b5840353SAdam Hornáček this.type = fileType; 643b5840353SAdam Hornáček } 644b5840353SAdam Hornáček } 645