xref: /OpenGrok/opengrok-indexer/src/main/java/org/opengrok/indexer/search/SearchEngine.java (revision 70091cc094f5cc7eac3b7666324e952b7be992ab)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * See LICENSE.txt included in this distribution for the specific
9  * language governing permissions and limitations under the License.
10  *
11  * When distributing Covered Code, include this CDDL HEADER in each
12  * file and include the License file at LICENSE.txt.
13  * If applicable, add the following below this CDDL HEADER, with the
14  * fields enclosed by brackets "[]" replaced with your own identifying
15  * information: Portions Copyright [yyyy] [name of copyright owner]
16  *
17  * CDDL HEADER END
18  */
19 
20  /*
21  * Copyright (c) 2005, 2020, Oracle and/or its affiliates. All rights reserved.
22  * Portions Copyright (c) 2018, Chris Fraire <cfraire@me.com>.
23  */
24 package org.opengrok.indexer.search;
25 
26 import java.io.BufferedReader;
27 import java.io.File;
28 import java.io.FileInputStream;
29 import java.io.FileNotFoundException;
30 import java.io.FileReader;
31 import java.io.IOException;
32 import java.io.InputStreamReader;
33 import java.io.Reader;
34 import java.nio.charset.StandardCharsets;
35 import java.util.ArrayList;
36 import java.util.List;
37 import java.util.SortedSet;
38 import java.util.TreeSet;
39 import java.util.logging.Level;
40 import java.util.logging.Logger;
41 import java.util.zip.GZIPInputStream;
42 import org.apache.lucene.analysis.charfilter.HTMLStripCharFilter;
43 import org.apache.lucene.document.Document;
44 import org.apache.lucene.index.DirectoryReader;
45 import org.apache.lucene.index.IndexReader;
46 import org.apache.lucene.index.IndexableField;
47 import org.apache.lucene.index.MultiReader;
48 import org.apache.lucene.queryparser.classic.ParseException;
49 import org.apache.lucene.search.IndexSearcher;
50 import org.apache.lucene.search.Query;
51 import org.apache.lucene.search.ScoreDoc;
52 import org.apache.lucene.search.TopScoreDocCollector;
53 import org.apache.lucene.store.FSDirectory;
54 import org.apache.lucene.util.Version;
55 import org.opengrok.indexer.analysis.AbstractAnalyzer;
56 import org.opengrok.indexer.analysis.CompatibleAnalyser;
57 import org.opengrok.indexer.analysis.Definitions;
58 import org.opengrok.indexer.analysis.Scopes;
59 import org.opengrok.indexer.configuration.Project;
60 import org.opengrok.indexer.configuration.RuntimeEnvironment;
61 import org.opengrok.indexer.configuration.SuperIndexSearcher;
62 import org.opengrok.indexer.history.HistoryException;
63 import org.opengrok.indexer.index.IndexDatabase;
64 import org.opengrok.indexer.logger.LoggerFactory;
65 import org.opengrok.indexer.search.Summary.Fragment;
66 import org.opengrok.indexer.search.context.Context;
67 import org.opengrok.indexer.search.context.HistoryContext;
68 import org.opengrok.indexer.util.Statistics;
69 import org.opengrok.indexer.util.TandemPath;
70 import org.opengrok.indexer.web.Prefix;
71 
72 /**
73  * This is an encapsulation of the details on how to search in the index database.
74  * This is used for searching via the REST API.
75  *
76  * @author Trond Norbye 2005
77  * @author Lubos Kosco - upgrade to lucene 3.x, 4.x, 5.x
78  */
79 public class SearchEngine {
80 
81     private static final Logger LOGGER = LoggerFactory.getLogger(SearchEngine.class);
82 
83     /**
84      * Message text used when logging exceptions thrown when searching.
85      */
86     private static final String SEARCH_EXCEPTION_MSG = "Exception searching {0}";
87     //NOTE below will need to be changed after new lucene upgrade, if they
88     //increase the version - every change of below makes us incompatible with the
89     //old index and we need to ask for reindex
90     /**
91      * Version of Lucene index common for the whole application.
92      */
93     public static final Version LUCENE_VERSION = Version.LATEST;
94     public static final String LUCENE_VERSION_HELP = LUCENE_VERSION.major + "_" + LUCENE_VERSION.minor + "_" + LUCENE_VERSION.bugfix;
95     /**
96      * Holds value of property definition.
97      */
98     private String definition;
99     /**
100      * Holds value of property file.
101      */
102     private String file;
103     /**
104      * Holds value of property freetext.
105      */
106     private String freetext;
107     /**
108      * Holds value of property history.
109      */
110     private String history;
111     /**
112      * Holds value of property symbol.
113      */
114     private String symbol;
115     /**
116      * Holds value of property type.
117      */
118     private String type;
119     /**
120      * Holds value of property indexDatabase.
121      */
122     private Query query;
123     private QueryBuilder queryBuilder;
124     private final CompatibleAnalyser analyzer = new CompatibleAnalyser();
125     private Context sourceContext;
126     private HistoryContext historyContext;
127     private Summarizer summarizer;
128     // internal structure to hold the results from lucene
129     private final List<Document> docs;
130     private final char[] content = new char[1024 * 8];
131     private String source;
132     private String data;
133     int hitsPerPage = RuntimeEnvironment.getInstance().getHitsPerPage();
134     int cachePages = RuntimeEnvironment.getInstance().getCachePages();
135     int totalHits = 0;
136     private ScoreDoc[] hits;
137     private TopScoreDocCollector collector;
138     private IndexSearcher searcher;
139     boolean allCollected;
140     private final ArrayList<SuperIndexSearcher> searcherList = new ArrayList<>();
141 
142     /**
143      * Creates a new instance of SearchEngine.
144      */
SearchEngine()145     public SearchEngine() {
146         docs = new ArrayList<>();
147     }
148 
149     /**
150      * Create a QueryBuilder using the fields that have been set on this
151      * SearchEngine.
152      *
153      * @return a query builder
154      */
createQueryBuilder()155     private QueryBuilder createQueryBuilder() {
156         return new QueryBuilder()
157                 .setFreetext(freetext)
158                 .setDefs(definition)
159                 .setRefs(symbol)
160                 .setPath(file)
161                 .setHist(history)
162                 .setType(type);
163     }
164 
isValidQuery()165     public boolean isValidQuery() {
166         boolean ret;
167         try {
168             query = createQueryBuilder().build();
169             ret = (query != null);
170         } catch (ParseException e) {
171             ret = false;
172         }
173 
174         return ret;
175     }
176 
177     /**
178      * Search one index. This is used if no projects are set up.
179      * @param paging whether to use paging (if yes, first X pages will load
180      * faster)
181      * @param root which db to search
182      * @throws IOException
183      */
searchSingleDatabase(File root, boolean paging)184     private void searchSingleDatabase(File root, boolean paging) throws IOException {
185         IndexReader ireader = DirectoryReader.open(FSDirectory.open(root.toPath()));
186         searcher = new IndexSearcher(ireader);
187         searchIndex(searcher, paging);
188     }
189 
190     /**
191      * Perform search on multiple indexes in parallel.
192      * @param paging whether to use paging (if yes, first X pages will load
193      * faster)
194      * @param root list of projects to search
195      * @throws IOException
196      */
searchMultiDatabase(List<Project> root, boolean paging)197     private void searchMultiDatabase(List<Project> root, boolean paging) throws IOException {
198         SortedSet<String> projects = new TreeSet<>();
199         for (Project p : root) {
200             projects.add(p.getName());
201         }
202 
203         // We use MultiReader even for single project. This should
204         // not matter given that MultiReader is just a cheap wrapper
205         // around set of IndexReader objects.
206         MultiReader searchables = RuntimeEnvironment.getInstance().
207             getMultiReader(projects, searcherList);
208         searcher = new IndexSearcher(searchables);
209         searchIndex(searcher, paging);
210     }
211 
searchIndex(IndexSearcher searcher, boolean paging)212     private void searchIndex(IndexSearcher searcher, boolean paging) throws IOException {
213         collector = TopScoreDocCollector.create(hitsPerPage * cachePages, Short.MAX_VALUE);
214         Statistics stat = new Statistics();
215         searcher.search(query, collector);
216         totalHits = collector.getTotalHits();
217         stat.report(LOGGER, Level.FINEST, "search via SearchEngine done",
218                 "search.latency", new String[]{"category", "engine",
219                         "outcome", totalHits > 0 ? "success" : "empty"});
220         if (!paging && totalHits > 0) {
221             collector = TopScoreDocCollector.create(totalHits, Short.MAX_VALUE);
222             searcher.search(query, collector);
223         }
224         hits = collector.topDocs().scoreDocs;
225         for (ScoreDoc hit : hits) {
226             int docId = hit.doc;
227             Document d = searcher.doc(docId);
228             docs.add(d);
229         }
230     }
231 
232     /**
233      * Gets the instance from {@code search(...)} if it was called.
234      * @return defined instance or {@code null}
235      */
getQuery()236     public String getQuery() {
237         return query != null ? query.toString() : null;
238     }
239 
240     /**
241      * Gets the instance from {@code search(...)} if it was called.
242      * @return defined instance or {@code null}
243      */
getQueryObject()244     public Query getQueryObject() {
245         return query;
246     }
247 
248     /**
249      * Gets the builder from {@code search(...)} if it was called.
250      * <p>
251      * (Modifying the builder will have no effect on this
252      * {@link SearchEngine}.)
253      * @return defined instance or {@code null}
254      */
getQueryBuilder()255     public QueryBuilder getQueryBuilder() {
256         return queryBuilder;
257     }
258 
259     /**
260      * Gets the searcher from {@code search(...)} if it was called.
261      * @return defined instance or {@code null}
262      */
getSearcher()263     public IndexSearcher getSearcher() {
264         return searcher;
265     }
266 
267     /**
268      * Execute a search aware of current request, limited to specific project names.
269      *
270      * This filters out all projects which are not allowed for the current request.
271      *
272      * Before calling this function,
273      * you must set the appropriate search criteria with the set-functions. Note
274      * that this search will return the first cachePages of hitsPerPage, for
275      * more you need to call more.
276      *
277      * Call to search() must be eventually followed by call to destroy()
278      * so that IndexSearcher objects are properly freed.
279      *
280      * @param projects projects to search
281      * @return The number of hits
282      */
search(List<Project> projects)283     public int search(List<Project> projects) {
284         return search(projects, new File(RuntimeEnvironment.getInstance().getDataRootFile(), IndexDatabase.INDEX_DIR));
285     }
286 
287     /**
288      * Execute a search without authorization.
289      *
290      * Before calling this function, you must set the
291      * appropriate search criteria with the set-functions. Note that this search
292      * will return the first cachePages of hitsPerPage, for more you need to
293      * call more.
294      *
295      * Call to search() must be eventually followed by call to destroy()
296      * so that IndexSearcher objects are properly freed.
297      *
298      * @return The number of hits
299      */
search()300     public int search() {
301         RuntimeEnvironment env = RuntimeEnvironment.getInstance();
302         return search(
303                 env.hasProjects() ? env.getProjectList() : new ArrayList<>(),
304                 new File(env.getDataRootFile(), IndexDatabase.INDEX_DIR));
305     }
306 
307     /**
308      * Execute a search on projects or root file.
309      *
310      * If @param projects is an empty list it tries to search in @code
311      * searchSingleDatabase with root set to @param root
312      *
313      * Call to search() must be eventually followed by call to destroy()
314      * so that IndexSearcher objects are properly freed.
315      *
316      * @return The number of hits
317      */
search(List<Project> projects, File root)318     private int search(List<Project> projects, File root) {
319         source = RuntimeEnvironment.getInstance().getSourceRootPath();
320         data = RuntimeEnvironment.getInstance().getDataRootPath();
321         docs.clear();
322 
323         QueryBuilder newBuilder = createQueryBuilder();
324         try {
325             query = newBuilder.build();
326             if (query != null) {
327 
328                 if (projects.isEmpty()) {
329                     // search the index database
330                     //NOTE this assumes that src does not contain any project, just
331                     // data files - so no authorization can be enforced
332                     searchSingleDatabase(root, true);
333                 } else {
334                     // search all projects
335                     //TODO support paging per project (in search.java)
336                     //TODO optimize if only one project by falling back to SingleDatabase ?
337                     //NOTE projects are already filtered if we accessed through web page @see search(HttpServletRequest)
338                     searchMultiDatabase(projects, false);
339                 }
340             }
341         } catch (Exception e) {
342             LOGGER.log(
343                     Level.WARNING, SEARCH_EXCEPTION_MSG, e);
344         }
345 
346         if (!docs.isEmpty()) {
347             sourceContext = null;
348             summarizer = null;
349             try {
350                 sourceContext = new Context(query, newBuilder);
351                 if (sourceContext.isEmpty()) {
352                     sourceContext = null;
353                 }
354                 summarizer = new Summarizer(query, analyzer);
355             } catch (Exception e) {
356                 LOGGER.log(Level.WARNING, "An error occurred while creating summary", e);
357             }
358 
359             historyContext = null;
360             try {
361                 historyContext = new HistoryContext(query);
362                 if (historyContext.isEmpty()) {
363                     historyContext = null;
364                 }
365             } catch (Exception e) {
366                 LOGGER.log(Level.WARNING, "An error occurred while getting history context", e);
367             }
368         }
369         int count = hits == null ? 0 : hits.length;
370         queryBuilder = newBuilder;
371         return count;
372     }
373 
374     /**
375      * Gets the queried score docs from {@code search(...)} if it was called.
376      * @return a defined instance if a query succeeded, or {@code null}
377      */
scoreDocs()378     public ScoreDoc[] scoreDocs() {
379         return hits;
380     }
381 
382     /**
383      * Gets the document of the specified {@code docId} from
384      * {@code search(...)} if it was called.
385      *
386      * @param docId document ID
387      * @return a defined instance if a query succeeded
388      * @throws java.io.IOException if an error occurs obtaining the Lucene
389      * document by ID
390      */
doc(int docId)391     public Document doc(int docId) throws IOException {
392         if (searcher == null) {
393             throw new IllegalStateException("search(...) did not succeed");
394         }
395         return searcher.doc(docId);
396     }
397 
398     /**
399      * Get results , if no search was started before, no results are returned.
400      * This method will requery if {@code end} is more than first query from search,
401      * hence performance hit applies, if you want results in later pages than
402      * number of cachePages. {@code end} has to be bigger than {@code start} !
403      *
404      * @param start start of the hit list
405      * @param end end of the hit list
406      * @param ret list of results from start to end or null/empty if no search
407      * was started
408      */
results(int start, int end, List<Hit> ret)409     public void results(int start, int end, List<Hit> ret) {
410 
411         //return if no start search() was done
412         if (hits == null || (end < start)) {
413             ret.clear();
414             return;
415         }
416 
417         ret.clear();
418 
419         // TODO check if below fits for if end=old hits.length, or it should include it
420         if (end > hits.length && !allCollected) {
421             //do the requery, we want more than 5 pages
422             collector = TopScoreDocCollector.create(totalHits, Short.MAX_VALUE);
423             try {
424                 searcher.search(query, collector);
425             } catch (Exception e) { // this exception should never be hit, since search() will hit this before
426                 LOGGER.log(
427                         Level.WARNING, SEARCH_EXCEPTION_MSG, e);
428             }
429             hits = collector.topDocs().scoreDocs;
430             Document d = null;
431             for (int i = start; i < hits.length; i++) {
432                 int docId = hits[i].doc;
433                 try {
434                     d = searcher.doc(docId);
435                 } catch (Exception e) {
436                     LOGGER.log(
437                             Level.SEVERE, SEARCH_EXCEPTION_MSG, e);
438                 }
439                 docs.add(d);
440             }
441             allCollected = true;
442         }
443 
444         //TODO generation of ret(results) could be cashed and consumers of engine would just print them in whatever
445         // form they need, this way we could get rid of docs
446         // the only problem is that count of docs is usually smaller than number of results
447         for (int ii = start; ii < end; ++ii) {
448             boolean alt = (ii % 2 == 0);
449             boolean hasContext = false;
450             try {
451                 Document doc = docs.get(ii);
452                 String filename = doc.get(QueryBuilder.PATH);
453 
454                 AbstractAnalyzer.Genre genre = AbstractAnalyzer.Genre.get(doc.get(QueryBuilder.T));
455                 Definitions tags = null;
456                 IndexableField tagsField = doc.getField(QueryBuilder.TAGS);
457                 if (tagsField != null) {
458                     tags = Definitions.deserialize(tagsField.binaryValue().bytes);
459                 }
460                 Scopes scopes = null;
461                 IndexableField scopesField = doc.getField(QueryBuilder.SCOPES);
462                 if (scopesField != null) {
463                     scopes = Scopes.deserialize(scopesField.binaryValue().bytes);
464                 }
465                 int nhits = docs.size();
466 
467                 if (sourceContext != null) {
468                     sourceContext.toggleAlt();
469                     try {
470                         if (AbstractAnalyzer.Genre.PLAIN == genre && (source != null)) {
471                             // SRCROOT is read with UTF-8 as a default.
472                             hasContext = sourceContext.getContext(
473                                 new InputStreamReader(new FileInputStream(
474                                 source + filename), StandardCharsets.UTF_8),
475                                 null, null, null, filename, tags, nhits > 100,
476                                 getDefinition() != null, ret, scopes);
477                         } else if (AbstractAnalyzer.Genre.XREFABLE == genre && data != null && summarizer != null) {
478                             int l;
479                             /**
480                              * For backward compatibility, read the
481                              * OpenGrok-produced document using the system
482                              * default charset.
483                              */
484                             try (Reader r = RuntimeEnvironment.getInstance().isCompressXref()
485                                     ? new HTMLStripCharFilter(new BufferedReader(new InputStreamReader(new GZIPInputStream(new FileInputStream(
486                                             TandemPath.join(data + Prefix.XREF_P + filename, ".gz"))))))
487                                     : new HTMLStripCharFilter(new BufferedReader(new FileReader(data + Prefix.XREF_P + filename)))) {
488                                 l = r.read(content);
489                             }
490                             //TODO FIX below fragmenter according to either summarizer or context
491                             // (to get line numbers, might be hard, since xref writers will need to be fixed too,
492                             // they generate just one line of html code now :( )
493                             Summary sum = summarizer.getSummary(new String(content, 0, l));
494                             Fragment[] fragments = sum.getFragments();
495                             for (Fragment fragment : fragments) {
496                                 String match = fragment.toString();
497                                 if (match.length() > 0) {
498                                     if (!fragment.isEllipsis()) {
499                                         Hit hit = new Hit(filename, fragment.toString(), "", true, alt);
500                                         ret.add(hit);
501                                     }
502                                     hasContext = true;
503                                 }
504                             }
505                         } else {
506                             LOGGER.log(Level.WARNING, "Unknown genre: {0} for {1}", new Object[]{genre, filename});
507                             hasContext |= sourceContext.getContext(null, null, null, null, filename, tags, false, false, ret, scopes);
508                         }
509                     } catch (FileNotFoundException exp) {
510                         LOGGER.log(Level.WARNING, "Couldn''t read summary from {0} ({1})", new Object[]{filename, exp.getMessage()});
511                         hasContext |= sourceContext.getContext(null, null, null, null, filename, tags, false, false, ret, scopes);
512                     }
513                 }
514                 if (historyContext != null) {
515                     hasContext |= historyContext.getContext(source + filename, filename, ret);
516                 }
517                 if (!hasContext) {
518                     ret.add(new Hit(filename, "...", "", false, alt));
519                 }
520             } catch (IOException | ClassNotFoundException | HistoryException e) {
521                 LOGGER.log(
522                         Level.WARNING, SEARCH_EXCEPTION_MSG, e);
523             }
524         }
525     }
526 
destroy()527     public void destroy() {
528         for (SuperIndexSearcher is : searcherList) {
529             try {
530                 is.getSearcherManager().release(is);
531             } catch (IOException ex) {
532                 LOGGER.log(Level.WARNING, "cannot release indexSearcher", ex);
533             }
534         }
535     }
536 
537     /**
538      * Getter for property definition.
539      *
540      * @return Value of property definition.
541      */
getDefinition()542     public String getDefinition() {
543         return this.definition;
544     }
545 
546     /**
547      * Setter for property definition.
548      *
549      * @param definition New value of property definition.
550      */
setDefinition(String definition)551     public void setDefinition(String definition) {
552         this.definition = definition;
553     }
554 
555     /**
556      * Getter for property file.
557      *
558      * @return Value of property file.
559      */
getFile()560     public String getFile() {
561         return this.file;
562     }
563 
564     /**
565      * Setter for property file.
566      *
567      * @param file New value of property file.
568      */
setFile(String file)569     public void setFile(String file) {
570         this.file = file;
571     }
572 
573     /**
574      * Getter for property freetext.
575      *
576      * @return Value of property freetext.
577      */
getFreetext()578     public String getFreetext() {
579         return this.freetext;
580     }
581 
582     /**
583      * Setter for property freetext.
584      *
585      * @param freetext New value of property freetext.
586      */
setFreetext(String freetext)587     public void setFreetext(String freetext) {
588         this.freetext = freetext;
589     }
590 
591     /**
592      * Getter for property history.
593      *
594      * @return Value of property history.
595      */
getHistory()596     public String getHistory() {
597         return this.history;
598     }
599 
600     /**
601      * Setter for property history.
602      *
603      * @param history New value of property history.
604      */
setHistory(String history)605     public void setHistory(String history) {
606         this.history = history;
607     }
608 
609     /**
610      * Getter for property symbol.
611      *
612      * @return Value of property symbol.
613      */
getSymbol()614     public String getSymbol() {
615         return this.symbol;
616     }
617 
618     /**
619      * Setter for property symbol.
620      *
621      * @param symbol New value of property symbol.
622      */
setSymbol(String symbol)623     public void setSymbol(String symbol) {
624         this.symbol = symbol;
625     }
626 
627     /**
628      * Getter for property type.
629      *
630      * @return Value of property type.
631      */
getType()632     public String getType() {
633         return this.type;
634     }
635 
636     /**
637      * Setter for property type.
638      *
639      * @param fileType New value of property type.
640      */
setType(String fileType)641     public void setType(String fileType) {
642         this.type = fileType;
643     }
644 }
645