indexer/search/SearchEngine.java

/*
 * CDDL HEADER START
 *
 * The contents of this file are subject to the terms of the
 * Common Development and Distribution License (the "License").
 * You may not use this file except in compliance with the License.
 *
 * See LICENSE.txt included in this distribution for the specific
 * language governing permissions and limitations under the License.
 *
 * When distributing Covered Code, include this CDDL HEADER in each
 * file and include the License file at LICENSE.txt.
 * If applicable, add the following below this CDDL HEADER, with the
 * fields enclosed by brackets "[]" replaced with your own identifying
 * information: Portions Copyright [yyyy] [name of copyright owner]
 *
 * CDDL HEADER END
 */

 /*
 * Copyright (c) 2005, 2020, Oracle and/or its affiliates. All rights reserved.
 * Portions Copyright (c) 2018, Chris Fraire <cfraire@me.com>.
 */
package org.opengrok.indexer.search;

import java.io.BufferedReader;
import java.io.File;
import java.io.FileInputStream;
import java.io.FileNotFoundException;
import java.io.FileReader;
import java.io.IOException;
import java.io.InputStreamReader;
import java.io.Reader;
import java.nio.charset.StandardCharsets;
import java.util.ArrayList;
import java.util.List;
import java.util.SortedSet;
import java.util.TreeSet;
import java.util.logging.Level;
import java.util.logging.Logger;
import java.util.zip.GZIPInputStream;
import org.apache.lucene.analysis.charfilter.HTMLStripCharFilter;
import org.apache.lucene.document.Document;
import org.apache.lucene.index.DirectoryReader;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.IndexableField;
import org.apache.lucene.index.MultiReader;
import org.apache.lucene.queryparser.classic.ParseException;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.Query;
import org.apache.lucene.search.ScoreDoc;
import org.apache.lucene.search.TopScoreDocCollector;
import org.apache.lucene.store.FSDirectory;
import org.apache.lucene.util.Version;
import org.opengrok.indexer.analysis.AbstractAnalyzer;
import org.opengrok.indexer.analysis.CompatibleAnalyser;
import org.opengrok.indexer.analysis.Definitions;
import org.opengrok.indexer.analysis.Scopes;
import org.opengrok.indexer.configuration.Project;
import org.opengrok.indexer.configuration.RuntimeEnvironment;
import org.opengrok.indexer.configuration.SuperIndexSearcher;
import org.opengrok.indexer.history.HistoryException;
import org.opengrok.indexer.index.IndexDatabase;
import org.opengrok.indexer.logger.LoggerFactory;
import org.opengrok.indexer.search.Summary.Fragment;
import org.opengrok.indexer.search.context.Context;
import org.opengrok.indexer.search.context.HistoryContext;
import org.opengrok.indexer.util.Statistics;
import org.opengrok.indexer.util.TandemPath;
import org.opengrok.indexer.web.Prefix;

/**
 * This is an encapsulation of the details on how to search in the index database.
 * This is used for searching via the REST API.
 *
 * @author Trond Norbye 2005
 * @author Lubos Kosco - upgrade to lucene 3.x, 4.x, 5.x
 */
public class SearchEngine {

    private static final Logger LOGGER = LoggerFactory.getLogger(SearchEngine.class);

    /**
     * Message text used when logging exceptions thrown when searching.
     */
    private static final String SEARCH_EXCEPTION_MSG = "Exception searching {0}";
    //NOTE below will need to be changed after new lucene upgrade, if they
    //increase the version - every change of below makes us incompatible with the
    //old index and we need to ask for reindex
    /**
     * Version of Lucene index common for the whole application.
     */
    public static final Version LUCENE_VERSION = Version.LATEST;
    public static final String LUCENE_VERSION_HELP = LUCENE_VERSION.major + "_" + LUCENE_VERSION.minor + "_" + LUCENE_VERSION.bugfix;
    /**
     * Holds value of property definition.
     */
    private String definition;
    /**
     * Holds value of property file.
     */
    private String file;
    /**
     * Holds value of property freetext.
     */
    private String freetext;
    /**
     * Holds value of property history.
     */
    private String history;
    /**
     * Holds value of property symbol.
     */
    private String symbol;
    /**
     * Holds value of property type.
     */
    private String type;
    /**
     * Holds value of property indexDatabase.
     */
    private Query query;
    private QueryBuilder queryBuilder;
    private final CompatibleAnalyser analyzer = new CompatibleAnalyser();
    private Context sourceContext;
    private HistoryContext historyContext;
    private Summarizer summarizer;
    // internal structure to hold the results from lucene
    private final List<Document> docs;
    private final char[] content = new char[1024 * 8];
    private String source;
    private String data;
    int hitsPerPage = RuntimeEnvironment.getInstance().getHitsPerPage();
    int cachePages = RuntimeEnvironment.getInstance().getCachePages();
    int totalHits = 0;
    private ScoreDoc[] hits;
    private TopScoreDocCollector collector;
    private IndexSearcher searcher;
    boolean allCollected;
    private final ArrayList<SuperIndexSearcher> searcherList = new ArrayList<>();

    /**
     * Creates a new instance of SearchEngine.
     */
    public SearchEngine() {
        docs = new ArrayList<>();
    }

    /**
     * Create a QueryBuilder using the fields that have been set on this
     * SearchEngine.
     *
     * @return a query builder
     */
    private QueryBuilder createQueryBuilder() {
        return new QueryBuilder()
                .setFreetext(freetext)
                .setDefs(definition)
                .setRefs(symbol)
                .setPath(file)
                .setHist(history)
                .setType(type);
    }

    public boolean isValidQuery() {
        boolean ret;
        try {
            query = createQueryBuilder().build();
            ret = (query != null);
        } catch (ParseException e) {
            ret = false;
        }

        return ret;
    }

    /**
     * Search one index. This is used if no projects are set up.
     * @param paging whether to use paging (if yes, first X pages will load
     * faster)
     * @param root which db to search
     * @throws IOException
     */
    private void searchSingleDatabase(File root, boolean paging) throws IOException {
        IndexReader ireader = DirectoryReader.open(FSDirectory.open(root.toPath()));
        searcher = new IndexSearcher(ireader);
        searchIndex(searcher, paging);
    }

    /**
     * Perform search on multiple indexes in parallel.
     * @param paging whether to use paging (if yes, first X pages will load
     * faster)
     * @param root list of projects to search
     * @throws IOException
     */
    private void searchMultiDatabase(List<Project> root, boolean paging) throws IOException {
        SortedSet<String> projects = new TreeSet<>();
        for (Project p : root) {
            projects.add(p.getName());
        }

        // We use MultiReader even for single project. This should
        // not matter given that MultiReader is just a cheap wrapper
        // around set of IndexReader objects.
        MultiReader searchables = RuntimeEnvironment.getInstance().
            getMultiReader(projects, searcherList);
        searcher = new IndexSearcher(searchables);
        searchIndex(searcher, paging);
    }

    private void searchIndex(IndexSearcher searcher, boolean paging) throws IOException {
        collector = TopScoreDocCollector.create(hitsPerPage * cachePages, Short.MAX_VALUE);
        Statistics stat = new Statistics();
        searcher.search(query, collector);
        totalHits = collector.getTotalHits();
        stat.report(LOGGER, Level.FINEST, "search via SearchEngine done",
                "search.latency", new String[]{"category", "engine",
                        "outcome", totalHits > 0 ? "success" : "empty"});
        if (!paging && totalHits > 0) {
            collector = TopScoreDocCollector.create(totalHits, Short.MAX_VALUE);
            searcher.search(query, collector);
        }
        hits = collector.topDocs().scoreDocs;
        for (ScoreDoc hit : hits) {
            int docId = hit.doc;
            Document d = searcher.doc(docId);
            docs.add(d);
        }
    }

    /**
     * Gets the instance from {@code search(...)} if it was called.
     * @return defined instance or {@code null}
     */
    public String getQuery() {
        return query != null ? query.toString() : null;
    }

    /**
     * Gets the instance from {@code search(...)} if it was called.
     * @return defined instance or {@code null}
     */
    public Query getQueryObject() {
        return query;
    }

    /**
     * Gets the builder from {@code search(...)} if it was called.
     * <p>
     * (Modifying the builder will have no effect on this
     * {@link SearchEngine}.)
     * @return defined instance or {@code null}
     */
    public QueryBuilder getQueryBuilder() {
        return queryBuilder;
    }

    /**
     * Gets the searcher from {@code search(...)} if it was called.
     * @return defined instance or {@code null}
     */
    public IndexSearcher getSearcher() {
        return searcher;
    }

    /**
     * Execute a search aware of current request, limited to specific project names.
     *
     * This filters out all projects which are not allowed for the current request.
     *
     * Before calling this function,
     * you must set the appropriate search criteria with the set-functions. Note
     * that this search will return the first cachePages of hitsPerPage, for
     * more you need to call more.
     *
     * Call to search() must be eventually followed by call to destroy()
     * so that IndexSearcher objects are properly freed.
     *
     * @param projects projects to search
     * @return The number of hits
     */
    public int search(List<Project> projects) {
        return search(projects, new File(RuntimeEnvironment.getInstance().getDataRootFile(), IndexDatabase.INDEX_DIR));
    }

    /**
     * Execute a search without authorization.
     *
     * Before calling this function, you must set the
     * appropriate search criteria with the set-functions. Note that this search
     * will return the first cachePages of hitsPerPage, for more you need to
     * call more.
     *
     * Call to search() must be eventually followed by call to destroy()
     * so that IndexSearcher objects are properly freed.
     *
     * @return The number of hits
     */
    public int search() {
        RuntimeEnvironment env = RuntimeEnvironment.getInstance();
        return search(
                env.hasProjects() ? env.getProjectList() : new ArrayList<>(),
                new File(env.getDataRootFile(), IndexDatabase.INDEX_DIR));
    }

    /**
     * Execute a search on projects or root file.
     *
     * If @param projects is an empty list it tries to search in @code
     * searchSingleDatabase with root set to @param root
     *
     * Call to search() must be eventually followed by call to destroy()
     * so that IndexSearcher objects are properly freed.
     *
     * @return The number of hits
     */
    private int search(List<Project> projects, File root) {
        source = RuntimeEnvironment.getInstance().getSourceRootPath();
        data = RuntimeEnvironment.getInstance().getDataRootPath();
        docs.clear();

        QueryBuilder newBuilder = createQueryBuilder();
        try {
            query = newBuilder.build();
            if (query != null) {

                if (projects.isEmpty()) {
                    // search the index database
                    //NOTE this assumes that src does not contain any project, just
                    // data files - so no authorization can be enforced
                    searchSingleDatabase(root, true);
                } else {
                    // search all projects
                    //TODO support paging per project (in search.java)
                    //TODO optimize if only one project by falling back to SingleDatabase ?
                    //NOTE projects are already filtered if we accessed through web page @see search(HttpServletRequest)
                    searchMultiDatabase(projects, false);
                }
            }
        } catch (Exception e) {
            LOGGER.log(
                    Level.WARNING, SEARCH_EXCEPTION_MSG, e);
        }

        if (!docs.isEmpty()) {
            sourceContext = null;
            summarizer = null;
            try {
                sourceContext = new Context(query, newBuilder);
                if (sourceContext.isEmpty()) {
                    sourceContext = null;
                }
                summarizer = new Summarizer(query, analyzer);
            } catch (Exception e) {
                LOGGER.log(Level.WARNING, "An error occurred while creating summary", e);
            }

            historyContext = null;
            try {
                historyContext = new HistoryContext(query);
                if (historyContext.isEmpty()) {
                    historyContext = null;
                }
            } catch (Exception e) {
                LOGGER.log(Level.WARNING, "An error occurred while getting history context", e);
            }
        }
        int count = hits == null ? 0 : hits.length;
        queryBuilder = newBuilder;
        return count;
    }

    /**
     * Gets the queried score docs from {@code search(...)} if it was called.
     * @return a defined instance if a query succeeded, or {@code null}
     */
    public ScoreDoc[] scoreDocs() {
        return hits;
    }

    /**
     * Gets the document of the specified {@code docId} from
     * {@code search(...)} if it was called.
     *
     * @param docId document ID
     * @return a defined instance if a query succeeded
     * @throws java.io.IOException if an error occurs obtaining the Lucene
     * document by ID
     */
    public Document doc(int docId) throws IOException {
        if (searcher == null) {
            throw new IllegalStateException("search(...) did not succeed");
        }
        return searcher.doc(docId);
    }

    /**
     * Get results , if no search was started before, no results are returned.
     * This method will requery if {@code end} is more than first query from search,
     * hence performance hit applies, if you want results in later pages than
     * number of cachePages. {@code end} has to be bigger than {@code start} !
     *
     * @param start start of the hit list
     * @param end end of the hit list
     * @param ret list of results from start to end or null/empty if no search
     * was started
     */
    public void results(int start, int end, List<Hit> ret) {

        //return if no start search() was done
        if (hits == null || (end < start)) {
            ret.clear();
            return;
        }

        ret.clear();

        // TODO check if below fits for if end=old hits.length, or it should include it
        if (end > hits.length && !allCollected) {
            //do the requery, we want more than 5 pages
            collector = TopScoreDocCollector.create(totalHits, Short.MAX_VALUE);
            try {
                searcher.search(query, collector);
            } catch (Exception e) { // this exception should never be hit, since search() will hit this before
                LOGGER.log(
                        Level.WARNING, SEARCH_EXCEPTION_MSG, e);
            }
            hits = collector.topDocs().scoreDocs;
            Document d = null;
            for (int i = start; i < hits.length; i++) {
                int docId = hits[i].doc;
                try {
                    d = searcher.doc(docId);
                } catch (Exception e) {
                    LOGGER.log(
                            Level.SEVERE, SEARCH_EXCEPTION_MSG, e);
                }
                docs.add(d);
            }
            allCollected = true;
        }

        //TODO generation of ret(results) could be cashed and consumers of engine would just print them in whatever
        // form they need, this way we could get rid of docs
        // the only problem is that count of docs is usually smaller than number of results
        for (int ii = start; ii < end; ++ii) {
            boolean alt = (ii % 2 == 0);
            boolean hasContext = false;
            try {
                Document doc = docs.get(ii);
                String filename = doc.get(QueryBuilder.PATH);

                AbstractAnalyzer.Genre genre = AbstractAnalyzer.Genre.get(doc.get(QueryBuilder.T));
                Definitions tags = null;
                IndexableField tagsField = doc.getField(QueryBuilder.TAGS);
                if (tagsField != null) {
                    tags = Definitions.deserialize(tagsField.binaryValue().bytes);
                }
                Scopes scopes = null;
                IndexableField scopesField = doc.getField(QueryBuilder.SCOPES);
                if (scopesField != null) {
                    scopes = Scopes.deserialize(scopesField.binaryValue().bytes);
                }
                int nhits = docs.size();

                if (sourceContext != null) {
                    sourceContext.toggleAlt();
                    try {
                        if (AbstractAnalyzer.Genre.PLAIN == genre && (source != null)) {
                            // SRCROOT is read with UTF-8 as a default.
                            hasContext = sourceContext.getContext(
                                new InputStreamReader(new FileInputStream(
                                source + filename), StandardCharsets.UTF_8),
                                null, null, null, filename, tags, nhits > 100,
                                getDefinition() != null, ret, scopes);
                        } else if (AbstractAnalyzer.Genre.XREFABLE == genre && data != null && summarizer != null) {
                            int l;
                            /**
                             * For backward compatibility, read the
                             * OpenGrok-produced document using the system
                             * default charset.
                             */
                            try (Reader r = RuntimeEnvironment.getInstance().isCompressXref()
                                    ? new HTMLStripCharFilter(new BufferedReader(new InputStreamReader(new GZIPInputStream(new FileInputStream(
                                            TandemPath.join(data + Prefix.XREF_P + filename, ".gz"))))))
                                    : new HTMLStripCharFilter(new BufferedReader(new FileReader(data + Prefix.XREF_P + filename)))) {
                                l = r.read(content);
                            }
                            //TODO FIX below fragmenter according to either summarizer or context
                            // (to get line numbers, might be hard, since xref writers will need to be fixed too,
                            // they generate just one line of html code now :( )
                            Summary sum = summarizer.getSummary(new String(content, 0, l));
                            Fragment[] fragments = sum.getFragments();
                            for (Fragment fragment : fragments) {
                                String match = fragment.toString();
                                if (match.length() > 0) {
                                    if (!fragment.isEllipsis()) {
                                        Hit hit = new Hit(filename, fragment.toString(), "", true, alt);
                                        ret.add(hit);
                                    }
                                    hasContext = true;
                                }
                            }
                        } else {
                            LOGGER.log(Level.WARNING, "Unknown genre: {0} for {1}", new Object[]{genre, filename});
                            hasContext |= sourceContext.getContext(null, null, null, null, filename, tags, false, false, ret, scopes);
                        }
                    } catch (FileNotFoundException exp) {
                        LOGGER.log(Level.WARNING, "Couldn''t read summary from {0} ({1})", new Object[]{filename, exp.getMessage()});
                        hasContext |= sourceContext.getContext(null, null, null, null, filename, tags, false, false, ret, scopes);
                    }
                }
                if (historyContext != null) {
                    hasContext |= historyContext.getContext(source + filename, filename, ret);
                }
                if (!hasContext) {
                    ret.add(new Hit(filename, "...", "", false, alt));
                }
            } catch (IOException | ClassNotFoundException | HistoryException e) {
                LOGGER.log(
                        Level.WARNING, SEARCH_EXCEPTION_MSG, e);
            }
        }
    }

    public void destroy() {
        for (SuperIndexSearcher is : searcherList) {
            try {
                is.getSearcherManager().release(is);
            } catch (IOException ex) {
                LOGGER.log(Level.WARNING, "cannot release indexSearcher", ex);
            }
        }
    }

    /**
     * Getter for property definition.
     *
     * @return Value of property definition.
     */
    public String getDefinition() {
        return this.definition;
    }

    /**
     * Setter for property definition.
     *
     * @param definition New value of property definition.
     */
    public void setDefinition(String definition) {
        this.definition = definition;
    }

    /**
     * Getter for property file.
     *
     * @return Value of property file.
     */
    public String getFile() {
        return this.file;
    }

    /**
     * Setter for property file.
     *
     * @param file New value of property file.
     */
    public void setFile(String file) {
        this.file = file;
    }

    /**
     * Getter for property freetext.
     *
     * @return Value of property freetext.
     */
    public String getFreetext() {
        return this.freetext;
    }

    /**
     * Setter for property freetext.
     *
     * @param freetext New value of property freetext.
     */
    public void setFreetext(String freetext) {
        this.freetext = freetext;
    }

    /**
     * Getter for property history.
     *
     * @return Value of property history.
     */
    public String getHistory() {
        return this.history;
    }

    /**
     * Setter for property history.
     *
     * @param history New value of property history.
     */
    public void setHistory(String history) {
        this.history = history;
    }

    /**
     * Getter for property symbol.
     *
     * @return Value of property symbol.
     */
    public String getSymbol() {
        return this.symbol;
    }

    /**
     * Setter for property symbol.
     *
     * @param symbol New value of property symbol.
     */
    public void setSymbol(String symbol) {
        this.symbol = symbol;
    }

    /**
     * Getter for property type.
     *
     * @return Value of property type.
     */
    public String getType() {
        return this.type;
    }

    /**
     * Setter for property type.
     *
     * @param fileType New value of property type.
     */
    public void setType(String fileType) {
        this.type = fileType;
    }
}