indexer/search/Summarizer.java

b5840353SAdam Hornáček/*
b5840353SAdam Hornáček * Copyright 2005 The Apache Software Foundation
b5840353SAdam Hornáček *
b5840353SAdam Hornáček * Licensed under the Apache License, Version 2.0 (the "License");
b5840353SAdam Hornáček * you may not use this file except in compliance with the License.
b5840353SAdam Hornáček * You may obtain a copy of the License at
b5840353SAdam Hornáček *
b5840353SAdam Hornáček *     http://www.apache.org/licenses/LICENSE-2.0
b5840353SAdam Hornáček *
b5840353SAdam Hornáček * Unless required by applicable law or agreed to in writing, software
b5840353SAdam Hornáček * distributed under the License is distributed on an "AS IS" BASIS,
b5840353SAdam Hornáček * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
b5840353SAdam Hornáček * See the License for the specific language governing permissions and
b5840353SAdam Hornáček * limitations under the License.
b5840353SAdam Hornáček */
b5840353SAdam Hornáček// modified by Lubos Kosco 2010 to upgrade lucene to 3.0.0
b5840353SAdam Hornáček// modified by Lubos Kosco 2014 to upgrade lucene to 4.9.0
b5840353SAdam Hornáček// TODO : rewrite this to use Highlighter from lucene contrib ...
9805b761SAdam Hornáčekpackage org.opengrok.indexer.search;
b5840353SAdam Hornáček
b5840353SAdam Hornáčekimport java.io.IOException;
b5840353SAdam Hornáčekimport java.util.ArrayList;
b5840353SAdam Hornáčekimport java.util.HashSet;
b5840353SAdam Hornáčekimport java.util.List;
b5840353SAdam Hornáčekimport java.util.Set;
b5840353SAdam Hornáčekimport java.util.SortedSet;
b5840353SAdam Hornáčekimport java.util.TreeSet;
b5840353SAdam Hornáčekimport org.apache.lucene.analysis.Analyzer;
b5840353SAdam Hornáčekimport org.apache.lucene.analysis.TokenStream;
b5840353SAdam Hornáčekimport org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
b5840353SAdam Hornáčekimport org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
b5840353SAdam Hornáčekimport org.apache.lucene.analysis.tokenattributes.PackedTokenAttributeImpl;
b5840353SAdam Hornáčekimport org.apache.lucene.index.Term;
b5840353SAdam Hornáčekimport org.apache.lucene.search.BooleanClause;
b5840353SAdam Hornáčekimport org.apache.lucene.search.BooleanQuery;
b5840353SAdam Hornáčekimport org.apache.lucene.search.PhraseQuery;
b5840353SAdam Hornáčekimport org.apache.lucene.search.PrefixQuery;
b5840353SAdam Hornáčekimport org.apache.lucene.search.Query;
b5840353SAdam Hornáčekimport org.apache.lucene.search.TermQuery;
b5840353SAdam Hornáčekimport org.apache.lucene.search.WildcardQuery;
b5840353SAdam Hornáček
b5840353SAdam Hornáček/**
b5840353SAdam Hornáček * Implements hit summarization.
b5840353SAdam Hornáček */
b5840353SAdam Hornáčekpublic class Summarizer {
b5840353SAdam Hornáček
b5840353SAdam Hornáček    /**
b5840353SAdam Hornáček     * The number of context terms to display preceding and following matches.
b5840353SAdam Hornáček     */
b5840353SAdam Hornáček    private static final int SUM_CONTEXT = 10;
b5840353SAdam Hornáček
b5840353SAdam Hornáček    /**
b5840353SAdam Hornáček     * The total number of terms to display in a summary.
b5840353SAdam Hornáček     */
b5840353SAdam Hornáček    private static final int SUM_LENGTH = 20;
b5840353SAdam Hornáček
b5840353SAdam Hornáček    /**
b5840353SAdam Hornáček     * Converts text to tokens.
b5840353SAdam Hornáček     */
b5840353SAdam Hornáček    private final Analyzer analyzer;
b5840353SAdam Hornáček
b5840353SAdam Hornáček    private final Set<String> highlight = new HashSet<>();            // put query terms in table
b5840353SAdam Hornáček
b5840353SAdam Hornáček    public Summarizer(Query query, Analyzer a) {
b5840353SAdam Hornáček        analyzer = a;
b5840353SAdam Hornáček        getTerms(query);
b5840353SAdam Hornáček    }
b5840353SAdam Hornáček
b5840353SAdam Hornáček    /**
b5840353SAdam Hornáček     * Class Excerpt represents a single passage found in the document, with
b5840353SAdam Hornáček     * some appropriate regions highlight.
b5840353SAdam Hornáček     */
b5840353SAdam Hornáček    static class Excerpt {
b5840353SAdam Hornáček
b5840353SAdam Hornáček        List<Summary.Fragment> passages = new ArrayList<>();
b5840353SAdam Hornáček        Set<String> tokenSet = new TreeSet<>();
b5840353SAdam Hornáček        int numTerms = 0;
b5840353SAdam Hornáček
b5840353SAdam Hornáček        public void addToken(String token) {
b5840353SAdam Hornáček            tokenSet.add(token);
b5840353SAdam Hornáček        }
b5840353SAdam Hornáček
b5840353SAdam Hornáček        /**
ff44f24aSAdam Hornáček         * Return how many unique tokens we have.
b5840353SAdam Hornáček         */
b5840353SAdam Hornáček        public int numUniqueTokens() {
b5840353SAdam Hornáček            return tokenSet.size();
b5840353SAdam Hornáček        }
b5840353SAdam Hornáček
b5840353SAdam Hornáček        /**
b5840353SAdam Hornáček         * How many fragments we have.
b5840353SAdam Hornáček         */
b5840353SAdam Hornáček        public int numFragments() {
b5840353SAdam Hornáček            return passages.size();
b5840353SAdam Hornáček        }
b5840353SAdam Hornáček
b5840353SAdam Hornáček        public void setNumTerms(int numTerms) {
b5840353SAdam Hornáček            this.numTerms = numTerms;
b5840353SAdam Hornáček        }
b5840353SAdam Hornáček
b5840353SAdam Hornáček        public int getNumTerms() {
b5840353SAdam Hornáček            return numTerms;
b5840353SAdam Hornáček        }
b5840353SAdam Hornáček
b5840353SAdam Hornáček        /**
b5840353SAdam Hornáček         * Add a frag to the list.
b5840353SAdam Hornáček         */
b5840353SAdam Hornáček        public void add(Summary.Fragment fragment) {
b5840353SAdam Hornáček            passages.add(fragment);
b5840353SAdam Hornáček        }
b5840353SAdam Hornáček
b5840353SAdam Hornáček        /**
ff44f24aSAdam Hornáček         * Return an Enum for all the fragments.
b5840353SAdam Hornáček         */
b5840353SAdam Hornáček        public List<Summary.Fragment> elements() {
b5840353SAdam Hornáček            return passages;
b5840353SAdam Hornáček        }
b5840353SAdam Hornáček    }
b5840353SAdam Hornáček
b5840353SAdam Hornáček    /**
b5840353SAdam Hornáček     * Returns a summary for the given pre-tokenized text.
b5840353SAdam Hornáček     *
b5840353SAdam Hornáček     * @param text input text
b5840353SAdam Hornáček     * @return summary of hits
81b586e6SVladimir Kotal     * @throws java.io.IOException I/O exception
b5840353SAdam Hornáček     */
b5840353SAdam Hornáček    public Summary getSummary(String text) throws IOException {
b5840353SAdam Hornáček        if (text == null) {
b5840353SAdam Hornáček            return null;
b5840353SAdam Hornáček        }
b5840353SAdam Hornáček        // Simplistic implementation.  Finds the first fragments in the document
b5840353SAdam Hornáček        // containing any query terms.
b5840353SAdam Hornáček        //
b5840353SAdam Hornáček        // @TODO: check that phrases in the query are matched in the fragment
b5840353SAdam Hornáček
b5840353SAdam Hornáček        SToken[] tokens = getTokens(text);             // parse text to token array
b5840353SAdam Hornáček
b5840353SAdam Hornáček        if (tokens.length == 0) {
b5840353SAdam Hornáček            return new Summary();
b5840353SAdam Hornáček        }
b5840353SAdam Hornáček
b5840353SAdam Hornáček        //
b5840353SAdam Hornáček        // Create a SortedSet that ranks excerpts according to
b5840353SAdam Hornáček        // how many query terms are present.  An excerpt is
b5840353SAdam Hornáček        // a List full of Fragments and Highlights
b5840353SAdam Hornáček        //
*c6f0939bSAdam Hornacek        SortedSet<Excerpt> excerptSet = new TreeSet<>((excerpt1, excerpt2) -> {
b5840353SAdam Hornáček            if (excerpt1 == null) {
b5840353SAdam Hornáček                return excerpt2 == null ? 0 : -1;
b5840353SAdam Hornáček            } else if (excerpt2 == null) {
b5840353SAdam Hornáček                return 1;
b5840353SAdam Hornáček            } else {
b5840353SAdam Hornáček                int numToks1 = excerpt1.numUniqueTokens();
b5840353SAdam Hornáček                int numToks2 = excerpt2.numUniqueTokens();
b5840353SAdam Hornáček
b5840353SAdam Hornáček                if (numToks1 < numToks2) {
b5840353SAdam Hornáček                    return -1;
b5840353SAdam Hornáček                } else if (numToks1 == numToks2) {
b5840353SAdam Hornáček                    return excerpt1.numFragments() - excerpt2.numFragments();
b5840353SAdam Hornáček                } else {
b5840353SAdam Hornáček                    return 1;
b5840353SAdam Hornáček                }
b5840353SAdam Hornáček            }
b5840353SAdam Hornáček        });
b5840353SAdam Hornáček
b5840353SAdam Hornáček        //
b5840353SAdam Hornáček        // Iterate through all terms in the document
b5840353SAdam Hornáček        //
b5840353SAdam Hornáček        int lastExcerptPos = 0;
b5840353SAdam Hornáček        for (int i = 0; i < tokens.length; i++) {
b5840353SAdam Hornáček            //
b5840353SAdam Hornáček            // If we find a term that's in the query...
b5840353SAdam Hornáček            //
b5840353SAdam Hornáček            if (highlight.contains(tokens[i].toString())) {
b5840353SAdam Hornáček                //
b5840353SAdam Hornáček                // Start searching at a point SUM_CONTEXT terms back,
b5840353SAdam Hornáček                // and move SUM_CONTEXT terms into the future.
b5840353SAdam Hornáček                //
b5840353SAdam Hornáček                int startToken = (i > SUM_CONTEXT) ? i - SUM_CONTEXT : 0;
b5840353SAdam Hornáček                int endToken = Math.min(i + SUM_CONTEXT, tokens.length);
b5840353SAdam Hornáček                int offset = tokens[startToken].startOffset();
b5840353SAdam Hornáček                int j = startToken;
b5840353SAdam Hornáček
b5840353SAdam Hornáček                //
b5840353SAdam Hornáček                // Iterate from the start point to the finish, adding
b5840353SAdam Hornáček                // terms all the way.  The end of the passage is always
b5840353SAdam Hornáček                // SUM_CONTEXT beyond the last query-term.
b5840353SAdam Hornáček                //
b5840353SAdam Hornáček                Excerpt excerpt = new Excerpt();
b5840353SAdam Hornáček                if (i != 0) {
b5840353SAdam Hornáček                    excerpt.add(new Summary.Ellipsis());
b5840353SAdam Hornáček                }
b5840353SAdam Hornáček
b5840353SAdam Hornáček                //
b5840353SAdam Hornáček                // Iterate through as long as we're before the end of
b5840353SAdam Hornáček                // the document and we haven't hit the max-number-of-items
b5840353SAdam Hornáček                // -in-a-summary.
b5840353SAdam Hornáček                //
b5840353SAdam Hornáček                while ((j < endToken) && (j - startToken < SUM_LENGTH)) {
b5840353SAdam Hornáček                    //
b5840353SAdam Hornáček                    // Now grab the hit-element, if present
b5840353SAdam Hornáček                    //
b5840353SAdam Hornáček                    SToken t = tokens[j];
b5840353SAdam Hornáček                    if (highlight.contains(t.toString())) {
b5840353SAdam Hornáček                        excerpt.addToken(t.toString());
b5840353SAdam Hornáček                        excerpt.add(new Summary.Fragment(text.substring(offset, t.startOffset())));
b5840353SAdam Hornáček                        excerpt.add(new Summary.Highlight(text.substring(t.startOffset(), t.endOffset())));
b5840353SAdam Hornáček                        offset = t.endOffset();
b5840353SAdam Hornáček                        endToken = Math.min(j + SUM_CONTEXT, tokens.length);
b5840353SAdam Hornáček                    }
b5840353SAdam Hornáček
b5840353SAdam Hornáček                    j++;
b5840353SAdam Hornáček                }
b5840353SAdam Hornáček
b5840353SAdam Hornáček                lastExcerptPos = endToken;
b5840353SAdam Hornáček
b5840353SAdam Hornáček                //
b5840353SAdam Hornáček                // We found the series of search-term hits and added
b5840353SAdam Hornáček                // them (with intervening text) to the excerpt.  Now
b5840353SAdam Hornáček                // we need to add the trailing edge of text.
b5840353SAdam Hornáček                //
b5840353SAdam Hornáček                // So if (j < tokens.length) then there is still trailing
b5840353SAdam Hornáček                // text to add.  (We haven't hit the end of the source doc.)
b5840353SAdam Hornáček                // Add the words since the last hit-term insert.
b5840353SAdam Hornáček                //
b5840353SAdam Hornáček                if (j < tokens.length) {
b5840353SAdam Hornáček                    excerpt.add(new Summary.Fragment(text.substring(offset, tokens[j].endOffset())));
b5840353SAdam Hornáček                }
b5840353SAdam Hornáček
b5840353SAdam Hornáček                //
b5840353SAdam Hornáček                // Remember how many terms are in this excerpt
b5840353SAdam Hornáček                //
b5840353SAdam Hornáček                excerpt.setNumTerms(j - startToken);
b5840353SAdam Hornáček
b5840353SAdam Hornáček                //
b5840353SAdam Hornáček                // Store the excerpt for later sorting
b5840353SAdam Hornáček                //
b5840353SAdam Hornáček                excerptSet.add(excerpt);
b5840353SAdam Hornáček
b5840353SAdam Hornáček                //
b5840353SAdam Hornáček                // Start SUM_CONTEXT places away.  The next
b5840353SAdam Hornáček                // search for relevant excerpts begins at i-SUM_CONTEXT
b5840353SAdam Hornáček                //
b5840353SAdam Hornáček                i = j + SUM_CONTEXT;
b5840353SAdam Hornáček            }
b5840353SAdam Hornáček        }
b5840353SAdam Hornáček
b5840353SAdam Hornáček        //
b5840353SAdam Hornáček        // If the target text doesn't appear, then we just
b5840353SAdam Hornáček        // excerpt the first SUM_LENGTH words from the document.
b5840353SAdam Hornáček        //
*c6f0939bSAdam Hornacek        if (excerptSet.isEmpty()) {
b5840353SAdam Hornáček            Excerpt excerpt = new Excerpt();
b5840353SAdam Hornáček            int excerptLen = Math.min(SUM_LENGTH, tokens.length);
b5840353SAdam Hornáček            lastExcerptPos = excerptLen;
b5840353SAdam Hornáček
b5840353SAdam Hornáček            excerpt.add(new Summary.Fragment(text.substring(tokens[0].startOffset(), tokens[excerptLen - 1].startOffset())));
b5840353SAdam Hornáček            excerpt.setNumTerms(excerptLen);
b5840353SAdam Hornáček            excerptSet.add(excerpt);
b5840353SAdam Hornáček        }
b5840353SAdam Hornáček
b5840353SAdam Hornáček        //
b5840353SAdam Hornáček        // Now choose the best items from the excerpt set.
b5840353SAdam Hornáček        // Stop when our Summary grows too large.
b5840353SAdam Hornáček        //
b5840353SAdam Hornáček        double tokenCount = 0;
b5840353SAdam Hornáček        Summary s = new Summary();
b5840353SAdam Hornáček        while (tokenCount <= SUM_LENGTH && excerptSet.size() > 0) {
b5840353SAdam Hornáček            Excerpt excerpt = excerptSet.last();
b5840353SAdam Hornáček            excerptSet.remove(excerpt);
b5840353SAdam Hornáček
b5840353SAdam Hornáček            double tokenFraction = (1.0 * excerpt.getNumTerms()) / excerpt.numFragments();
b5840353SAdam Hornáček            for (Summary.Fragment f : excerpt.elements()) {
b5840353SAdam Hornáček                // Don't add fragments if it takes us over the max-limit
b5840353SAdam Hornáček                if (tokenCount + tokenFraction <= SUM_LENGTH) {
b5840353SAdam Hornáček                    s.add(f);
b5840353SAdam Hornáček                }
b5840353SAdam Hornáček                tokenCount += tokenFraction;
b5840353SAdam Hornáček            }
b5840353SAdam Hornáček        }
b5840353SAdam Hornáček
b5840353SAdam Hornáček        if (tokenCount > 0 && lastExcerptPos < tokens.length) {
b5840353SAdam Hornáček            s.add(new Summary.Ellipsis());
b5840353SAdam Hornáček        }
b5840353SAdam Hornáček        return s;
b5840353SAdam Hornáček    }
b5840353SAdam Hornáček
*c6f0939bSAdam Hornacek    private static class SToken extends PackedTokenAttributeImpl {
b5840353SAdam Hornáček
d1e826faSAdam Hornáček        SToken(char[] startTermBuffer, int termBufferOffset, int termBufferLength, int start, int end) {
b5840353SAdam Hornáček            copyBuffer(startTermBuffer, termBufferOffset, termBufferLength);
b5840353SAdam Hornáček            setOffset(start, end);
b5840353SAdam Hornáček        }
b5840353SAdam Hornáček    }
b5840353SAdam Hornáček
b5840353SAdam Hornáček    private SToken[] getTokens(String text) throws IOException {
b5840353SAdam Hornáček        //FIXME somehow integrate below cycle to getSummary to save the cloning and memory,
b5840353SAdam Hornáček        //also creating Tokens is suboptimal with 3.0.0 , this whole class could be replaced by highlighter
b5840353SAdam Hornáček        ArrayList<SToken> result = new ArrayList<>();
1c830032SChris Fraire        try (TokenStream ts = analyzer.tokenStream(QueryBuilder.FULL, text)) {
b5840353SAdam Hornáček            CharTermAttribute term = ts.addAttribute(CharTermAttribute.class);
b5840353SAdam Hornáček            OffsetAttribute offset = ts.addAttribute(OffsetAttribute.class);
b5840353SAdam Hornáček            ts.reset();
b5840353SAdam Hornáček            while (ts.incrementToken()) {
b5840353SAdam Hornáček                SToken t = new SToken(term.buffer(), 0, term.length(), offset.startOffset(), offset.endOffset());
b5840353SAdam Hornáček                result.add(t);
b5840353SAdam Hornáček            }
b5840353SAdam Hornáček            ts.end();
b5840353SAdam Hornáček        }
*c6f0939bSAdam Hornacek        return result.toArray(new SToken[0]);
b5840353SAdam Hornáček    }
b5840353SAdam Hornáček
b5840353SAdam Hornáček    /**
ff44f24aSAdam Hornáček     * Get the terms from a query and adds them to highlight a stream of tokens.
b5840353SAdam Hornáček     */
b5840353SAdam Hornáček    private void getTerms(Query query) {
b5840353SAdam Hornáček        if (query instanceof BooleanQuery) {
b5840353SAdam Hornáček            getBooleans((BooleanQuery) query);
b5840353SAdam Hornáček        } else if (query instanceof PhraseQuery) {
b5840353SAdam Hornáček            getPhrases((PhraseQuery) query);
b5840353SAdam Hornáček        } else if (query instanceof WildcardQuery) {
b5840353SAdam Hornáček            getWildTerm((WildcardQuery) query);
b5840353SAdam Hornáček        } else if (query instanceof TermQuery) {
b5840353SAdam Hornáček            getTerm((TermQuery) query);
b5840353SAdam Hornáček        } else if (query instanceof PrefixQuery) {
b5840353SAdam Hornáček            getPrefix((PrefixQuery) query);
b5840353SAdam Hornáček        }
b5840353SAdam Hornáček    }
b5840353SAdam Hornáček
b5840353SAdam Hornáček    private void getBooleans(BooleanQuery query) {
fa525f6dSVladimir Kotal        for (BooleanClause clause : query) {
b5840353SAdam Hornáček            if (!clause.isProhibited()) {
b5840353SAdam Hornáček                getTerms(clause.getQuery());
b5840353SAdam Hornáček            }
b5840353SAdam Hornáček        }
b5840353SAdam Hornáček    }
b5840353SAdam Hornáček
b5840353SAdam Hornáček    private void getPhrases(PhraseQuery query) {
b5840353SAdam Hornáček        Term[] queryTerms = query.getTerms();
b5840353SAdam Hornáček        for (Term queryTerm : queryTerms) {
b5840353SAdam Hornáček            highlight.add(queryTerm.text());
b5840353SAdam Hornáček        }
b5840353SAdam Hornáček    }
b5840353SAdam Hornáček
b5840353SAdam Hornáček    private void getTerm(TermQuery query) {
b5840353SAdam Hornáček        highlight.add(query.getTerm().text());
b5840353SAdam Hornáček    }
b5840353SAdam Hornáček
b5840353SAdam Hornáček    private void getWildTerm(WildcardQuery query) {
b5840353SAdam Hornáček        highlight.add(query.getTerm().text());
b5840353SAdam Hornáček    }
b5840353SAdam Hornáček
b5840353SAdam Hornáček    private void getPrefix(PrefixQuery query) {
b5840353SAdam Hornáček        highlight.add(query.getPrefix().text());
b5840353SAdam Hornáček    }
b5840353SAdam Hornáček}