xref: /OpenGrok/opengrok-indexer/src/main/java/org/opengrok/indexer/search/Summarizer.java (revision 0e4c55544f8ea0a68e8bae37b0e502097e008ec1)
1 /*
2  * Copyright 2005 The Apache Software Foundation
3  *
4  * Licensed under the Apache License, Version 2.0 (the "License");
5  * you may not use this file except in compliance with the License.
6  * You may obtain a copy of the License at
7  *
8  *     http://www.apache.org/licenses/LICENSE-2.0
9  *
10  * Unless required by applicable law or agreed to in writing, software
11  * distributed under the License is distributed on an "AS IS" BASIS,
12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13  * See the License for the specific language governing permissions and
14  * limitations under the License.
15  */
16 // modified by Lubos Kosco 2010 to upgrade lucene to 3.0.0
17 // modified by Lubos Kosco 2014 to upgrade lucene to 4.9.0
18 // TODO : rewrite this to use Highlighter from lucene contrib ...
19 package org.opengrok.indexer.search;
20 
21 import java.io.IOException;
22 import java.util.ArrayList;
23 import java.util.HashSet;
24 import java.util.List;
25 import java.util.Set;
26 import java.util.SortedSet;
27 import java.util.TreeSet;
28 import org.apache.lucene.analysis.Analyzer;
29 import org.apache.lucene.analysis.TokenStream;
30 import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
31 import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
32 import org.apache.lucene.analysis.tokenattributes.PackedTokenAttributeImpl;
33 import org.apache.lucene.index.Term;
34 import org.apache.lucene.search.BooleanClause;
35 import org.apache.lucene.search.BooleanQuery;
36 import org.apache.lucene.search.PhraseQuery;
37 import org.apache.lucene.search.PrefixQuery;
38 import org.apache.lucene.search.Query;
39 import org.apache.lucene.search.TermQuery;
40 import org.apache.lucene.search.WildcardQuery;
41 
42 /**
43  * Implements hit summarization.
44  */
45 public class Summarizer {
46 
47     /**
48      * The number of context terms to display preceding and following matches.
49      */
50     private static final int SUM_CONTEXT = 10;
51 
52     /**
53      * The total number of terms to display in a summary.
54      */
55     private static final int SUM_LENGTH = 20;
56 
57     /**
58      * Converts text to tokens.
59      */
60     private final Analyzer analyzer;
61 
62     private final Set<String> highlight = new HashSet<>();            // put query terms in table
63 
Summarizer(Query query, Analyzer a)64     public Summarizer(Query query, Analyzer a) {
65         analyzer = a;
66         getTerms(query);
67     }
68 
69     /**
70      * Class Excerpt represents a single passage found in the document, with
71      * some appropriate regions highlight.
72      */
73     static class Excerpt {
74 
75         List<Summary.Fragment> passages = new ArrayList<>();
76         Set<String> tokenSet = new TreeSet<>();
77         int numTerms = 0;
78 
addToken(String token)79         public void addToken(String token) {
80             tokenSet.add(token);
81         }
82 
83         /**
84          * Return how many unique tokens we have.
85          */
numUniqueTokens()86         public int numUniqueTokens() {
87             return tokenSet.size();
88         }
89 
90         /**
91          * How many fragments we have.
92          */
numFragments()93         public int numFragments() {
94             return passages.size();
95         }
96 
setNumTerms(int numTerms)97         public void setNumTerms(int numTerms) {
98             this.numTerms = numTerms;
99         }
100 
getNumTerms()101         public int getNumTerms() {
102             return numTerms;
103         }
104 
105         /**
106          * Add a frag to the list.
107          */
add(Summary.Fragment fragment)108         public void add(Summary.Fragment fragment) {
109             passages.add(fragment);
110         }
111 
112         /**
113          * Return an Enum for all the fragments.
114          */
elements()115         public List<Summary.Fragment> elements() {
116             return passages;
117         }
118     }
119 
120     /**
121      * Returns a summary for the given pre-tokenized text.
122      *
123      * @param text input text
124      * @return summary of hits
125      * @throws java.io.IOException I/O exception
126      */
getSummary(String text)127     public Summary getSummary(String text) throws IOException {
128         if (text == null) {
129             return null;
130         }
131         // Simplistic implementation.  Finds the first fragments in the document
132         // containing any query terms.
133         //
134         // @TODO: check that phrases in the query are matched in the fragment
135 
136         SToken[] tokens = getTokens(text);             // parse text to token array
137 
138         if (tokens.length == 0) {
139             return new Summary();
140         }
141 
142         //
143         // Create a SortedSet that ranks excerpts according to
144         // how many query terms are present.  An excerpt is
145         // a List full of Fragments and Highlights
146         //
147         SortedSet<Excerpt> excerptSet = new TreeSet<>((excerpt1, excerpt2) -> {
148             if (excerpt1 == null) {
149                 return excerpt2 == null ? 0 : -1;
150             } else if (excerpt2 == null) {
151                 return 1;
152             } else {
153                 int numToks1 = excerpt1.numUniqueTokens();
154                 int numToks2 = excerpt2.numUniqueTokens();
155 
156                 if (numToks1 < numToks2) {
157                     return -1;
158                 } else if (numToks1 == numToks2) {
159                     return excerpt1.numFragments() - excerpt2.numFragments();
160                 } else {
161                     return 1;
162                 }
163             }
164         });
165 
166         //
167         // Iterate through all terms in the document
168         //
169         int lastExcerptPos = 0;
170         for (int i = 0; i < tokens.length; i++) {
171             //
172             // If we find a term that's in the query...
173             //
174             if (highlight.contains(tokens[i].toString())) {
175                 //
176                 // Start searching at a point SUM_CONTEXT terms back,
177                 // and move SUM_CONTEXT terms into the future.
178                 //
179                 int startToken = (i > SUM_CONTEXT) ? i - SUM_CONTEXT : 0;
180                 int endToken = Math.min(i + SUM_CONTEXT, tokens.length);
181                 int offset = tokens[startToken].startOffset();
182                 int j = startToken;
183 
184                 //
185                 // Iterate from the start point to the finish, adding
186                 // terms all the way.  The end of the passage is always
187                 // SUM_CONTEXT beyond the last query-term.
188                 //
189                 Excerpt excerpt = new Excerpt();
190                 if (i != 0) {
191                     excerpt.add(new Summary.Ellipsis());
192                 }
193 
194                 //
195                 // Iterate through as long as we're before the end of
196                 // the document and we haven't hit the max-number-of-items
197                 // -in-a-summary.
198                 //
199                 while ((j < endToken) && (j - startToken < SUM_LENGTH)) {
200                     //
201                     // Now grab the hit-element, if present
202                     //
203                     SToken t = tokens[j];
204                     if (highlight.contains(t.toString())) {
205                         excerpt.addToken(t.toString());
206                         excerpt.add(new Summary.Fragment(text.substring(offset, t.startOffset())));
207                         excerpt.add(new Summary.Highlight(text.substring(t.startOffset(), t.endOffset())));
208                         offset = t.endOffset();
209                         endToken = Math.min(j + SUM_CONTEXT, tokens.length);
210                     }
211 
212                     j++;
213                 }
214 
215                 lastExcerptPos = endToken;
216 
217                 //
218                 // We found the series of search-term hits and added
219                 // them (with intervening text) to the excerpt.  Now
220                 // we need to add the trailing edge of text.
221                 //
222                 // So if (j < tokens.length) then there is still trailing
223                 // text to add.  (We haven't hit the end of the source doc.)
224                 // Add the words since the last hit-term insert.
225                 //
226                 if (j < tokens.length) {
227                     excerpt.add(new Summary.Fragment(text.substring(offset, tokens[j].endOffset())));
228                 }
229 
230                 //
231                 // Remember how many terms are in this excerpt
232                 //
233                 excerpt.setNumTerms(j - startToken);
234 
235                 //
236                 // Store the excerpt for later sorting
237                 //
238                 excerptSet.add(excerpt);
239 
240                 //
241                 // Start SUM_CONTEXT places away.  The next
242                 // search for relevant excerpts begins at i-SUM_CONTEXT
243                 //
244                 i = j + SUM_CONTEXT;
245             }
246         }
247 
248         //
249         // If the target text doesn't appear, then we just
250         // excerpt the first SUM_LENGTH words from the document.
251         //
252         if (excerptSet.isEmpty()) {
253             Excerpt excerpt = new Excerpt();
254             int excerptLen = Math.min(SUM_LENGTH, tokens.length);
255             lastExcerptPos = excerptLen;
256 
257             excerpt.add(new Summary.Fragment(text.substring(tokens[0].startOffset(), tokens[excerptLen - 1].startOffset())));
258             excerpt.setNumTerms(excerptLen);
259             excerptSet.add(excerpt);
260         }
261 
262         //
263         // Now choose the best items from the excerpt set.
264         // Stop when our Summary grows too large.
265         //
266         double tokenCount = 0;
267         Summary s = new Summary();
268         while (tokenCount <= SUM_LENGTH && excerptSet.size() > 0) {
269             Excerpt excerpt = excerptSet.last();
270             excerptSet.remove(excerpt);
271 
272             double tokenFraction = (1.0 * excerpt.getNumTerms()) / excerpt.numFragments();
273             for (Summary.Fragment f : excerpt.elements()) {
274                 // Don't add fragments if it takes us over the max-limit
275                 if (tokenCount + tokenFraction <= SUM_LENGTH) {
276                     s.add(f);
277                 }
278                 tokenCount += tokenFraction;
279             }
280         }
281 
282         if (tokenCount > 0 && lastExcerptPos < tokens.length) {
283             s.add(new Summary.Ellipsis());
284         }
285         return s;
286     }
287 
288     private static class SToken extends PackedTokenAttributeImpl {
289 
SToken(char[] startTermBuffer, int termBufferOffset, int termBufferLength, int start, int end)290         SToken(char[] startTermBuffer, int termBufferOffset, int termBufferLength, int start, int end) {
291             copyBuffer(startTermBuffer, termBufferOffset, termBufferLength);
292             setOffset(start, end);
293         }
294     }
295 
getTokens(String text)296     private SToken[] getTokens(String text) throws IOException {
297         //FIXME somehow integrate below cycle to getSummary to save the cloning and memory,
298         //also creating Tokens is suboptimal with 3.0.0 , this whole class could be replaced by highlighter
299         ArrayList<SToken> result = new ArrayList<>();
300         try (TokenStream ts = analyzer.tokenStream(QueryBuilder.FULL, text)) {
301             CharTermAttribute term = ts.addAttribute(CharTermAttribute.class);
302             OffsetAttribute offset = ts.addAttribute(OffsetAttribute.class);
303             ts.reset();
304             while (ts.incrementToken()) {
305                 SToken t = new SToken(term.buffer(), 0, term.length(), offset.startOffset(), offset.endOffset());
306                 result.add(t);
307             }
308             ts.end();
309         }
310         return result.toArray(new SToken[0]);
311     }
312 
313     /**
314      * Get the terms from a query and adds them to highlight a stream of tokens.
315      */
getTerms(Query query)316     private void getTerms(Query query) {
317         if (query instanceof BooleanQuery) {
318             getBooleans((BooleanQuery) query);
319         } else if (query instanceof PhraseQuery) {
320             getPhrases((PhraseQuery) query);
321         } else if (query instanceof WildcardQuery) {
322             getWildTerm((WildcardQuery) query);
323         } else if (query instanceof TermQuery) {
324             getTerm((TermQuery) query);
325         } else if (query instanceof PrefixQuery) {
326             getPrefix((PrefixQuery) query);
327         }
328     }
329 
getBooleans(BooleanQuery query)330     private void getBooleans(BooleanQuery query) {
331         for (BooleanClause clause : query) {
332             if (!clause.isProhibited()) {
333                 getTerms(clause.getQuery());
334             }
335         }
336     }
337 
getPhrases(PhraseQuery query)338     private void getPhrases(PhraseQuery query) {
339         Term[] queryTerms = query.getTerms();
340         for (Term queryTerm : queryTerms) {
341             highlight.add(queryTerm.text());
342         }
343     }
344 
getTerm(TermQuery query)345     private void getTerm(TermQuery query) {
346         highlight.add(query.getTerm().text());
347     }
348 
getWildTerm(WildcardQuery query)349     private void getWildTerm(WildcardQuery query) {
350         highlight.add(query.getTerm().text());
351     }
352 
getPrefix(PrefixQuery query)353     private void getPrefix(PrefixQuery query) {
354         highlight.add(query.getPrefix().text());
355     }
356 }
357