xref: /OpenGrok/opengrok-indexer/src/main/java/org/opengrok/indexer/search/Summarizer.java (revision 0e4c55544f8ea0a68e8bae37b0e502097e008ec1)
1b5840353SAdam Hornáček /*
2b5840353SAdam Hornáček  * Copyright 2005 The Apache Software Foundation
3b5840353SAdam Hornáček  *
4b5840353SAdam Hornáček  * Licensed under the Apache License, Version 2.0 (the "License");
5b5840353SAdam Hornáček  * you may not use this file except in compliance with the License.
6b5840353SAdam Hornáček  * You may obtain a copy of the License at
7b5840353SAdam Hornáček  *
8b5840353SAdam Hornáček  *     http://www.apache.org/licenses/LICENSE-2.0
9b5840353SAdam Hornáček  *
10b5840353SAdam Hornáček  * Unless required by applicable law or agreed to in writing, software
11b5840353SAdam Hornáček  * distributed under the License is distributed on an "AS IS" BASIS,
12b5840353SAdam Hornáček  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13b5840353SAdam Hornáček  * See the License for the specific language governing permissions and
14b5840353SAdam Hornáček  * limitations under the License.
15b5840353SAdam Hornáček  */
16b5840353SAdam Hornáček // modified by Lubos Kosco 2010 to upgrade lucene to 3.0.0
17b5840353SAdam Hornáček // modified by Lubos Kosco 2014 to upgrade lucene to 4.9.0
18b5840353SAdam Hornáček // TODO : rewrite this to use Highlighter from lucene contrib ...
199805b761SAdam Hornáček package org.opengrok.indexer.search;
20b5840353SAdam Hornáček 
21b5840353SAdam Hornáček import java.io.IOException;
22b5840353SAdam Hornáček import java.util.ArrayList;
23b5840353SAdam Hornáček import java.util.HashSet;
24b5840353SAdam Hornáček import java.util.List;
25b5840353SAdam Hornáček import java.util.Set;
26b5840353SAdam Hornáček import java.util.SortedSet;
27b5840353SAdam Hornáček import java.util.TreeSet;
28b5840353SAdam Hornáček import org.apache.lucene.analysis.Analyzer;
29b5840353SAdam Hornáček import org.apache.lucene.analysis.TokenStream;
30b5840353SAdam Hornáček import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
31b5840353SAdam Hornáček import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
32b5840353SAdam Hornáček import org.apache.lucene.analysis.tokenattributes.PackedTokenAttributeImpl;
33b5840353SAdam Hornáček import org.apache.lucene.index.Term;
34b5840353SAdam Hornáček import org.apache.lucene.search.BooleanClause;
35b5840353SAdam Hornáček import org.apache.lucene.search.BooleanQuery;
36b5840353SAdam Hornáček import org.apache.lucene.search.PhraseQuery;
37b5840353SAdam Hornáček import org.apache.lucene.search.PrefixQuery;
38b5840353SAdam Hornáček import org.apache.lucene.search.Query;
39b5840353SAdam Hornáček import org.apache.lucene.search.TermQuery;
40b5840353SAdam Hornáček import org.apache.lucene.search.WildcardQuery;
41b5840353SAdam Hornáček 
42b5840353SAdam Hornáček /**
43b5840353SAdam Hornáček  * Implements hit summarization.
44b5840353SAdam Hornáček  */
45b5840353SAdam Hornáček public class Summarizer {
46b5840353SAdam Hornáček 
47b5840353SAdam Hornáček     /**
48b5840353SAdam Hornáček      * The number of context terms to display preceding and following matches.
49b5840353SAdam Hornáček      */
50b5840353SAdam Hornáček     private static final int SUM_CONTEXT = 10;
51b5840353SAdam Hornáček 
52b5840353SAdam Hornáček     /**
53b5840353SAdam Hornáček      * The total number of terms to display in a summary.
54b5840353SAdam Hornáček      */
55b5840353SAdam Hornáček     private static final int SUM_LENGTH = 20;
56b5840353SAdam Hornáček 
57b5840353SAdam Hornáček     /**
58b5840353SAdam Hornáček      * Converts text to tokens.
59b5840353SAdam Hornáček      */
60b5840353SAdam Hornáček     private final Analyzer analyzer;
61b5840353SAdam Hornáček 
62b5840353SAdam Hornáček     private final Set<String> highlight = new HashSet<>();            // put query terms in table
63b5840353SAdam Hornáček 
Summarizer(Query query, Analyzer a)64b5840353SAdam Hornáček     public Summarizer(Query query, Analyzer a) {
65b5840353SAdam Hornáček         analyzer = a;
66b5840353SAdam Hornáček         getTerms(query);
67b5840353SAdam Hornáček     }
68b5840353SAdam Hornáček 
69b5840353SAdam Hornáček     /**
70b5840353SAdam Hornáček      * Class Excerpt represents a single passage found in the document, with
71b5840353SAdam Hornáček      * some appropriate regions highlight.
72b5840353SAdam Hornáček      */
73b5840353SAdam Hornáček     static class Excerpt {
74b5840353SAdam Hornáček 
75b5840353SAdam Hornáček         List<Summary.Fragment> passages = new ArrayList<>();
76b5840353SAdam Hornáček         Set<String> tokenSet = new TreeSet<>();
77b5840353SAdam Hornáček         int numTerms = 0;
78b5840353SAdam Hornáček 
addToken(String token)79b5840353SAdam Hornáček         public void addToken(String token) {
80b5840353SAdam Hornáček             tokenSet.add(token);
81b5840353SAdam Hornáček         }
82b5840353SAdam Hornáček 
83b5840353SAdam Hornáček         /**
84ff44f24aSAdam Hornáček          * Return how many unique tokens we have.
85b5840353SAdam Hornáček          */
numUniqueTokens()86b5840353SAdam Hornáček         public int numUniqueTokens() {
87b5840353SAdam Hornáček             return tokenSet.size();
88b5840353SAdam Hornáček         }
89b5840353SAdam Hornáček 
90b5840353SAdam Hornáček         /**
91b5840353SAdam Hornáček          * How many fragments we have.
92b5840353SAdam Hornáček          */
numFragments()93b5840353SAdam Hornáček         public int numFragments() {
94b5840353SAdam Hornáček             return passages.size();
95b5840353SAdam Hornáček         }
96b5840353SAdam Hornáček 
setNumTerms(int numTerms)97b5840353SAdam Hornáček         public void setNumTerms(int numTerms) {
98b5840353SAdam Hornáček             this.numTerms = numTerms;
99b5840353SAdam Hornáček         }
100b5840353SAdam Hornáček 
getNumTerms()101b5840353SAdam Hornáček         public int getNumTerms() {
102b5840353SAdam Hornáček             return numTerms;
103b5840353SAdam Hornáček         }
104b5840353SAdam Hornáček 
105b5840353SAdam Hornáček         /**
106b5840353SAdam Hornáček          * Add a frag to the list.
107b5840353SAdam Hornáček          */
add(Summary.Fragment fragment)108b5840353SAdam Hornáček         public void add(Summary.Fragment fragment) {
109b5840353SAdam Hornáček             passages.add(fragment);
110b5840353SAdam Hornáček         }
111b5840353SAdam Hornáček 
112b5840353SAdam Hornáček         /**
113ff44f24aSAdam Hornáček          * Return an Enum for all the fragments.
114b5840353SAdam Hornáček          */
elements()115b5840353SAdam Hornáček         public List<Summary.Fragment> elements() {
116b5840353SAdam Hornáček             return passages;
117b5840353SAdam Hornáček         }
118b5840353SAdam Hornáček     }
119b5840353SAdam Hornáček 
120b5840353SAdam Hornáček     /**
121b5840353SAdam Hornáček      * Returns a summary for the given pre-tokenized text.
122b5840353SAdam Hornáček      *
123b5840353SAdam Hornáček      * @param text input text
124b5840353SAdam Hornáček      * @return summary of hits
12581b586e6SVladimir Kotal      * @throws java.io.IOException I/O exception
126b5840353SAdam Hornáček      */
getSummary(String text)127b5840353SAdam Hornáček     public Summary getSummary(String text) throws IOException {
128b5840353SAdam Hornáček         if (text == null) {
129b5840353SAdam Hornáček             return null;
130b5840353SAdam Hornáček         }
131b5840353SAdam Hornáček         // Simplistic implementation.  Finds the first fragments in the document
132b5840353SAdam Hornáček         // containing any query terms.
133b5840353SAdam Hornáček         //
134b5840353SAdam Hornáček         // @TODO: check that phrases in the query are matched in the fragment
135b5840353SAdam Hornáček 
136b5840353SAdam Hornáček         SToken[] tokens = getTokens(text);             // parse text to token array
137b5840353SAdam Hornáček 
138b5840353SAdam Hornáček         if (tokens.length == 0) {
139b5840353SAdam Hornáček             return new Summary();
140b5840353SAdam Hornáček         }
141b5840353SAdam Hornáček 
142b5840353SAdam Hornáček         //
143b5840353SAdam Hornáček         // Create a SortedSet that ranks excerpts according to
144b5840353SAdam Hornáček         // how many query terms are present.  An excerpt is
145b5840353SAdam Hornáček         // a List full of Fragments and Highlights
146b5840353SAdam Hornáček         //
147*c6f0939bSAdam Hornacek         SortedSet<Excerpt> excerptSet = new TreeSet<>((excerpt1, excerpt2) -> {
148b5840353SAdam Hornáček             if (excerpt1 == null) {
149b5840353SAdam Hornáček                 return excerpt2 == null ? 0 : -1;
150b5840353SAdam Hornáček             } else if (excerpt2 == null) {
151b5840353SAdam Hornáček                 return 1;
152b5840353SAdam Hornáček             } else {
153b5840353SAdam Hornáček                 int numToks1 = excerpt1.numUniqueTokens();
154b5840353SAdam Hornáček                 int numToks2 = excerpt2.numUniqueTokens();
155b5840353SAdam Hornáček 
156b5840353SAdam Hornáček                 if (numToks1 < numToks2) {
157b5840353SAdam Hornáček                     return -1;
158b5840353SAdam Hornáček                 } else if (numToks1 == numToks2) {
159b5840353SAdam Hornáček                     return excerpt1.numFragments() - excerpt2.numFragments();
160b5840353SAdam Hornáček                 } else {
161b5840353SAdam Hornáček                     return 1;
162b5840353SAdam Hornáček                 }
163b5840353SAdam Hornáček             }
164b5840353SAdam Hornáček         });
165b5840353SAdam Hornáček 
166b5840353SAdam Hornáček         //
167b5840353SAdam Hornáček         // Iterate through all terms in the document
168b5840353SAdam Hornáček         //
169b5840353SAdam Hornáček         int lastExcerptPos = 0;
170b5840353SAdam Hornáček         for (int i = 0; i < tokens.length; i++) {
171b5840353SAdam Hornáček             //
172b5840353SAdam Hornáček             // If we find a term that's in the query...
173b5840353SAdam Hornáček             //
174b5840353SAdam Hornáček             if (highlight.contains(tokens[i].toString())) {
175b5840353SAdam Hornáček                 //
176b5840353SAdam Hornáček                 // Start searching at a point SUM_CONTEXT terms back,
177b5840353SAdam Hornáček                 // and move SUM_CONTEXT terms into the future.
178b5840353SAdam Hornáček                 //
179b5840353SAdam Hornáček                 int startToken = (i > SUM_CONTEXT) ? i - SUM_CONTEXT : 0;
180b5840353SAdam Hornáček                 int endToken = Math.min(i + SUM_CONTEXT, tokens.length);
181b5840353SAdam Hornáček                 int offset = tokens[startToken].startOffset();
182b5840353SAdam Hornáček                 int j = startToken;
183b5840353SAdam Hornáček 
184b5840353SAdam Hornáček                 //
185b5840353SAdam Hornáček                 // Iterate from the start point to the finish, adding
186b5840353SAdam Hornáček                 // terms all the way.  The end of the passage is always
187b5840353SAdam Hornáček                 // SUM_CONTEXT beyond the last query-term.
188b5840353SAdam Hornáček                 //
189b5840353SAdam Hornáček                 Excerpt excerpt = new Excerpt();
190b5840353SAdam Hornáček                 if (i != 0) {
191b5840353SAdam Hornáček                     excerpt.add(new Summary.Ellipsis());
192b5840353SAdam Hornáček                 }
193b5840353SAdam Hornáček 
194b5840353SAdam Hornáček                 //
195b5840353SAdam Hornáček                 // Iterate through as long as we're before the end of
196b5840353SAdam Hornáček                 // the document and we haven't hit the max-number-of-items
197b5840353SAdam Hornáček                 // -in-a-summary.
198b5840353SAdam Hornáček                 //
199b5840353SAdam Hornáček                 while ((j < endToken) && (j - startToken < SUM_LENGTH)) {
200b5840353SAdam Hornáček                     //
201b5840353SAdam Hornáček                     // Now grab the hit-element, if present
202b5840353SAdam Hornáček                     //
203b5840353SAdam Hornáček                     SToken t = tokens[j];
204b5840353SAdam Hornáček                     if (highlight.contains(t.toString())) {
205b5840353SAdam Hornáček                         excerpt.addToken(t.toString());
206b5840353SAdam Hornáček                         excerpt.add(new Summary.Fragment(text.substring(offset, t.startOffset())));
207b5840353SAdam Hornáček                         excerpt.add(new Summary.Highlight(text.substring(t.startOffset(), t.endOffset())));
208b5840353SAdam Hornáček                         offset = t.endOffset();
209b5840353SAdam Hornáček                         endToken = Math.min(j + SUM_CONTEXT, tokens.length);
210b5840353SAdam Hornáček                     }
211b5840353SAdam Hornáček 
212b5840353SAdam Hornáček                     j++;
213b5840353SAdam Hornáček                 }
214b5840353SAdam Hornáček 
215b5840353SAdam Hornáček                 lastExcerptPos = endToken;
216b5840353SAdam Hornáček 
217b5840353SAdam Hornáček                 //
218b5840353SAdam Hornáček                 // We found the series of search-term hits and added
219b5840353SAdam Hornáček                 // them (with intervening text) to the excerpt.  Now
220b5840353SAdam Hornáček                 // we need to add the trailing edge of text.
221b5840353SAdam Hornáček                 //
222b5840353SAdam Hornáček                 // So if (j < tokens.length) then there is still trailing
223b5840353SAdam Hornáček                 // text to add.  (We haven't hit the end of the source doc.)
224b5840353SAdam Hornáček                 // Add the words since the last hit-term insert.
225b5840353SAdam Hornáček                 //
226b5840353SAdam Hornáček                 if (j < tokens.length) {
227b5840353SAdam Hornáček                     excerpt.add(new Summary.Fragment(text.substring(offset, tokens[j].endOffset())));
228b5840353SAdam Hornáček                 }
229b5840353SAdam Hornáček 
230b5840353SAdam Hornáček                 //
231b5840353SAdam Hornáček                 // Remember how many terms are in this excerpt
232b5840353SAdam Hornáček                 //
233b5840353SAdam Hornáček                 excerpt.setNumTerms(j - startToken);
234b5840353SAdam Hornáček 
235b5840353SAdam Hornáček                 //
236b5840353SAdam Hornáček                 // Store the excerpt for later sorting
237b5840353SAdam Hornáček                 //
238b5840353SAdam Hornáček                 excerptSet.add(excerpt);
239b5840353SAdam Hornáček 
240b5840353SAdam Hornáček                 //
241b5840353SAdam Hornáček                 // Start SUM_CONTEXT places away.  The next
242b5840353SAdam Hornáček                 // search for relevant excerpts begins at i-SUM_CONTEXT
243b5840353SAdam Hornáček                 //
244b5840353SAdam Hornáček                 i = j + SUM_CONTEXT;
245b5840353SAdam Hornáček             }
246b5840353SAdam Hornáček         }
247b5840353SAdam Hornáček 
248b5840353SAdam Hornáček         //
249b5840353SAdam Hornáček         // If the target text doesn't appear, then we just
250b5840353SAdam Hornáček         // excerpt the first SUM_LENGTH words from the document.
251b5840353SAdam Hornáček         //
252*c6f0939bSAdam Hornacek         if (excerptSet.isEmpty()) {
253b5840353SAdam Hornáček             Excerpt excerpt = new Excerpt();
254b5840353SAdam Hornáček             int excerptLen = Math.min(SUM_LENGTH, tokens.length);
255b5840353SAdam Hornáček             lastExcerptPos = excerptLen;
256b5840353SAdam Hornáček 
257b5840353SAdam Hornáček             excerpt.add(new Summary.Fragment(text.substring(tokens[0].startOffset(), tokens[excerptLen - 1].startOffset())));
258b5840353SAdam Hornáček             excerpt.setNumTerms(excerptLen);
259b5840353SAdam Hornáček             excerptSet.add(excerpt);
260b5840353SAdam Hornáček         }
261b5840353SAdam Hornáček 
262b5840353SAdam Hornáček         //
263b5840353SAdam Hornáček         // Now choose the best items from the excerpt set.
264b5840353SAdam Hornáček         // Stop when our Summary grows too large.
265b5840353SAdam Hornáček         //
266b5840353SAdam Hornáček         double tokenCount = 0;
267b5840353SAdam Hornáček         Summary s = new Summary();
268b5840353SAdam Hornáček         while (tokenCount <= SUM_LENGTH && excerptSet.size() > 0) {
269b5840353SAdam Hornáček             Excerpt excerpt = excerptSet.last();
270b5840353SAdam Hornáček             excerptSet.remove(excerpt);
271b5840353SAdam Hornáček 
272b5840353SAdam Hornáček             double tokenFraction = (1.0 * excerpt.getNumTerms()) / excerpt.numFragments();
273b5840353SAdam Hornáček             for (Summary.Fragment f : excerpt.elements()) {
274b5840353SAdam Hornáček                 // Don't add fragments if it takes us over the max-limit
275b5840353SAdam Hornáček                 if (tokenCount + tokenFraction <= SUM_LENGTH) {
276b5840353SAdam Hornáček                     s.add(f);
277b5840353SAdam Hornáček                 }
278b5840353SAdam Hornáček                 tokenCount += tokenFraction;
279b5840353SAdam Hornáček             }
280b5840353SAdam Hornáček         }
281b5840353SAdam Hornáček 
282b5840353SAdam Hornáček         if (tokenCount > 0 && lastExcerptPos < tokens.length) {
283b5840353SAdam Hornáček             s.add(new Summary.Ellipsis());
284b5840353SAdam Hornáček         }
285b5840353SAdam Hornáček         return s;
286b5840353SAdam Hornáček     }
287b5840353SAdam Hornáček 
288*c6f0939bSAdam Hornacek     private static class SToken extends PackedTokenAttributeImpl {
289b5840353SAdam Hornáček 
SToken(char[] startTermBuffer, int termBufferOffset, int termBufferLength, int start, int end)290d1e826faSAdam Hornáček         SToken(char[] startTermBuffer, int termBufferOffset, int termBufferLength, int start, int end) {
291b5840353SAdam Hornáček             copyBuffer(startTermBuffer, termBufferOffset, termBufferLength);
292b5840353SAdam Hornáček             setOffset(start, end);
293b5840353SAdam Hornáček         }
294b5840353SAdam Hornáček     }
295b5840353SAdam Hornáček 
getTokens(String text)296b5840353SAdam Hornáček     private SToken[] getTokens(String text) throws IOException {
297b5840353SAdam Hornáček         //FIXME somehow integrate below cycle to getSummary to save the cloning and memory,
298b5840353SAdam Hornáček         //also creating Tokens is suboptimal with 3.0.0 , this whole class could be replaced by highlighter
299b5840353SAdam Hornáček         ArrayList<SToken> result = new ArrayList<>();
3001c830032SChris Fraire         try (TokenStream ts = analyzer.tokenStream(QueryBuilder.FULL, text)) {
301b5840353SAdam Hornáček             CharTermAttribute term = ts.addAttribute(CharTermAttribute.class);
302b5840353SAdam Hornáček             OffsetAttribute offset = ts.addAttribute(OffsetAttribute.class);
303b5840353SAdam Hornáček             ts.reset();
304b5840353SAdam Hornáček             while (ts.incrementToken()) {
305b5840353SAdam Hornáček                 SToken t = new SToken(term.buffer(), 0, term.length(), offset.startOffset(), offset.endOffset());
306b5840353SAdam Hornáček                 result.add(t);
307b5840353SAdam Hornáček             }
308b5840353SAdam Hornáček             ts.end();
309b5840353SAdam Hornáček         }
310*c6f0939bSAdam Hornacek         return result.toArray(new SToken[0]);
311b5840353SAdam Hornáček     }
312b5840353SAdam Hornáček 
313b5840353SAdam Hornáček     /**
314ff44f24aSAdam Hornáček      * Get the terms from a query and adds them to highlight a stream of tokens.
315b5840353SAdam Hornáček      */
getTerms(Query query)316b5840353SAdam Hornáček     private void getTerms(Query query) {
317b5840353SAdam Hornáček         if (query instanceof BooleanQuery) {
318b5840353SAdam Hornáček             getBooleans((BooleanQuery) query);
319b5840353SAdam Hornáček         } else if (query instanceof PhraseQuery) {
320b5840353SAdam Hornáček             getPhrases((PhraseQuery) query);
321b5840353SAdam Hornáček         } else if (query instanceof WildcardQuery) {
322b5840353SAdam Hornáček             getWildTerm((WildcardQuery) query);
323b5840353SAdam Hornáček         } else if (query instanceof TermQuery) {
324b5840353SAdam Hornáček             getTerm((TermQuery) query);
325b5840353SAdam Hornáček         } else if (query instanceof PrefixQuery) {
326b5840353SAdam Hornáček             getPrefix((PrefixQuery) query);
327b5840353SAdam Hornáček         }
328b5840353SAdam Hornáček     }
329b5840353SAdam Hornáček 
getBooleans(BooleanQuery query)330b5840353SAdam Hornáček     private void getBooleans(BooleanQuery query) {
331fa525f6dSVladimir Kotal         for (BooleanClause clause : query) {
332b5840353SAdam Hornáček             if (!clause.isProhibited()) {
333b5840353SAdam Hornáček                 getTerms(clause.getQuery());
334b5840353SAdam Hornáček             }
335b5840353SAdam Hornáček         }
336b5840353SAdam Hornáček     }
337b5840353SAdam Hornáček 
getPhrases(PhraseQuery query)338b5840353SAdam Hornáček     private void getPhrases(PhraseQuery query) {
339b5840353SAdam Hornáček         Term[] queryTerms = query.getTerms();
340b5840353SAdam Hornáček         for (Term queryTerm : queryTerms) {
341b5840353SAdam Hornáček             highlight.add(queryTerm.text());
342b5840353SAdam Hornáček         }
343b5840353SAdam Hornáček     }
344b5840353SAdam Hornáček 
getTerm(TermQuery query)345b5840353SAdam Hornáček     private void getTerm(TermQuery query) {
346b5840353SAdam Hornáček         highlight.add(query.getTerm().text());
347b5840353SAdam Hornáček     }
348b5840353SAdam Hornáček 
getWildTerm(WildcardQuery query)349b5840353SAdam Hornáček     private void getWildTerm(WildcardQuery query) {
350b5840353SAdam Hornáček         highlight.add(query.getTerm().text());
351b5840353SAdam Hornáček     }
352b5840353SAdam Hornáček 
getPrefix(PrefixQuery query)353b5840353SAdam Hornáček     private void getPrefix(PrefixQuery query) {
354b5840353SAdam Hornáček         highlight.add(query.getPrefix().text());
355b5840353SAdam Hornáček     }
356b5840353SAdam Hornáček }
357