1b5840353SAdam Hornáček /* 2b5840353SAdam Hornáček * Copyright 2005 The Apache Software Foundation 3b5840353SAdam Hornáček * 4b5840353SAdam Hornáček * Licensed under the Apache License, Version 2.0 (the "License"); 5b5840353SAdam Hornáček * you may not use this file except in compliance with the License. 6b5840353SAdam Hornáček * You may obtain a copy of the License at 7b5840353SAdam Hornáček * 8b5840353SAdam Hornáček * http://www.apache.org/licenses/LICENSE-2.0 9b5840353SAdam Hornáček * 10b5840353SAdam Hornáček * Unless required by applicable law or agreed to in writing, software 11b5840353SAdam Hornáček * distributed under the License is distributed on an "AS IS" BASIS, 12b5840353SAdam Hornáček * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13b5840353SAdam Hornáček * See the License for the specific language governing permissions and 14b5840353SAdam Hornáček * limitations under the License. 15b5840353SAdam Hornáček */ 16b5840353SAdam Hornáček // modified by Lubos Kosco 2010 to upgrade lucene to 3.0.0 17b5840353SAdam Hornáček // modified by Lubos Kosco 2014 to upgrade lucene to 4.9.0 18b5840353SAdam Hornáček // TODO : rewrite this to use Highlighter from lucene contrib ... 199805b761SAdam Hornáček package org.opengrok.indexer.search; 20b5840353SAdam Hornáček 21b5840353SAdam Hornáček import java.io.IOException; 22b5840353SAdam Hornáček import java.util.ArrayList; 23b5840353SAdam Hornáček import java.util.HashSet; 24b5840353SAdam Hornáček import java.util.List; 25b5840353SAdam Hornáček import java.util.Set; 26b5840353SAdam Hornáček import java.util.SortedSet; 27b5840353SAdam Hornáček import java.util.TreeSet; 28b5840353SAdam Hornáček import org.apache.lucene.analysis.Analyzer; 29b5840353SAdam Hornáček import org.apache.lucene.analysis.TokenStream; 30b5840353SAdam Hornáček import org.apache.lucene.analysis.tokenattributes.CharTermAttribute; 31b5840353SAdam Hornáček import org.apache.lucene.analysis.tokenattributes.OffsetAttribute; 32b5840353SAdam Hornáček import org.apache.lucene.analysis.tokenattributes.PackedTokenAttributeImpl; 33b5840353SAdam Hornáček import org.apache.lucene.index.Term; 34b5840353SAdam Hornáček import org.apache.lucene.search.BooleanClause; 35b5840353SAdam Hornáček import org.apache.lucene.search.BooleanQuery; 36b5840353SAdam Hornáček import org.apache.lucene.search.PhraseQuery; 37b5840353SAdam Hornáček import org.apache.lucene.search.PrefixQuery; 38b5840353SAdam Hornáček import org.apache.lucene.search.Query; 39b5840353SAdam Hornáček import org.apache.lucene.search.TermQuery; 40b5840353SAdam Hornáček import org.apache.lucene.search.WildcardQuery; 41b5840353SAdam Hornáček 42b5840353SAdam Hornáček /** 43b5840353SAdam Hornáček * Implements hit summarization. 44b5840353SAdam Hornáček */ 45b5840353SAdam Hornáček public class Summarizer { 46b5840353SAdam Hornáček 47b5840353SAdam Hornáček /** 48b5840353SAdam Hornáček * The number of context terms to display preceding and following matches. 49b5840353SAdam Hornáček */ 50b5840353SAdam Hornáček private static final int SUM_CONTEXT = 10; 51b5840353SAdam Hornáček 52b5840353SAdam Hornáček /** 53b5840353SAdam Hornáček * The total number of terms to display in a summary. 54b5840353SAdam Hornáček */ 55b5840353SAdam Hornáček private static final int SUM_LENGTH = 20; 56b5840353SAdam Hornáček 57b5840353SAdam Hornáček /** 58b5840353SAdam Hornáček * Converts text to tokens. 59b5840353SAdam Hornáček */ 60b5840353SAdam Hornáček private final Analyzer analyzer; 61b5840353SAdam Hornáček 62b5840353SAdam Hornáček private final Set<String> highlight = new HashSet<>(); // put query terms in table 63b5840353SAdam Hornáček Summarizer(Query query, Analyzer a)64b5840353SAdam Hornáček public Summarizer(Query query, Analyzer a) { 65b5840353SAdam Hornáček analyzer = a; 66b5840353SAdam Hornáček getTerms(query); 67b5840353SAdam Hornáček } 68b5840353SAdam Hornáček 69b5840353SAdam Hornáček /** 70b5840353SAdam Hornáček * Class Excerpt represents a single passage found in the document, with 71b5840353SAdam Hornáček * some appropriate regions highlight. 72b5840353SAdam Hornáček */ 73b5840353SAdam Hornáček static class Excerpt { 74b5840353SAdam Hornáček 75b5840353SAdam Hornáček List<Summary.Fragment> passages = new ArrayList<>(); 76b5840353SAdam Hornáček Set<String> tokenSet = new TreeSet<>(); 77b5840353SAdam Hornáček int numTerms = 0; 78b5840353SAdam Hornáček addToken(String token)79b5840353SAdam Hornáček public void addToken(String token) { 80b5840353SAdam Hornáček tokenSet.add(token); 81b5840353SAdam Hornáček } 82b5840353SAdam Hornáček 83b5840353SAdam Hornáček /** 84ff44f24aSAdam Hornáček * Return how many unique tokens we have. 85b5840353SAdam Hornáček */ numUniqueTokens()86b5840353SAdam Hornáček public int numUniqueTokens() { 87b5840353SAdam Hornáček return tokenSet.size(); 88b5840353SAdam Hornáček } 89b5840353SAdam Hornáček 90b5840353SAdam Hornáček /** 91b5840353SAdam Hornáček * How many fragments we have. 92b5840353SAdam Hornáček */ numFragments()93b5840353SAdam Hornáček public int numFragments() { 94b5840353SAdam Hornáček return passages.size(); 95b5840353SAdam Hornáček } 96b5840353SAdam Hornáček setNumTerms(int numTerms)97b5840353SAdam Hornáček public void setNumTerms(int numTerms) { 98b5840353SAdam Hornáček this.numTerms = numTerms; 99b5840353SAdam Hornáček } 100b5840353SAdam Hornáček getNumTerms()101b5840353SAdam Hornáček public int getNumTerms() { 102b5840353SAdam Hornáček return numTerms; 103b5840353SAdam Hornáček } 104b5840353SAdam Hornáček 105b5840353SAdam Hornáček /** 106b5840353SAdam Hornáček * Add a frag to the list. 107b5840353SAdam Hornáček */ add(Summary.Fragment fragment)108b5840353SAdam Hornáček public void add(Summary.Fragment fragment) { 109b5840353SAdam Hornáček passages.add(fragment); 110b5840353SAdam Hornáček } 111b5840353SAdam Hornáček 112b5840353SAdam Hornáček /** 113ff44f24aSAdam Hornáček * Return an Enum for all the fragments. 114b5840353SAdam Hornáček */ elements()115b5840353SAdam Hornáček public List<Summary.Fragment> elements() { 116b5840353SAdam Hornáček return passages; 117b5840353SAdam Hornáček } 118b5840353SAdam Hornáček } 119b5840353SAdam Hornáček 120b5840353SAdam Hornáček /** 121b5840353SAdam Hornáček * Returns a summary for the given pre-tokenized text. 122b5840353SAdam Hornáček * 123b5840353SAdam Hornáček * @param text input text 124b5840353SAdam Hornáček * @return summary of hits 12581b586e6SVladimir Kotal * @throws java.io.IOException I/O exception 126b5840353SAdam Hornáček */ getSummary(String text)127b5840353SAdam Hornáček public Summary getSummary(String text) throws IOException { 128b5840353SAdam Hornáček if (text == null) { 129b5840353SAdam Hornáček return null; 130b5840353SAdam Hornáček } 131b5840353SAdam Hornáček // Simplistic implementation. Finds the first fragments in the document 132b5840353SAdam Hornáček // containing any query terms. 133b5840353SAdam Hornáček // 134b5840353SAdam Hornáček // @TODO: check that phrases in the query are matched in the fragment 135b5840353SAdam Hornáček 136b5840353SAdam Hornáček SToken[] tokens = getTokens(text); // parse text to token array 137b5840353SAdam Hornáček 138b5840353SAdam Hornáček if (tokens.length == 0) { 139b5840353SAdam Hornáček return new Summary(); 140b5840353SAdam Hornáček } 141b5840353SAdam Hornáček 142b5840353SAdam Hornáček // 143b5840353SAdam Hornáček // Create a SortedSet that ranks excerpts according to 144b5840353SAdam Hornáček // how many query terms are present. An excerpt is 145b5840353SAdam Hornáček // a List full of Fragments and Highlights 146b5840353SAdam Hornáček // 147*c6f0939bSAdam Hornacek SortedSet<Excerpt> excerptSet = new TreeSet<>((excerpt1, excerpt2) -> { 148b5840353SAdam Hornáček if (excerpt1 == null) { 149b5840353SAdam Hornáček return excerpt2 == null ? 0 : -1; 150b5840353SAdam Hornáček } else if (excerpt2 == null) { 151b5840353SAdam Hornáček return 1; 152b5840353SAdam Hornáček } else { 153b5840353SAdam Hornáček int numToks1 = excerpt1.numUniqueTokens(); 154b5840353SAdam Hornáček int numToks2 = excerpt2.numUniqueTokens(); 155b5840353SAdam Hornáček 156b5840353SAdam Hornáček if (numToks1 < numToks2) { 157b5840353SAdam Hornáček return -1; 158b5840353SAdam Hornáček } else if (numToks1 == numToks2) { 159b5840353SAdam Hornáček return excerpt1.numFragments() - excerpt2.numFragments(); 160b5840353SAdam Hornáček } else { 161b5840353SAdam Hornáček return 1; 162b5840353SAdam Hornáček } 163b5840353SAdam Hornáček } 164b5840353SAdam Hornáček }); 165b5840353SAdam Hornáček 166b5840353SAdam Hornáček // 167b5840353SAdam Hornáček // Iterate through all terms in the document 168b5840353SAdam Hornáček // 169b5840353SAdam Hornáček int lastExcerptPos = 0; 170b5840353SAdam Hornáček for (int i = 0; i < tokens.length; i++) { 171b5840353SAdam Hornáček // 172b5840353SAdam Hornáček // If we find a term that's in the query... 173b5840353SAdam Hornáček // 174b5840353SAdam Hornáček if (highlight.contains(tokens[i].toString())) { 175b5840353SAdam Hornáček // 176b5840353SAdam Hornáček // Start searching at a point SUM_CONTEXT terms back, 177b5840353SAdam Hornáček // and move SUM_CONTEXT terms into the future. 178b5840353SAdam Hornáček // 179b5840353SAdam Hornáček int startToken = (i > SUM_CONTEXT) ? i - SUM_CONTEXT : 0; 180b5840353SAdam Hornáček int endToken = Math.min(i + SUM_CONTEXT, tokens.length); 181b5840353SAdam Hornáček int offset = tokens[startToken].startOffset(); 182b5840353SAdam Hornáček int j = startToken; 183b5840353SAdam Hornáček 184b5840353SAdam Hornáček // 185b5840353SAdam Hornáček // Iterate from the start point to the finish, adding 186b5840353SAdam Hornáček // terms all the way. The end of the passage is always 187b5840353SAdam Hornáček // SUM_CONTEXT beyond the last query-term. 188b5840353SAdam Hornáček // 189b5840353SAdam Hornáček Excerpt excerpt = new Excerpt(); 190b5840353SAdam Hornáček if (i != 0) { 191b5840353SAdam Hornáček excerpt.add(new Summary.Ellipsis()); 192b5840353SAdam Hornáček } 193b5840353SAdam Hornáček 194b5840353SAdam Hornáček // 195b5840353SAdam Hornáček // Iterate through as long as we're before the end of 196b5840353SAdam Hornáček // the document and we haven't hit the max-number-of-items 197b5840353SAdam Hornáček // -in-a-summary. 198b5840353SAdam Hornáček // 199b5840353SAdam Hornáček while ((j < endToken) && (j - startToken < SUM_LENGTH)) { 200b5840353SAdam Hornáček // 201b5840353SAdam Hornáček // Now grab the hit-element, if present 202b5840353SAdam Hornáček // 203b5840353SAdam Hornáček SToken t = tokens[j]; 204b5840353SAdam Hornáček if (highlight.contains(t.toString())) { 205b5840353SAdam Hornáček excerpt.addToken(t.toString()); 206b5840353SAdam Hornáček excerpt.add(new Summary.Fragment(text.substring(offset, t.startOffset()))); 207b5840353SAdam Hornáček excerpt.add(new Summary.Highlight(text.substring(t.startOffset(), t.endOffset()))); 208b5840353SAdam Hornáček offset = t.endOffset(); 209b5840353SAdam Hornáček endToken = Math.min(j + SUM_CONTEXT, tokens.length); 210b5840353SAdam Hornáček } 211b5840353SAdam Hornáček 212b5840353SAdam Hornáček j++; 213b5840353SAdam Hornáček } 214b5840353SAdam Hornáček 215b5840353SAdam Hornáček lastExcerptPos = endToken; 216b5840353SAdam Hornáček 217b5840353SAdam Hornáček // 218b5840353SAdam Hornáček // We found the series of search-term hits and added 219b5840353SAdam Hornáček // them (with intervening text) to the excerpt. Now 220b5840353SAdam Hornáček // we need to add the trailing edge of text. 221b5840353SAdam Hornáček // 222b5840353SAdam Hornáček // So if (j < tokens.length) then there is still trailing 223b5840353SAdam Hornáček // text to add. (We haven't hit the end of the source doc.) 224b5840353SAdam Hornáček // Add the words since the last hit-term insert. 225b5840353SAdam Hornáček // 226b5840353SAdam Hornáček if (j < tokens.length) { 227b5840353SAdam Hornáček excerpt.add(new Summary.Fragment(text.substring(offset, tokens[j].endOffset()))); 228b5840353SAdam Hornáček } 229b5840353SAdam Hornáček 230b5840353SAdam Hornáček // 231b5840353SAdam Hornáček // Remember how many terms are in this excerpt 232b5840353SAdam Hornáček // 233b5840353SAdam Hornáček excerpt.setNumTerms(j - startToken); 234b5840353SAdam Hornáček 235b5840353SAdam Hornáček // 236b5840353SAdam Hornáček // Store the excerpt for later sorting 237b5840353SAdam Hornáček // 238b5840353SAdam Hornáček excerptSet.add(excerpt); 239b5840353SAdam Hornáček 240b5840353SAdam Hornáček // 241b5840353SAdam Hornáček // Start SUM_CONTEXT places away. The next 242b5840353SAdam Hornáček // search for relevant excerpts begins at i-SUM_CONTEXT 243b5840353SAdam Hornáček // 244b5840353SAdam Hornáček i = j + SUM_CONTEXT; 245b5840353SAdam Hornáček } 246b5840353SAdam Hornáček } 247b5840353SAdam Hornáček 248b5840353SAdam Hornáček // 249b5840353SAdam Hornáček // If the target text doesn't appear, then we just 250b5840353SAdam Hornáček // excerpt the first SUM_LENGTH words from the document. 251b5840353SAdam Hornáček // 252*c6f0939bSAdam Hornacek if (excerptSet.isEmpty()) { 253b5840353SAdam Hornáček Excerpt excerpt = new Excerpt(); 254b5840353SAdam Hornáček int excerptLen = Math.min(SUM_LENGTH, tokens.length); 255b5840353SAdam Hornáček lastExcerptPos = excerptLen; 256b5840353SAdam Hornáček 257b5840353SAdam Hornáček excerpt.add(new Summary.Fragment(text.substring(tokens[0].startOffset(), tokens[excerptLen - 1].startOffset()))); 258b5840353SAdam Hornáček excerpt.setNumTerms(excerptLen); 259b5840353SAdam Hornáček excerptSet.add(excerpt); 260b5840353SAdam Hornáček } 261b5840353SAdam Hornáček 262b5840353SAdam Hornáček // 263b5840353SAdam Hornáček // Now choose the best items from the excerpt set. 264b5840353SAdam Hornáček // Stop when our Summary grows too large. 265b5840353SAdam Hornáček // 266b5840353SAdam Hornáček double tokenCount = 0; 267b5840353SAdam Hornáček Summary s = new Summary(); 268b5840353SAdam Hornáček while (tokenCount <= SUM_LENGTH && excerptSet.size() > 0) { 269b5840353SAdam Hornáček Excerpt excerpt = excerptSet.last(); 270b5840353SAdam Hornáček excerptSet.remove(excerpt); 271b5840353SAdam Hornáček 272b5840353SAdam Hornáček double tokenFraction = (1.0 * excerpt.getNumTerms()) / excerpt.numFragments(); 273b5840353SAdam Hornáček for (Summary.Fragment f : excerpt.elements()) { 274b5840353SAdam Hornáček // Don't add fragments if it takes us over the max-limit 275b5840353SAdam Hornáček if (tokenCount + tokenFraction <= SUM_LENGTH) { 276b5840353SAdam Hornáček s.add(f); 277b5840353SAdam Hornáček } 278b5840353SAdam Hornáček tokenCount += tokenFraction; 279b5840353SAdam Hornáček } 280b5840353SAdam Hornáček } 281b5840353SAdam Hornáček 282b5840353SAdam Hornáček if (tokenCount > 0 && lastExcerptPos < tokens.length) { 283b5840353SAdam Hornáček s.add(new Summary.Ellipsis()); 284b5840353SAdam Hornáček } 285b5840353SAdam Hornáček return s; 286b5840353SAdam Hornáček } 287b5840353SAdam Hornáček 288*c6f0939bSAdam Hornacek private static class SToken extends PackedTokenAttributeImpl { 289b5840353SAdam Hornáček SToken(char[] startTermBuffer, int termBufferOffset, int termBufferLength, int start, int end)290d1e826faSAdam Hornáček SToken(char[] startTermBuffer, int termBufferOffset, int termBufferLength, int start, int end) { 291b5840353SAdam Hornáček copyBuffer(startTermBuffer, termBufferOffset, termBufferLength); 292b5840353SAdam Hornáček setOffset(start, end); 293b5840353SAdam Hornáček } 294b5840353SAdam Hornáček } 295b5840353SAdam Hornáček getTokens(String text)296b5840353SAdam Hornáček private SToken[] getTokens(String text) throws IOException { 297b5840353SAdam Hornáček //FIXME somehow integrate below cycle to getSummary to save the cloning and memory, 298b5840353SAdam Hornáček //also creating Tokens is suboptimal with 3.0.0 , this whole class could be replaced by highlighter 299b5840353SAdam Hornáček ArrayList<SToken> result = new ArrayList<>(); 3001c830032SChris Fraire try (TokenStream ts = analyzer.tokenStream(QueryBuilder.FULL, text)) { 301b5840353SAdam Hornáček CharTermAttribute term = ts.addAttribute(CharTermAttribute.class); 302b5840353SAdam Hornáček OffsetAttribute offset = ts.addAttribute(OffsetAttribute.class); 303b5840353SAdam Hornáček ts.reset(); 304b5840353SAdam Hornáček while (ts.incrementToken()) { 305b5840353SAdam Hornáček SToken t = new SToken(term.buffer(), 0, term.length(), offset.startOffset(), offset.endOffset()); 306b5840353SAdam Hornáček result.add(t); 307b5840353SAdam Hornáček } 308b5840353SAdam Hornáček ts.end(); 309b5840353SAdam Hornáček } 310*c6f0939bSAdam Hornacek return result.toArray(new SToken[0]); 311b5840353SAdam Hornáček } 312b5840353SAdam Hornáček 313b5840353SAdam Hornáček /** 314ff44f24aSAdam Hornáček * Get the terms from a query and adds them to highlight a stream of tokens. 315b5840353SAdam Hornáček */ getTerms(Query query)316b5840353SAdam Hornáček private void getTerms(Query query) { 317b5840353SAdam Hornáček if (query instanceof BooleanQuery) { 318b5840353SAdam Hornáček getBooleans((BooleanQuery) query); 319b5840353SAdam Hornáček } else if (query instanceof PhraseQuery) { 320b5840353SAdam Hornáček getPhrases((PhraseQuery) query); 321b5840353SAdam Hornáček } else if (query instanceof WildcardQuery) { 322b5840353SAdam Hornáček getWildTerm((WildcardQuery) query); 323b5840353SAdam Hornáček } else if (query instanceof TermQuery) { 324b5840353SAdam Hornáček getTerm((TermQuery) query); 325b5840353SAdam Hornáček } else if (query instanceof PrefixQuery) { 326b5840353SAdam Hornáček getPrefix((PrefixQuery) query); 327b5840353SAdam Hornáček } 328b5840353SAdam Hornáček } 329b5840353SAdam Hornáček getBooleans(BooleanQuery query)330b5840353SAdam Hornáček private void getBooleans(BooleanQuery query) { 331fa525f6dSVladimir Kotal for (BooleanClause clause : query) { 332b5840353SAdam Hornáček if (!clause.isProhibited()) { 333b5840353SAdam Hornáček getTerms(clause.getQuery()); 334b5840353SAdam Hornáček } 335b5840353SAdam Hornáček } 336b5840353SAdam Hornáček } 337b5840353SAdam Hornáček getPhrases(PhraseQuery query)338b5840353SAdam Hornáček private void getPhrases(PhraseQuery query) { 339b5840353SAdam Hornáček Term[] queryTerms = query.getTerms(); 340b5840353SAdam Hornáček for (Term queryTerm : queryTerms) { 341b5840353SAdam Hornáček highlight.add(queryTerm.text()); 342b5840353SAdam Hornáček } 343b5840353SAdam Hornáček } 344b5840353SAdam Hornáček getTerm(TermQuery query)345b5840353SAdam Hornáček private void getTerm(TermQuery query) { 346b5840353SAdam Hornáček highlight.add(query.getTerm().text()); 347b5840353SAdam Hornáček } 348b5840353SAdam Hornáček getWildTerm(WildcardQuery query)349b5840353SAdam Hornáček private void getWildTerm(WildcardQuery query) { 350b5840353SAdam Hornáček highlight.add(query.getTerm().text()); 351b5840353SAdam Hornáček } 352b5840353SAdam Hornáček getPrefix(PrefixQuery query)353b5840353SAdam Hornáček private void getPrefix(PrefixQuery query) { 354b5840353SAdam Hornáček highlight.add(query.getPrefix().text()); 355b5840353SAdam Hornáček } 356b5840353SAdam Hornáček } 357