1 /* 2 * Copyright 2005 The Apache Software Foundation 3 * 4 * Licensed under the Apache License, Version 2.0 (the "License"); 5 * you may not use this file except in compliance with the License. 6 * You may obtain a copy of the License at 7 * 8 * http://www.apache.org/licenses/LICENSE-2.0 9 * 10 * Unless required by applicable law or agreed to in writing, software 11 * distributed under the License is distributed on an "AS IS" BASIS, 12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 * See the License for the specific language governing permissions and 14 * limitations under the License. 15 */ 16 // modified by Lubos Kosco 2010 to upgrade lucene to 3.0.0 17 // modified by Lubos Kosco 2014 to upgrade lucene to 4.9.0 18 // TODO : rewrite this to use Highlighter from lucene contrib ... 19 package org.opengrok.indexer.search; 20 21 import java.io.IOException; 22 import java.util.ArrayList; 23 import java.util.HashSet; 24 import java.util.List; 25 import java.util.Set; 26 import java.util.SortedSet; 27 import java.util.TreeSet; 28 import org.apache.lucene.analysis.Analyzer; 29 import org.apache.lucene.analysis.TokenStream; 30 import org.apache.lucene.analysis.tokenattributes.CharTermAttribute; 31 import org.apache.lucene.analysis.tokenattributes.OffsetAttribute; 32 import org.apache.lucene.analysis.tokenattributes.PackedTokenAttributeImpl; 33 import org.apache.lucene.index.Term; 34 import org.apache.lucene.search.BooleanClause; 35 import org.apache.lucene.search.BooleanQuery; 36 import org.apache.lucene.search.PhraseQuery; 37 import org.apache.lucene.search.PrefixQuery; 38 import org.apache.lucene.search.Query; 39 import org.apache.lucene.search.TermQuery; 40 import org.apache.lucene.search.WildcardQuery; 41 42 /** 43 * Implements hit summarization. 44 */ 45 public class Summarizer { 46 47 /** 48 * The number of context terms to display preceding and following matches. 49 */ 50 private static final int SUM_CONTEXT = 10; 51 52 /** 53 * The total number of terms to display in a summary. 54 */ 55 private static final int SUM_LENGTH = 20; 56 57 /** 58 * Converts text to tokens. 59 */ 60 private final Analyzer analyzer; 61 62 private final Set<String> highlight = new HashSet<>(); // put query terms in table 63 Summarizer(Query query, Analyzer a)64 public Summarizer(Query query, Analyzer a) { 65 analyzer = a; 66 getTerms(query); 67 } 68 69 /** 70 * Class Excerpt represents a single passage found in the document, with 71 * some appropriate regions highlight. 72 */ 73 static class Excerpt { 74 75 List<Summary.Fragment> passages = new ArrayList<>(); 76 Set<String> tokenSet = new TreeSet<>(); 77 int numTerms = 0; 78 addToken(String token)79 public void addToken(String token) { 80 tokenSet.add(token); 81 } 82 83 /** 84 * Return how many unique tokens we have. 85 */ numUniqueTokens()86 public int numUniqueTokens() { 87 return tokenSet.size(); 88 } 89 90 /** 91 * How many fragments we have. 92 */ numFragments()93 public int numFragments() { 94 return passages.size(); 95 } 96 setNumTerms(int numTerms)97 public void setNumTerms(int numTerms) { 98 this.numTerms = numTerms; 99 } 100 getNumTerms()101 public int getNumTerms() { 102 return numTerms; 103 } 104 105 /** 106 * Add a frag to the list. 107 */ add(Summary.Fragment fragment)108 public void add(Summary.Fragment fragment) { 109 passages.add(fragment); 110 } 111 112 /** 113 * Return an Enum for all the fragments. 114 */ elements()115 public List<Summary.Fragment> elements() { 116 return passages; 117 } 118 } 119 120 /** 121 * Returns a summary for the given pre-tokenized text. 122 * 123 * @param text input text 124 * @return summary of hits 125 * @throws java.io.IOException I/O exception 126 */ getSummary(String text)127 public Summary getSummary(String text) throws IOException { 128 if (text == null) { 129 return null; 130 } 131 // Simplistic implementation. Finds the first fragments in the document 132 // containing any query terms. 133 // 134 // @TODO: check that phrases in the query are matched in the fragment 135 136 SToken[] tokens = getTokens(text); // parse text to token array 137 138 if (tokens.length == 0) { 139 return new Summary(); 140 } 141 142 // 143 // Create a SortedSet that ranks excerpts according to 144 // how many query terms are present. An excerpt is 145 // a List full of Fragments and Highlights 146 // 147 SortedSet<Excerpt> excerptSet = new TreeSet<>((excerpt1, excerpt2) -> { 148 if (excerpt1 == null) { 149 return excerpt2 == null ? 0 : -1; 150 } else if (excerpt2 == null) { 151 return 1; 152 } else { 153 int numToks1 = excerpt1.numUniqueTokens(); 154 int numToks2 = excerpt2.numUniqueTokens(); 155 156 if (numToks1 < numToks2) { 157 return -1; 158 } else if (numToks1 == numToks2) { 159 return excerpt1.numFragments() - excerpt2.numFragments(); 160 } else { 161 return 1; 162 } 163 } 164 }); 165 166 // 167 // Iterate through all terms in the document 168 // 169 int lastExcerptPos = 0; 170 for (int i = 0; i < tokens.length; i++) { 171 // 172 // If we find a term that's in the query... 173 // 174 if (highlight.contains(tokens[i].toString())) { 175 // 176 // Start searching at a point SUM_CONTEXT terms back, 177 // and move SUM_CONTEXT terms into the future. 178 // 179 int startToken = (i > SUM_CONTEXT) ? i - SUM_CONTEXT : 0; 180 int endToken = Math.min(i + SUM_CONTEXT, tokens.length); 181 int offset = tokens[startToken].startOffset(); 182 int j = startToken; 183 184 // 185 // Iterate from the start point to the finish, adding 186 // terms all the way. The end of the passage is always 187 // SUM_CONTEXT beyond the last query-term. 188 // 189 Excerpt excerpt = new Excerpt(); 190 if (i != 0) { 191 excerpt.add(new Summary.Ellipsis()); 192 } 193 194 // 195 // Iterate through as long as we're before the end of 196 // the document and we haven't hit the max-number-of-items 197 // -in-a-summary. 198 // 199 while ((j < endToken) && (j - startToken < SUM_LENGTH)) { 200 // 201 // Now grab the hit-element, if present 202 // 203 SToken t = tokens[j]; 204 if (highlight.contains(t.toString())) { 205 excerpt.addToken(t.toString()); 206 excerpt.add(new Summary.Fragment(text.substring(offset, t.startOffset()))); 207 excerpt.add(new Summary.Highlight(text.substring(t.startOffset(), t.endOffset()))); 208 offset = t.endOffset(); 209 endToken = Math.min(j + SUM_CONTEXT, tokens.length); 210 } 211 212 j++; 213 } 214 215 lastExcerptPos = endToken; 216 217 // 218 // We found the series of search-term hits and added 219 // them (with intervening text) to the excerpt. Now 220 // we need to add the trailing edge of text. 221 // 222 // So if (j < tokens.length) then there is still trailing 223 // text to add. (We haven't hit the end of the source doc.) 224 // Add the words since the last hit-term insert. 225 // 226 if (j < tokens.length) { 227 excerpt.add(new Summary.Fragment(text.substring(offset, tokens[j].endOffset()))); 228 } 229 230 // 231 // Remember how many terms are in this excerpt 232 // 233 excerpt.setNumTerms(j - startToken); 234 235 // 236 // Store the excerpt for later sorting 237 // 238 excerptSet.add(excerpt); 239 240 // 241 // Start SUM_CONTEXT places away. The next 242 // search for relevant excerpts begins at i-SUM_CONTEXT 243 // 244 i = j + SUM_CONTEXT; 245 } 246 } 247 248 // 249 // If the target text doesn't appear, then we just 250 // excerpt the first SUM_LENGTH words from the document. 251 // 252 if (excerptSet.isEmpty()) { 253 Excerpt excerpt = new Excerpt(); 254 int excerptLen = Math.min(SUM_LENGTH, tokens.length); 255 lastExcerptPos = excerptLen; 256 257 excerpt.add(new Summary.Fragment(text.substring(tokens[0].startOffset(), tokens[excerptLen - 1].startOffset()))); 258 excerpt.setNumTerms(excerptLen); 259 excerptSet.add(excerpt); 260 } 261 262 // 263 // Now choose the best items from the excerpt set. 264 // Stop when our Summary grows too large. 265 // 266 double tokenCount = 0; 267 Summary s = new Summary(); 268 while (tokenCount <= SUM_LENGTH && excerptSet.size() > 0) { 269 Excerpt excerpt = excerptSet.last(); 270 excerptSet.remove(excerpt); 271 272 double tokenFraction = (1.0 * excerpt.getNumTerms()) / excerpt.numFragments(); 273 for (Summary.Fragment f : excerpt.elements()) { 274 // Don't add fragments if it takes us over the max-limit 275 if (tokenCount + tokenFraction <= SUM_LENGTH) { 276 s.add(f); 277 } 278 tokenCount += tokenFraction; 279 } 280 } 281 282 if (tokenCount > 0 && lastExcerptPos < tokens.length) { 283 s.add(new Summary.Ellipsis()); 284 } 285 return s; 286 } 287 288 private static class SToken extends PackedTokenAttributeImpl { 289 SToken(char[] startTermBuffer, int termBufferOffset, int termBufferLength, int start, int end)290 SToken(char[] startTermBuffer, int termBufferOffset, int termBufferLength, int start, int end) { 291 copyBuffer(startTermBuffer, termBufferOffset, termBufferLength); 292 setOffset(start, end); 293 } 294 } 295 getTokens(String text)296 private SToken[] getTokens(String text) throws IOException { 297 //FIXME somehow integrate below cycle to getSummary to save the cloning and memory, 298 //also creating Tokens is suboptimal with 3.0.0 , this whole class could be replaced by highlighter 299 ArrayList<SToken> result = new ArrayList<>(); 300 try (TokenStream ts = analyzer.tokenStream(QueryBuilder.FULL, text)) { 301 CharTermAttribute term = ts.addAttribute(CharTermAttribute.class); 302 OffsetAttribute offset = ts.addAttribute(OffsetAttribute.class); 303 ts.reset(); 304 while (ts.incrementToken()) { 305 SToken t = new SToken(term.buffer(), 0, term.length(), offset.startOffset(), offset.endOffset()); 306 result.add(t); 307 } 308 ts.end(); 309 } 310 return result.toArray(new SToken[0]); 311 } 312 313 /** 314 * Get the terms from a query and adds them to highlight a stream of tokens. 315 */ getTerms(Query query)316 private void getTerms(Query query) { 317 if (query instanceof BooleanQuery) { 318 getBooleans((BooleanQuery) query); 319 } else if (query instanceof PhraseQuery) { 320 getPhrases((PhraseQuery) query); 321 } else if (query instanceof WildcardQuery) { 322 getWildTerm((WildcardQuery) query); 323 } else if (query instanceof TermQuery) { 324 getTerm((TermQuery) query); 325 } else if (query instanceof PrefixQuery) { 326 getPrefix((PrefixQuery) query); 327 } 328 } 329 getBooleans(BooleanQuery query)330 private void getBooleans(BooleanQuery query) { 331 for (BooleanClause clause : query) { 332 if (!clause.isProhibited()) { 333 getTerms(clause.getQuery()); 334 } 335 } 336 } 337 getPhrases(PhraseQuery query)338 private void getPhrases(PhraseQuery query) { 339 Term[] queryTerms = query.getTerms(); 340 for (Term queryTerm : queryTerms) { 341 highlight.add(queryTerm.text()); 342 } 343 } 344 getTerm(TermQuery query)345 private void getTerm(TermQuery query) { 346 highlight.add(query.getTerm().text()); 347 } 348 getWildTerm(WildcardQuery query)349 private void getWildTerm(WildcardQuery query) { 350 highlight.add(query.getTerm().text()); 351 } 352 getPrefix(PrefixQuery query)353 private void getPrefix(PrefixQuery query) { 354 highlight.add(query.getPrefix().text()); 355 } 356 } 357