1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * See LICENSE.txt included in this distribution for the specific 9 * language governing permissions and limitations under the License. 10 * 11 * When distributing Covered Code, include this CDDL HEADER in each 12 * file and include the License file at LICENSE.txt. 13 * If applicable, add the following below this CDDL HEADER, with the 14 * fields enclosed by brackets "[]" replaced with your own identifying 15 * information: Portions Copyright [yyyy] [name of copyright owner] 16 * 17 * CDDL HEADER END 18 */ 19 20 /* 21 * Copyright (c) 2010, 2019, Oracle and/or its affiliates. All rights reserved. 22 * Portions Copyright (c) 2011, Jens Elkner. 23 * Portions Copyright (c) 2017, 2020, Chris Fraire <cfraire@me.com>. 24 */ 25 package org.opengrok.indexer.search; 26 27 import java.io.File; 28 import java.io.StringReader; 29 import java.nio.charset.StandardCharsets; 30 import java.security.MessageDigest; 31 import java.security.NoSuchAlgorithmException; 32 import java.util.ArrayList; 33 import java.util.Arrays; 34 import java.util.Collections; 35 import java.util.HashSet; 36 import java.util.List; 37 import java.util.Map; 38 import java.util.TreeMap; 39 import org.apache.lucene.queryparser.classic.ParseException; 40 import org.apache.lucene.search.BooleanClause; 41 import org.apache.lucene.search.BooleanClause.Occur; 42 import org.apache.lucene.search.BooleanQuery; 43 import org.apache.lucene.search.Query; 44 45 /** 46 * Helper class that builds a Lucene query based on provided search terms for 47 * the different fields. 48 */ 49 public class QueryBuilder { 50 51 /** 52 * Fields we use in lucene: public ones. 53 */ 54 public static final String FULL = "full"; 55 public static final String DEFS = "defs"; 56 public static final String REFS = "refs"; 57 public static final String PATH = "path"; 58 public static final String HIST = "hist"; 59 public static final String TYPE = "type"; 60 public static final String SCOPES = "scopes"; 61 public static final String NUML = "numl"; 62 public static final String LOC = "loc"; 63 public static final String LASTREV = "lastrev"; // last revision 64 /** 65 * Fields we use in Lucene: internal ones. 66 */ 67 public static final String D = "d"; // Directory key 68 public static final String U = "u"; // File and timestamp key 69 public static final String TAGS = "tags"; 70 public static final String T = "t"; 71 public static final String FULLPATH = "fullpath"; 72 public static final String DIRPATH = "dirpath"; 73 public static final String PROJECT = "project"; 74 public static final String DATE = "date"; 75 public static final String OBJUID = "objuid"; // object UID 76 public static final String OBJSER = "objser"; // object serialized 77 public static final String OBJVER = "objver"; // object version 78 79 protected static final List<String> searchFields = Arrays.asList(FULL, DEFS, REFS, PATH, HIST); 80 private static final HashSet<String> searchFieldsSet = new HashSet<>(searchFields); 81 82 /** Used for paths, so SHA-1 is completely sufficient. */ 83 private static final String DIRPATH_HASH_ALGORITHM = "SHA-1"; 84 85 /** 86 * A map containing the query text for each field. (We use a sorted map here 87 * only because we have tests that check the generated query string. If we 88 * had used a hash map, the order of the terms could have varied between 89 * platforms and it would be harder to test.) 90 */ 91 private final Map<String, String> queries = new TreeMap<>(); 92 getSearchFields()93 public static List<String> getSearchFields() { 94 return Collections.unmodifiableList(searchFields); 95 } 96 97 /** 98 * Gets a value indicating if the specified {@code fieldName} is a valid 99 * search field. 100 */ isSearchField(String fieldName)101 public static boolean isSearchField(String fieldName) { 102 return searchFieldsSet.contains(fieldName); 103 } 104 105 /** 106 * Sets the instance to the state of {@code other}. 107 * @param other a defined instance 108 * @return {@code this} 109 */ reset(QueryBuilder other)110 public QueryBuilder reset(QueryBuilder other) { 111 if (other == null) { 112 throw new IllegalArgumentException("other is null"); 113 } 114 if (this != other) { 115 queries.clear(); 116 queries.putAll(other.queries); 117 } 118 return this; 119 } 120 121 /** 122 * Set search string for the {@link #FULL} field. 123 * 124 * @param freetext query string to set 125 * @return this instance 126 */ setFreetext(String freetext)127 public QueryBuilder setFreetext(String freetext) { 128 return addQueryText(FULL, freetext); 129 } 130 131 /** 132 * Get search string for the {@link #FULL} field. 133 * 134 * @return {@code null} if not set, the query string otherwise. 135 */ getFreetext()136 public String getFreetext() { 137 return getQueryText(FULL); 138 } 139 140 /** 141 * Set search string for the {@link #DEFS} field. 142 * 143 * @param defs query string to set 144 * @return this instance 145 */ setDefs(String defs)146 public QueryBuilder setDefs(String defs) { 147 return addQueryText(DEFS, defs); 148 } 149 150 /** 151 * Get search string for the {@link #FULL} field. 152 * 153 * @return {@code null} if not set, the query string otherwise. 154 */ getDefs()155 public String getDefs() { 156 return getQueryText(DEFS); 157 } 158 159 /** 160 * Set search string for the {@link #REFS} field. 161 * 162 * @param refs query string to set 163 * @return this instance 164 */ setRefs(String refs)165 public QueryBuilder setRefs(String refs) { 166 return addQueryText(REFS, refs); 167 } 168 169 /** 170 * Get search string for the {@link #REFS} field. 171 * 172 * @return {@code null} if not set, the query string otherwise. 173 */ getRefs()174 public String getRefs() { 175 return getQueryText(REFS); 176 } 177 178 /** 179 * Set search string for the {@link #PATH} field. 180 * 181 * @param path query string to set 182 * @return this instance 183 */ setPath(String path)184 public QueryBuilder setPath(String path) { 185 return addQueryText(PATH, path); 186 } 187 188 /** 189 * Get search string for the {@link #PATH} field. 190 * 191 * @return {@code null} if not set, the query string otherwise. 192 */ getPath()193 public String getPath() { 194 return getQueryText(PATH); 195 } 196 197 /** 198 * Set search string for the {@link #DIRPATH} field. 199 * @param path query string to set 200 * @return this instance 201 */ setDirPath(String path)202 public QueryBuilder setDirPath(String path) { 203 String normalizedPath = normalizeDirPath(path); 204 return addQueryText(DIRPATH, normalizedPath); 205 } 206 207 /** 208 * Get search string for the {@link #DIRPATH} field. 209 * @return {@code null} if not set; the query string otherwise. 210 */ getDirPath()211 public String getDirPath() { 212 return getQueryText(DIRPATH); 213 } 214 215 /** 216 * Transform {@code path} to ensure any {@link File#separatorChar} is 217 * represented as '/', that there is a trailing '/' if {@code path} is not 218 * empty, and then hash using SHA-1 and formatted in a private encoding 219 * using only letters [g-u]. 220 * @param path a defined value 221 * @return a defined, transformed value 222 */ normalizeDirPath(String path)223 public static String normalizeDirPath(String path) { 224 String norm2; 225 if (path.length() > 0) { 226 String norm1 = path.replace(File.separatorChar, '/'); 227 norm2 = norm1.endsWith("/") ? norm1 : norm1 + "/"; 228 } else { 229 norm2 = path; 230 } 231 232 MessageDigest digest; 233 try { 234 digest = MessageDigest.getInstance(DIRPATH_HASH_ALGORITHM); 235 } catch (NoSuchAlgorithmException e) { 236 /* 237 * This will not happen since "Every implementation of the Java 238 * platform is required to support the following standard 239 * MessageDigest algorithms: MD5, SHA-1, SHA-256." 240 */ 241 throw new RuntimeException(e); 242 } 243 byte[] hash = digest.digest(norm2.getBytes(StandardCharsets.UTF_8)); 244 245 StringBuilder encodedString = new StringBuilder(); 246 for (byte b : hash) { 247 int v0 = (0xF0 & b) >> 4; 248 int v1 = 0xF & b; 249 char c0 = (char) ('g' + v0); 250 char c1 = (char) ('g' + v1); 251 encodedString.append(c0); 252 encodedString.append(c1); 253 } 254 return encodedString.toString(); 255 } 256 257 /** 258 * Set search string for the {@link #HIST} field. 259 * 260 * @param hist query string to set 261 * @return this instance 262 */ setHist(String hist)263 public QueryBuilder setHist(String hist) { 264 return addQueryText(HIST, hist); 265 } 266 267 /** 268 * Get search string for the {@link #HIST} field. 269 * 270 * @return {@code null} if not set, the query string otherwise. 271 */ getHist()272 public String getHist() { 273 return getQueryText(HIST); 274 } 275 276 /** 277 * Set search string for the {@link #TYPE} field. 278 * 279 * @param type query string to set 280 * @return this instance 281 */ setType(String type)282 public QueryBuilder setType(String type) { 283 return addQueryText(TYPE, type); 284 } 285 286 /** 287 * Get search string for the {@link #TYPE} field. 288 * 289 * @return {@code null} if not set, the query string otherwise. 290 */ getType()291 public String getType() { 292 return getQueryText(TYPE); 293 } 294 295 /** 296 * Get a map containing the query text for each of the fields that have been 297 * set. 298 * 299 * @return a possible empty map. 300 */ getQueries()301 public Map<String, String> getQueries() { 302 return Collections.unmodifiableMap(queries); 303 } 304 305 /** 306 * Gets a list of fields from {@link #getQueries()} which are extracted 307 * from source text and which therefore can be used for context 308 * presentations -- in the order of most specific to least. 309 * @return a defined, possibly-empty list 310 */ getContextFields()311 public List<String> getContextFields() { 312 List<String> fields = new ArrayList<>(queries.size()); 313 /** 314 * setFreetext() allows query fragments that specify a field name with 315 * a colon (e.g., "defs:ensure_cache" in the "Full Search" box), so the 316 * context fields (i.e., the result of this method) are not just the 317 * keys of `queries' but need a full parsing to be determined. 318 */ 319 Query query; 320 try { 321 query = build(); 322 } catch (ParseException ex) { 323 return fields; 324 } 325 String queryString = query.toString(""); 326 if (queryString.contains(DEFS + ":")) { 327 fields.add(DEFS); 328 } 329 if (queryString.contains(REFS + ":")) { 330 fields.add(REFS); 331 } 332 if (queryString.contains(FULL + ":")) { 333 fields.add(FULL); 334 } 335 return fields; 336 } 337 338 /** 339 * Get the number of query fields set. 340 * 341 * @return the current number of fields with a none-empty query string. 342 */ getSize()343 public int getSize() { 344 return queries.size(); 345 } 346 347 /** 348 * Used to tell if this search only has the {@link #DEFS} field filled in. 349 * 350 * @return whether above statement is true or false 351 */ isDefSearch()352 public boolean isDefSearch() { 353 354 return ((getQueryText(FULL) == null) 355 && (getQueryText(REFS) == null) 356 && (getQueryText(PATH) == null) 357 && (getQueryText(HIST) == null) 358 && (getQueryText(DIRPATH) == null) 359 && (getQueryText(DEFS) != null)); 360 } 361 362 /** 363 * Gets a value indicating if this search only has defined the {@link #PATH} 364 * query field. 365 */ isPathSearch()366 public boolean isPathSearch() { 367 return ((getQueryText(FULL) == null) 368 && (getQueryText(REFS) == null) 369 && (getQueryText(PATH) != null) 370 && (getQueryText(HIST) == null) 371 && (getQueryText(DIRPATH) == null) 372 && (getQueryText(DEFS) == null)); 373 } 374 375 /** 376 * Build a new query based on the query text that has been passed in to this 377 * builder. 378 * 379 * @return a query, or {@code null} if no query text is available. 380 * @throws ParseException if the query text cannot be parsed 381 */ build()382 public Query build() throws ParseException { 383 if (queries.isEmpty()) { 384 // We don't have any text to parse 385 return null; 386 } 387 // Parse each of the query texts separately 388 ArrayList<Query> queryList = new ArrayList<>(queries.size()); 389 for (Map.Entry<String, String> entry : queries.entrySet()) { 390 String field = entry.getKey(); 391 String queryText = entry.getValue(); 392 queryList.add(buildQuery(field, escapeQueryString(field, queryText))); 393 } 394 // If we only have one sub-query, return it directly 395 if (queryList.size() == 1) { 396 return queryList.get(0); 397 } 398 // We have multiple subqueries, so let's combine them into a 399 // BooleanQuery. 400 // 401 // If the subquery is a BooleanQuery, we pull out each clause and 402 // add it to the outer BooleanQuery so that any negations work on 403 // the query as a whole. One exception to this rule: If the query 404 // contains one or more Occur.SHOULD clauses and no Occur.MUST 405 // clauses, we keep it in a subquery so that the requirement that 406 // at least one of the Occur.SHOULD clauses must match (pulling them 407 // out would make all of them optional). 408 // 409 // All other types of subqueries are added directly to the outer 410 // query with Occur.MUST. 411 BooleanQuery.Builder combinedQuery = new BooleanQuery.Builder(); 412 for (Query query : queryList) { 413 if (query instanceof BooleanQuery) { 414 BooleanQuery boolQuery = (BooleanQuery) query; 415 if (hasClause(boolQuery, Occur.SHOULD) 416 && !hasClause(boolQuery, Occur.MUST)) { 417 combinedQuery.add(query, Occur.MUST); 418 } else { 419 for (BooleanClause clause : boolQuery) { 420 combinedQuery.add(clause); 421 } 422 } 423 } else { 424 combinedQuery.add(query, Occur.MUST); 425 } 426 } 427 return combinedQuery.build(); 428 } 429 430 /** 431 * Add query text for the specified field. 432 * 433 * @param field the field to add query text for 434 * @param query the query text to set 435 * @return this object 436 */ addQueryText(String field, String query)437 private QueryBuilder addQueryText(String field, String query) { 438 if (query == null || query.isEmpty()) { 439 queries.remove(field); 440 } else { 441 queries.put(field, query); 442 } 443 return this; 444 } 445 getQueryText(String field)446 private String getQueryText(String field) { 447 return queries.get(field); 448 } 449 450 /** 451 * Escape special characters in a query string. 452 * 453 * @param field the field for which the query string is provided 454 * @param query the query string to escape 455 * @return the escaped query string 456 */ escapeQueryString(String field, String query)457 private String escapeQueryString(String field, String query) { 458 StringReader reader = new StringReader(query); 459 StringBuilder res = new StringBuilder(); 460 switch (field) { 461 case FULL: 462 FullQueryEscaper fesc = new FullQueryEscaper(reader); 463 fesc.setOut(res); 464 fesc.consume(); 465 break; 466 case PATH: 467 if (!(query.startsWith("/") && query.endsWith("/"))) { 468 PathQueryEscaper pesc = new PathQueryEscaper(reader); 469 pesc.setOut(res); 470 pesc.consume(); 471 break; 472 } 473 // FALLTHROUGH 474 default: 475 DefaultQueryEscaper desc = new DefaultQueryEscaper(reader); 476 desc.setOut(res); 477 desc.consume(); 478 } 479 return res.toString(); 480 } 481 482 /** 483 * Build a subquery against one of the fields. 484 * 485 * @param field the field to build the query against 486 * @param queryText the query text 487 * @return a parsed query 488 * @throws ParseException if the query text cannot be parsed 489 */ buildQuery(String field, String queryText)490 protected Query buildQuery(String field, String queryText) 491 throws ParseException { 492 return new CustomQueryParser(field).parse(queryText); 493 } 494 495 /** 496 * Check if a BooleanQuery contains a clause of a given occur type. 497 * 498 * @param query the query to check 499 * @param occur the occur type to check for 500 * @return whether or not the query contains a clause of the specified type 501 */ hasClause(BooleanQuery query, Occur occur)502 private boolean hasClause(BooleanQuery query, Occur occur) { 503 for (BooleanClause clause : query) { 504 if (clause.getOccur().equals(occur)) { 505 return true; 506 } 507 } 508 return false; 509 } 510 } 511