xref: /OpenGrok/opengrok-indexer/src/main/java/org/opengrok/indexer/search/QueryBuilder.java (revision 0e4c55544f8ea0a68e8bae37b0e502097e008ec1)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * See LICENSE.txt included in this distribution for the specific
9  * language governing permissions and limitations under the License.
10  *
11  * When distributing Covered Code, include this CDDL HEADER in each
12  * file and include the License file at LICENSE.txt.
13  * If applicable, add the following below this CDDL HEADER, with the
14  * fields enclosed by brackets "[]" replaced with your own identifying
15  * information: Portions Copyright [yyyy] [name of copyright owner]
16  *
17  * CDDL HEADER END
18  */
19 
20 /*
21  * Copyright (c) 2010, 2019, Oracle and/or its affiliates. All rights reserved.
22  * Portions Copyright (c) 2011, Jens Elkner.
23  * Portions Copyright (c) 2017, 2020, Chris Fraire <cfraire@me.com>.
24  */
25 package org.opengrok.indexer.search;
26 
27 import java.io.File;
28 import java.io.StringReader;
29 import java.nio.charset.StandardCharsets;
30 import java.security.MessageDigest;
31 import java.security.NoSuchAlgorithmException;
32 import java.util.ArrayList;
33 import java.util.Arrays;
34 import java.util.Collections;
35 import java.util.HashSet;
36 import java.util.List;
37 import java.util.Map;
38 import java.util.TreeMap;
39 import org.apache.lucene.queryparser.classic.ParseException;
40 import org.apache.lucene.search.BooleanClause;
41 import org.apache.lucene.search.BooleanClause.Occur;
42 import org.apache.lucene.search.BooleanQuery;
43 import org.apache.lucene.search.Query;
44 
45 /**
46  * Helper class that builds a Lucene query based on provided search terms for
47  * the different fields.
48  */
49 public class QueryBuilder {
50 
51     /**
52      * Fields we use in lucene: public ones.
53      */
54     public static final String FULL = "full";
55     public static final String DEFS = "defs";
56     public static final String REFS = "refs";
57     public static final String PATH = "path";
58     public static final String HIST = "hist";
59     public static final String TYPE = "type";
60     public static final String SCOPES = "scopes";
61     public static final String NUML = "numl";
62     public static final String LOC = "loc";
63     public static final String LASTREV = "lastrev"; // last revision
64     /**
65      * Fields we use in Lucene: internal ones.
66      */
67     public static final String D = "d"; // Directory key
68     public static final String U = "u"; // File and timestamp key
69     public static final String TAGS = "tags";
70     public static final String T = "t";
71     public static final String FULLPATH = "fullpath";
72     public static final String DIRPATH = "dirpath";
73     public static final String PROJECT = "project";
74     public static final String DATE = "date";
75     public static final String OBJUID = "objuid"; // object UID
76     public static final String OBJSER = "objser"; // object serialized
77     public static final String OBJVER = "objver"; // object version
78 
79     protected static final List<String> searchFields = Arrays.asList(FULL, DEFS, REFS, PATH, HIST);
80     private static final HashSet<String> searchFieldsSet = new HashSet<>(searchFields);
81 
82     /** Used for paths, so SHA-1 is completely sufficient. */
83     private static final String DIRPATH_HASH_ALGORITHM = "SHA-1";
84 
85     /**
86      * A map containing the query text for each field. (We use a sorted map here
87      * only because we have tests that check the generated query string. If we
88      * had used a hash map, the order of the terms could have varied between
89      * platforms and it would be harder to test.)
90      */
91     private final Map<String, String> queries = new TreeMap<>();
92 
getSearchFields()93     public static List<String> getSearchFields() {
94         return Collections.unmodifiableList(searchFields);
95     }
96 
97     /**
98      * Gets a value indicating if the specified {@code fieldName} is a valid
99      * search field.
100      */
isSearchField(String fieldName)101     public static boolean isSearchField(String fieldName) {
102         return searchFieldsSet.contains(fieldName);
103     }
104 
105     /**
106      * Sets the instance to the state of {@code other}.
107      * @param other a defined instance
108      * @return {@code this}
109      */
reset(QueryBuilder other)110     public QueryBuilder reset(QueryBuilder other) {
111         if (other == null) {
112             throw new IllegalArgumentException("other is null");
113         }
114         if (this != other) {
115             queries.clear();
116             queries.putAll(other.queries);
117         }
118         return this;
119     }
120 
121     /**
122      * Set search string for the {@link #FULL} field.
123      *
124      * @param freetext query string to set
125      * @return this instance
126      */
setFreetext(String freetext)127     public QueryBuilder setFreetext(String freetext) {
128         return addQueryText(FULL, freetext);
129     }
130 
131     /**
132      * Get search string for the {@link #FULL} field.
133      *
134      * @return {@code null} if not set, the query string otherwise.
135      */
getFreetext()136     public String getFreetext() {
137         return getQueryText(FULL);
138     }
139 
140     /**
141      * Set search string for the {@link #DEFS} field.
142      *
143      * @param defs query string to set
144      * @return this instance
145      */
setDefs(String defs)146     public QueryBuilder setDefs(String defs) {
147         return addQueryText(DEFS, defs);
148     }
149 
150     /**
151      * Get search string for the {@link #FULL} field.
152      *
153      * @return {@code null} if not set, the query string otherwise.
154      */
getDefs()155     public String getDefs() {
156         return getQueryText(DEFS);
157     }
158 
159     /**
160      * Set search string for the {@link #REFS} field.
161      *
162      * @param refs query string to set
163      * @return this instance
164      */
setRefs(String refs)165     public QueryBuilder setRefs(String refs) {
166         return addQueryText(REFS, refs);
167     }
168 
169     /**
170      * Get search string for the {@link #REFS} field.
171      *
172      * @return {@code null} if not set, the query string otherwise.
173      */
getRefs()174     public String getRefs() {
175         return getQueryText(REFS);
176     }
177 
178     /**
179      * Set search string for the {@link #PATH} field.
180      *
181      * @param path query string to set
182      * @return this instance
183      */
setPath(String path)184     public QueryBuilder setPath(String path) {
185         return addQueryText(PATH, path);
186     }
187 
188     /**
189      * Get search string for the {@link #PATH} field.
190      *
191      * @return {@code null} if not set, the query string otherwise.
192      */
getPath()193     public String getPath() {
194         return getQueryText(PATH);
195     }
196 
197     /**
198      * Set search string for the {@link #DIRPATH} field.
199      * @param path query string to set
200      * @return this instance
201      */
setDirPath(String path)202     public QueryBuilder setDirPath(String path) {
203         String normalizedPath = normalizeDirPath(path);
204         return addQueryText(DIRPATH, normalizedPath);
205     }
206 
207     /**
208      * Get search string for the {@link #DIRPATH} field.
209      * @return {@code null} if not set; the query string otherwise.
210      */
getDirPath()211     public String getDirPath() {
212         return getQueryText(DIRPATH);
213     }
214 
215     /**
216      * Transform {@code path} to ensure any {@link File#separatorChar} is
217      * represented as '/', that there is a trailing '/' if {@code path} is not
218      * empty, and then hash using SHA-1 and formatted in a private encoding
219      * using only letters [g-u].
220      * @param path a defined value
221      * @return a defined, transformed value
222      */
normalizeDirPath(String path)223     public static String normalizeDirPath(String path) {
224         String norm2;
225         if (path.length() > 0) {
226             String norm1 = path.replace(File.separatorChar, '/');
227             norm2 = norm1.endsWith("/") ? norm1 : norm1 + "/";
228         } else {
229             norm2 = path;
230         }
231 
232         MessageDigest digest;
233         try {
234             digest = MessageDigest.getInstance(DIRPATH_HASH_ALGORITHM);
235         } catch (NoSuchAlgorithmException e) {
236             /*
237              * This will not happen since "Every implementation of the Java
238              * platform is required to support the following standard
239              * MessageDigest algorithms: MD5, SHA-1, SHA-256."
240              */
241             throw new RuntimeException(e);
242         }
243         byte[] hash = digest.digest(norm2.getBytes(StandardCharsets.UTF_8));
244 
245         StringBuilder encodedString = new StringBuilder();
246         for (byte b : hash) {
247             int v0 = (0xF0 & b) >> 4;
248             int v1 = 0xF & b;
249             char c0 = (char) ('g' + v0);
250             char c1 = (char) ('g' + v1);
251             encodedString.append(c0);
252             encodedString.append(c1);
253         }
254         return encodedString.toString();
255     }
256 
257     /**
258      * Set search string for the {@link #HIST} field.
259      *
260      * @param hist query string to set
261      * @return this instance
262      */
setHist(String hist)263     public QueryBuilder setHist(String hist) {
264         return addQueryText(HIST, hist);
265     }
266 
267     /**
268      * Get search string for the {@link #HIST} field.
269      *
270      * @return {@code null} if not set, the query string otherwise.
271      */
getHist()272     public String getHist() {
273         return getQueryText(HIST);
274     }
275 
276     /**
277      * Set search string for the {@link #TYPE} field.
278      *
279      * @param type query string to set
280      * @return this instance
281      */
setType(String type)282     public QueryBuilder setType(String type) {
283         return addQueryText(TYPE, type);
284     }
285 
286     /**
287      * Get search string for the {@link #TYPE} field.
288      *
289      * @return {@code null} if not set, the query string otherwise.
290      */
getType()291     public String getType() {
292         return getQueryText(TYPE);
293     }
294 
295     /**
296      * Get a map containing the query text for each of the fields that have been
297      * set.
298      *
299      * @return a possible empty map.
300      */
getQueries()301     public Map<String, String> getQueries() {
302         return Collections.unmodifiableMap(queries);
303     }
304 
305     /**
306      * Gets a list of fields from {@link #getQueries()} which are extracted
307      * from source text and which therefore can be used for context
308      * presentations -- in the order of most specific to least.
309      * @return a defined, possibly-empty list
310      */
getContextFields()311     public List<String> getContextFields() {
312         List<String> fields = new ArrayList<>(queries.size());
313         /**
314          * setFreetext() allows query fragments that specify a field name with
315          * a colon (e.g., "defs:ensure_cache" in the "Full Search" box), so the
316          * context fields (i.e., the result of this method) are not just the
317          * keys of `queries' but need a full parsing to be determined.
318          */
319         Query query;
320         try {
321             query = build();
322         } catch (ParseException ex) {
323             return fields;
324         }
325         String queryString = query.toString("");
326         if (queryString.contains(DEFS + ":")) {
327             fields.add(DEFS);
328         }
329         if (queryString.contains(REFS + ":")) {
330             fields.add(REFS);
331         }
332         if (queryString.contains(FULL + ":")) {
333             fields.add(FULL);
334         }
335         return fields;
336     }
337 
338     /**
339      * Get the number of query fields set.
340      *
341      * @return the current number of fields with a none-empty query string.
342      */
getSize()343     public int getSize() {
344         return queries.size();
345     }
346 
347     /**
348      * Used to tell if this search only has the {@link #DEFS} field filled in.
349      *
350      * @return whether above statement is true or false
351      */
isDefSearch()352     public boolean isDefSearch() {
353 
354         return ((getQueryText(FULL) == null)
355                 && (getQueryText(REFS) == null)
356                 && (getQueryText(PATH) == null)
357                 && (getQueryText(HIST) == null)
358                 && (getQueryText(DIRPATH) == null)
359                 && (getQueryText(DEFS) != null));
360     }
361 
362     /**
363      * Gets a value indicating if this search only has defined the {@link #PATH}
364      * query field.
365      */
isPathSearch()366     public boolean isPathSearch() {
367         return ((getQueryText(FULL) == null)
368                 && (getQueryText(REFS) == null)
369                 && (getQueryText(PATH) != null)
370                 && (getQueryText(HIST) == null)
371                 && (getQueryText(DIRPATH) == null)
372                 && (getQueryText(DEFS) == null));
373     }
374 
375     /**
376      * Build a new query based on the query text that has been passed in to this
377      * builder.
378      *
379      * @return a query, or {@code null} if no query text is available.
380      * @throws ParseException if the query text cannot be parsed
381      */
build()382     public Query build() throws ParseException {
383         if (queries.isEmpty()) {
384             // We don't have any text to parse
385             return null;
386         }
387         // Parse each of the query texts separately
388         ArrayList<Query> queryList = new ArrayList<>(queries.size());
389         for (Map.Entry<String, String> entry : queries.entrySet()) {
390             String field = entry.getKey();
391             String queryText = entry.getValue();
392             queryList.add(buildQuery(field, escapeQueryString(field, queryText)));
393         }
394         // If we only have one sub-query, return it directly
395         if (queryList.size() == 1) {
396             return queryList.get(0);
397         }
398         // We have multiple subqueries, so let's combine them into a
399         // BooleanQuery.
400         //
401         // If the subquery is a BooleanQuery, we pull out each clause and
402         // add it to the outer BooleanQuery so that any negations work on
403         // the query as a whole. One exception to this rule: If the query
404         // contains one or more Occur.SHOULD clauses and no Occur.MUST
405         // clauses, we keep it in a subquery so that the requirement that
406         // at least one of the Occur.SHOULD clauses must match (pulling them
407         // out would make all of them optional).
408         //
409         // All other types of subqueries are added directly to the outer
410         // query with Occur.MUST.
411         BooleanQuery.Builder combinedQuery = new BooleanQuery.Builder();
412         for (Query query : queryList) {
413             if (query instanceof BooleanQuery) {
414                 BooleanQuery boolQuery = (BooleanQuery) query;
415                 if (hasClause(boolQuery, Occur.SHOULD)
416                         && !hasClause(boolQuery, Occur.MUST)) {
417                     combinedQuery.add(query, Occur.MUST);
418                 } else {
419                     for (BooleanClause clause : boolQuery) {
420                         combinedQuery.add(clause);
421                     }
422                 }
423             } else {
424                 combinedQuery.add(query, Occur.MUST);
425             }
426         }
427         return combinedQuery.build();
428     }
429 
430     /**
431      * Add query text for the specified field.
432      *
433      * @param field the field to add query text for
434      * @param query the query text to set
435      * @return this object
436      */
addQueryText(String field, String query)437     private QueryBuilder addQueryText(String field, String query) {
438         if (query == null || query.isEmpty()) {
439             queries.remove(field);
440         } else {
441             queries.put(field, query);
442         }
443         return this;
444     }
445 
getQueryText(String field)446     private String getQueryText(String field) {
447         return queries.get(field);
448     }
449 
450     /**
451      * Escape special characters in a query string.
452      *
453      * @param field the field for which the query string is provided
454      * @param query the query string to escape
455      * @return the escaped query string
456      */
escapeQueryString(String field, String query)457     private String escapeQueryString(String field, String query) {
458         StringReader reader = new StringReader(query);
459         StringBuilder res = new StringBuilder();
460         switch (field) {
461             case FULL:
462                 FullQueryEscaper fesc = new FullQueryEscaper(reader);
463                 fesc.setOut(res);
464                 fesc.consume();
465                 break;
466             case PATH:
467                 if (!(query.startsWith("/") && query.endsWith("/"))) {
468                     PathQueryEscaper pesc = new PathQueryEscaper(reader);
469                     pesc.setOut(res);
470                     pesc.consume();
471                     break;
472                 }
473                 // FALLTHROUGH
474             default:
475                 DefaultQueryEscaper desc = new DefaultQueryEscaper(reader);
476                 desc.setOut(res);
477                 desc.consume();
478         }
479         return res.toString();
480     }
481 
482     /**
483      * Build a subquery against one of the fields.
484      *
485      * @param field the field to build the query against
486      * @param queryText the query text
487      * @return a parsed query
488      * @throws ParseException if the query text cannot be parsed
489      */
buildQuery(String field, String queryText)490     protected Query buildQuery(String field, String queryText)
491             throws ParseException {
492         return new CustomQueryParser(field).parse(queryText);
493     }
494 
495     /**
496      * Check if a BooleanQuery contains a clause of a given occur type.
497      *
498      * @param query the query to check
499      * @param occur the occur type to check for
500      * @return whether or not the query contains a clause of the specified type
501      */
hasClause(BooleanQuery query, Occur occur)502     private boolean hasClause(BooleanQuery query, Occur occur) {
503         for (BooleanClause clause : query) {
504             if (clause.getOccur().equals(occur)) {
505                 return true;
506             }
507         }
508         return false;
509     }
510 }
511