xref: /OpenGrok/opengrok-indexer/src/main/java/org/opengrok/indexer/analysis/AnalyzerGuru.java (revision 750b3115a5b8976536ee4dccce497eb97b7a4c9b)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * See LICENSE.txt included in this distribution for the specific
9  * language governing permissions and limitations under the License.
10  *
11  * When distributing Covered Code, include this CDDL HEADER in each
12  * file and include the License file at LICENSE.txt.
13  * If applicable, add the following below this CDDL HEADER, with the
14  * fields enclosed by brackets "[]" replaced with your own identifying
15  * information: Portions Copyright [yyyy] [name of copyright owner]
16  *
17  * CDDL HEADER END
18  */
19 
20 /*
21  * Copyright (c) 2005, 2021, Oracle and/or its affiliates. All rights reserved.
22  * Portions Copyright (c) 2017, 2021, Chris Fraire <cfraire@me.com>.
23  */
24 package org.opengrok.indexer.analysis;
25 
26 import java.io.BufferedReader;
27 import java.io.File;
28 import java.io.FileWriter;
29 import java.io.IOException;
30 import java.io.InputStream;
31 import java.io.InputStreamReader;
32 import java.io.Reader;
33 import java.io.Writer;
34 import java.lang.reflect.InvocationTargetException;
35 import java.nio.charset.StandardCharsets;
36 import java.nio.file.Files;
37 import java.util.ArrayList;
38 import java.util.Arrays;
39 import java.util.Collections;
40 import java.util.Comparator;
41 import java.util.HashMap;
42 import java.util.List;
43 import java.util.Locale;
44 import java.util.Map;
45 import java.util.Objects;
46 import java.util.SortedMap;
47 import java.util.TreeMap;
48 import java.util.TreeSet;
49 import java.util.logging.Level;
50 import java.util.logging.Logger;
51 import org.apache.lucene.document.DateTools;
52 import org.apache.lucene.document.Document;
53 import org.apache.lucene.document.Field;
54 import org.apache.lucene.document.Field.Store;
55 import org.apache.lucene.document.FieldType;
56 import org.apache.lucene.document.SortedDocValuesField;
57 import org.apache.lucene.document.StringField;
58 import org.apache.lucene.document.TextField;
59 import org.apache.lucene.util.BytesRef;
60 import org.opengrok.indexer.analysis.FileAnalyzerFactory.Matcher;
61 import org.opengrok.indexer.analysis.ada.AdaAnalyzerFactory;
62 import org.opengrok.indexer.analysis.archive.BZip2AnalyzerFactory;
63 import org.opengrok.indexer.analysis.archive.GZIPAnalyzerFactory;
64 import org.opengrok.indexer.analysis.archive.TarAnalyzerFactory;
65 import org.opengrok.indexer.analysis.archive.ZipAnalyzerFactory;
66 import org.opengrok.indexer.analysis.asm.AsmAnalyzerFactory;
67 import org.opengrok.indexer.analysis.c.CAnalyzerFactory;
68 import org.opengrok.indexer.analysis.c.CxxAnalyzerFactory;
69 import org.opengrok.indexer.analysis.clojure.ClojureAnalyzerFactory;
70 import org.opengrok.indexer.analysis.csharp.CSharpAnalyzerFactory;
71 import org.opengrok.indexer.analysis.data.IgnorantAnalyzerFactory;
72 import org.opengrok.indexer.analysis.data.ImageAnalyzerFactory;
73 import org.opengrok.indexer.analysis.document.MandocAnalyzerFactory;
74 import org.opengrok.indexer.analysis.document.TroffAnalyzerFactory;
75 import org.opengrok.indexer.analysis.eiffel.EiffelAnalyzerFactory;
76 import org.opengrok.indexer.analysis.erlang.ErlangAnalyzerFactory;
77 import org.opengrok.indexer.analysis.executables.ELFAnalyzerFactory;
78 import org.opengrok.indexer.analysis.executables.JarAnalyzerFactory;
79 import org.opengrok.indexer.analysis.executables.JavaClassAnalyzerFactory;
80 import org.opengrok.indexer.analysis.fortran.FortranAnalyzerFactory;
81 import org.opengrok.indexer.analysis.golang.GolangAnalyzerFactory;
82 import org.opengrok.indexer.analysis.haskell.HaskellAnalyzerFactory;
83 import org.opengrok.indexer.analysis.hcl.HCLAnalyzerFactory;
84 import org.opengrok.indexer.analysis.java.JavaAnalyzerFactory;
85 import org.opengrok.indexer.analysis.javascript.JavaScriptAnalyzerFactory;
86 import org.opengrok.indexer.analysis.json.JsonAnalyzerFactory;
87 import org.opengrok.indexer.analysis.kotlin.KotlinAnalyzerFactory;
88 import org.opengrok.indexer.analysis.lisp.LispAnalyzerFactory;
89 import org.opengrok.indexer.analysis.lua.LuaAnalyzerFactory;
90 import org.opengrok.indexer.analysis.pascal.PascalAnalyzerFactory;
91 import org.opengrok.indexer.analysis.perl.PerlAnalyzerFactory;
92 import org.opengrok.indexer.analysis.php.PhpAnalyzerFactory;
93 import org.opengrok.indexer.analysis.plain.PlainAnalyzerFactory;
94 import org.opengrok.indexer.analysis.plain.XMLAnalyzerFactory;
95 import org.opengrok.indexer.analysis.powershell.PowershellAnalyzerFactory;
96 import org.opengrok.indexer.analysis.python.PythonAnalyzerFactory;
97 import org.opengrok.indexer.analysis.r.RAnalyzerFactory;
98 import org.opengrok.indexer.analysis.ruby.RubyAnalyzerFactory;
99 import org.opengrok.indexer.analysis.rust.RustAnalyzerFactory;
100 import org.opengrok.indexer.analysis.scala.ScalaAnalyzerFactory;
101 import org.opengrok.indexer.analysis.sh.ShAnalyzerFactory;
102 import org.opengrok.indexer.analysis.sql.PLSQLAnalyzerFactory;
103 import org.opengrok.indexer.analysis.sql.SQLAnalyzerFactory;
104 import org.opengrok.indexer.analysis.swift.SwiftAnalyzerFactory;
105 import org.opengrok.indexer.analysis.tcl.TclAnalyzerFactory;
106 import org.opengrok.indexer.analysis.terraform.TerraformAnalyzerFactory;
107 import org.opengrok.indexer.analysis.typescript.TypeScriptAnalyzerFactory;
108 import org.opengrok.indexer.analysis.uue.UuencodeAnalyzerFactory;
109 import org.opengrok.indexer.analysis.vb.VBAnalyzerFactory;
110 import org.opengrok.indexer.analysis.verilog.VerilogAnalyzerFactory;
111 import org.opengrok.indexer.configuration.Project;
112 import org.opengrok.indexer.configuration.RuntimeEnvironment;
113 import org.opengrok.indexer.history.Annotation;
114 import org.opengrok.indexer.history.History;
115 import org.opengrok.indexer.history.HistoryEntry;
116 import org.opengrok.indexer.history.HistoryException;
117 import org.opengrok.indexer.history.HistoryGuru;
118 import org.opengrok.indexer.history.HistoryReader;
119 import org.opengrok.indexer.logger.LoggerFactory;
120 import org.opengrok.indexer.search.QueryBuilder;
121 import org.opengrok.indexer.util.IOUtils;
122 import org.opengrok.indexer.web.Util;
123 
124 /**
125  * Manages and provides Analyzers as needed. Please see
126  * <a href="https://github.com/oracle/opengrok/wiki/Internals">
127  * this</a> page for a great description of the purpose of the AnalyzerGuru.
128  *
129  * Created on September 22, 2005
130  *
131  * @author Chandan
132  */
133 public class AnalyzerGuru {
134 
135     /**
136      * The maximum number of characters (multi-byte if a BOM is identified) to
137      * read from the input stream to be used for magic string matching.
138      */
139     private static final int OPENING_MAX_CHARS = 100;
140 
141     /**
142      * Set to 16K -- though debugging shows it would do with only 8K+3
143      * (standard buffer for Java BufferedInputStream plus 3 bytes for largest UTF BOM).
144      */
145     private static final int MARK_READ_LIMIT = 1024 * 16;
146 
147     /**
148      * The number of bytes read from the start of the file for magic number or
149      * string analysis. Some {@link FileAnalyzerFactory.Matcher}
150      * implementations may read more data subsequently, but this field defines
151      * the number of bytes initially read for general matching.
152      */
153     private static final int MAGIC_BYTES_NUM = 8;
154 
155     private static final Logger LOGGER = LoggerFactory.getLogger(AnalyzerGuru.class);
156 
157     /**
158      * The default {@code FileAnalyzerFactory} instance.
159      */
160     private static final AnalyzerFactory DEFAULT_ANALYZER_FACTORY = new FileAnalyzerFactory();
161 
162     /**
163      * Map from file names to analyzer factories.
164      */
165     private static final Map<String, AnalyzerFactory> FILE_NAMES = new HashMap<>();
166 
167     /**
168      * Map from file extensions to analyzer factories.
169      */
170     private static final Map<String, AnalyzerFactory> ext = new HashMap<>();
171 
172     /**
173      * Map from file prefixes to analyzer factories.
174      */
175     private static final Map<String, AnalyzerFactory> pre = new HashMap<>();
176 
177     /**
178      * Appended when
179      * {@link #addExtension(java.lang.String, AnalyzerFactory)}
180      * or
181      * {@link #addPrefix(java.lang.String, AnalyzerFactory)}
182      * are called to augment the value in {@link #getVersionNo()}.
183      */
184     private static final TreeSet<String> CUSTOMIZATION_KEYS = new TreeSet<>();
185 
186     private static int customizationHashCode;
187 
188     /**
189      * Descending string length comparator for magics.
190      */
191     private static final Comparator<String> descStrlenComparator = (s1, s2) -> {
192         // DESC: s2 length <=> s1 length
193         int cmp = Integer.compare(s2.length(), s1.length());
194         if (cmp != 0) {
195             return cmp;
196         }
197 
198         // the Comparator must also be "consistent with equals", so check
199         // string contents too when (length)cmp == 0. (ASC: s1 <=> s2.)
200         cmp = s1.compareTo(s2);
201         return cmp;
202     };
203 
204     /**
205      * Map from magic strings to analyzer factories.
206      */
207     private static final SortedMap<String, AnalyzerFactory> magics =
208         new TreeMap<>(descStrlenComparator);
209 
210     /**
211      * List of matcher objects which can be used to determine which analyzer
212      * factory to use.
213      */
214     private static final List<FileAnalyzerFactory.Matcher> matchers = new ArrayList<>();
215 
216     /**
217      * List of all registered {@code FileAnalyzerFactory} instances.
218      */
219     private static final List<AnalyzerFactory> factories = new ArrayList<>();
220 
221     /**
222      * Names of all analysis packages.
223      */
224     private static final List<String> analysisPkgNames = new ArrayList<>();
225 
226     public static final FieldType string_ft_stored_nanalyzed_norms = new FieldType(StringField.TYPE_STORED);
227     public static final FieldType string_ft_nstored_nanalyzed_norms = new FieldType(StringField.TYPE_NOT_STORED);
228 
229     private static final Map<String, String> fileTypeDescriptions = new TreeMap<>();
230 
231     /**
232      * Maps from {@link FileAnalyzer#getFileTypeName()} to
233      * {@link FileAnalyzerFactory}.
234      */
235     private static final Map<String, AnalyzerFactory> FILETYPE_FACTORIES =
236             new HashMap<>();
237 
238     /**
239      * Maps from {@link FileAnalyzer#getFileTypeName()} to
240      * {@link FileAnalyzer#getVersionNo()}.
241      */
242     private static final Map<String, Long> ANALYZER_VERSIONS = new HashMap<>();
243 
244     private static final LangTreeMap langMap = new LangTreeMap();
245     private static final LangTreeMap defaultLangMap = new LangTreeMap();
246 
247     /*
248      * If you write your own analyzer please register it here. The order is
249      * important for any factory that uses a FileAnalyzerFactory.Matcher
250      * implementation, as those are run in the same order as defined below --
251      * though precise Matchers are run before imprecise ones.
252      */
253     static {
254         try {
255             AnalyzerFactory[] analyzers = {
256                 DEFAULT_ANALYZER_FACTORY,
257                 new IgnorantAnalyzerFactory(),
258                 new BZip2AnalyzerFactory(),
259                 new XMLAnalyzerFactory(),
260                 MandocAnalyzerFactory.DEFAULT_INSTANCE,
261                 TroffAnalyzerFactory.DEFAULT_INSTANCE,
262                 new ELFAnalyzerFactory(),
263                 JavaClassAnalyzerFactory.DEFAULT_INSTANCE,
264                 new ImageAnalyzerFactory(),
265                 JarAnalyzerFactory.DEFAULT_INSTANCE,
266                 ZipAnalyzerFactory.DEFAULT_INSTANCE,
267                 new TarAnalyzerFactory(),
268                 new CAnalyzerFactory(),
269                 new CSharpAnalyzerFactory(),
270                 new VBAnalyzerFactory(),
271                 new CxxAnalyzerFactory(),
272                 new ErlangAnalyzerFactory(),
273                 new ShAnalyzerFactory(),
274                 new PowershellAnalyzerFactory(),
275                 new UuencodeAnalyzerFactory(),
276                 new GZIPAnalyzerFactory(),
277                 new JavaAnalyzerFactory(),
278                 new JavaScriptAnalyzerFactory(),
279                 new KotlinAnalyzerFactory(),
280                 new SwiftAnalyzerFactory(),
281                 new JsonAnalyzerFactory(),
282                 new PythonAnalyzerFactory(),
283                 new RustAnalyzerFactory(),
284                 new PerlAnalyzerFactory(),
285                 new PhpAnalyzerFactory(),
286                 new LispAnalyzerFactory(),
287                 new TclAnalyzerFactory(),
288                 new ScalaAnalyzerFactory(),
289                 new ClojureAnalyzerFactory(),
290                 new SQLAnalyzerFactory(),
291                 new PLSQLAnalyzerFactory(),
292                 new FortranAnalyzerFactory(),
293                 new HaskellAnalyzerFactory(),
294                 new GolangAnalyzerFactory(),
295                 new LuaAnalyzerFactory(),
296                 new PascalAnalyzerFactory(),
297                 new AdaAnalyzerFactory(),
298                 new RubyAnalyzerFactory(),
299                 new EiffelAnalyzerFactory(),
300                 new VerilogAnalyzerFactory(),
301                 new TypeScriptAnalyzerFactory(),
302                 new AsmAnalyzerFactory(),
303                 new HCLAnalyzerFactory(),
304                 new TerraformAnalyzerFactory(),
305                 new RAnalyzerFactory(),
306                 // Keep PlainAnalyzer last, with its lone, quite fuzzy matcher.
307                 PlainAnalyzerFactory.DEFAULT_INSTANCE
308             };
309 
310             for (AnalyzerFactory analyzer : analyzers) {
311                 registerAnalyzer(analyzer);
312             }
313 
314             for (AnalyzerFactory analyzer : analyzers) {
315                 if (analyzer.getName() != null && !analyzer.getName().isEmpty()) {
analyzer.getName()316                     fileTypeDescriptions.put(analyzer.getAnalyzer().getFileTypeName(), analyzer.getName());
317                 }
318             }
319 
320             string_ft_stored_nanalyzed_norms.setOmitNorms(false);
321             string_ft_nstored_nanalyzed_norms.setOmitNorms(false);
322         } catch (Throwable t) {
323             LOGGER.log(Level.SEVERE,
324                     "exception hit when constructing AnalyzerGuru static", t);
325             throw t;
326         }
327     }
328 
329     /**
330      * Gets a version number to be used to tag documents examined by the guru so
331      * that {@link AbstractAnalyzer} selection can be re-done later if a stored
332      * version number is different from the current implementation or if guru
333      * factory registrations are modified by the user to change the guru
334      * operation.
335      * <p>
336      * The static part of the version is bumped in a release when e.g. new
337      * {@link FileAnalyzerFactory} subclasses are registered or when existing
338      * {@link FileAnalyzerFactory} subclasses are revised to target more or
339      * different files.
340      * @return a value whose lower 32-bits are a static value
341      * 20201003_00
342      * for the current implementation and whose higher-32 bits are non-zero if
343      * {@link #addExtension(java.lang.String, AnalyzerFactory)}
344      * or
345      * {@link #addPrefix(java.lang.String, AnalyzerFactory)}
346      * has been called.
347      */
getVersionNo()348     public static long getVersionNo() {
349         final int ver32 = 20201003_00; // Edit comment above too!
350         long ver = ver32;
351         if (customizationHashCode != 0) {
352             ver |= (long) customizationHashCode << 32;
353         }
354         return ver;
355     }
356 
357     /**
358      * Gets a version number according to a registered
359      * {@link FileAnalyzer#getVersionNo()} for a {@code fileTypeName} according
360      * to {@link FileAnalyzer#getFileTypeName()}.
361      * @param fileTypeName a defined instance
362      * @return a registered value or {@link Long#MIN_VALUE} if
363      * {@code fileTypeName} is unknown
364      */
getAnalyzerVersionNo(String fileTypeName)365     public static long getAnalyzerVersionNo(String fileTypeName) {
366         return ANALYZER_VERSIONS.getOrDefault(fileTypeName, Long.MIN_VALUE);
367     }
368 
getAnalyzersVersionNos()369     public static Map<String, Long> getAnalyzersVersionNos() {
370         return Collections.unmodifiableMap(ANALYZER_VERSIONS);
371     }
372 
getExtensionsMap()373     public static Map<String, AnalyzerFactory> getExtensionsMap() {
374         return Collections.unmodifiableMap(ext);
375     }
376 
getPrefixesMap()377     public static Map<String, AnalyzerFactory> getPrefixesMap() {
378         return Collections.unmodifiableMap(pre);
379     }
380 
getMagicsMap()381     public static Map<String, AnalyzerFactory> getMagicsMap() {
382         return Collections.unmodifiableMap(magics);
383     }
384 
getAnalyzerFactoryMatchers()385     public static List<Matcher> getAnalyzerFactoryMatchers() {
386         return Collections.unmodifiableList(matchers);
387     }
388 
getfileTypeDescriptions()389     public static Map<String, String> getfileTypeDescriptions() {
390         return Collections.unmodifiableMap(fileTypeDescriptions);
391     }
392 
getAnalyzerFactories()393     public static List<AnalyzerFactory> getAnalyzerFactories() {
394         return Collections.unmodifiableList(factories);
395     }
396 
397     private static final String USED_IN_MULTIPLE_MSG = "' used in multiple analyzers";
398 
399     /**
400      * Register a {@code FileAnalyzerFactory} instance.
401      */
registerAnalyzer(AnalyzerFactory factory)402     private static void registerAnalyzer(AnalyzerFactory factory) {
403         for (String name : factory.getFileNames()) {
404             AnalyzerFactory old = FILE_NAMES.put(name, factory);
405             assert old == null :
406                     "name '" + name + USED_IN_MULTIPLE_MSG;
407         }
408         for (String prefix : factory.getPrefixes()) {
409             AnalyzerFactory old = pre.put(prefix, factory);
410             assert old == null :
411                     "prefix '" + prefix + USED_IN_MULTIPLE_MSG;
412         }
413         for (String suffix : factory.getSuffixes()) {
414             AnalyzerFactory old = ext.put(suffix, factory);
415             assert old == null :
416                     "suffix '" + suffix + USED_IN_MULTIPLE_MSG;
417         }
418         for (String magic : factory.getMagicStrings()) {
419             AnalyzerFactory old = magics.put(magic, factory);
420             assert old == null :
421                     "magic '" + magic + USED_IN_MULTIPLE_MSG;
422         }
423         matchers.addAll(factory.getMatchers());
424         factories.add(factory);
425 
426         AbstractAnalyzer fa = factory.getAnalyzer();
427         String fileTypeName = fa.getFileTypeName();
428         FILETYPE_FACTORIES.put(fileTypeName, factory);
429         ANALYZER_VERSIONS.put(fileTypeName, fa.getVersionNo());
430 
431         // Possibly configure default LANG mappings for the factory.
432         String ctagsLang = factory.getAnalyzer().getCtagsLang();
433         if (ctagsLang != null) {
434             List<String> prefixes = factory.getPrefixes();
435             if (prefixes != null) {
436                 for (String prefix : prefixes) {
437                     defaultLangMap.add(prefix, ctagsLang);
438                 }
439             }
440 
441             List<String> suffixes = factory.getSuffixes();
442             if (suffixes != null) {
443                 for (String suffix : suffixes) {
444                     // LangMap needs a "." to signify a file extension.
445                     defaultLangMap.add("." + suffix, ctagsLang);
446                 }
447             }
448         }
449     }
450 
451     /**
452      * Instruct the AnalyzerGuru to use a given analyzer for a given file
453      * prefix.
454      *
455      * @param prefix the file prefix to add
456      * @param factory a factory which creates the analyzer to use for the given
457      * extension (if you pass null as the analyzer, you will disable the
458      * analyzer used for that extension)
459      */
addPrefix(String prefix, AnalyzerFactory factory)460     public static void addPrefix(String prefix, AnalyzerFactory factory) {
461         AnalyzerFactory oldFactory;
462         if (factory == null) {
463             oldFactory = pre.remove(prefix);
464             langMap.exclude(prefix);
465         } else {
466             oldFactory = pre.put(prefix, factory);
467             langMap.add(prefix, factory.getAnalyzer().getCtagsLang());
468         }
469 
470         if (factoriesDifferent(factory, oldFactory)) {
471             addCustomizationKey("p:" + prefix);
472         }
473     }
474 
475     /**
476      * Instruct the AnalyzerGuru to use a given analyzer for a given file
477      * extension.
478      *
479      * @param extension the file-extension to add
480      * @param factory a factory which creates the analyzer to use for the given
481      * extension (if you pass null as the analyzer, you will disable the
482      * analyzer used for that extension)
483      * @throws IllegalArgumentException if {@code extension} contains a period
484      */
addExtension(String extension, AnalyzerFactory factory)485     public static void addExtension(String extension, AnalyzerFactory factory) {
486         if (extension.contains(".")) {
487             throw new IllegalArgumentException("extension contains a '.'");
488         }
489 
490         // LangMap fileSpec requires a leading period to indicate an extension.
491         String langMapExtension = "." + extension;
492 
493         AnalyzerFactory oldFactory;
494         if (factory == null) {
495             oldFactory = ext.remove(extension);
496             langMap.exclude(langMapExtension);
497         } else {
498             oldFactory = ext.put(extension, factory);
499             langMap.add(langMapExtension, factory.getAnalyzer().getCtagsLang());
500         }
501 
502         if (factoriesDifferent(factory, oldFactory)) {
503             addCustomizationKey("e:" + extension);
504         }
505     }
506 
507     /**
508      * Gets an unmodifiable view of the language mappings resulting from
509      * {@link #addExtension(String, AnalyzerFactory)} and
510      * {@link #addPrefix(String, AnalyzerFactory)} merged with default language
511      * mappings of OpenGrok's analyzers.
512      */
getLangMap()513     public static LangMap getLangMap() {
514         return langMap.mergeSecondary(defaultLangMap).unmodifiable();
515     }
516 
517     /**
518      * Get the default Analyzer.
519      *
520      * @return default FileAnalyzer
521      */
getAnalyzer()522     public static AbstractAnalyzer getAnalyzer() {
523         return DEFAULT_ANALYZER_FACTORY.getAnalyzer();
524     }
525 
526     /**
527      * Gets an analyzer for the specified {@code fileTypeName} if it accords
528      * with a known {@link FileAnalyzer#getFileTypeName()}.
529      * @param fileTypeName a defined name
530      * @return a defined instance if known or otherwise {@code null}
531      */
getAnalyzer(String fileTypeName)532     public static AbstractAnalyzer getAnalyzer(String fileTypeName) {
533         AnalyzerFactory factory = FILETYPE_FACTORIES.get(fileTypeName);
534         return factory == null ? null : factory.getAnalyzer();
535     }
536 
537     /**
538      * Get an analyzer suited to analyze a file. This function will reuse
539      * analyzers since they are costly.
540      *
541      * @param in Input stream containing data to be analyzed
542      * @param file Name of the file to be analyzed
543      * @return An analyzer suited for that file content
544      * @throws java.io.IOException If an error occurs while accessing the data
545      * in the input stream.
546      */
getAnalyzer(InputStream in, String file)547     public static AbstractAnalyzer getAnalyzer(InputStream in, String file) throws IOException {
548         AnalyzerFactory factory = find(in, file);
549         if (factory == null) {
550             AbstractAnalyzer defaultAnalyzer = getAnalyzer();
551             if (LOGGER.isLoggable(Level.FINEST)) {
552                 LOGGER.log(Level.FINEST, "{0}: fallback {1}",
553                     new Object[]{file,
554                     defaultAnalyzer.getClass().getSimpleName() });
555             }
556             return defaultAnalyzer;
557         }
558         return factory.getAnalyzer();
559     }
560 
561     /**
562      * Free resources associated with all registered analyzers.
563      */
returnAnalyzers()564     public static void returnAnalyzers() {
565         for (AnalyzerFactory analyzer : factories) {
566             analyzer.returnAnalyzer();
567         }
568     }
569 
570     /**
571      * Populate a Lucene document with the required fields.
572      *
573      * @param doc The document to populate
574      * @param file The file to index
575      * @param path Where the file is located (from source root)
576      * @param fa The analyzer to use on the file
577      * @param xrefOut Where to write the xref (possibly {@code null})
578      * @throws IOException If an exception occurs while collecting the data
579      * @throws InterruptedException if a timeout occurs
580      */
populateDocument(Document doc, File file, String path, AbstractAnalyzer fa, Writer xrefOut)581     public void populateDocument(Document doc, File file, String path, AbstractAnalyzer fa, Writer xrefOut)
582             throws IOException, InterruptedException {
583 
584         String date = DateTools.timeToString(file.lastModified(),
585                 DateTools.Resolution.MILLISECOND);
586         path = Util.fixPathIfWindows(path);
587         doc.add(new Field(QueryBuilder.U, Util.path2uid(path, date),
588                 string_ft_stored_nanalyzed_norms));
589         doc.add(new Field(QueryBuilder.FULLPATH, file.getAbsolutePath(),
590                 string_ft_nstored_nanalyzed_norms));
591         doc.add(new SortedDocValuesField(QueryBuilder.FULLPATH,
592                 new BytesRef(file.getAbsolutePath())));
593 
594         if (RuntimeEnvironment.getInstance().isHistoryEnabled()) {
595             try {
596                 HistoryGuru histGuru = HistoryGuru.getInstance();
597                 HistoryReader hr = histGuru.getHistoryReader(file);
598                 if (hr != null) {
599                     doc.add(new TextField(QueryBuilder.HIST, hr));
600                     History history;
601                     if ((history = histGuru.getHistory(file)) != null) {
602                         List<HistoryEntry> historyEntries = history.getHistoryEntries(1, 0);
603                         if (!historyEntries.isEmpty()) {
604                             HistoryEntry histEntry = historyEntries.get(0);
605                             doc.add(new TextField(QueryBuilder.LASTREV, histEntry.getRevision(), Store.YES));
606                         }
607                     }
608                 }
609             } catch (HistoryException e) {
610                 LOGGER.log(Level.WARNING, "An error occurred while reading history: ", e);
611             }
612         }
613         doc.add(new Field(QueryBuilder.DATE, date, string_ft_stored_nanalyzed_norms));
614         doc.add(new SortedDocValuesField(QueryBuilder.DATE, new BytesRef(date)));
615 
616         // `path' is not null, as it was passed to Util.path2uid() above.
617         doc.add(new TextField(QueryBuilder.PATH, path, Store.YES));
618         Project project = Project.getProject(path);
619         if (project != null) {
620             doc.add(new TextField(QueryBuilder.PROJECT, project.getPath(), Store.YES));
621         }
622 
623         /*
624          * Use the parent of the path -- not the absolute file as is done for
625          * FULLPATH -- so that DIRPATH is the same convention as for PATH
626          * above. A StringField, however, is used instead of a TextField.
627          */
628         File fpath = new File(path);
629         String fileParent = fpath.getParent();
630         if (fileParent != null && fileParent.length() > 0) {
631             String normalizedPath = QueryBuilder.normalizeDirPath(fileParent);
632             StringField npstring = new StringField(QueryBuilder.DIRPATH,
633                 normalizedPath, Store.NO);
634             doc.add(npstring);
635         }
636 
637         if (fa != null) {
638             AbstractAnalyzer.Genre g = fa.getGenre();
639             if (g == AbstractAnalyzer.Genre.PLAIN || g == AbstractAnalyzer.Genre.XREFABLE || g == AbstractAnalyzer.Genre.HTML) {
640                 doc.add(new Field(QueryBuilder.T, g.typeName(), string_ft_stored_nanalyzed_norms));
641             }
642             fa.analyze(doc, StreamSource.fromFile(file), xrefOut);
643 
644             String type = fa.getFileTypeName();
645             doc.add(new StringField(QueryBuilder.TYPE, type, Store.YES));
646         }
647     }
648 
649     /**
650      * Write a browse-able version of the file.
651      *
652      * @param factory The analyzer factory for this file type
653      * @param in The input stream containing the data
654      * @param out Where to write the result
655      * @param defs definitions for the source file, if available
656      * @param annotation Annotation information for the file
657      * @param project Project the file belongs to
658      * @throws java.io.IOException If an error occurs while creating the output
659      */
writeXref(AnalyzerFactory factory, Reader in, Writer out, Definitions defs, Annotation annotation, Project project)660     public static void writeXref(AnalyzerFactory factory, Reader in,
661             Writer out, Definitions defs,
662             Annotation annotation, Project project)
663             throws IOException {
664         Reader input = in;
665         if (factory.getGenre() == AbstractAnalyzer.Genre.PLAIN) {
666             // This is some kind of text file, so we need to expand tabs to
667             // spaces to match the project's tab settings.
668             input = ExpandTabsReader.wrap(in, project);
669         }
670 
671         WriteXrefArgs args = new WriteXrefArgs(input, out);
672         args.setDefs(defs);
673         args.setAnnotation(annotation);
674         args.setProject(project);
675 
676         AbstractAnalyzer analyzer = factory.getAnalyzer();
677         RuntimeEnvironment env = RuntimeEnvironment.getInstance();
678         analyzer.setScopesEnabled(env.isScopesEnabled());
679         analyzer.setFoldingEnabled(env.isFoldingEnabled());
680         analyzer.writeXref(args);
681     }
682 
683     /**
684      * Writes a browse-able version of the file transformed for immediate
685      * serving to a web client.
686      * @param contextPath the web context path for
687      * {@link Util#dumpXref(java.io.Writer, java.io.Reader, java.lang.String)}
688      * @param factory the analyzer factory for this file type
689      * @param in the input stream containing the data
690      * @param out a defined instance to write
691      * @param defs definitions for the source file, if available
692      * @param annotation annotation information for the file
693      * @param project project the file belongs to
694      * @throws java.io.IOException if an error occurs while creating the output
695      */
writeDumpedXref(String contextPath, AnalyzerFactory factory, Reader in, Writer out, Definitions defs, Annotation annotation, Project project)696     public static void writeDumpedXref(String contextPath,
697             AnalyzerFactory factory, Reader in, Writer out,
698             Definitions defs, Annotation annotation, Project project)
699             throws IOException {
700 
701         File xrefTemp = File.createTempFile("ogxref", ".html");
702         try {
703             try (FileWriter tmpout = new FileWriter(xrefTemp)) {
704                 writeXref(factory, in, tmpout, defs, annotation, project);
705             }
706             Util.dumpXref(out, xrefTemp, false, contextPath);
707         } finally {
708             Files.delete(xrefTemp.toPath());
709         }
710     }
711 
712     /**
713      * Get the genre of a file.
714      *
715      * @param file The file to inspect
716      * @return The genre suitable to decide how to display the file
717      */
getGenre(String file)718     public static AbstractAnalyzer.Genre getGenre(String file) {
719         return getGenre(find(file));
720     }
721 
722     /**
723      * Get the genre of a bulk of data.
724      *
725      * @param in A stream containing the data
726      * @return The genre suitable to decide how to display the file
727      * @throws java.io.IOException If an error occurs while getting the content
728      */
getGenre(InputStream in)729     public static AbstractAnalyzer.Genre getGenre(InputStream in) throws IOException {
730         return getGenre(find(in));
731     }
732 
733     /**
734      * Get the genre for a named class (this is most likely an analyzer).
735      *
736      * @param factory the analyzer factory to get the genre for
737      * @return The genre of this class (null if not found)
738      */
getGenre(AnalyzerFactory factory)739     public static AbstractAnalyzer.Genre getGenre(AnalyzerFactory factory) {
740         if (factory != null) {
741             return factory.getGenre();
742         }
743         return null;
744     }
745 
746     /**
747      * Finds a {@code FileAnalyzerFactory} for the specified
748      * {@link FileAnalyzer#getFileTypeName()}.
749      * @param fileTypeName a defined instance
750      * @return a defined instance or {@code null}
751      */
findByFileTypeName(String fileTypeName)752     public static AnalyzerFactory findByFileTypeName(String fileTypeName) {
753         return FILETYPE_FACTORIES.get(fileTypeName);
754     }
755 
756     /**
757      * Find a {@code FileAnalyzerFactory} with the specified class name. If one
758      * doesn't exist, create one and register it. Allow specification of either
759      * the complete class name (which includes the package name) or the simple
760      * name of the class.
761      *
762      * @param factoryClassName name of the factory class
763      * @return a file analyzer factory
764      *
765      * @throws ClassNotFoundException if there is no class with that name
766      * @throws ClassCastException if the class is not a subclass of {@code
767      * FileAnalyzerFactory}
768      * @throws IllegalAccessException if the constructor cannot be accessed
769      * @throws InstantiationException if the class cannot be instantiated
770      * @throws NoSuchMethodException if no-argument constructor could not be found
771      * @throws InvocationTargetException if the underlying constructor throws an exception
772      */
findFactory(String factoryClassName)773     public static AnalyzerFactory findFactory(String factoryClassName)
774             throws ClassNotFoundException, IllegalAccessException, InstantiationException, NoSuchMethodException,
775             InvocationTargetException {
776         Class<?> fcn;
777         try {
778             fcn = Class.forName(factoryClassName);
779 
780         } catch (ClassNotFoundException e) {
781             fcn = getFactoryClass(factoryClassName);
782 
783             if (fcn == null) {
784                 throw new ClassNotFoundException("Unable to locate class " + factoryClassName);
785             }
786         }
787 
788         return findFactory(fcn);
789     }
790 
791     /**
792      * Get Analyzer factory class using class simple name.
793      *
794      * @param simpleName which may be either the factory class
795      * simple name (eg. CAnalyzerFactory), the analyzer name
796      * (eg. CAnalyzer), or the language name (eg. C)
797      *
798      * @return the analyzer factory class, or null when not found.
799      */
getFactoryClass(String simpleName)800     public static Class<?> getFactoryClass(String simpleName) {
801         Class<?> factoryClass = null;
802 
803         // Build analysis package name list first time only
804         if (analysisPkgNames.isEmpty()) {
805             Package[] p = Package.getPackages();
806             for (Package pp : p) {
807                 String pname = pp.getName();
808                 if (pname.contains(".analysis.")) {
809                     analysisPkgNames.add(pname);
810                 }
811             }
812         }
813 
814         // This allows user to enter the language or analyzer name
815         // (eg. C or CAnalyzer vs. CAnalyzerFactory)
816         // Note that this assumes a regular naming scheme of
817         // all language parsers:
818         //      <language>Analyzer, <language>AnalyzerFactory
819 
820         if (!simpleName.contains("Analyzer")) {
821             simpleName += "Analyzer";
822         }
823 
824         if (!simpleName.contains("Factory")) {
825             simpleName += "Factory";
826         }
827 
828         for (String aPackage : analysisPkgNames) {
829             try {
830                 String fqn = aPackage + "." + simpleName;
831                 factoryClass = Class.forName(fqn);
832                 break;
833             } catch (ClassNotFoundException e) {
834                 // Ignore
835             }
836         }
837 
838         return factoryClass;
839     }
840 
841     /**
842      * Find a {@code FileAnalyzerFactory} which is an instance of the specified
843      * class. If one doesn't exist, create one and register it.
844      *
845      * @param factoryClass the factory class
846      * @return a file analyzer factory
847      *
848      * @throws ClassCastException if the class is not a subclass of {@code
849      * FileAnalyzerFactory}
850      * @throws IllegalAccessException if the constructor cannot be accessed
851      * @throws InstantiationException if the class cannot be instantiated
852      * @throws NoSuchMethodException if no-argument constructor could not be found
853      * @throws InvocationTargetException if the underlying constructor throws an exception
854      */
findFactory(Class<?> factoryClass)855     private static AnalyzerFactory findFactory(Class<?> factoryClass)
856             throws InstantiationException, IllegalAccessException, NoSuchMethodException, InvocationTargetException {
857         for (AnalyzerFactory f : factories) {
858             if (f.getClass() == factoryClass) {
859                 return f;
860             }
861         }
862         AnalyzerFactory f = (AnalyzerFactory) factoryClass.getDeclaredConstructor().newInstance();
863         registerAnalyzer(f);
864         return f;
865     }
866 
867     /**
868      * Finds a suitable analyser class for file name. If the analyzer cannot be
869      * determined by the file extension, try to look at the data in the
870      * InputStream to find a suitable analyzer.
871      *
872      * Use if you just want to find file type.
873      *
874      * @param in The input stream containing the data
875      * @param file The file name to get the analyzer for
876      * @return the analyzer factory to use
877      * @throws java.io.IOException If a problem occurred while reading the data
878      */
find(InputStream in, String file)879     public static AnalyzerFactory find(InputStream in, String file) throws IOException {
880         AnalyzerFactory factory = find(file);
881 
882         if (factory != null) {
883             return factory;
884         }
885         return findForStream(in, file);
886     }
887 
888     /**
889      * Finds a suitable analyser class for file name.
890      *
891      * @param file The file name to get the analyzer for
892      * @return the analyzer factory to use
893      */
find(String file)894     public static AnalyzerFactory find(String file) {
895         String path = file;
896         int i;
897 
898         // Get basename of the file first.
899         if (((i = path.lastIndexOf(File.separatorChar)) > 0)
900                 && (i + 1 < path.length())) {
901             path = path.substring(i + 1);
902         }
903 
904         int dotpos = path.lastIndexOf('.');
905         if (dotpos >= 0) {
906             AnalyzerFactory factory;
907 
908             // Try matching the prefix.
909             if (dotpos > 0) {
910                 factory = pre.get(path.substring(0, dotpos).toUpperCase(Locale.ROOT));
911                 if (factory != null) {
912                     if (LOGGER.isLoggable(Level.FINEST)) {
913                         LOGGER.log(Level.FINEST, "{0}: chosen by prefix: {1}",
914                             new Object[]{file,
915                             factory.getClass().getSimpleName() });
916                     }
917                     return factory;
918                 }
919             }
920 
921             // Now try matching the suffix. We kind of consider this order (first
922             // prefix then suffix) to be workable although for sure there can be
923             // cases when this does not work.
924             factory = ext.get(path.substring(dotpos + 1).toUpperCase(Locale.ROOT));
925             if (factory != null) {
926                 if (LOGGER.isLoggable(Level.FINEST)) {
927                     LOGGER.log(Level.FINEST, "{0}: chosen by suffix: {1}",
928                         new Object[]{file,
929                         factory.getClass().getSimpleName() });
930                 }
931                 return factory;
932             }
933         }
934 
935         // file doesn't have any of the prefix or extensions we know, try full match
936         return FILE_NAMES.get(path.toUpperCase(Locale.ROOT));
937     }
938 
939     /**
940      * Finds a suitable analyzer class for the data in this stream.
941      *
942      * @param in The stream containing the data to analyze
943      * @return the analyzer factory to use
944      * @throws java.io.IOException if an error occurs while reading data from
945      * the stream
946      */
find(InputStream in)947     public static AnalyzerFactory find(InputStream in) throws IOException {
948         return findForStream(in, "<anonymous>");
949     }
950 
951     /**
952      * Finds a suitable analyzer class for the data in this stream
953      * corresponding to a file of the specified name.
954      *
955      * @param in The stream containing the data to analyze
956      * @param file The file name to get the analyzer for
957      * @return the analyzer factory to use
958      * @throws java.io.IOException if an error occurs while reading data from
959      * the stream
960      */
findForStream(InputStream in, String file)961     private static AnalyzerFactory findForStream(InputStream in,
962         String file) throws IOException {
963 
964         in.mark(MAGIC_BYTES_NUM);
965         byte[] content = new byte[MAGIC_BYTES_NUM];
966         int len = in.read(content);
967         in.reset();
968 
969         if (len < MAGIC_BYTES_NUM) {
970             /*
971              * Need at least 4 bytes to perform magic string matching.
972              */
973             if (len < 4) {
974                 return null;
975             }
976             content = Arrays.copyOf(content, len);
977         }
978 
979         AnalyzerFactory fac;
980 
981         // First, do precise-magic Matcher matching
982         for (FileAnalyzerFactory.Matcher matcher : matchers) {
983             if (matcher.isPreciseMagic()) {
984                 fac = matcher.isMagic(content, in);
985                 if (fac != null) {
986                     if (LOGGER.isLoggable(Level.FINEST)) {
987                         LOGGER.log(Level.FINEST,
988                             "{0}: chosen by precise magic: {1}", new Object[]{
989                             file, fac.getClass().getSimpleName() });
990                     }
991                     return fac;
992                 }
993             }
994         }
995 
996         // Next, look for magic strings
997         String opening = readOpening(in, content);
998         fac = findMagicString(opening, file);
999         if (fac != null) {
1000             return fac;
1001         }
1002 
1003         // Last, do imprecise-magic Matcher matching
1004         for (FileAnalyzerFactory.Matcher matcher : matchers) {
1005             if (!matcher.isPreciseMagic()) {
1006                 fac = matcher.isMagic(content, in);
1007                 if (fac != null) {
1008                     if (LOGGER.isLoggable(Level.FINEST)) {
1009                         LOGGER.log(Level.FINEST,
1010                             "{0}: chosen by imprecise magic: {1}",
1011                             new Object[]{file,
1012                             fac.getClass().getSimpleName() });
1013                     }
1014                     return fac;
1015                 }
1016             }
1017         }
1018 
1019         return null;
1020     }
1021 
findMagicString(String opening, String file)1022     private static AnalyzerFactory findMagicString(String opening, String file) {
1023 
1024         // first, try to look up two words in magics
1025         String fragment = getWords(opening, 2);
1026         AnalyzerFactory fac = magics.get(fragment);
1027         if (fac != null) {
1028             if (LOGGER.isLoggable(Level.FINEST)) {
1029                 LOGGER.log(Level.FINEST, "{0}: chosen by magic {2}: {1}",
1030                     new Object[]{file, fac.getClass().getSimpleName(),
1031                     fragment});
1032             }
1033             return fac;
1034         }
1035 
1036         // second, try to look up one word in magics
1037         fragment = getWords(opening, 1);
1038         fac = magics.get(fragment);
1039         if (fac != null) {
1040             if (LOGGER.isLoggable(Level.FINEST)) {
1041                 LOGGER.log(Level.FINEST, "{0}: chosen by magic {2}: {1}",
1042                     new Object[]{file, fac.getClass().getSimpleName(),
1043                     fragment});
1044             }
1045             return fac;
1046         }
1047 
1048         // try to match initial substrings (DESC strlen)
1049         for (Map.Entry<String, AnalyzerFactory> entry :
1050             magics.entrySet()) {
1051             String magic = entry.getKey();
1052             if (opening.startsWith(magic)) {
1053                 fac = entry.getValue();
1054                 if (LOGGER.isLoggable(Level.FINEST)) {
1055                     LOGGER.log(Level.FINEST,
1056                         "{0}: chosen by magic(substr) {2}: {1}", new Object[]{
1057                         file, fac.getClass().getSimpleName(), magic});
1058                 }
1059                 return fac;
1060             }
1061         }
1062 
1063         return null;
1064     }
1065 
1066     /**
1067      * Extract initial words from a String, or take the entire
1068      * <code>value</code> if not enough words can be identified. (If
1069      * <code>n</code> is not 1 or more, returns an empty String.) (A "word"
1070      * ends at each and every space character.)
1071      *
1072      * @param value The source from which words are cut
1073      * @param n The number of words to try to extract
1074      * @return The extracted words or <code>""</code>
1075      */
getWords(String value, int n)1076     private static String getWords(String value, int n) {
1077         if (n < 1) {
1078             return "";
1079         }
1080         int l = 0;
1081         while (n-- > 0) {
1082             int o = l > 0 ? l + 1 : l;
1083             int i = value.indexOf(' ', o);
1084             if (i == -1) {
1085                 return value;
1086             }
1087             l = i;
1088         }
1089         return value.substring(0, l);
1090     }
1091 
1092     /**
1093      * Extract an opening string from the input stream, past any BOM, and past
1094      * any initial whitespace, but only up to <code>OPENING_MAX_CHARS</code> or
1095      * to the first <code>\n</code> after any non-whitespace. (Hashbang, #!,
1096      * openings will have superfluous space removed.)
1097      *
1098      * @param in The input stream containing the data
1099      * @param sig The initial sequence of bytes in the input stream
1100      * @return The extracted string or <code>""</code>
1101      * @throws java.io.IOException in case of any read error
1102      */
readOpening(InputStream in, byte[] sig)1103     private static String readOpening(InputStream in, byte[] sig)
1104         throws IOException {
1105 
1106         in.mark(MARK_READ_LIMIT);
1107 
1108         String encoding = IOUtils.findBOMEncoding(sig);
1109         if (encoding == null) {
1110             // SRCROOT is read with UTF-8 as a default.
1111             encoding = StandardCharsets.UTF_8.name();
1112         } else {
1113             int skipForBOM = IOUtils.skipForBOM(sig);
1114             if (in.skip(skipForBOM) < skipForBOM) {
1115                 in.reset();
1116                 return "";
1117             }
1118         }
1119 
1120         int nRead = 0;
1121         boolean sawNonWhitespace = false;
1122         boolean lastWhitespace = false;
1123         boolean postHashbang = false;
1124         int r;
1125 
1126         StringBuilder opening = new StringBuilder();
1127         BufferedReader readr = new BufferedReader(new InputStreamReader(in, encoding), OPENING_MAX_CHARS);
1128         while ((r = readr.read()) != -1) {
1129             if (++nRead > OPENING_MAX_CHARS) {
1130                 break;
1131             }
1132             char c = (char) r;
1133             boolean isWhitespace = Character.isWhitespace(c);
1134             if (!sawNonWhitespace) {
1135                 if (isWhitespace) {
1136                     continue;
1137                 }
1138                 sawNonWhitespace = true;
1139             }
1140             if (c == '\n') {
1141                 break;
1142             }
1143 
1144             if (isWhitespace) {
1145                 // Track `lastWhitespace' to condense stretches of whitespace,
1146                 // and use ' ' regardless of actual whitespace character to
1147                 // accord with magic string definitions.
1148                 if (!lastWhitespace && !postHashbang) {
1149                     opening.append(' ');
1150                 }
1151             } else {
1152                 opening.append(c);
1153                 postHashbang = false;
1154             }
1155             lastWhitespace = isWhitespace;
1156 
1157             // If the opening starts with "#!", then track so that any
1158             // trailing whitespace after the hashbang is ignored.
1159             if (opening.length() == 2 && opening.charAt(0) == '#' && opening.charAt(1) == '!') {
1160                 postHashbang = true;
1161             }
1162         }
1163 
1164         in.reset();
1165         return opening.toString();
1166     }
1167 
addCustomizationKey(String k)1168     private static void addCustomizationKey(String k) {
1169         CUSTOMIZATION_KEYS.add(k);
1170         Object[] keys = CUSTOMIZATION_KEYS.toArray();
1171         customizationHashCode = Objects.hash(keys);
1172     }
1173 
factoriesDifferent(AnalyzerFactory a, AnalyzerFactory b)1174     private static boolean factoriesDifferent(AnalyzerFactory a, AnalyzerFactory b) {
1175         String aName = null;
1176         if (a != null) {
1177             aName = a.getName();
1178             if (aName == null) {
1179                 aName = a.getClass().getSimpleName();
1180             }
1181         }
1182         String bName = null;
1183         if (b != null) {
1184             bName = b.getName();
1185             if (bName == null) {
1186                 bName = b.getClass().getSimpleName();
1187             }
1188         }
1189         if (aName == null && bName == null) {
1190             return false;
1191         }
1192         return aName == null || !aName.equals(bName);
1193     }
1194 }
1195