xref: /OpenGrok/opengrok-indexer/src/main/java/org/opengrok/indexer/analysis/AnalyzerGuru.java (revision 750b3115a5b8976536ee4dccce497eb97b7a4c9b)
1b5840353SAdam Hornáček /*
2b5840353SAdam Hornáček  * CDDL HEADER START
3b5840353SAdam Hornáček  *
4b5840353SAdam Hornáček  * The contents of this file are subject to the terms of the
5b5840353SAdam Hornáček  * Common Development and Distribution License (the "License").
6b5840353SAdam Hornáček  * You may not use this file except in compliance with the License.
7b5840353SAdam Hornáček  *
8b5840353SAdam Hornáček  * See LICENSE.txt included in this distribution for the specific
9b5840353SAdam Hornáček  * language governing permissions and limitations under the License.
10b5840353SAdam Hornáček  *
11b5840353SAdam Hornáček  * When distributing Covered Code, include this CDDL HEADER in each
12b5840353SAdam Hornáček  * file and include the License file at LICENSE.txt.
13b5840353SAdam Hornáček  * If applicable, add the following below this CDDL HEADER, with the
14b5840353SAdam Hornáček  * fields enclosed by brackets "[]" replaced with your own identifying
15b5840353SAdam Hornáček  * information: Portions Copyright [yyyy] [name of copyright owner]
16b5840353SAdam Hornáček  *
17b5840353SAdam Hornáček  * CDDL HEADER END
18b5840353SAdam Hornáček  */
19b5840353SAdam Hornáček 
20b5840353SAdam Hornáček /*
2136b6a6f8SVladimir Kotal  * Copyright (c) 2005, 2021, Oracle and/or its affiliates. All rights reserved.
22*750b3115SChris Fraire  * Portions Copyright (c) 2017, 2021, Chris Fraire <cfraire@me.com>.
23b5840353SAdam Hornáček  */
249805b761SAdam Hornáček package org.opengrok.indexer.analysis;
25b5840353SAdam Hornáček 
26b5840353SAdam Hornáček import java.io.BufferedReader;
27b5840353SAdam Hornáček import java.io.File;
28b5840353SAdam Hornáček import java.io.FileWriter;
29b5840353SAdam Hornáček import java.io.IOException;
30b5840353SAdam Hornáček import java.io.InputStream;
31b5840353SAdam Hornáček import java.io.InputStreamReader;
32b5840353SAdam Hornáček import java.io.Reader;
33b5840353SAdam Hornáček import java.io.Writer;
34b5840353SAdam Hornáček import java.lang.reflect.InvocationTargetException;
35b5840353SAdam Hornáček import java.nio.charset.StandardCharsets;
3653e96f58SVladimir Kotal import java.nio.file.Files;
37b5840353SAdam Hornáček import java.util.ArrayList;
38b5840353SAdam Hornáček import java.util.Arrays;
39b5840353SAdam Hornáček import java.util.Collections;
40b5840353SAdam Hornáček import java.util.Comparator;
41b5840353SAdam Hornáček import java.util.HashMap;
42b5840353SAdam Hornáček import java.util.List;
43b5840353SAdam Hornáček import java.util.Locale;
44b5840353SAdam Hornáček import java.util.Map;
45b5840353SAdam Hornáček import java.util.Objects;
46b5840353SAdam Hornáček import java.util.SortedMap;
47b5840353SAdam Hornáček import java.util.TreeMap;
48b5840353SAdam Hornáček import java.util.TreeSet;
49b5840353SAdam Hornáček import java.util.logging.Level;
50b5840353SAdam Hornáček import java.util.logging.Logger;
51b5840353SAdam Hornáček import org.apache.lucene.document.DateTools;
52b5840353SAdam Hornáček import org.apache.lucene.document.Document;
53b5840353SAdam Hornáček import org.apache.lucene.document.Field;
54b5840353SAdam Hornáček import org.apache.lucene.document.Field.Store;
55b5840353SAdam Hornáček import org.apache.lucene.document.FieldType;
56b5840353SAdam Hornáček import org.apache.lucene.document.SortedDocValuesField;
57b5840353SAdam Hornáček import org.apache.lucene.document.StringField;
58b5840353SAdam Hornáček import org.apache.lucene.document.TextField;
59b5840353SAdam Hornáček import org.apache.lucene.util.BytesRef;
609805b761SAdam Hornáček import org.opengrok.indexer.analysis.FileAnalyzerFactory.Matcher;
619805b761SAdam Hornáček import org.opengrok.indexer.analysis.ada.AdaAnalyzerFactory;
629805b761SAdam Hornáček import org.opengrok.indexer.analysis.archive.BZip2AnalyzerFactory;
639805b761SAdam Hornáček import org.opengrok.indexer.analysis.archive.GZIPAnalyzerFactory;
649805b761SAdam Hornáček import org.opengrok.indexer.analysis.archive.TarAnalyzerFactory;
659805b761SAdam Hornáček import org.opengrok.indexer.analysis.archive.ZipAnalyzerFactory;
6650065c95SChris Fraire import org.opengrok.indexer.analysis.asm.AsmAnalyzerFactory;
679805b761SAdam Hornáček import org.opengrok.indexer.analysis.c.CAnalyzerFactory;
689805b761SAdam Hornáček import org.opengrok.indexer.analysis.c.CxxAnalyzerFactory;
6957eefa47SKryštof Tulinger import org.opengrok.indexer.analysis.clojure.ClojureAnalyzerFactory;
709805b761SAdam Hornáček import org.opengrok.indexer.analysis.csharp.CSharpAnalyzerFactory;
719805b761SAdam Hornáček import org.opengrok.indexer.analysis.data.IgnorantAnalyzerFactory;
729805b761SAdam Hornáček import org.opengrok.indexer.analysis.data.ImageAnalyzerFactory;
739805b761SAdam Hornáček import org.opengrok.indexer.analysis.document.MandocAnalyzerFactory;
749805b761SAdam Hornáček import org.opengrok.indexer.analysis.document.TroffAnalyzerFactory;
759805b761SAdam Hornáček import org.opengrok.indexer.analysis.eiffel.EiffelAnalyzerFactory;
769805b761SAdam Hornáček import org.opengrok.indexer.analysis.erlang.ErlangAnalyzerFactory;
779805b761SAdam Hornáček import org.opengrok.indexer.analysis.executables.ELFAnalyzerFactory;
789805b761SAdam Hornáček import org.opengrok.indexer.analysis.executables.JarAnalyzerFactory;
799805b761SAdam Hornáček import org.opengrok.indexer.analysis.executables.JavaClassAnalyzerFactory;
809805b761SAdam Hornáček import org.opengrok.indexer.analysis.fortran.FortranAnalyzerFactory;
819805b761SAdam Hornáček import org.opengrok.indexer.analysis.golang.GolangAnalyzerFactory;
829805b761SAdam Hornáček import org.opengrok.indexer.analysis.haskell.HaskellAnalyzerFactory;
83c5ef7ff6SChris Fraire import org.opengrok.indexer.analysis.hcl.HCLAnalyzerFactory;
849805b761SAdam Hornáček import org.opengrok.indexer.analysis.java.JavaAnalyzerFactory;
859805b761SAdam Hornáček import org.opengrok.indexer.analysis.javascript.JavaScriptAnalyzerFactory;
8657eefa47SKryštof Tulinger import org.opengrok.indexer.analysis.json.JsonAnalyzerFactory;
8757eefa47SKryštof Tulinger import org.opengrok.indexer.analysis.kotlin.KotlinAnalyzerFactory;
889805b761SAdam Hornáček import org.opengrok.indexer.analysis.lisp.LispAnalyzerFactory;
8957eefa47SKryštof Tulinger import org.opengrok.indexer.analysis.lua.LuaAnalyzerFactory;
909805b761SAdam Hornáček import org.opengrok.indexer.analysis.pascal.PascalAnalyzerFactory;
919805b761SAdam Hornáček import org.opengrok.indexer.analysis.perl.PerlAnalyzerFactory;
929805b761SAdam Hornáček import org.opengrok.indexer.analysis.php.PhpAnalyzerFactory;
939805b761SAdam Hornáček import org.opengrok.indexer.analysis.plain.PlainAnalyzerFactory;
949805b761SAdam Hornáček import org.opengrok.indexer.analysis.plain.XMLAnalyzerFactory;
9557eefa47SKryštof Tulinger import org.opengrok.indexer.analysis.powershell.PowershellAnalyzerFactory;
969805b761SAdam Hornáček import org.opengrok.indexer.analysis.python.PythonAnalyzerFactory;
97ca0eafb1SChris Fraire import org.opengrok.indexer.analysis.r.RAnalyzerFactory;
9857eefa47SKryštof Tulinger import org.opengrok.indexer.analysis.ruby.RubyAnalyzerFactory;
999805b761SAdam Hornáček import org.opengrok.indexer.analysis.rust.RustAnalyzerFactory;
1009805b761SAdam Hornáček import org.opengrok.indexer.analysis.scala.ScalaAnalyzerFactory;
1019805b761SAdam Hornáček import org.opengrok.indexer.analysis.sh.ShAnalyzerFactory;
1029805b761SAdam Hornáček import org.opengrok.indexer.analysis.sql.PLSQLAnalyzerFactory;
1039805b761SAdam Hornáček import org.opengrok.indexer.analysis.sql.SQLAnalyzerFactory;
1049805b761SAdam Hornáček import org.opengrok.indexer.analysis.swift.SwiftAnalyzerFactory;
1059805b761SAdam Hornáček import org.opengrok.indexer.analysis.tcl.TclAnalyzerFactory;
106c5ef7ff6SChris Fraire import org.opengrok.indexer.analysis.terraform.TerraformAnalyzerFactory;
1070ed261b2SChris Fraire import org.opengrok.indexer.analysis.typescript.TypeScriptAnalyzerFactory;
1089805b761SAdam Hornáček import org.opengrok.indexer.analysis.uue.UuencodeAnalyzerFactory;
1099805b761SAdam Hornáček import org.opengrok.indexer.analysis.vb.VBAnalyzerFactory;
1104f9cbaecSChris Fraire import org.opengrok.indexer.analysis.verilog.VerilogAnalyzerFactory;
1119805b761SAdam Hornáček import org.opengrok.indexer.configuration.Project;
1129805b761SAdam Hornáček import org.opengrok.indexer.configuration.RuntimeEnvironment;
1139805b761SAdam Hornáček import org.opengrok.indexer.history.Annotation;
11436b6a6f8SVladimir Kotal import org.opengrok.indexer.history.History;
11536b6a6f8SVladimir Kotal import org.opengrok.indexer.history.HistoryEntry;
1169805b761SAdam Hornáček import org.opengrok.indexer.history.HistoryException;
1179805b761SAdam Hornáček import org.opengrok.indexer.history.HistoryGuru;
1189805b761SAdam Hornáček import org.opengrok.indexer.history.HistoryReader;
1199805b761SAdam Hornáček import org.opengrok.indexer.logger.LoggerFactory;
1209805b761SAdam Hornáček import org.opengrok.indexer.search.QueryBuilder;
1219805b761SAdam Hornáček import org.opengrok.indexer.util.IOUtils;
1229805b761SAdam Hornáček import org.opengrok.indexer.web.Util;
123b5840353SAdam Hornáček 
124b5840353SAdam Hornáček /**
125b5840353SAdam Hornáček  * Manages and provides Analyzers as needed. Please see
126a840d4e0SAdam Hornacek  * <a href="https://github.com/oracle/opengrok/wiki/Internals">
127b5840353SAdam Hornáček  * this</a> page for a great description of the purpose of the AnalyzerGuru.
128b5840353SAdam Hornáček  *
129b5840353SAdam Hornáček  * Created on September 22, 2005
130b5840353SAdam Hornáček  *
131b5840353SAdam Hornáček  * @author Chandan
132b5840353SAdam Hornáček  */
133b5840353SAdam Hornáček public class AnalyzerGuru {
134b5840353SAdam Hornáček 
135b5840353SAdam Hornáček     /**
136b5840353SAdam Hornáček      * The maximum number of characters (multi-byte if a BOM is identified) to
137ff44f24aSAdam Hornáček      * read from the input stream to be used for magic string matching.
138b5840353SAdam Hornáček      */
139b5840353SAdam Hornáček     private static final int OPENING_MAX_CHARS = 100;
140b5840353SAdam Hornáček 
141b5840353SAdam Hornáček     /**
142b5840353SAdam Hornáček      * Set to 16K -- though debugging shows it would do with only 8K+3
143ff44f24aSAdam Hornáček      * (standard buffer for Java BufferedInputStream plus 3 bytes for largest UTF BOM).
144b5840353SAdam Hornáček      */
145b5840353SAdam Hornáček     private static final int MARK_READ_LIMIT = 1024 * 16;
146b5840353SAdam Hornáček 
147b5840353SAdam Hornáček     /**
148b5840353SAdam Hornáček      * The number of bytes read from the start of the file for magic number or
149b5840353SAdam Hornáček      * string analysis. Some {@link FileAnalyzerFactory.Matcher}
150b5840353SAdam Hornáček      * implementations may read more data subsequently, but this field defines
151b5840353SAdam Hornáček      * the number of bytes initially read for general matching.
152b5840353SAdam Hornáček      */
153b5840353SAdam Hornáček     private static final int MAGIC_BYTES_NUM = 8;
154b5840353SAdam Hornáček 
155b5840353SAdam Hornáček     private static final Logger LOGGER = LoggerFactory.getLogger(AnalyzerGuru.class);
156b5840353SAdam Hornáček 
157b5840353SAdam Hornáček     /**
158b5840353SAdam Hornáček      * The default {@code FileAnalyzerFactory} instance.
159b5840353SAdam Hornáček      */
16057eefa47SKryštof Tulinger     private static final AnalyzerFactory DEFAULT_ANALYZER_FACTORY = new FileAnalyzerFactory();
161b5840353SAdam Hornáček 
162b5840353SAdam Hornáček     /**
163b5840353SAdam Hornáček      * Map from file names to analyzer factories.
164b5840353SAdam Hornáček      */
16557eefa47SKryštof Tulinger     private static final Map<String, AnalyzerFactory> FILE_NAMES = new HashMap<>();
166b5840353SAdam Hornáček 
167b5840353SAdam Hornáček     /**
168b5840353SAdam Hornáček      * Map from file extensions to analyzer factories.
169b5840353SAdam Hornáček      */
17057eefa47SKryštof Tulinger     private static final Map<String, AnalyzerFactory> ext = new HashMap<>();
171b5840353SAdam Hornáček 
172b5840353SAdam Hornáček     /**
173b5840353SAdam Hornáček      * Map from file prefixes to analyzer factories.
174b5840353SAdam Hornáček      */
17557eefa47SKryštof Tulinger     private static final Map<String, AnalyzerFactory> pre = new HashMap<>();
176b5840353SAdam Hornáček 
177b5840353SAdam Hornáček     /**
178b5840353SAdam Hornáček      * Appended when
17957eefa47SKryštof Tulinger      * {@link #addExtension(java.lang.String, AnalyzerFactory)}
180b5840353SAdam Hornáček      * or
18157eefa47SKryštof Tulinger      * {@link #addPrefix(java.lang.String, AnalyzerFactory)}
182b5840353SAdam Hornáček      * are called to augment the value in {@link #getVersionNo()}.
183b5840353SAdam Hornáček      */
184b5840353SAdam Hornáček     private static final TreeSet<String> CUSTOMIZATION_KEYS = new TreeSet<>();
185b5840353SAdam Hornáček 
186b5840353SAdam Hornáček     private static int customizationHashCode;
187b5840353SAdam Hornáček 
188b5840353SAdam Hornáček     /**
189ff44f24aSAdam Hornáček      * Descending string length comparator for magics.
190b5840353SAdam Hornáček      */
191c6f0939bSAdam Hornacek     private static final Comparator<String> descStrlenComparator = (s1, s2) -> {
192b5840353SAdam Hornáček         // DESC: s2 length <=> s1 length
193b5840353SAdam Hornáček         int cmp = Integer.compare(s2.length(), s1.length());
194a72324b1SAdam Hornáček         if (cmp != 0) {
195a72324b1SAdam Hornáček             return cmp;
196a72324b1SAdam Hornáček         }
197b5840353SAdam Hornáček 
198b5840353SAdam Hornáček         // the Comparator must also be "consistent with equals", so check
199b5840353SAdam Hornáček         // string contents too when (length)cmp == 0. (ASC: s1 <=> s2.)
200b5840353SAdam Hornáček         cmp = s1.compareTo(s2);
201b5840353SAdam Hornáček         return cmp;
202b5840353SAdam Hornáček     };
203b5840353SAdam Hornáček 
204b5840353SAdam Hornáček     /**
205b5840353SAdam Hornáček      * Map from magic strings to analyzer factories.
206b5840353SAdam Hornáček      */
20757eefa47SKryštof Tulinger     private static final SortedMap<String, AnalyzerFactory> magics =
208b5840353SAdam Hornáček         new TreeMap<>(descStrlenComparator);
209b5840353SAdam Hornáček 
210b5840353SAdam Hornáček     /**
211b5840353SAdam Hornáček      * List of matcher objects which can be used to determine which analyzer
212b5840353SAdam Hornáček      * factory to use.
213b5840353SAdam Hornáček      */
214b5840353SAdam Hornáček     private static final List<FileAnalyzerFactory.Matcher> matchers = new ArrayList<>();
215b5840353SAdam Hornáček 
216b5840353SAdam Hornáček     /**
217b5840353SAdam Hornáček      * List of all registered {@code FileAnalyzerFactory} instances.
218b5840353SAdam Hornáček      */
21957eefa47SKryštof Tulinger     private static final List<AnalyzerFactory> factories = new ArrayList<>();
220b5840353SAdam Hornáček 
221b5840353SAdam Hornáček     /**
222b5840353SAdam Hornáček      * Names of all analysis packages.
223b5840353SAdam Hornáček      */
224b5840353SAdam Hornáček     private static final List<String> analysisPkgNames = new ArrayList<>();
225b5840353SAdam Hornáček 
226b5840353SAdam Hornáček     public static final FieldType string_ft_stored_nanalyzed_norms = new FieldType(StringField.TYPE_STORED);
227b5840353SAdam Hornáček     public static final FieldType string_ft_nstored_nanalyzed_norms = new FieldType(StringField.TYPE_NOT_STORED);
228b5840353SAdam Hornáček 
229b5840353SAdam Hornáček     private static final Map<String, String> fileTypeDescriptions = new TreeMap<>();
230b5840353SAdam Hornáček 
231b5840353SAdam Hornáček     /**
232b5840353SAdam Hornáček      * Maps from {@link FileAnalyzer#getFileTypeName()} to
233ff44f24aSAdam Hornáček      * {@link FileAnalyzerFactory}.
234b5840353SAdam Hornáček      */
23557eefa47SKryštof Tulinger     private static final Map<String, AnalyzerFactory> FILETYPE_FACTORIES =
236b5840353SAdam Hornáček             new HashMap<>();
237b5840353SAdam Hornáček 
238b5840353SAdam Hornáček     /**
239b5840353SAdam Hornáček      * Maps from {@link FileAnalyzer#getFileTypeName()} to
240ff44f24aSAdam Hornáček      * {@link FileAnalyzer#getVersionNo()}.
241b5840353SAdam Hornáček      */
242b5840353SAdam Hornáček     private static final Map<String, Long> ANALYZER_VERSIONS = new HashMap<>();
243b5840353SAdam Hornáček 
2446de4f5aaSChris Fraire     private static final LangTreeMap langMap = new LangTreeMap();
245adc0ce11SChris Fraire     private static final LangTreeMap defaultLangMap = new LangTreeMap();
2466de4f5aaSChris Fraire 
247b5840353SAdam Hornáček     /*
248b5840353SAdam Hornáček      * If you write your own analyzer please register it here. The order is
249b5840353SAdam Hornáček      * important for any factory that uses a FileAnalyzerFactory.Matcher
250b5840353SAdam Hornáček      * implementation, as those are run in the same order as defined below --
251b5840353SAdam Hornáček      * though precise Matchers are run before imprecise ones.
252b5840353SAdam Hornáček      */
253b5840353SAdam Hornáček     static {
254b5840353SAdam Hornáček         try {
25557eefa47SKryštof Tulinger             AnalyzerFactory[] analyzers = {
256b5840353SAdam Hornáček                 DEFAULT_ANALYZER_FACTORY,
257b5840353SAdam Hornáček                 new IgnorantAnalyzerFactory(),
258b5840353SAdam Hornáček                 new BZip2AnalyzerFactory(),
259b5840353SAdam Hornáček                 new XMLAnalyzerFactory(),
260b5840353SAdam Hornáček                 MandocAnalyzerFactory.DEFAULT_INSTANCE,
261b5840353SAdam Hornáček                 TroffAnalyzerFactory.DEFAULT_INSTANCE,
262b5840353SAdam Hornáček                 new ELFAnalyzerFactory(),
263b5840353SAdam Hornáček                 JavaClassAnalyzerFactory.DEFAULT_INSTANCE,
264b5840353SAdam Hornáček                 new ImageAnalyzerFactory(),
265b5840353SAdam Hornáček                 JarAnalyzerFactory.DEFAULT_INSTANCE,
266b5840353SAdam Hornáček                 ZipAnalyzerFactory.DEFAULT_INSTANCE,
267b5840353SAdam Hornáček                 new TarAnalyzerFactory(),
268b5840353SAdam Hornáček                 new CAnalyzerFactory(),
269b5840353SAdam Hornáček                 new CSharpAnalyzerFactory(),
270b5840353SAdam Hornáček                 new VBAnalyzerFactory(),
271b5840353SAdam Hornáček                 new CxxAnalyzerFactory(),
272b5840353SAdam Hornáček                 new ErlangAnalyzerFactory(),
273b5840353SAdam Hornáček                 new ShAnalyzerFactory(),
274b5840353SAdam Hornáček                 new PowershellAnalyzerFactory(),
275b5840353SAdam Hornáček                 new UuencodeAnalyzerFactory(),
276b5840353SAdam Hornáček                 new GZIPAnalyzerFactory(),
277b5840353SAdam Hornáček                 new JavaAnalyzerFactory(),
278b5840353SAdam Hornáček                 new JavaScriptAnalyzerFactory(),
279b5840353SAdam Hornáček                 new KotlinAnalyzerFactory(),
280b5840353SAdam Hornáček                 new SwiftAnalyzerFactory(),
281b5840353SAdam Hornáček                 new JsonAnalyzerFactory(),
282b5840353SAdam Hornáček                 new PythonAnalyzerFactory(),
283b5840353SAdam Hornáček                 new RustAnalyzerFactory(),
284b5840353SAdam Hornáček                 new PerlAnalyzerFactory(),
285b5840353SAdam Hornáček                 new PhpAnalyzerFactory(),
286b5840353SAdam Hornáček                 new LispAnalyzerFactory(),
287b5840353SAdam Hornáček                 new TclAnalyzerFactory(),
288b5840353SAdam Hornáček                 new ScalaAnalyzerFactory(),
289b5840353SAdam Hornáček                 new ClojureAnalyzerFactory(),
290b5840353SAdam Hornáček                 new SQLAnalyzerFactory(),
291b5840353SAdam Hornáček                 new PLSQLAnalyzerFactory(),
292b5840353SAdam Hornáček                 new FortranAnalyzerFactory(),
293b5840353SAdam Hornáček                 new HaskellAnalyzerFactory(),
294b5840353SAdam Hornáček                 new GolangAnalyzerFactory(),
295b5840353SAdam Hornáček                 new LuaAnalyzerFactory(),
296b5840353SAdam Hornáček                 new PascalAnalyzerFactory(),
297b5840353SAdam Hornáček                 new AdaAnalyzerFactory(),
298b5840353SAdam Hornáček                 new RubyAnalyzerFactory(),
2994f9cbaecSChris Fraire                 new EiffelAnalyzerFactory(),
3000ed261b2SChris Fraire                 new VerilogAnalyzerFactory(),
30150065c95SChris Fraire                 new TypeScriptAnalyzerFactory(),
302c5ef7ff6SChris Fraire                 new AsmAnalyzerFactory(),
303c5ef7ff6SChris Fraire                 new HCLAnalyzerFactory(),
304ca0eafb1SChris Fraire                 new TerraformAnalyzerFactory(),
305*750b3115SChris Fraire                 new RAnalyzerFactory(),
306*750b3115SChris Fraire                 // Keep PlainAnalyzer last, with its lone, quite fuzzy matcher.
307*750b3115SChris Fraire                 PlainAnalyzerFactory.DEFAULT_INSTANCE
308b5840353SAdam Hornáček             };
309b5840353SAdam Hornáček 
31057eefa47SKryštof Tulinger             for (AnalyzerFactory analyzer : analyzers) {
311b5840353SAdam Hornáček                 registerAnalyzer(analyzer);
312b5840353SAdam Hornáček             }
313b5840353SAdam Hornáček 
31457eefa47SKryštof Tulinger             for (AnalyzerFactory analyzer : analyzers) {
315b5840353SAdam Hornáček                 if (analyzer.getName() != null && !analyzer.getName().isEmpty()) {
analyzer.getName()316b5840353SAdam Hornáček                     fileTypeDescriptions.put(analyzer.getAnalyzer().getFileTypeName(), analyzer.getName());
317b5840353SAdam Hornáček                 }
318b5840353SAdam Hornáček             }
319b5840353SAdam Hornáček 
320b5840353SAdam Hornáček             string_ft_stored_nanalyzed_norms.setOmitNorms(false);
321b5840353SAdam Hornáček             string_ft_nstored_nanalyzed_norms.setOmitNorms(false);
322b5840353SAdam Hornáček         } catch (Throwable t) {
323b5840353SAdam Hornáček             LOGGER.log(Level.SEVERE,
324b5840353SAdam Hornáček                     "exception hit when constructing AnalyzerGuru static", t);
325b5840353SAdam Hornáček             throw t;
326b5840353SAdam Hornáček         }
327b5840353SAdam Hornáček     }
328b5840353SAdam Hornáček 
329b5840353SAdam Hornáček     /**
330b5840353SAdam Hornáček      * Gets a version number to be used to tag documents examined by the guru so
331670ee787SChris Fraire      * that {@link AbstractAnalyzer} selection can be re-done later if a stored
332670ee787SChris Fraire      * version number is different from the current implementation or if guru
333670ee787SChris Fraire      * factory registrations are modified by the user to change the guru
334670ee787SChris Fraire      * operation.
335b5840353SAdam Hornáček      * <p>
336b5840353SAdam Hornáček      * The static part of the version is bumped in a release when e.g. new
337b5840353SAdam Hornáček      * {@link FileAnalyzerFactory} subclasses are registered or when existing
338b5840353SAdam Hornáček      * {@link FileAnalyzerFactory} subclasses are revised to target more or
339b5840353SAdam Hornáček      * different files.
340b5840353SAdam Hornáček      * @return a value whose lower 32-bits are a static value
341ca0eafb1SChris Fraire      * 20201003_00
342b5840353SAdam Hornáček      * for the current implementation and whose higher-32 bits are non-zero if
34357eefa47SKryštof Tulinger      * {@link #addExtension(java.lang.String, AnalyzerFactory)}
344b5840353SAdam Hornáček      * or
34557eefa47SKryštof Tulinger      * {@link #addPrefix(java.lang.String, AnalyzerFactory)}
346b5840353SAdam Hornáček      * has been called.
347b5840353SAdam Hornáček      */
getVersionNo()348b5840353SAdam Hornáček     public static long getVersionNo() {
349ca0eafb1SChris Fraire         final int ver32 = 20201003_00; // Edit comment above too!
350b5840353SAdam Hornáček         long ver = ver32;
351b5840353SAdam Hornáček         if (customizationHashCode != 0) {
352b5840353SAdam Hornáček             ver |= (long) customizationHashCode << 32;
353b5840353SAdam Hornáček         }
354b5840353SAdam Hornáček         return ver;
355b5840353SAdam Hornáček     }
356b5840353SAdam Hornáček 
357b5840353SAdam Hornáček     /**
358b5840353SAdam Hornáček      * Gets a version number according to a registered
359b5840353SAdam Hornáček      * {@link FileAnalyzer#getVersionNo()} for a {@code fileTypeName} according
360b5840353SAdam Hornáček      * to {@link FileAnalyzer#getFileTypeName()}.
361b5840353SAdam Hornáček      * @param fileTypeName a defined instance
362b5840353SAdam Hornáček      * @return a registered value or {@link Long#MIN_VALUE} if
363b5840353SAdam Hornáček      * {@code fileTypeName} is unknown
364b5840353SAdam Hornáček      */
getAnalyzerVersionNo(String fileTypeName)365b5840353SAdam Hornáček     public static long getAnalyzerVersionNo(String fileTypeName) {
366b5840353SAdam Hornáček         return ANALYZER_VERSIONS.getOrDefault(fileTypeName, Long.MIN_VALUE);
367b5840353SAdam Hornáček     }
368b5840353SAdam Hornáček 
getAnalyzersVersionNos()369b5840353SAdam Hornáček     public static Map<String, Long> getAnalyzersVersionNos() {
370b5840353SAdam Hornáček         return Collections.unmodifiableMap(ANALYZER_VERSIONS);
371b5840353SAdam Hornáček     }
372b5840353SAdam Hornáček 
getExtensionsMap()37357eefa47SKryštof Tulinger     public static Map<String, AnalyzerFactory> getExtensionsMap() {
374b5840353SAdam Hornáček         return Collections.unmodifiableMap(ext);
375b5840353SAdam Hornáček     }
376b5840353SAdam Hornáček 
getPrefixesMap()37757eefa47SKryštof Tulinger     public static Map<String, AnalyzerFactory> getPrefixesMap() {
378b5840353SAdam Hornáček         return Collections.unmodifiableMap(pre);
379b5840353SAdam Hornáček     }
380b5840353SAdam Hornáček 
getMagicsMap()38157eefa47SKryštof Tulinger     public static Map<String, AnalyzerFactory> getMagicsMap() {
382b5840353SAdam Hornáček         return Collections.unmodifiableMap(magics);
383b5840353SAdam Hornáček     }
384b5840353SAdam Hornáček 
getAnalyzerFactoryMatchers()385b5840353SAdam Hornáček     public static List<Matcher> getAnalyzerFactoryMatchers() {
386b5840353SAdam Hornáček         return Collections.unmodifiableList(matchers);
387b5840353SAdam Hornáček     }
388b5840353SAdam Hornáček 
getfileTypeDescriptions()389b5840353SAdam Hornáček     public static Map<String, String> getfileTypeDescriptions() {
390b5840353SAdam Hornáček         return Collections.unmodifiableMap(fileTypeDescriptions);
391b5840353SAdam Hornáček     }
392b5840353SAdam Hornáček 
getAnalyzerFactories()393cd560f86SVladimir Kotal     public static List<AnalyzerFactory> getAnalyzerFactories() {
394b5840353SAdam Hornáček         return Collections.unmodifiableList(factories);
395b5840353SAdam Hornáček     }
396b5840353SAdam Hornáček 
39753e96f58SVladimir Kotal     private static final String USED_IN_MULTIPLE_MSG = "' used in multiple analyzers";
39853e96f58SVladimir Kotal 
399b5840353SAdam Hornáček     /**
400b5840353SAdam Hornáček      * Register a {@code FileAnalyzerFactory} instance.
401b5840353SAdam Hornáček      */
registerAnalyzer(AnalyzerFactory factory)40257eefa47SKryštof Tulinger     private static void registerAnalyzer(AnalyzerFactory factory) {
403b5840353SAdam Hornáček         for (String name : factory.getFileNames()) {
40457eefa47SKryštof Tulinger             AnalyzerFactory old = FILE_NAMES.put(name, factory);
405b5840353SAdam Hornáček             assert old == null :
40653e96f58SVladimir Kotal                     "name '" + name + USED_IN_MULTIPLE_MSG;
407b5840353SAdam Hornáček         }
408b5840353SAdam Hornáček         for (String prefix : factory.getPrefixes()) {
40957eefa47SKryštof Tulinger             AnalyzerFactory old = pre.put(prefix, factory);
410b5840353SAdam Hornáček             assert old == null :
41153e96f58SVladimir Kotal                     "prefix '" + prefix + USED_IN_MULTIPLE_MSG;
412b5840353SAdam Hornáček         }
413b5840353SAdam Hornáček         for (String suffix : factory.getSuffixes()) {
41457eefa47SKryštof Tulinger             AnalyzerFactory old = ext.put(suffix, factory);
415b5840353SAdam Hornáček             assert old == null :
41653e96f58SVladimir Kotal                     "suffix '" + suffix + USED_IN_MULTIPLE_MSG;
417b5840353SAdam Hornáček         }
418b5840353SAdam Hornáček         for (String magic : factory.getMagicStrings()) {
41957eefa47SKryštof Tulinger             AnalyzerFactory old = magics.put(magic, factory);
420b5840353SAdam Hornáček             assert old == null :
42153e96f58SVladimir Kotal                     "magic '" + magic + USED_IN_MULTIPLE_MSG;
422b5840353SAdam Hornáček         }
423b5840353SAdam Hornáček         matchers.addAll(factory.getMatchers());
424b5840353SAdam Hornáček         factories.add(factory);
425b5840353SAdam Hornáček 
42657eefa47SKryštof Tulinger         AbstractAnalyzer fa = factory.getAnalyzer();
427b5840353SAdam Hornáček         String fileTypeName = fa.getFileTypeName();
428b5840353SAdam Hornáček         FILETYPE_FACTORIES.put(fileTypeName, factory);
429b5840353SAdam Hornáček         ANALYZER_VERSIONS.put(fileTypeName, fa.getVersionNo());
430adc0ce11SChris Fraire 
431adc0ce11SChris Fraire         // Possibly configure default LANG mappings for the factory.
432adc0ce11SChris Fraire         String ctagsLang = factory.getAnalyzer().getCtagsLang();
433adc0ce11SChris Fraire         if (ctagsLang != null) {
434adc0ce11SChris Fraire             List<String> prefixes = factory.getPrefixes();
435adc0ce11SChris Fraire             if (prefixes != null) {
436adc0ce11SChris Fraire                 for (String prefix : prefixes) {
437adc0ce11SChris Fraire                     defaultLangMap.add(prefix, ctagsLang);
438adc0ce11SChris Fraire                 }
439adc0ce11SChris Fraire             }
440adc0ce11SChris Fraire 
441adc0ce11SChris Fraire             List<String> suffixes = factory.getSuffixes();
442adc0ce11SChris Fraire             if (suffixes != null) {
443adc0ce11SChris Fraire                 for (String suffix : suffixes) {
444adc0ce11SChris Fraire                     // LangMap needs a "." to signify a file extension.
445adc0ce11SChris Fraire                     defaultLangMap.add("." + suffix, ctagsLang);
446adc0ce11SChris Fraire                 }
447adc0ce11SChris Fraire             }
448adc0ce11SChris Fraire         }
449b5840353SAdam Hornáček     }
450b5840353SAdam Hornáček 
451b5840353SAdam Hornáček     /**
452b5840353SAdam Hornáček      * Instruct the AnalyzerGuru to use a given analyzer for a given file
453b5840353SAdam Hornáček      * prefix.
454b5840353SAdam Hornáček      *
455b5840353SAdam Hornáček      * @param prefix the file prefix to add
456b5840353SAdam Hornáček      * @param factory a factory which creates the analyzer to use for the given
457b5840353SAdam Hornáček      * extension (if you pass null as the analyzer, you will disable the
458b5840353SAdam Hornáček      * analyzer used for that extension)
459b5840353SAdam Hornáček      */
addPrefix(String prefix, AnalyzerFactory factory)46057eefa47SKryštof Tulinger     public static void addPrefix(String prefix, AnalyzerFactory factory) {
46157eefa47SKryštof Tulinger         AnalyzerFactory oldFactory;
462b5840353SAdam Hornáček         if (factory == null) {
463b5840353SAdam Hornáček             oldFactory = pre.remove(prefix);
4646de4f5aaSChris Fraire             langMap.exclude(prefix);
465b5840353SAdam Hornáček         } else {
466b5840353SAdam Hornáček             oldFactory = pre.put(prefix, factory);
4676de4f5aaSChris Fraire             langMap.add(prefix, factory.getAnalyzer().getCtagsLang());
468b5840353SAdam Hornáček         }
469b5840353SAdam Hornáček 
470b5840353SAdam Hornáček         if (factoriesDifferent(factory, oldFactory)) {
471b5840353SAdam Hornáček             addCustomizationKey("p:" + prefix);
472b5840353SAdam Hornáček         }
473b5840353SAdam Hornáček     }
474b5840353SAdam Hornáček 
475b5840353SAdam Hornáček     /**
476b5840353SAdam Hornáček      * Instruct the AnalyzerGuru to use a given analyzer for a given file
477b5840353SAdam Hornáček      * extension.
478b5840353SAdam Hornáček      *
479b5840353SAdam Hornáček      * @param extension the file-extension to add
480b5840353SAdam Hornáček      * @param factory a factory which creates the analyzer to use for the given
481b5840353SAdam Hornáček      * extension (if you pass null as the analyzer, you will disable the
482b5840353SAdam Hornáček      * analyzer used for that extension)
4836de4f5aaSChris Fraire      * @throws IllegalArgumentException if {@code extension} contains a period
484b5840353SAdam Hornáček      */
addExtension(String extension, AnalyzerFactory factory)4856de4f5aaSChris Fraire     public static void addExtension(String extension, AnalyzerFactory factory) {
4866de4f5aaSChris Fraire         if (extension.contains(".")) {
4876de4f5aaSChris Fraire             throw new IllegalArgumentException("extension contains a '.'");
4886de4f5aaSChris Fraire         }
4896de4f5aaSChris Fraire 
4906de4f5aaSChris Fraire         // LangMap fileSpec requires a leading period to indicate an extension.
4916de4f5aaSChris Fraire         String langMapExtension = "." + extension;
4926de4f5aaSChris Fraire 
49357eefa47SKryštof Tulinger         AnalyzerFactory oldFactory;
494b5840353SAdam Hornáček         if (factory == null) {
495b5840353SAdam Hornáček             oldFactory = ext.remove(extension);
4966de4f5aaSChris Fraire             langMap.exclude(langMapExtension);
497b5840353SAdam Hornáček         } else {
498b5840353SAdam Hornáček             oldFactory = ext.put(extension, factory);
4996de4f5aaSChris Fraire             langMap.add(langMapExtension, factory.getAnalyzer().getCtagsLang());
500b5840353SAdam Hornáček         }
501b5840353SAdam Hornáček 
502b5840353SAdam Hornáček         if (factoriesDifferent(factory, oldFactory)) {
503b5840353SAdam Hornáček             addCustomizationKey("e:" + extension);
504b5840353SAdam Hornáček         }
505b5840353SAdam Hornáček     }
506b5840353SAdam Hornáček 
507b5840353SAdam Hornáček     /**
5086de4f5aaSChris Fraire      * Gets an unmodifiable view of the language mappings resulting from
5096de4f5aaSChris Fraire      * {@link #addExtension(String, AnalyzerFactory)} and
510adc0ce11SChris Fraire      * {@link #addPrefix(String, AnalyzerFactory)} merged with default language
511adc0ce11SChris Fraire      * mappings of OpenGrok's analyzers.
5126de4f5aaSChris Fraire      */
getLangMap()5136de4f5aaSChris Fraire     public static LangMap getLangMap() {
514adc0ce11SChris Fraire         return langMap.mergeSecondary(defaultLangMap).unmodifiable();
5156de4f5aaSChris Fraire     }
5166de4f5aaSChris Fraire 
5176de4f5aaSChris Fraire     /**
518b5840353SAdam Hornáček      * Get the default Analyzer.
519b5840353SAdam Hornáček      *
520b5840353SAdam Hornáček      * @return default FileAnalyzer
521b5840353SAdam Hornáček      */
getAnalyzer()52257eefa47SKryštof Tulinger     public static AbstractAnalyzer getAnalyzer() {
523b5840353SAdam Hornáček         return DEFAULT_ANALYZER_FACTORY.getAnalyzer();
524b5840353SAdam Hornáček     }
525b5840353SAdam Hornáček 
526b5840353SAdam Hornáček     /**
527b5840353SAdam Hornáček      * Gets an analyzer for the specified {@code fileTypeName} if it accords
528b5840353SAdam Hornáček      * with a known {@link FileAnalyzer#getFileTypeName()}.
529b5840353SAdam Hornáček      * @param fileTypeName a defined name
530b5840353SAdam Hornáček      * @return a defined instance if known or otherwise {@code null}
531b5840353SAdam Hornáček      */
getAnalyzer(String fileTypeName)53257eefa47SKryštof Tulinger     public static AbstractAnalyzer getAnalyzer(String fileTypeName) {
53357eefa47SKryštof Tulinger         AnalyzerFactory factory = FILETYPE_FACTORIES.get(fileTypeName);
534b5840353SAdam Hornáček         return factory == null ? null : factory.getAnalyzer();
535b5840353SAdam Hornáček     }
536b5840353SAdam Hornáček 
537b5840353SAdam Hornáček     /**
538b5840353SAdam Hornáček      * Get an analyzer suited to analyze a file. This function will reuse
539b5840353SAdam Hornáček      * analyzers since they are costly.
540b5840353SAdam Hornáček      *
541b5840353SAdam Hornáček      * @param in Input stream containing data to be analyzed
542b5840353SAdam Hornáček      * @param file Name of the file to be analyzed
543b5840353SAdam Hornáček      * @return An analyzer suited for that file content
544b5840353SAdam Hornáček      * @throws java.io.IOException If an error occurs while accessing the data
545b5840353SAdam Hornáček      * in the input stream.
546b5840353SAdam Hornáček      */
getAnalyzer(InputStream in, String file)54757eefa47SKryštof Tulinger     public static AbstractAnalyzer getAnalyzer(InputStream in, String file) throws IOException {
54857eefa47SKryštof Tulinger         AnalyzerFactory factory = find(in, file);
549b5840353SAdam Hornáček         if (factory == null) {
55057eefa47SKryštof Tulinger             AbstractAnalyzer defaultAnalyzer = getAnalyzer();
55130bba29fSChris Fraire             if (LOGGER.isLoggable(Level.FINEST)) {
55230bba29fSChris Fraire                 LOGGER.log(Level.FINEST, "{0}: fallback {1}",
553b5840353SAdam Hornáček                     new Object[]{file,
554b5840353SAdam Hornáček                     defaultAnalyzer.getClass().getSimpleName() });
555b5840353SAdam Hornáček             }
556b5840353SAdam Hornáček             return defaultAnalyzer;
557b5840353SAdam Hornáček         }
558b5840353SAdam Hornáček         return factory.getAnalyzer();
559b5840353SAdam Hornáček     }
560b5840353SAdam Hornáček 
561b5840353SAdam Hornáček     /**
5627a857b12SVladimir Kotal      * Free resources associated with all registered analyzers.
5637a857b12SVladimir Kotal      */
returnAnalyzers()5647a857b12SVladimir Kotal     public static void returnAnalyzers() {
56557eefa47SKryštof Tulinger         for (AnalyzerFactory analyzer : factories) {
5667a857b12SVladimir Kotal             analyzer.returnAnalyzer();
5677a857b12SVladimir Kotal         }
5687a857b12SVladimir Kotal     }
5697a857b12SVladimir Kotal 
5707a857b12SVladimir Kotal     /**
571b5840353SAdam Hornáček      * Populate a Lucene document with the required fields.
572b5840353SAdam Hornáček      *
573b5840353SAdam Hornáček      * @param doc The document to populate
574b5840353SAdam Hornáček      * @param file The file to index
575b5840353SAdam Hornáček      * @param path Where the file is located (from source root)
576b5840353SAdam Hornáček      * @param fa The analyzer to use on the file
577b5840353SAdam Hornáček      * @param xrefOut Where to write the xref (possibly {@code null})
578b5840353SAdam Hornáček      * @throws IOException If an exception occurs while collecting the data
579b5840353SAdam Hornáček      * @throws InterruptedException if a timeout occurs
580b5840353SAdam Hornáček      */
populateDocument(Document doc, File file, String path, AbstractAnalyzer fa, Writer xrefOut)58127c0cfcdSVladimir Kotal     public void populateDocument(Document doc, File file, String path, AbstractAnalyzer fa, Writer xrefOut)
58227c0cfcdSVladimir Kotal             throws IOException, InterruptedException {
583b5840353SAdam Hornáček 
584b5840353SAdam Hornáček         String date = DateTools.timeToString(file.lastModified(),
585b5840353SAdam Hornáček                 DateTools.Resolution.MILLISECOND);
586807ead8fSLubos Kosco         path = Util.fixPathIfWindows(path);
587b5840353SAdam Hornáček         doc.add(new Field(QueryBuilder.U, Util.path2uid(path, date),
588b5840353SAdam Hornáček                 string_ft_stored_nanalyzed_norms));
589b5840353SAdam Hornáček         doc.add(new Field(QueryBuilder.FULLPATH, file.getAbsolutePath(),
590b5840353SAdam Hornáček                 string_ft_nstored_nanalyzed_norms));
591b5840353SAdam Hornáček         doc.add(new SortedDocValuesField(QueryBuilder.FULLPATH,
592b5840353SAdam Hornáček                 new BytesRef(file.getAbsolutePath())));
593b5840353SAdam Hornáček 
594b5840353SAdam Hornáček         if (RuntimeEnvironment.getInstance().isHistoryEnabled()) {
595b5840353SAdam Hornáček             try {
59636b6a6f8SVladimir Kotal                 HistoryGuru histGuru = HistoryGuru.getInstance();
59736b6a6f8SVladimir Kotal                 HistoryReader hr = histGuru.getHistoryReader(file);
598b5840353SAdam Hornáček                 if (hr != null) {
599b5840353SAdam Hornáček                     doc.add(new TextField(QueryBuilder.HIST, hr));
60036b6a6f8SVladimir Kotal                     History history;
60136b6a6f8SVladimir Kotal                     if ((history = histGuru.getHistory(file)) != null) {
60236b6a6f8SVladimir Kotal                         List<HistoryEntry> historyEntries = history.getHistoryEntries(1, 0);
60353e96f58SVladimir Kotal                         if (!historyEntries.isEmpty()) {
60436b6a6f8SVladimir Kotal                             HistoryEntry histEntry = historyEntries.get(0);
60536b6a6f8SVladimir Kotal                             doc.add(new TextField(QueryBuilder.LASTREV, histEntry.getRevision(), Store.YES));
60636b6a6f8SVladimir Kotal                         }
60736b6a6f8SVladimir Kotal                     }
608b5840353SAdam Hornáček                 }
609b5840353SAdam Hornáček             } catch (HistoryException e) {
610b5840353SAdam Hornáček                 LOGGER.log(Level.WARNING, "An error occurred while reading history: ", e);
611b5840353SAdam Hornáček             }
612b5840353SAdam Hornáček         }
613b5840353SAdam Hornáček         doc.add(new Field(QueryBuilder.DATE, date, string_ft_stored_nanalyzed_norms));
614b5840353SAdam Hornáček         doc.add(new SortedDocValuesField(QueryBuilder.DATE, new BytesRef(date)));
615b5840353SAdam Hornáček 
616b5840353SAdam Hornáček         // `path' is not null, as it was passed to Util.path2uid() above.
617b5840353SAdam Hornáček         doc.add(new TextField(QueryBuilder.PATH, path, Store.YES));
618b5840353SAdam Hornáček         Project project = Project.getProject(path);
619b5840353SAdam Hornáček         if (project != null) {
620b5840353SAdam Hornáček             doc.add(new TextField(QueryBuilder.PROJECT, project.getPath(), Store.YES));
621b5840353SAdam Hornáček         }
622b5840353SAdam Hornáček 
623b5840353SAdam Hornáček         /*
624b5840353SAdam Hornáček          * Use the parent of the path -- not the absolute file as is done for
625b5840353SAdam Hornáček          * FULLPATH -- so that DIRPATH is the same convention as for PATH
626b5840353SAdam Hornáček          * above. A StringField, however, is used instead of a TextField.
627b5840353SAdam Hornáček          */
628b5840353SAdam Hornáček         File fpath = new File(path);
629b5840353SAdam Hornáček         String fileParent = fpath.getParent();
630b5840353SAdam Hornáček         if (fileParent != null && fileParent.length() > 0) {
631b5840353SAdam Hornáček             String normalizedPath = QueryBuilder.normalizeDirPath(fileParent);
632b5840353SAdam Hornáček             StringField npstring = new StringField(QueryBuilder.DIRPATH,
633b5840353SAdam Hornáček                 normalizedPath, Store.NO);
634b5840353SAdam Hornáček             doc.add(npstring);
635b5840353SAdam Hornáček         }
636b5840353SAdam Hornáček 
637b5840353SAdam Hornáček         if (fa != null) {
63857eefa47SKryštof Tulinger             AbstractAnalyzer.Genre g = fa.getGenre();
63957eefa47SKryštof Tulinger             if (g == AbstractAnalyzer.Genre.PLAIN || g == AbstractAnalyzer.Genre.XREFABLE || g == AbstractAnalyzer.Genre.HTML) {
640a72324b1SAdam Hornáček                 doc.add(new Field(QueryBuilder.T, g.typeName(), string_ft_stored_nanalyzed_norms));
641b5840353SAdam Hornáček             }
642b5840353SAdam Hornáček             fa.analyze(doc, StreamSource.fromFile(file), xrefOut);
643b5840353SAdam Hornáček 
644b5840353SAdam Hornáček             String type = fa.getFileTypeName();
645b5840353SAdam Hornáček             doc.add(new StringField(QueryBuilder.TYPE, type, Store.YES));
646b5840353SAdam Hornáček         }
647b5840353SAdam Hornáček     }
648b5840353SAdam Hornáček 
649b5840353SAdam Hornáček     /**
650ff44f24aSAdam Hornáček      * Write a browse-able version of the file.
651b5840353SAdam Hornáček      *
652b5840353SAdam Hornáček      * @param factory The analyzer factory for this file type
653b5840353SAdam Hornáček      * @param in The input stream containing the data
654b5840353SAdam Hornáček      * @param out Where to write the result
655b5840353SAdam Hornáček      * @param defs definitions for the source file, if available
656b5840353SAdam Hornáček      * @param annotation Annotation information for the file
657b5840353SAdam Hornáček      * @param project Project the file belongs to
658b5840353SAdam Hornáček      * @throws java.io.IOException If an error occurs while creating the output
659b5840353SAdam Hornáček      */
writeXref(AnalyzerFactory factory, Reader in, Writer out, Definitions defs, Annotation annotation, Project project)66057eefa47SKryštof Tulinger     public static void writeXref(AnalyzerFactory factory, Reader in,
661b5840353SAdam Hornáček             Writer out, Definitions defs,
662b5840353SAdam Hornáček             Annotation annotation, Project project)
663b5840353SAdam Hornáček             throws IOException {
664b5840353SAdam Hornáček         Reader input = in;
66557eefa47SKryštof Tulinger         if (factory.getGenre() == AbstractAnalyzer.Genre.PLAIN) {
666b5840353SAdam Hornáček             // This is some kind of text file, so we need to expand tabs to
667b5840353SAdam Hornáček             // spaces to match the project's tab settings.
668b5840353SAdam Hornáček             input = ExpandTabsReader.wrap(in, project);
669b5840353SAdam Hornáček         }
670b5840353SAdam Hornáček 
671b5840353SAdam Hornáček         WriteXrefArgs args = new WriteXrefArgs(input, out);
672b5840353SAdam Hornáček         args.setDefs(defs);
673b5840353SAdam Hornáček         args.setAnnotation(annotation);
674b5840353SAdam Hornáček         args.setProject(project);
675b5840353SAdam Hornáček 
67657eefa47SKryštof Tulinger         AbstractAnalyzer analyzer = factory.getAnalyzer();
677b5840353SAdam Hornáček         RuntimeEnvironment env = RuntimeEnvironment.getInstance();
678b5840353SAdam Hornáček         analyzer.setScopesEnabled(env.isScopesEnabled());
679b5840353SAdam Hornáček         analyzer.setFoldingEnabled(env.isFoldingEnabled());
680b5840353SAdam Hornáček         analyzer.writeXref(args);
681b5840353SAdam Hornáček     }
682b5840353SAdam Hornáček 
683b5840353SAdam Hornáček     /**
684b5840353SAdam Hornáček      * Writes a browse-able version of the file transformed for immediate
685b5840353SAdam Hornáček      * serving to a web client.
686b5840353SAdam Hornáček      * @param contextPath the web context path for
687b5840353SAdam Hornáček      * {@link Util#dumpXref(java.io.Writer, java.io.Reader, java.lang.String)}
688b5840353SAdam Hornáček      * @param factory the analyzer factory for this file type
689b5840353SAdam Hornáček      * @param in the input stream containing the data
690b5840353SAdam Hornáček      * @param out a defined instance to write
691b5840353SAdam Hornáček      * @param defs definitions for the source file, if available
692b5840353SAdam Hornáček      * @param annotation annotation information for the file
693b5840353SAdam Hornáček      * @param project project the file belongs to
694b5840353SAdam Hornáček      * @throws java.io.IOException if an error occurs while creating the output
695b5840353SAdam Hornáček      */
writeDumpedXref(String contextPath, AnalyzerFactory factory, Reader in, Writer out, Definitions defs, Annotation annotation, Project project)696b5840353SAdam Hornáček     public static void writeDumpedXref(String contextPath,
69757eefa47SKryštof Tulinger             AnalyzerFactory factory, Reader in, Writer out,
698b5840353SAdam Hornáček             Definitions defs, Annotation annotation, Project project)
699b5840353SAdam Hornáček             throws IOException {
700b5840353SAdam Hornáček 
701b5840353SAdam Hornáček         File xrefTemp = File.createTempFile("ogxref", ".html");
702b5840353SAdam Hornáček         try {
703b5840353SAdam Hornáček             try (FileWriter tmpout = new FileWriter(xrefTemp)) {
704b5840353SAdam Hornáček                 writeXref(factory, in, tmpout, defs, annotation, project);
705b5840353SAdam Hornáček             }
706b5840353SAdam Hornáček             Util.dumpXref(out, xrefTemp, false, contextPath);
707b5840353SAdam Hornáček         } finally {
70853e96f58SVladimir Kotal             Files.delete(xrefTemp.toPath());
709b5840353SAdam Hornáček         }
710b5840353SAdam Hornáček     }
711b5840353SAdam Hornáček 
712b5840353SAdam Hornáček     /**
713ff44f24aSAdam Hornáček      * Get the genre of a file.
714b5840353SAdam Hornáček      *
715b5840353SAdam Hornáček      * @param file The file to inspect
716b5840353SAdam Hornáček      * @return The genre suitable to decide how to display the file
717b5840353SAdam Hornáček      */
getGenre(String file)71857eefa47SKryštof Tulinger     public static AbstractAnalyzer.Genre getGenre(String file) {
719b5840353SAdam Hornáček         return getGenre(find(file));
720b5840353SAdam Hornáček     }
721b5840353SAdam Hornáček 
722b5840353SAdam Hornáček     /**
723ff44f24aSAdam Hornáček      * Get the genre of a bulk of data.
724b5840353SAdam Hornáček      *
725b5840353SAdam Hornáček      * @param in A stream containing the data
726b5840353SAdam Hornáček      * @return The genre suitable to decide how to display the file
727b5840353SAdam Hornáček      * @throws java.io.IOException If an error occurs while getting the content
728b5840353SAdam Hornáček      */
getGenre(InputStream in)72957eefa47SKryštof Tulinger     public static AbstractAnalyzer.Genre getGenre(InputStream in) throws IOException {
730b5840353SAdam Hornáček         return getGenre(find(in));
731b5840353SAdam Hornáček     }
732b5840353SAdam Hornáček 
733b5840353SAdam Hornáček     /**
734ff44f24aSAdam Hornáček      * Get the genre for a named class (this is most likely an analyzer).
735b5840353SAdam Hornáček      *
736b5840353SAdam Hornáček      * @param factory the analyzer factory to get the genre for
737b5840353SAdam Hornáček      * @return The genre of this class (null if not found)
738b5840353SAdam Hornáček      */
getGenre(AnalyzerFactory factory)73957eefa47SKryštof Tulinger     public static AbstractAnalyzer.Genre getGenre(AnalyzerFactory factory) {
740b5840353SAdam Hornáček         if (factory != null) {
741b5840353SAdam Hornáček             return factory.getGenre();
742b5840353SAdam Hornáček         }
743b5840353SAdam Hornáček         return null;
744b5840353SAdam Hornáček     }
745b5840353SAdam Hornáček 
746b5840353SAdam Hornáček     /**
747ee13dbaeSChris Fraire      * Finds a {@code FileAnalyzerFactory} for the specified
748ee13dbaeSChris Fraire      * {@link FileAnalyzer#getFileTypeName()}.
749ee13dbaeSChris Fraire      * @param fileTypeName a defined instance
750ee13dbaeSChris Fraire      * @return a defined instance or {@code null}
751ee13dbaeSChris Fraire      */
findByFileTypeName(String fileTypeName)75257eefa47SKryštof Tulinger     public static AnalyzerFactory findByFileTypeName(String fileTypeName) {
753ee13dbaeSChris Fraire         return FILETYPE_FACTORIES.get(fileTypeName);
754ee13dbaeSChris Fraire     }
755ee13dbaeSChris Fraire 
756ee13dbaeSChris Fraire     /**
757b5840353SAdam Hornáček      * Find a {@code FileAnalyzerFactory} with the specified class name. If one
758b5840353SAdam Hornáček      * doesn't exist, create one and register it. Allow specification of either
759b5840353SAdam Hornáček      * the complete class name (which includes the package name) or the simple
760b5840353SAdam Hornáček      * name of the class.
761b5840353SAdam Hornáček      *
762b5840353SAdam Hornáček      * @param factoryClassName name of the factory class
763b5840353SAdam Hornáček      * @return a file analyzer factory
764b5840353SAdam Hornáček      *
765b5840353SAdam Hornáček      * @throws ClassNotFoundException if there is no class with that name
766b5840353SAdam Hornáček      * @throws ClassCastException if the class is not a subclass of {@code
767b5840353SAdam Hornáček      * FileAnalyzerFactory}
768b5840353SAdam Hornáček      * @throws IllegalAccessException if the constructor cannot be accessed
769b5840353SAdam Hornáček      * @throws InstantiationException if the class cannot be instantiated
770b5840353SAdam Hornáček      * @throws NoSuchMethodException if no-argument constructor could not be found
771b5840353SAdam Hornáček      * @throws InvocationTargetException if the underlying constructor throws an exception
772b5840353SAdam Hornáček      */
findFactory(String factoryClassName)77357eefa47SKryštof Tulinger     public static AnalyzerFactory findFactory(String factoryClassName)
774b5840353SAdam Hornáček             throws ClassNotFoundException, IllegalAccessException, InstantiationException, NoSuchMethodException,
775b5840353SAdam Hornáček             InvocationTargetException {
776c5ef7ff6SChris Fraire         Class<?> fcn;
777b5840353SAdam Hornáček         try {
778b5840353SAdam Hornáček             fcn = Class.forName(factoryClassName);
779b5840353SAdam Hornáček 
780b5840353SAdam Hornáček         } catch (ClassNotFoundException e) {
781b5840353SAdam Hornáček             fcn = getFactoryClass(factoryClassName);
782b5840353SAdam Hornáček 
783b5840353SAdam Hornáček             if (fcn == null) {
784b5840353SAdam Hornáček                 throw new ClassNotFoundException("Unable to locate class " + factoryClassName);
785b5840353SAdam Hornáček             }
786b5840353SAdam Hornáček         }
787b5840353SAdam Hornáček 
788b5840353SAdam Hornáček         return findFactory(fcn);
789b5840353SAdam Hornáček     }
790b5840353SAdam Hornáček 
791b5840353SAdam Hornáček     /**
792b5840353SAdam Hornáček      * Get Analyzer factory class using class simple name.
793b5840353SAdam Hornáček      *
794b5840353SAdam Hornáček      * @param simpleName which may be either the factory class
795b5840353SAdam Hornáček      * simple name (eg. CAnalyzerFactory), the analyzer name
796b5840353SAdam Hornáček      * (eg. CAnalyzer), or the language name (eg. C)
797b5840353SAdam Hornáček      *
798b5840353SAdam Hornáček      * @return the analyzer factory class, or null when not found.
799b5840353SAdam Hornáček      */
getFactoryClass(String simpleName)800b5840353SAdam Hornáček     public static Class<?> getFactoryClass(String simpleName) {
801b5840353SAdam Hornáček         Class<?> factoryClass = null;
802b5840353SAdam Hornáček 
803b5840353SAdam Hornáček         // Build analysis package name list first time only
804b5840353SAdam Hornáček         if (analysisPkgNames.isEmpty()) {
805b5840353SAdam Hornáček             Package[] p = Package.getPackages();
806b5840353SAdam Hornáček             for (Package pp : p) {
807b5840353SAdam Hornáček                 String pname = pp.getName();
808ff44f24aSAdam Hornáček                 if (pname.contains(".analysis.")) {
809b5840353SAdam Hornáček                     analysisPkgNames.add(pname);
810b5840353SAdam Hornáček                 }
811b5840353SAdam Hornáček             }
812b5840353SAdam Hornáček         }
813b5840353SAdam Hornáček 
814b5840353SAdam Hornáček         // This allows user to enter the language or analyzer name
815b5840353SAdam Hornáček         // (eg. C or CAnalyzer vs. CAnalyzerFactory)
816b5840353SAdam Hornáček         // Note that this assumes a regular naming scheme of
817b5840353SAdam Hornáček         // all language parsers:
818b5840353SAdam Hornáček         //      <language>Analyzer, <language>AnalyzerFactory
819b5840353SAdam Hornáček 
820ff44f24aSAdam Hornáček         if (!simpleName.contains("Analyzer")) {
821b5840353SAdam Hornáček             simpleName += "Analyzer";
822b5840353SAdam Hornáček         }
823b5840353SAdam Hornáček 
824ff44f24aSAdam Hornáček         if (!simpleName.contains("Factory")) {
825b5840353SAdam Hornáček             simpleName += "Factory";
826b5840353SAdam Hornáček         }
827b5840353SAdam Hornáček 
828b5840353SAdam Hornáček         for (String aPackage : analysisPkgNames) {
829b5840353SAdam Hornáček             try {
830b5840353SAdam Hornáček                 String fqn = aPackage + "." + simpleName;
831b5840353SAdam Hornáček                 factoryClass = Class.forName(fqn);
832b5840353SAdam Hornáček                 break;
833b5840353SAdam Hornáček             } catch (ClassNotFoundException e) {
834b5840353SAdam Hornáček                 // Ignore
835b5840353SAdam Hornáček             }
836b5840353SAdam Hornáček         }
837b5840353SAdam Hornáček 
838b5840353SAdam Hornáček         return factoryClass;
839b5840353SAdam Hornáček     }
840b5840353SAdam Hornáček 
841b5840353SAdam Hornáček     /**
842b5840353SAdam Hornáček      * Find a {@code FileAnalyzerFactory} which is an instance of the specified
843b5840353SAdam Hornáček      * class. If one doesn't exist, create one and register it.
844b5840353SAdam Hornáček      *
845b5840353SAdam Hornáček      * @param factoryClass the factory class
846b5840353SAdam Hornáček      * @return a file analyzer factory
847b5840353SAdam Hornáček      *
848b5840353SAdam Hornáček      * @throws ClassCastException if the class is not a subclass of {@code
849b5840353SAdam Hornáček      * FileAnalyzerFactory}
850b5840353SAdam Hornáček      * @throws IllegalAccessException if the constructor cannot be accessed
851b5840353SAdam Hornáček      * @throws InstantiationException if the class cannot be instantiated
852b5840353SAdam Hornáček      * @throws NoSuchMethodException if no-argument constructor could not be found
853b5840353SAdam Hornáček      * @throws InvocationTargetException if the underlying constructor throws an exception
854b5840353SAdam Hornáček      */
findFactory(Class<?> factoryClass)85557eefa47SKryštof Tulinger     private static AnalyzerFactory findFactory(Class<?> factoryClass)
856b5840353SAdam Hornáček             throws InstantiationException, IllegalAccessException, NoSuchMethodException, InvocationTargetException {
85757eefa47SKryštof Tulinger         for (AnalyzerFactory f : factories) {
858b5840353SAdam Hornáček             if (f.getClass() == factoryClass) {
859b5840353SAdam Hornáček                 return f;
860b5840353SAdam Hornáček             }
861b5840353SAdam Hornáček         }
86257eefa47SKryštof Tulinger         AnalyzerFactory f = (AnalyzerFactory) factoryClass.getDeclaredConstructor().newInstance();
863b5840353SAdam Hornáček         registerAnalyzer(f);
864b5840353SAdam Hornáček         return f;
865b5840353SAdam Hornáček     }
866b5840353SAdam Hornáček 
867b5840353SAdam Hornáček     /**
868b5840353SAdam Hornáček      * Finds a suitable analyser class for file name. If the analyzer cannot be
869b5840353SAdam Hornáček      * determined by the file extension, try to look at the data in the
870b5840353SAdam Hornáček      * InputStream to find a suitable analyzer.
871b5840353SAdam Hornáček      *
872b5840353SAdam Hornáček      * Use if you just want to find file type.
873b5840353SAdam Hornáček      *
874b5840353SAdam Hornáček      * @param in The input stream containing the data
875b5840353SAdam Hornáček      * @param file The file name to get the analyzer for
876b5840353SAdam Hornáček      * @return the analyzer factory to use
87753e96f58SVladimir Kotal      * @throws java.io.IOException If a problem occurred while reading the data
878b5840353SAdam Hornáček      */
find(InputStream in, String file)87953e96f58SVladimir Kotal     public static AnalyzerFactory find(InputStream in, String file) throws IOException {
88057eefa47SKryštof Tulinger         AnalyzerFactory factory = find(file);
88153e96f58SVladimir Kotal 
882b5840353SAdam Hornáček         if (factory != null) {
883b5840353SAdam Hornáček             return factory;
884b5840353SAdam Hornáček         }
885b5840353SAdam Hornáček         return findForStream(in, file);
886b5840353SAdam Hornáček     }
887b5840353SAdam Hornáček 
888b5840353SAdam Hornáček     /**
889b5840353SAdam Hornáček      * Finds a suitable analyser class for file name.
890b5840353SAdam Hornáček      *
891b5840353SAdam Hornáček      * @param file The file name to get the analyzer for
892b5840353SAdam Hornáček      * @return the analyzer factory to use
893b5840353SAdam Hornáček      */
find(String file)89457eefa47SKryštof Tulinger     public static AnalyzerFactory find(String file) {
895b5840353SAdam Hornáček         String path = file;
896b5840353SAdam Hornáček         int i;
897b5840353SAdam Hornáček 
898b5840353SAdam Hornáček         // Get basename of the file first.
899b5840353SAdam Hornáček         if (((i = path.lastIndexOf(File.separatorChar)) > 0)
900b5840353SAdam Hornáček                 && (i + 1 < path.length())) {
901b5840353SAdam Hornáček             path = path.substring(i + 1);
902b5840353SAdam Hornáček         }
903b5840353SAdam Hornáček 
904b5840353SAdam Hornáček         int dotpos = path.lastIndexOf('.');
905b5840353SAdam Hornáček         if (dotpos >= 0) {
90657eefa47SKryštof Tulinger             AnalyzerFactory factory;
907b5840353SAdam Hornáček 
908b5840353SAdam Hornáček             // Try matching the prefix.
909b5840353SAdam Hornáček             if (dotpos > 0) {
91052dccac1SChris Fraire                 factory = pre.get(path.substring(0, dotpos).toUpperCase(Locale.ROOT));
911b5840353SAdam Hornáček                 if (factory != null) {
91230bba29fSChris Fraire                     if (LOGGER.isLoggable(Level.FINEST)) {
91330bba29fSChris Fraire                         LOGGER.log(Level.FINEST, "{0}: chosen by prefix: {1}",
914b5840353SAdam Hornáček                             new Object[]{file,
915b5840353SAdam Hornáček                             factory.getClass().getSimpleName() });
916b5840353SAdam Hornáček                     }
917b5840353SAdam Hornáček                     return factory;
918b5840353SAdam Hornáček                 }
919b5840353SAdam Hornáček             }
920b5840353SAdam Hornáček 
921b5840353SAdam Hornáček             // Now try matching the suffix. We kind of consider this order (first
922b5840353SAdam Hornáček             // prefix then suffix) to be workable although for sure there can be
923b5840353SAdam Hornáček             // cases when this does not work.
92452dccac1SChris Fraire             factory = ext.get(path.substring(dotpos + 1).toUpperCase(Locale.ROOT));
925b5840353SAdam Hornáček             if (factory != null) {
92630bba29fSChris Fraire                 if (LOGGER.isLoggable(Level.FINEST)) {
92730bba29fSChris Fraire                     LOGGER.log(Level.FINEST, "{0}: chosen by suffix: {1}",
928b5840353SAdam Hornáček                         new Object[]{file,
929b5840353SAdam Hornáček                         factory.getClass().getSimpleName() });
930b5840353SAdam Hornáček                 }
931b5840353SAdam Hornáček                 return factory;
932b5840353SAdam Hornáček             }
933b5840353SAdam Hornáček         }
934b5840353SAdam Hornáček 
935b5840353SAdam Hornáček         // file doesn't have any of the prefix or extensions we know, try full match
93652dccac1SChris Fraire         return FILE_NAMES.get(path.toUpperCase(Locale.ROOT));
937b5840353SAdam Hornáček     }
938b5840353SAdam Hornáček 
939b5840353SAdam Hornáček     /**
940ff44f24aSAdam Hornáček      * Finds a suitable analyzer class for the data in this stream.
941b5840353SAdam Hornáček      *
942b5840353SAdam Hornáček      * @param in The stream containing the data to analyze
943b5840353SAdam Hornáček      * @return the analyzer factory to use
944b5840353SAdam Hornáček      * @throws java.io.IOException if an error occurs while reading data from
945b5840353SAdam Hornáček      * the stream
946b5840353SAdam Hornáček      */
find(InputStream in)94757eefa47SKryštof Tulinger     public static AnalyzerFactory find(InputStream in) throws IOException {
948b5840353SAdam Hornáček         return findForStream(in, "<anonymous>");
949b5840353SAdam Hornáček     }
950b5840353SAdam Hornáček 
951b5840353SAdam Hornáček     /**
952b5840353SAdam Hornáček      * Finds a suitable analyzer class for the data in this stream
953ff44f24aSAdam Hornáček      * corresponding to a file of the specified name.
954b5840353SAdam Hornáček      *
955b5840353SAdam Hornáček      * @param in The stream containing the data to analyze
956b5840353SAdam Hornáček      * @param file The file name to get the analyzer for
957b5840353SAdam Hornáček      * @return the analyzer factory to use
958b5840353SAdam Hornáček      * @throws java.io.IOException if an error occurs while reading data from
959b5840353SAdam Hornáček      * the stream
960b5840353SAdam Hornáček      */
findForStream(InputStream in, String file)96157eefa47SKryštof Tulinger     private static AnalyzerFactory findForStream(InputStream in,
962b5840353SAdam Hornáček         String file) throws IOException {
963b5840353SAdam Hornáček 
964b5840353SAdam Hornáček         in.mark(MAGIC_BYTES_NUM);
965b5840353SAdam Hornáček         byte[] content = new byte[MAGIC_BYTES_NUM];
966b5840353SAdam Hornáček         int len = in.read(content);
967b5840353SAdam Hornáček         in.reset();
968b5840353SAdam Hornáček 
969b5840353SAdam Hornáček         if (len < MAGIC_BYTES_NUM) {
970b5840353SAdam Hornáček             /*
971b5840353SAdam Hornáček              * Need at least 4 bytes to perform magic string matching.
972b5840353SAdam Hornáček              */
973b5840353SAdam Hornáček             if (len < 4) {
974b5840353SAdam Hornáček                 return null;
975b5840353SAdam Hornáček             }
976b5840353SAdam Hornáček             content = Arrays.copyOf(content, len);
977b5840353SAdam Hornáček         }
978b5840353SAdam Hornáček 
97957eefa47SKryštof Tulinger         AnalyzerFactory fac;
980b5840353SAdam Hornáček 
981b5840353SAdam Hornáček         // First, do precise-magic Matcher matching
982b5840353SAdam Hornáček         for (FileAnalyzerFactory.Matcher matcher : matchers) {
983*750b3115SChris Fraire             if (matcher.isPreciseMagic()) {
984b5840353SAdam Hornáček                 fac = matcher.isMagic(content, in);
985b5840353SAdam Hornáček                 if (fac != null) {
98630bba29fSChris Fraire                     if (LOGGER.isLoggable(Level.FINEST)) {
98730bba29fSChris Fraire                         LOGGER.log(Level.FINEST,
988b5840353SAdam Hornáček                             "{0}: chosen by precise magic: {1}", new Object[]{
989b5840353SAdam Hornáček                             file, fac.getClass().getSimpleName() });
990b5840353SAdam Hornáček                     }
991b5840353SAdam Hornáček                     return fac;
992b5840353SAdam Hornáček                 }
993b5840353SAdam Hornáček             }
994b5840353SAdam Hornáček         }
995b5840353SAdam Hornáček 
996b5840353SAdam Hornáček         // Next, look for magic strings
997b5840353SAdam Hornáček         String opening = readOpening(in, content);
998b5840353SAdam Hornáček         fac = findMagicString(opening, file);
999b5840353SAdam Hornáček         if (fac != null) {
1000b5840353SAdam Hornáček             return fac;
1001b5840353SAdam Hornáček         }
1002b5840353SAdam Hornáček 
1003b5840353SAdam Hornáček         // Last, do imprecise-magic Matcher matching
1004b5840353SAdam Hornáček         for (FileAnalyzerFactory.Matcher matcher : matchers) {
1005*750b3115SChris Fraire             if (!matcher.isPreciseMagic()) {
1006b5840353SAdam Hornáček                 fac = matcher.isMagic(content, in);
1007b5840353SAdam Hornáček                 if (fac != null) {
100830bba29fSChris Fraire                     if (LOGGER.isLoggable(Level.FINEST)) {
100930bba29fSChris Fraire                         LOGGER.log(Level.FINEST,
1010b5840353SAdam Hornáček                             "{0}: chosen by imprecise magic: {1}",
1011b5840353SAdam Hornáček                             new Object[]{file,
1012b5840353SAdam Hornáček                             fac.getClass().getSimpleName() });
1013b5840353SAdam Hornáček                     }
1014b5840353SAdam Hornáček                     return fac;
1015b5840353SAdam Hornáček                 }
1016b5840353SAdam Hornáček             }
1017b5840353SAdam Hornáček         }
1018b5840353SAdam Hornáček 
1019b5840353SAdam Hornáček         return null;
1020b5840353SAdam Hornáček     }
1021b5840353SAdam Hornáček 
findMagicString(String opening, String file)1022c5ef7ff6SChris Fraire     private static AnalyzerFactory findMagicString(String opening, String file) {
1023b5840353SAdam Hornáček 
1024b5840353SAdam Hornáček         // first, try to look up two words in magics
1025b5840353SAdam Hornáček         String fragment = getWords(opening, 2);
102657eefa47SKryštof Tulinger         AnalyzerFactory fac = magics.get(fragment);
1027b5840353SAdam Hornáček         if (fac != null) {
102830bba29fSChris Fraire             if (LOGGER.isLoggable(Level.FINEST)) {
102930bba29fSChris Fraire                 LOGGER.log(Level.FINEST, "{0}: chosen by magic {2}: {1}",
1030b5840353SAdam Hornáček                     new Object[]{file, fac.getClass().getSimpleName(),
1031b5840353SAdam Hornáček                     fragment});
1032b5840353SAdam Hornáček             }
1033b5840353SAdam Hornáček             return fac;
1034b5840353SAdam Hornáček         }
1035b5840353SAdam Hornáček 
1036b5840353SAdam Hornáček         // second, try to look up one word in magics
1037b5840353SAdam Hornáček         fragment = getWords(opening, 1);
1038b5840353SAdam Hornáček         fac = magics.get(fragment);
1039b5840353SAdam Hornáček         if (fac != null) {
104030bba29fSChris Fraire             if (LOGGER.isLoggable(Level.FINEST)) {
104130bba29fSChris Fraire                 LOGGER.log(Level.FINEST, "{0}: chosen by magic {2}: {1}",
1042b5840353SAdam Hornáček                     new Object[]{file, fac.getClass().getSimpleName(),
1043b5840353SAdam Hornáček                     fragment});
1044b5840353SAdam Hornáček             }
1045b5840353SAdam Hornáček             return fac;
1046b5840353SAdam Hornáček         }
1047b5840353SAdam Hornáček 
1048b5840353SAdam Hornáček         // try to match initial substrings (DESC strlen)
104957eefa47SKryštof Tulinger         for (Map.Entry<String, AnalyzerFactory> entry :
1050b5840353SAdam Hornáček             magics.entrySet()) {
1051b5840353SAdam Hornáček             String magic = entry.getKey();
1052b5840353SAdam Hornáček             if (opening.startsWith(magic)) {
1053b5840353SAdam Hornáček                 fac = entry.getValue();
105430bba29fSChris Fraire                 if (LOGGER.isLoggable(Level.FINEST)) {
105530bba29fSChris Fraire                     LOGGER.log(Level.FINEST,
1056b5840353SAdam Hornáček                         "{0}: chosen by magic(substr) {2}: {1}", new Object[]{
1057b5840353SAdam Hornáček                         file, fac.getClass().getSimpleName(), magic});
1058b5840353SAdam Hornáček                 }
1059b5840353SAdam Hornáček                 return fac;
1060b5840353SAdam Hornáček             }
1061b5840353SAdam Hornáček         }
1062b5840353SAdam Hornáček 
1063b5840353SAdam Hornáček         return null;
1064b5840353SAdam Hornáček     }
1065b5840353SAdam Hornáček 
1066b5840353SAdam Hornáček     /**
1067b5840353SAdam Hornáček      * Extract initial words from a String, or take the entire
1068b5840353SAdam Hornáček      * <code>value</code> if not enough words can be identified. (If
1069b5840353SAdam Hornáček      * <code>n</code> is not 1 or more, returns an empty String.) (A "word"
1070b5840353SAdam Hornáček      * ends at each and every space character.)
1071b5840353SAdam Hornáček      *
1072b5840353SAdam Hornáček      * @param value The source from which words are cut
1073b5840353SAdam Hornáček      * @param n The number of words to try to extract
1074b5840353SAdam Hornáček      * @return The extracted words or <code>""</code>
1075b5840353SAdam Hornáček      */
getWords(String value, int n)1076b5840353SAdam Hornáček     private static String getWords(String value, int n) {
1077a72324b1SAdam Hornáček         if (n < 1) {
1078a72324b1SAdam Hornáček             return "";
1079a72324b1SAdam Hornáček         }
1080b5840353SAdam Hornáček         int l = 0;
1081b5840353SAdam Hornáček         while (n-- > 0) {
1082b5840353SAdam Hornáček             int o = l > 0 ? l + 1 : l;
1083b5840353SAdam Hornáček             int i = value.indexOf(' ', o);
1084a72324b1SAdam Hornáček             if (i == -1) {
1085a72324b1SAdam Hornáček                 return value;
1086a72324b1SAdam Hornáček             }
1087b5840353SAdam Hornáček             l = i;
1088b5840353SAdam Hornáček         }
1089b5840353SAdam Hornáček         return value.substring(0, l);
1090b5840353SAdam Hornáček     }
1091b5840353SAdam Hornáček 
1092b5840353SAdam Hornáček     /**
1093b5840353SAdam Hornáček      * Extract an opening string from the input stream, past any BOM, and past
1094b5840353SAdam Hornáček      * any initial whitespace, but only up to <code>OPENING_MAX_CHARS</code> or
1095b5840353SAdam Hornáček      * to the first <code>\n</code> after any non-whitespace. (Hashbang, #!,
1096b5840353SAdam Hornáček      * openings will have superfluous space removed.)
1097b5840353SAdam Hornáček      *
1098b5840353SAdam Hornáček      * @param in The input stream containing the data
1099b5840353SAdam Hornáček      * @param sig The initial sequence of bytes in the input stream
1100b5840353SAdam Hornáček      * @return The extracted string or <code>""</code>
1101b5840353SAdam Hornáček      * @throws java.io.IOException in case of any read error
1102b5840353SAdam Hornáček      */
readOpening(InputStream in, byte[] sig)1103b5840353SAdam Hornáček     private static String readOpening(InputStream in, byte[] sig)
1104b5840353SAdam Hornáček         throws IOException {
1105b5840353SAdam Hornáček 
1106b5840353SAdam Hornáček         in.mark(MARK_READ_LIMIT);
1107b5840353SAdam Hornáček 
1108b5840353SAdam Hornáček         String encoding = IOUtils.findBOMEncoding(sig);
1109b5840353SAdam Hornáček         if (encoding == null) {
1110b5840353SAdam Hornáček             // SRCROOT is read with UTF-8 as a default.
1111b5840353SAdam Hornáček             encoding = StandardCharsets.UTF_8.name();
1112b5840353SAdam Hornáček         } else {
1113b5840353SAdam Hornáček             int skipForBOM = IOUtils.skipForBOM(sig);
1114b5840353SAdam Hornáček             if (in.skip(skipForBOM) < skipForBOM) {
1115b5840353SAdam Hornáček                 in.reset();
1116b5840353SAdam Hornáček                 return "";
1117b5840353SAdam Hornáček             }
1118b5840353SAdam Hornáček         }
1119b5840353SAdam Hornáček 
1120b5840353SAdam Hornáček         int nRead = 0;
1121b5840353SAdam Hornáček         boolean sawNonWhitespace = false;
1122b5840353SAdam Hornáček         boolean lastWhitespace = false;
1123b5840353SAdam Hornáček         boolean postHashbang = false;
1124b5840353SAdam Hornáček         int r;
1125b5840353SAdam Hornáček 
1126b5840353SAdam Hornáček         StringBuilder opening = new StringBuilder();
112753e96f58SVladimir Kotal         BufferedReader readr = new BufferedReader(new InputStreamReader(in, encoding), OPENING_MAX_CHARS);
1128b5840353SAdam Hornáček         while ((r = readr.read()) != -1) {
1129a72324b1SAdam Hornáček             if (++nRead > OPENING_MAX_CHARS) {
1130a72324b1SAdam Hornáček                 break;
1131a72324b1SAdam Hornáček             }
1132b5840353SAdam Hornáček             char c = (char) r;
1133b5840353SAdam Hornáček             boolean isWhitespace = Character.isWhitespace(c);
1134b5840353SAdam Hornáček             if (!sawNonWhitespace) {
1135a72324b1SAdam Hornáček                 if (isWhitespace) {
1136a72324b1SAdam Hornáček                     continue;
1137a72324b1SAdam Hornáček                 }
1138b5840353SAdam Hornáček                 sawNonWhitespace = true;
1139b5840353SAdam Hornáček             }
1140a72324b1SAdam Hornáček             if (c == '\n') {
1141a72324b1SAdam Hornáček                 break;
1142a72324b1SAdam Hornáček             }
1143b5840353SAdam Hornáček 
1144b5840353SAdam Hornáček             if (isWhitespace) {
1145b5840353SAdam Hornáček                 // Track `lastWhitespace' to condense stretches of whitespace,
1146b5840353SAdam Hornáček                 // and use ' ' regardless of actual whitespace character to
1147b5840353SAdam Hornáček                 // accord with magic string definitions.
1148a72324b1SAdam Hornáček                 if (!lastWhitespace && !postHashbang) {
1149a72324b1SAdam Hornáček                     opening.append(' ');
1150a72324b1SAdam Hornáček                 }
1151b5840353SAdam Hornáček             } else {
1152b5840353SAdam Hornáček                 opening.append(c);
1153b5840353SAdam Hornáček                 postHashbang = false;
1154b5840353SAdam Hornáček             }
1155b5840353SAdam Hornáček             lastWhitespace = isWhitespace;
1156b5840353SAdam Hornáček 
1157b5840353SAdam Hornáček             // If the opening starts with "#!", then track so that any
1158b5840353SAdam Hornáček             // trailing whitespace after the hashbang is ignored.
115953e96f58SVladimir Kotal             if (opening.length() == 2 && opening.charAt(0) == '#' && opening.charAt(1) == '!') {
1160b5840353SAdam Hornáček                 postHashbang = true;
1161b5840353SAdam Hornáček             }
1162b5840353SAdam Hornáček         }
1163b5840353SAdam Hornáček 
1164b5840353SAdam Hornáček         in.reset();
1165b5840353SAdam Hornáček         return opening.toString();
1166b5840353SAdam Hornáček     }
1167b5840353SAdam Hornáček 
addCustomizationKey(String k)1168b5840353SAdam Hornáček     private static void addCustomizationKey(String k) {
1169b5840353SAdam Hornáček         CUSTOMIZATION_KEYS.add(k);
1170b5840353SAdam Hornáček         Object[] keys = CUSTOMIZATION_KEYS.toArray();
1171b5840353SAdam Hornáček         customizationHashCode = Objects.hash(keys);
1172b5840353SAdam Hornáček     }
1173b5840353SAdam Hornáček 
factoriesDifferent(AnalyzerFactory a, AnalyzerFactory b)117453e96f58SVladimir Kotal     private static boolean factoriesDifferent(AnalyzerFactory a, AnalyzerFactory b) {
117553e96f58SVladimir Kotal         String aName = null;
1176b5840353SAdam Hornáček         if (a != null) {
117753e96f58SVladimir Kotal             aName = a.getName();
117853e96f58SVladimir Kotal             if (aName == null) {
117953e96f58SVladimir Kotal                 aName = a.getClass().getSimpleName();
1180b5840353SAdam Hornáček             }
1181b5840353SAdam Hornáček         }
118253e96f58SVladimir Kotal         String bName = null;
1183b5840353SAdam Hornáček         if (b != null) {
118453e96f58SVladimir Kotal             bName = b.getName();
118553e96f58SVladimir Kotal             if (bName == null) {
118653e96f58SVladimir Kotal                 bName = b.getClass().getSimpleName();
1187b5840353SAdam Hornáček             }
1188b5840353SAdam Hornáček         }
118953e96f58SVladimir Kotal         if (aName == null && bName == null) {
1190b5840353SAdam Hornáček             return false;
1191b5840353SAdam Hornáček         }
119253e96f58SVladimir Kotal         return aName == null || !aName.equals(bName);
1193b5840353SAdam Hornáček     }
1194b5840353SAdam Hornáček }
1195