1b5840353SAdam Hornáček /* 2b5840353SAdam Hornáček * CDDL HEADER START 3b5840353SAdam Hornáček * 4b5840353SAdam Hornáček * The contents of this file are subject to the terms of the 5b5840353SAdam Hornáček * Common Development and Distribution License (the "License"). 6b5840353SAdam Hornáček * You may not use this file except in compliance with the License. 7b5840353SAdam Hornáček * 8b5840353SAdam Hornáček * See LICENSE.txt included in this distribution for the specific 9b5840353SAdam Hornáček * language governing permissions and limitations under the License. 10b5840353SAdam Hornáček * 11b5840353SAdam Hornáček * When distributing Covered Code, include this CDDL HEADER in each 12b5840353SAdam Hornáček * file and include the License file at LICENSE.txt. 13b5840353SAdam Hornáček * If applicable, add the following below this CDDL HEADER, with the 14b5840353SAdam Hornáček * fields enclosed by brackets "[]" replaced with your own identifying 15b5840353SAdam Hornáček * information: Portions Copyright [yyyy] [name of copyright owner] 16b5840353SAdam Hornáček * 17b5840353SAdam Hornáček * CDDL HEADER END 18b5840353SAdam Hornáček */ 19b5840353SAdam Hornáček 20b5840353SAdam Hornáček /* 2136b6a6f8SVladimir Kotal * Copyright (c) 2005, 2021, Oracle and/or its affiliates. All rights reserved. 22*750b3115SChris Fraire * Portions Copyright (c) 2017, 2021, Chris Fraire <cfraire@me.com>. 23b5840353SAdam Hornáček */ 249805b761SAdam Hornáček package org.opengrok.indexer.analysis; 25b5840353SAdam Hornáček 26b5840353SAdam Hornáček import java.io.BufferedReader; 27b5840353SAdam Hornáček import java.io.File; 28b5840353SAdam Hornáček import java.io.FileWriter; 29b5840353SAdam Hornáček import java.io.IOException; 30b5840353SAdam Hornáček import java.io.InputStream; 31b5840353SAdam Hornáček import java.io.InputStreamReader; 32b5840353SAdam Hornáček import java.io.Reader; 33b5840353SAdam Hornáček import java.io.Writer; 34b5840353SAdam Hornáček import java.lang.reflect.InvocationTargetException; 35b5840353SAdam Hornáček import java.nio.charset.StandardCharsets; 3653e96f58SVladimir Kotal import java.nio.file.Files; 37b5840353SAdam Hornáček import java.util.ArrayList; 38b5840353SAdam Hornáček import java.util.Arrays; 39b5840353SAdam Hornáček import java.util.Collections; 40b5840353SAdam Hornáček import java.util.Comparator; 41b5840353SAdam Hornáček import java.util.HashMap; 42b5840353SAdam Hornáček import java.util.List; 43b5840353SAdam Hornáček import java.util.Locale; 44b5840353SAdam Hornáček import java.util.Map; 45b5840353SAdam Hornáček import java.util.Objects; 46b5840353SAdam Hornáček import java.util.SortedMap; 47b5840353SAdam Hornáček import java.util.TreeMap; 48b5840353SAdam Hornáček import java.util.TreeSet; 49b5840353SAdam Hornáček import java.util.logging.Level; 50b5840353SAdam Hornáček import java.util.logging.Logger; 51b5840353SAdam Hornáček import org.apache.lucene.document.DateTools; 52b5840353SAdam Hornáček import org.apache.lucene.document.Document; 53b5840353SAdam Hornáček import org.apache.lucene.document.Field; 54b5840353SAdam Hornáček import org.apache.lucene.document.Field.Store; 55b5840353SAdam Hornáček import org.apache.lucene.document.FieldType; 56b5840353SAdam Hornáček import org.apache.lucene.document.SortedDocValuesField; 57b5840353SAdam Hornáček import org.apache.lucene.document.StringField; 58b5840353SAdam Hornáček import org.apache.lucene.document.TextField; 59b5840353SAdam Hornáček import org.apache.lucene.util.BytesRef; 609805b761SAdam Hornáček import org.opengrok.indexer.analysis.FileAnalyzerFactory.Matcher; 619805b761SAdam Hornáček import org.opengrok.indexer.analysis.ada.AdaAnalyzerFactory; 629805b761SAdam Hornáček import org.opengrok.indexer.analysis.archive.BZip2AnalyzerFactory; 639805b761SAdam Hornáček import org.opengrok.indexer.analysis.archive.GZIPAnalyzerFactory; 649805b761SAdam Hornáček import org.opengrok.indexer.analysis.archive.TarAnalyzerFactory; 659805b761SAdam Hornáček import org.opengrok.indexer.analysis.archive.ZipAnalyzerFactory; 6650065c95SChris Fraire import org.opengrok.indexer.analysis.asm.AsmAnalyzerFactory; 679805b761SAdam Hornáček import org.opengrok.indexer.analysis.c.CAnalyzerFactory; 689805b761SAdam Hornáček import org.opengrok.indexer.analysis.c.CxxAnalyzerFactory; 6957eefa47SKryštof Tulinger import org.opengrok.indexer.analysis.clojure.ClojureAnalyzerFactory; 709805b761SAdam Hornáček import org.opengrok.indexer.analysis.csharp.CSharpAnalyzerFactory; 719805b761SAdam Hornáček import org.opengrok.indexer.analysis.data.IgnorantAnalyzerFactory; 729805b761SAdam Hornáček import org.opengrok.indexer.analysis.data.ImageAnalyzerFactory; 739805b761SAdam Hornáček import org.opengrok.indexer.analysis.document.MandocAnalyzerFactory; 749805b761SAdam Hornáček import org.opengrok.indexer.analysis.document.TroffAnalyzerFactory; 759805b761SAdam Hornáček import org.opengrok.indexer.analysis.eiffel.EiffelAnalyzerFactory; 769805b761SAdam Hornáček import org.opengrok.indexer.analysis.erlang.ErlangAnalyzerFactory; 779805b761SAdam Hornáček import org.opengrok.indexer.analysis.executables.ELFAnalyzerFactory; 789805b761SAdam Hornáček import org.opengrok.indexer.analysis.executables.JarAnalyzerFactory; 799805b761SAdam Hornáček import org.opengrok.indexer.analysis.executables.JavaClassAnalyzerFactory; 809805b761SAdam Hornáček import org.opengrok.indexer.analysis.fortran.FortranAnalyzerFactory; 819805b761SAdam Hornáček import org.opengrok.indexer.analysis.golang.GolangAnalyzerFactory; 829805b761SAdam Hornáček import org.opengrok.indexer.analysis.haskell.HaskellAnalyzerFactory; 83c5ef7ff6SChris Fraire import org.opengrok.indexer.analysis.hcl.HCLAnalyzerFactory; 849805b761SAdam Hornáček import org.opengrok.indexer.analysis.java.JavaAnalyzerFactory; 859805b761SAdam Hornáček import org.opengrok.indexer.analysis.javascript.JavaScriptAnalyzerFactory; 8657eefa47SKryštof Tulinger import org.opengrok.indexer.analysis.json.JsonAnalyzerFactory; 8757eefa47SKryštof Tulinger import org.opengrok.indexer.analysis.kotlin.KotlinAnalyzerFactory; 889805b761SAdam Hornáček import org.opengrok.indexer.analysis.lisp.LispAnalyzerFactory; 8957eefa47SKryštof Tulinger import org.opengrok.indexer.analysis.lua.LuaAnalyzerFactory; 909805b761SAdam Hornáček import org.opengrok.indexer.analysis.pascal.PascalAnalyzerFactory; 919805b761SAdam Hornáček import org.opengrok.indexer.analysis.perl.PerlAnalyzerFactory; 929805b761SAdam Hornáček import org.opengrok.indexer.analysis.php.PhpAnalyzerFactory; 939805b761SAdam Hornáček import org.opengrok.indexer.analysis.plain.PlainAnalyzerFactory; 949805b761SAdam Hornáček import org.opengrok.indexer.analysis.plain.XMLAnalyzerFactory; 9557eefa47SKryštof Tulinger import org.opengrok.indexer.analysis.powershell.PowershellAnalyzerFactory; 969805b761SAdam Hornáček import org.opengrok.indexer.analysis.python.PythonAnalyzerFactory; 97ca0eafb1SChris Fraire import org.opengrok.indexer.analysis.r.RAnalyzerFactory; 9857eefa47SKryštof Tulinger import org.opengrok.indexer.analysis.ruby.RubyAnalyzerFactory; 999805b761SAdam Hornáček import org.opengrok.indexer.analysis.rust.RustAnalyzerFactory; 1009805b761SAdam Hornáček import org.opengrok.indexer.analysis.scala.ScalaAnalyzerFactory; 1019805b761SAdam Hornáček import org.opengrok.indexer.analysis.sh.ShAnalyzerFactory; 1029805b761SAdam Hornáček import org.opengrok.indexer.analysis.sql.PLSQLAnalyzerFactory; 1039805b761SAdam Hornáček import org.opengrok.indexer.analysis.sql.SQLAnalyzerFactory; 1049805b761SAdam Hornáček import org.opengrok.indexer.analysis.swift.SwiftAnalyzerFactory; 1059805b761SAdam Hornáček import org.opengrok.indexer.analysis.tcl.TclAnalyzerFactory; 106c5ef7ff6SChris Fraire import org.opengrok.indexer.analysis.terraform.TerraformAnalyzerFactory; 1070ed261b2SChris Fraire import org.opengrok.indexer.analysis.typescript.TypeScriptAnalyzerFactory; 1089805b761SAdam Hornáček import org.opengrok.indexer.analysis.uue.UuencodeAnalyzerFactory; 1099805b761SAdam Hornáček import org.opengrok.indexer.analysis.vb.VBAnalyzerFactory; 1104f9cbaecSChris Fraire import org.opengrok.indexer.analysis.verilog.VerilogAnalyzerFactory; 1119805b761SAdam Hornáček import org.opengrok.indexer.configuration.Project; 1129805b761SAdam Hornáček import org.opengrok.indexer.configuration.RuntimeEnvironment; 1139805b761SAdam Hornáček import org.opengrok.indexer.history.Annotation; 11436b6a6f8SVladimir Kotal import org.opengrok.indexer.history.History; 11536b6a6f8SVladimir Kotal import org.opengrok.indexer.history.HistoryEntry; 1169805b761SAdam Hornáček import org.opengrok.indexer.history.HistoryException; 1179805b761SAdam Hornáček import org.opengrok.indexer.history.HistoryGuru; 1189805b761SAdam Hornáček import org.opengrok.indexer.history.HistoryReader; 1199805b761SAdam Hornáček import org.opengrok.indexer.logger.LoggerFactory; 1209805b761SAdam Hornáček import org.opengrok.indexer.search.QueryBuilder; 1219805b761SAdam Hornáček import org.opengrok.indexer.util.IOUtils; 1229805b761SAdam Hornáček import org.opengrok.indexer.web.Util; 123b5840353SAdam Hornáček 124b5840353SAdam Hornáček /** 125b5840353SAdam Hornáček * Manages and provides Analyzers as needed. Please see 126a840d4e0SAdam Hornacek * <a href="https://github.com/oracle/opengrok/wiki/Internals"> 127b5840353SAdam Hornáček * this</a> page for a great description of the purpose of the AnalyzerGuru. 128b5840353SAdam Hornáček * 129b5840353SAdam Hornáček * Created on September 22, 2005 130b5840353SAdam Hornáček * 131b5840353SAdam Hornáček * @author Chandan 132b5840353SAdam Hornáček */ 133b5840353SAdam Hornáček public class AnalyzerGuru { 134b5840353SAdam Hornáček 135b5840353SAdam Hornáček /** 136b5840353SAdam Hornáček * The maximum number of characters (multi-byte if a BOM is identified) to 137ff44f24aSAdam Hornáček * read from the input stream to be used for magic string matching. 138b5840353SAdam Hornáček */ 139b5840353SAdam Hornáček private static final int OPENING_MAX_CHARS = 100; 140b5840353SAdam Hornáček 141b5840353SAdam Hornáček /** 142b5840353SAdam Hornáček * Set to 16K -- though debugging shows it would do with only 8K+3 143ff44f24aSAdam Hornáček * (standard buffer for Java BufferedInputStream plus 3 bytes for largest UTF BOM). 144b5840353SAdam Hornáček */ 145b5840353SAdam Hornáček private static final int MARK_READ_LIMIT = 1024 * 16; 146b5840353SAdam Hornáček 147b5840353SAdam Hornáček /** 148b5840353SAdam Hornáček * The number of bytes read from the start of the file for magic number or 149b5840353SAdam Hornáček * string analysis. Some {@link FileAnalyzerFactory.Matcher} 150b5840353SAdam Hornáček * implementations may read more data subsequently, but this field defines 151b5840353SAdam Hornáček * the number of bytes initially read for general matching. 152b5840353SAdam Hornáček */ 153b5840353SAdam Hornáček private static final int MAGIC_BYTES_NUM = 8; 154b5840353SAdam Hornáček 155b5840353SAdam Hornáček private static final Logger LOGGER = LoggerFactory.getLogger(AnalyzerGuru.class); 156b5840353SAdam Hornáček 157b5840353SAdam Hornáček /** 158b5840353SAdam Hornáček * The default {@code FileAnalyzerFactory} instance. 159b5840353SAdam Hornáček */ 16057eefa47SKryštof Tulinger private static final AnalyzerFactory DEFAULT_ANALYZER_FACTORY = new FileAnalyzerFactory(); 161b5840353SAdam Hornáček 162b5840353SAdam Hornáček /** 163b5840353SAdam Hornáček * Map from file names to analyzer factories. 164b5840353SAdam Hornáček */ 16557eefa47SKryštof Tulinger private static final Map<String, AnalyzerFactory> FILE_NAMES = new HashMap<>(); 166b5840353SAdam Hornáček 167b5840353SAdam Hornáček /** 168b5840353SAdam Hornáček * Map from file extensions to analyzer factories. 169b5840353SAdam Hornáček */ 17057eefa47SKryštof Tulinger private static final Map<String, AnalyzerFactory> ext = new HashMap<>(); 171b5840353SAdam Hornáček 172b5840353SAdam Hornáček /** 173b5840353SAdam Hornáček * Map from file prefixes to analyzer factories. 174b5840353SAdam Hornáček */ 17557eefa47SKryštof Tulinger private static final Map<String, AnalyzerFactory> pre = new HashMap<>(); 176b5840353SAdam Hornáček 177b5840353SAdam Hornáček /** 178b5840353SAdam Hornáček * Appended when 17957eefa47SKryštof Tulinger * {@link #addExtension(java.lang.String, AnalyzerFactory)} 180b5840353SAdam Hornáček * or 18157eefa47SKryštof Tulinger * {@link #addPrefix(java.lang.String, AnalyzerFactory)} 182b5840353SAdam Hornáček * are called to augment the value in {@link #getVersionNo()}. 183b5840353SAdam Hornáček */ 184b5840353SAdam Hornáček private static final TreeSet<String> CUSTOMIZATION_KEYS = new TreeSet<>(); 185b5840353SAdam Hornáček 186b5840353SAdam Hornáček private static int customizationHashCode; 187b5840353SAdam Hornáček 188b5840353SAdam Hornáček /** 189ff44f24aSAdam Hornáček * Descending string length comparator for magics. 190b5840353SAdam Hornáček */ 191c6f0939bSAdam Hornacek private static final Comparator<String> descStrlenComparator = (s1, s2) -> { 192b5840353SAdam Hornáček // DESC: s2 length <=> s1 length 193b5840353SAdam Hornáček int cmp = Integer.compare(s2.length(), s1.length()); 194a72324b1SAdam Hornáček if (cmp != 0) { 195a72324b1SAdam Hornáček return cmp; 196a72324b1SAdam Hornáček } 197b5840353SAdam Hornáček 198b5840353SAdam Hornáček // the Comparator must also be "consistent with equals", so check 199b5840353SAdam Hornáček // string contents too when (length)cmp == 0. (ASC: s1 <=> s2.) 200b5840353SAdam Hornáček cmp = s1.compareTo(s2); 201b5840353SAdam Hornáček return cmp; 202b5840353SAdam Hornáček }; 203b5840353SAdam Hornáček 204b5840353SAdam Hornáček /** 205b5840353SAdam Hornáček * Map from magic strings to analyzer factories. 206b5840353SAdam Hornáček */ 20757eefa47SKryštof Tulinger private static final SortedMap<String, AnalyzerFactory> magics = 208b5840353SAdam Hornáček new TreeMap<>(descStrlenComparator); 209b5840353SAdam Hornáček 210b5840353SAdam Hornáček /** 211b5840353SAdam Hornáček * List of matcher objects which can be used to determine which analyzer 212b5840353SAdam Hornáček * factory to use. 213b5840353SAdam Hornáček */ 214b5840353SAdam Hornáček private static final List<FileAnalyzerFactory.Matcher> matchers = new ArrayList<>(); 215b5840353SAdam Hornáček 216b5840353SAdam Hornáček /** 217b5840353SAdam Hornáček * List of all registered {@code FileAnalyzerFactory} instances. 218b5840353SAdam Hornáček */ 21957eefa47SKryštof Tulinger private static final List<AnalyzerFactory> factories = new ArrayList<>(); 220b5840353SAdam Hornáček 221b5840353SAdam Hornáček /** 222b5840353SAdam Hornáček * Names of all analysis packages. 223b5840353SAdam Hornáček */ 224b5840353SAdam Hornáček private static final List<String> analysisPkgNames = new ArrayList<>(); 225b5840353SAdam Hornáček 226b5840353SAdam Hornáček public static final FieldType string_ft_stored_nanalyzed_norms = new FieldType(StringField.TYPE_STORED); 227b5840353SAdam Hornáček public static final FieldType string_ft_nstored_nanalyzed_norms = new FieldType(StringField.TYPE_NOT_STORED); 228b5840353SAdam Hornáček 229b5840353SAdam Hornáček private static final Map<String, String> fileTypeDescriptions = new TreeMap<>(); 230b5840353SAdam Hornáček 231b5840353SAdam Hornáček /** 232b5840353SAdam Hornáček * Maps from {@link FileAnalyzer#getFileTypeName()} to 233ff44f24aSAdam Hornáček * {@link FileAnalyzerFactory}. 234b5840353SAdam Hornáček */ 23557eefa47SKryštof Tulinger private static final Map<String, AnalyzerFactory> FILETYPE_FACTORIES = 236b5840353SAdam Hornáček new HashMap<>(); 237b5840353SAdam Hornáček 238b5840353SAdam Hornáček /** 239b5840353SAdam Hornáček * Maps from {@link FileAnalyzer#getFileTypeName()} to 240ff44f24aSAdam Hornáček * {@link FileAnalyzer#getVersionNo()}. 241b5840353SAdam Hornáček */ 242b5840353SAdam Hornáček private static final Map<String, Long> ANALYZER_VERSIONS = new HashMap<>(); 243b5840353SAdam Hornáček 2446de4f5aaSChris Fraire private static final LangTreeMap langMap = new LangTreeMap(); 245adc0ce11SChris Fraire private static final LangTreeMap defaultLangMap = new LangTreeMap(); 2466de4f5aaSChris Fraire 247b5840353SAdam Hornáček /* 248b5840353SAdam Hornáček * If you write your own analyzer please register it here. The order is 249b5840353SAdam Hornáček * important for any factory that uses a FileAnalyzerFactory.Matcher 250b5840353SAdam Hornáček * implementation, as those are run in the same order as defined below -- 251b5840353SAdam Hornáček * though precise Matchers are run before imprecise ones. 252b5840353SAdam Hornáček */ 253b5840353SAdam Hornáček static { 254b5840353SAdam Hornáček try { 25557eefa47SKryštof Tulinger AnalyzerFactory[] analyzers = { 256b5840353SAdam Hornáček DEFAULT_ANALYZER_FACTORY, 257b5840353SAdam Hornáček new IgnorantAnalyzerFactory(), 258b5840353SAdam Hornáček new BZip2AnalyzerFactory(), 259b5840353SAdam Hornáček new XMLAnalyzerFactory(), 260b5840353SAdam Hornáček MandocAnalyzerFactory.DEFAULT_INSTANCE, 261b5840353SAdam Hornáček TroffAnalyzerFactory.DEFAULT_INSTANCE, 262b5840353SAdam Hornáček new ELFAnalyzerFactory(), 263b5840353SAdam Hornáček JavaClassAnalyzerFactory.DEFAULT_INSTANCE, 264b5840353SAdam Hornáček new ImageAnalyzerFactory(), 265b5840353SAdam Hornáček JarAnalyzerFactory.DEFAULT_INSTANCE, 266b5840353SAdam Hornáček ZipAnalyzerFactory.DEFAULT_INSTANCE, 267b5840353SAdam Hornáček new TarAnalyzerFactory(), 268b5840353SAdam Hornáček new CAnalyzerFactory(), 269b5840353SAdam Hornáček new CSharpAnalyzerFactory(), 270b5840353SAdam Hornáček new VBAnalyzerFactory(), 271b5840353SAdam Hornáček new CxxAnalyzerFactory(), 272b5840353SAdam Hornáček new ErlangAnalyzerFactory(), 273b5840353SAdam Hornáček new ShAnalyzerFactory(), 274b5840353SAdam Hornáček new PowershellAnalyzerFactory(), 275b5840353SAdam Hornáček new UuencodeAnalyzerFactory(), 276b5840353SAdam Hornáček new GZIPAnalyzerFactory(), 277b5840353SAdam Hornáček new JavaAnalyzerFactory(), 278b5840353SAdam Hornáček new JavaScriptAnalyzerFactory(), 279b5840353SAdam Hornáček new KotlinAnalyzerFactory(), 280b5840353SAdam Hornáček new SwiftAnalyzerFactory(), 281b5840353SAdam Hornáček new JsonAnalyzerFactory(), 282b5840353SAdam Hornáček new PythonAnalyzerFactory(), 283b5840353SAdam Hornáček new RustAnalyzerFactory(), 284b5840353SAdam Hornáček new PerlAnalyzerFactory(), 285b5840353SAdam Hornáček new PhpAnalyzerFactory(), 286b5840353SAdam Hornáček new LispAnalyzerFactory(), 287b5840353SAdam Hornáček new TclAnalyzerFactory(), 288b5840353SAdam Hornáček new ScalaAnalyzerFactory(), 289b5840353SAdam Hornáček new ClojureAnalyzerFactory(), 290b5840353SAdam Hornáček new SQLAnalyzerFactory(), 291b5840353SAdam Hornáček new PLSQLAnalyzerFactory(), 292b5840353SAdam Hornáček new FortranAnalyzerFactory(), 293b5840353SAdam Hornáček new HaskellAnalyzerFactory(), 294b5840353SAdam Hornáček new GolangAnalyzerFactory(), 295b5840353SAdam Hornáček new LuaAnalyzerFactory(), 296b5840353SAdam Hornáček new PascalAnalyzerFactory(), 297b5840353SAdam Hornáček new AdaAnalyzerFactory(), 298b5840353SAdam Hornáček new RubyAnalyzerFactory(), 2994f9cbaecSChris Fraire new EiffelAnalyzerFactory(), 3000ed261b2SChris Fraire new VerilogAnalyzerFactory(), 30150065c95SChris Fraire new TypeScriptAnalyzerFactory(), 302c5ef7ff6SChris Fraire new AsmAnalyzerFactory(), 303c5ef7ff6SChris Fraire new HCLAnalyzerFactory(), 304ca0eafb1SChris Fraire new TerraformAnalyzerFactory(), 305*750b3115SChris Fraire new RAnalyzerFactory(), 306*750b3115SChris Fraire // Keep PlainAnalyzer last, with its lone, quite fuzzy matcher. 307*750b3115SChris Fraire PlainAnalyzerFactory.DEFAULT_INSTANCE 308b5840353SAdam Hornáček }; 309b5840353SAdam Hornáček 31057eefa47SKryštof Tulinger for (AnalyzerFactory analyzer : analyzers) { 311b5840353SAdam Hornáček registerAnalyzer(analyzer); 312b5840353SAdam Hornáček } 313b5840353SAdam Hornáček 31457eefa47SKryštof Tulinger for (AnalyzerFactory analyzer : analyzers) { 315b5840353SAdam Hornáček if (analyzer.getName() != null && !analyzer.getName().isEmpty()) { analyzer.getName()316b5840353SAdam Hornáček fileTypeDescriptions.put(analyzer.getAnalyzer().getFileTypeName(), analyzer.getName()); 317b5840353SAdam Hornáček } 318b5840353SAdam Hornáček } 319b5840353SAdam Hornáček 320b5840353SAdam Hornáček string_ft_stored_nanalyzed_norms.setOmitNorms(false); 321b5840353SAdam Hornáček string_ft_nstored_nanalyzed_norms.setOmitNorms(false); 322b5840353SAdam Hornáček } catch (Throwable t) { 323b5840353SAdam Hornáček LOGGER.log(Level.SEVERE, 324b5840353SAdam Hornáček "exception hit when constructing AnalyzerGuru static", t); 325b5840353SAdam Hornáček throw t; 326b5840353SAdam Hornáček } 327b5840353SAdam Hornáček } 328b5840353SAdam Hornáček 329b5840353SAdam Hornáček /** 330b5840353SAdam Hornáček * Gets a version number to be used to tag documents examined by the guru so 331670ee787SChris Fraire * that {@link AbstractAnalyzer} selection can be re-done later if a stored 332670ee787SChris Fraire * version number is different from the current implementation or if guru 333670ee787SChris Fraire * factory registrations are modified by the user to change the guru 334670ee787SChris Fraire * operation. 335b5840353SAdam Hornáček * <p> 336b5840353SAdam Hornáček * The static part of the version is bumped in a release when e.g. new 337b5840353SAdam Hornáček * {@link FileAnalyzerFactory} subclasses are registered or when existing 338b5840353SAdam Hornáček * {@link FileAnalyzerFactory} subclasses are revised to target more or 339b5840353SAdam Hornáček * different files. 340b5840353SAdam Hornáček * @return a value whose lower 32-bits are a static value 341ca0eafb1SChris Fraire * 20201003_00 342b5840353SAdam Hornáček * for the current implementation and whose higher-32 bits are non-zero if 34357eefa47SKryštof Tulinger * {@link #addExtension(java.lang.String, AnalyzerFactory)} 344b5840353SAdam Hornáček * or 34557eefa47SKryštof Tulinger * {@link #addPrefix(java.lang.String, AnalyzerFactory)} 346b5840353SAdam Hornáček * has been called. 347b5840353SAdam Hornáček */ getVersionNo()348b5840353SAdam Hornáček public static long getVersionNo() { 349ca0eafb1SChris Fraire final int ver32 = 20201003_00; // Edit comment above too! 350b5840353SAdam Hornáček long ver = ver32; 351b5840353SAdam Hornáček if (customizationHashCode != 0) { 352b5840353SAdam Hornáček ver |= (long) customizationHashCode << 32; 353b5840353SAdam Hornáček } 354b5840353SAdam Hornáček return ver; 355b5840353SAdam Hornáček } 356b5840353SAdam Hornáček 357b5840353SAdam Hornáček /** 358b5840353SAdam Hornáček * Gets a version number according to a registered 359b5840353SAdam Hornáček * {@link FileAnalyzer#getVersionNo()} for a {@code fileTypeName} according 360b5840353SAdam Hornáček * to {@link FileAnalyzer#getFileTypeName()}. 361b5840353SAdam Hornáček * @param fileTypeName a defined instance 362b5840353SAdam Hornáček * @return a registered value or {@link Long#MIN_VALUE} if 363b5840353SAdam Hornáček * {@code fileTypeName} is unknown 364b5840353SAdam Hornáček */ getAnalyzerVersionNo(String fileTypeName)365b5840353SAdam Hornáček public static long getAnalyzerVersionNo(String fileTypeName) { 366b5840353SAdam Hornáček return ANALYZER_VERSIONS.getOrDefault(fileTypeName, Long.MIN_VALUE); 367b5840353SAdam Hornáček } 368b5840353SAdam Hornáček getAnalyzersVersionNos()369b5840353SAdam Hornáček public static Map<String, Long> getAnalyzersVersionNos() { 370b5840353SAdam Hornáček return Collections.unmodifiableMap(ANALYZER_VERSIONS); 371b5840353SAdam Hornáček } 372b5840353SAdam Hornáček getExtensionsMap()37357eefa47SKryštof Tulinger public static Map<String, AnalyzerFactory> getExtensionsMap() { 374b5840353SAdam Hornáček return Collections.unmodifiableMap(ext); 375b5840353SAdam Hornáček } 376b5840353SAdam Hornáček getPrefixesMap()37757eefa47SKryštof Tulinger public static Map<String, AnalyzerFactory> getPrefixesMap() { 378b5840353SAdam Hornáček return Collections.unmodifiableMap(pre); 379b5840353SAdam Hornáček } 380b5840353SAdam Hornáček getMagicsMap()38157eefa47SKryštof Tulinger public static Map<String, AnalyzerFactory> getMagicsMap() { 382b5840353SAdam Hornáček return Collections.unmodifiableMap(magics); 383b5840353SAdam Hornáček } 384b5840353SAdam Hornáček getAnalyzerFactoryMatchers()385b5840353SAdam Hornáček public static List<Matcher> getAnalyzerFactoryMatchers() { 386b5840353SAdam Hornáček return Collections.unmodifiableList(matchers); 387b5840353SAdam Hornáček } 388b5840353SAdam Hornáček getfileTypeDescriptions()389b5840353SAdam Hornáček public static Map<String, String> getfileTypeDescriptions() { 390b5840353SAdam Hornáček return Collections.unmodifiableMap(fileTypeDescriptions); 391b5840353SAdam Hornáček } 392b5840353SAdam Hornáček getAnalyzerFactories()393cd560f86SVladimir Kotal public static List<AnalyzerFactory> getAnalyzerFactories() { 394b5840353SAdam Hornáček return Collections.unmodifiableList(factories); 395b5840353SAdam Hornáček } 396b5840353SAdam Hornáček 39753e96f58SVladimir Kotal private static final String USED_IN_MULTIPLE_MSG = "' used in multiple analyzers"; 39853e96f58SVladimir Kotal 399b5840353SAdam Hornáček /** 400b5840353SAdam Hornáček * Register a {@code FileAnalyzerFactory} instance. 401b5840353SAdam Hornáček */ registerAnalyzer(AnalyzerFactory factory)40257eefa47SKryštof Tulinger private static void registerAnalyzer(AnalyzerFactory factory) { 403b5840353SAdam Hornáček for (String name : factory.getFileNames()) { 40457eefa47SKryštof Tulinger AnalyzerFactory old = FILE_NAMES.put(name, factory); 405b5840353SAdam Hornáček assert old == null : 40653e96f58SVladimir Kotal "name '" + name + USED_IN_MULTIPLE_MSG; 407b5840353SAdam Hornáček } 408b5840353SAdam Hornáček for (String prefix : factory.getPrefixes()) { 40957eefa47SKryštof Tulinger AnalyzerFactory old = pre.put(prefix, factory); 410b5840353SAdam Hornáček assert old == null : 41153e96f58SVladimir Kotal "prefix '" + prefix + USED_IN_MULTIPLE_MSG; 412b5840353SAdam Hornáček } 413b5840353SAdam Hornáček for (String suffix : factory.getSuffixes()) { 41457eefa47SKryštof Tulinger AnalyzerFactory old = ext.put(suffix, factory); 415b5840353SAdam Hornáček assert old == null : 41653e96f58SVladimir Kotal "suffix '" + suffix + USED_IN_MULTIPLE_MSG; 417b5840353SAdam Hornáček } 418b5840353SAdam Hornáček for (String magic : factory.getMagicStrings()) { 41957eefa47SKryštof Tulinger AnalyzerFactory old = magics.put(magic, factory); 420b5840353SAdam Hornáček assert old == null : 42153e96f58SVladimir Kotal "magic '" + magic + USED_IN_MULTIPLE_MSG; 422b5840353SAdam Hornáček } 423b5840353SAdam Hornáček matchers.addAll(factory.getMatchers()); 424b5840353SAdam Hornáček factories.add(factory); 425b5840353SAdam Hornáček 42657eefa47SKryštof Tulinger AbstractAnalyzer fa = factory.getAnalyzer(); 427b5840353SAdam Hornáček String fileTypeName = fa.getFileTypeName(); 428b5840353SAdam Hornáček FILETYPE_FACTORIES.put(fileTypeName, factory); 429b5840353SAdam Hornáček ANALYZER_VERSIONS.put(fileTypeName, fa.getVersionNo()); 430adc0ce11SChris Fraire 431adc0ce11SChris Fraire // Possibly configure default LANG mappings for the factory. 432adc0ce11SChris Fraire String ctagsLang = factory.getAnalyzer().getCtagsLang(); 433adc0ce11SChris Fraire if (ctagsLang != null) { 434adc0ce11SChris Fraire List<String> prefixes = factory.getPrefixes(); 435adc0ce11SChris Fraire if (prefixes != null) { 436adc0ce11SChris Fraire for (String prefix : prefixes) { 437adc0ce11SChris Fraire defaultLangMap.add(prefix, ctagsLang); 438adc0ce11SChris Fraire } 439adc0ce11SChris Fraire } 440adc0ce11SChris Fraire 441adc0ce11SChris Fraire List<String> suffixes = factory.getSuffixes(); 442adc0ce11SChris Fraire if (suffixes != null) { 443adc0ce11SChris Fraire for (String suffix : suffixes) { 444adc0ce11SChris Fraire // LangMap needs a "." to signify a file extension. 445adc0ce11SChris Fraire defaultLangMap.add("." + suffix, ctagsLang); 446adc0ce11SChris Fraire } 447adc0ce11SChris Fraire } 448adc0ce11SChris Fraire } 449b5840353SAdam Hornáček } 450b5840353SAdam Hornáček 451b5840353SAdam Hornáček /** 452b5840353SAdam Hornáček * Instruct the AnalyzerGuru to use a given analyzer for a given file 453b5840353SAdam Hornáček * prefix. 454b5840353SAdam Hornáček * 455b5840353SAdam Hornáček * @param prefix the file prefix to add 456b5840353SAdam Hornáček * @param factory a factory which creates the analyzer to use for the given 457b5840353SAdam Hornáček * extension (if you pass null as the analyzer, you will disable the 458b5840353SAdam Hornáček * analyzer used for that extension) 459b5840353SAdam Hornáček */ addPrefix(String prefix, AnalyzerFactory factory)46057eefa47SKryštof Tulinger public static void addPrefix(String prefix, AnalyzerFactory factory) { 46157eefa47SKryštof Tulinger AnalyzerFactory oldFactory; 462b5840353SAdam Hornáček if (factory == null) { 463b5840353SAdam Hornáček oldFactory = pre.remove(prefix); 4646de4f5aaSChris Fraire langMap.exclude(prefix); 465b5840353SAdam Hornáček } else { 466b5840353SAdam Hornáček oldFactory = pre.put(prefix, factory); 4676de4f5aaSChris Fraire langMap.add(prefix, factory.getAnalyzer().getCtagsLang()); 468b5840353SAdam Hornáček } 469b5840353SAdam Hornáček 470b5840353SAdam Hornáček if (factoriesDifferent(factory, oldFactory)) { 471b5840353SAdam Hornáček addCustomizationKey("p:" + prefix); 472b5840353SAdam Hornáček } 473b5840353SAdam Hornáček } 474b5840353SAdam Hornáček 475b5840353SAdam Hornáček /** 476b5840353SAdam Hornáček * Instruct the AnalyzerGuru to use a given analyzer for a given file 477b5840353SAdam Hornáček * extension. 478b5840353SAdam Hornáček * 479b5840353SAdam Hornáček * @param extension the file-extension to add 480b5840353SAdam Hornáček * @param factory a factory which creates the analyzer to use for the given 481b5840353SAdam Hornáček * extension (if you pass null as the analyzer, you will disable the 482b5840353SAdam Hornáček * analyzer used for that extension) 4836de4f5aaSChris Fraire * @throws IllegalArgumentException if {@code extension} contains a period 484b5840353SAdam Hornáček */ addExtension(String extension, AnalyzerFactory factory)4856de4f5aaSChris Fraire public static void addExtension(String extension, AnalyzerFactory factory) { 4866de4f5aaSChris Fraire if (extension.contains(".")) { 4876de4f5aaSChris Fraire throw new IllegalArgumentException("extension contains a '.'"); 4886de4f5aaSChris Fraire } 4896de4f5aaSChris Fraire 4906de4f5aaSChris Fraire // LangMap fileSpec requires a leading period to indicate an extension. 4916de4f5aaSChris Fraire String langMapExtension = "." + extension; 4926de4f5aaSChris Fraire 49357eefa47SKryštof Tulinger AnalyzerFactory oldFactory; 494b5840353SAdam Hornáček if (factory == null) { 495b5840353SAdam Hornáček oldFactory = ext.remove(extension); 4966de4f5aaSChris Fraire langMap.exclude(langMapExtension); 497b5840353SAdam Hornáček } else { 498b5840353SAdam Hornáček oldFactory = ext.put(extension, factory); 4996de4f5aaSChris Fraire langMap.add(langMapExtension, factory.getAnalyzer().getCtagsLang()); 500b5840353SAdam Hornáček } 501b5840353SAdam Hornáček 502b5840353SAdam Hornáček if (factoriesDifferent(factory, oldFactory)) { 503b5840353SAdam Hornáček addCustomizationKey("e:" + extension); 504b5840353SAdam Hornáček } 505b5840353SAdam Hornáček } 506b5840353SAdam Hornáček 507b5840353SAdam Hornáček /** 5086de4f5aaSChris Fraire * Gets an unmodifiable view of the language mappings resulting from 5096de4f5aaSChris Fraire * {@link #addExtension(String, AnalyzerFactory)} and 510adc0ce11SChris Fraire * {@link #addPrefix(String, AnalyzerFactory)} merged with default language 511adc0ce11SChris Fraire * mappings of OpenGrok's analyzers. 5126de4f5aaSChris Fraire */ getLangMap()5136de4f5aaSChris Fraire public static LangMap getLangMap() { 514adc0ce11SChris Fraire return langMap.mergeSecondary(defaultLangMap).unmodifiable(); 5156de4f5aaSChris Fraire } 5166de4f5aaSChris Fraire 5176de4f5aaSChris Fraire /** 518b5840353SAdam Hornáček * Get the default Analyzer. 519b5840353SAdam Hornáček * 520b5840353SAdam Hornáček * @return default FileAnalyzer 521b5840353SAdam Hornáček */ getAnalyzer()52257eefa47SKryštof Tulinger public static AbstractAnalyzer getAnalyzer() { 523b5840353SAdam Hornáček return DEFAULT_ANALYZER_FACTORY.getAnalyzer(); 524b5840353SAdam Hornáček } 525b5840353SAdam Hornáček 526b5840353SAdam Hornáček /** 527b5840353SAdam Hornáček * Gets an analyzer for the specified {@code fileTypeName} if it accords 528b5840353SAdam Hornáček * with a known {@link FileAnalyzer#getFileTypeName()}. 529b5840353SAdam Hornáček * @param fileTypeName a defined name 530b5840353SAdam Hornáček * @return a defined instance if known or otherwise {@code null} 531b5840353SAdam Hornáček */ getAnalyzer(String fileTypeName)53257eefa47SKryštof Tulinger public static AbstractAnalyzer getAnalyzer(String fileTypeName) { 53357eefa47SKryštof Tulinger AnalyzerFactory factory = FILETYPE_FACTORIES.get(fileTypeName); 534b5840353SAdam Hornáček return factory == null ? null : factory.getAnalyzer(); 535b5840353SAdam Hornáček } 536b5840353SAdam Hornáček 537b5840353SAdam Hornáček /** 538b5840353SAdam Hornáček * Get an analyzer suited to analyze a file. This function will reuse 539b5840353SAdam Hornáček * analyzers since they are costly. 540b5840353SAdam Hornáček * 541b5840353SAdam Hornáček * @param in Input stream containing data to be analyzed 542b5840353SAdam Hornáček * @param file Name of the file to be analyzed 543b5840353SAdam Hornáček * @return An analyzer suited for that file content 544b5840353SAdam Hornáček * @throws java.io.IOException If an error occurs while accessing the data 545b5840353SAdam Hornáček * in the input stream. 546b5840353SAdam Hornáček */ getAnalyzer(InputStream in, String file)54757eefa47SKryštof Tulinger public static AbstractAnalyzer getAnalyzer(InputStream in, String file) throws IOException { 54857eefa47SKryštof Tulinger AnalyzerFactory factory = find(in, file); 549b5840353SAdam Hornáček if (factory == null) { 55057eefa47SKryštof Tulinger AbstractAnalyzer defaultAnalyzer = getAnalyzer(); 55130bba29fSChris Fraire if (LOGGER.isLoggable(Level.FINEST)) { 55230bba29fSChris Fraire LOGGER.log(Level.FINEST, "{0}: fallback {1}", 553b5840353SAdam Hornáček new Object[]{file, 554b5840353SAdam Hornáček defaultAnalyzer.getClass().getSimpleName() }); 555b5840353SAdam Hornáček } 556b5840353SAdam Hornáček return defaultAnalyzer; 557b5840353SAdam Hornáček } 558b5840353SAdam Hornáček return factory.getAnalyzer(); 559b5840353SAdam Hornáček } 560b5840353SAdam Hornáček 561b5840353SAdam Hornáček /** 5627a857b12SVladimir Kotal * Free resources associated with all registered analyzers. 5637a857b12SVladimir Kotal */ returnAnalyzers()5647a857b12SVladimir Kotal public static void returnAnalyzers() { 56557eefa47SKryštof Tulinger for (AnalyzerFactory analyzer : factories) { 5667a857b12SVladimir Kotal analyzer.returnAnalyzer(); 5677a857b12SVladimir Kotal } 5687a857b12SVladimir Kotal } 5697a857b12SVladimir Kotal 5707a857b12SVladimir Kotal /** 571b5840353SAdam Hornáček * Populate a Lucene document with the required fields. 572b5840353SAdam Hornáček * 573b5840353SAdam Hornáček * @param doc The document to populate 574b5840353SAdam Hornáček * @param file The file to index 575b5840353SAdam Hornáček * @param path Where the file is located (from source root) 576b5840353SAdam Hornáček * @param fa The analyzer to use on the file 577b5840353SAdam Hornáček * @param xrefOut Where to write the xref (possibly {@code null}) 578b5840353SAdam Hornáček * @throws IOException If an exception occurs while collecting the data 579b5840353SAdam Hornáček * @throws InterruptedException if a timeout occurs 580b5840353SAdam Hornáček */ populateDocument(Document doc, File file, String path, AbstractAnalyzer fa, Writer xrefOut)58127c0cfcdSVladimir Kotal public void populateDocument(Document doc, File file, String path, AbstractAnalyzer fa, Writer xrefOut) 58227c0cfcdSVladimir Kotal throws IOException, InterruptedException { 583b5840353SAdam Hornáček 584b5840353SAdam Hornáček String date = DateTools.timeToString(file.lastModified(), 585b5840353SAdam Hornáček DateTools.Resolution.MILLISECOND); 586807ead8fSLubos Kosco path = Util.fixPathIfWindows(path); 587b5840353SAdam Hornáček doc.add(new Field(QueryBuilder.U, Util.path2uid(path, date), 588b5840353SAdam Hornáček string_ft_stored_nanalyzed_norms)); 589b5840353SAdam Hornáček doc.add(new Field(QueryBuilder.FULLPATH, file.getAbsolutePath(), 590b5840353SAdam Hornáček string_ft_nstored_nanalyzed_norms)); 591b5840353SAdam Hornáček doc.add(new SortedDocValuesField(QueryBuilder.FULLPATH, 592b5840353SAdam Hornáček new BytesRef(file.getAbsolutePath()))); 593b5840353SAdam Hornáček 594b5840353SAdam Hornáček if (RuntimeEnvironment.getInstance().isHistoryEnabled()) { 595b5840353SAdam Hornáček try { 59636b6a6f8SVladimir Kotal HistoryGuru histGuru = HistoryGuru.getInstance(); 59736b6a6f8SVladimir Kotal HistoryReader hr = histGuru.getHistoryReader(file); 598b5840353SAdam Hornáček if (hr != null) { 599b5840353SAdam Hornáček doc.add(new TextField(QueryBuilder.HIST, hr)); 60036b6a6f8SVladimir Kotal History history; 60136b6a6f8SVladimir Kotal if ((history = histGuru.getHistory(file)) != null) { 60236b6a6f8SVladimir Kotal List<HistoryEntry> historyEntries = history.getHistoryEntries(1, 0); 60353e96f58SVladimir Kotal if (!historyEntries.isEmpty()) { 60436b6a6f8SVladimir Kotal HistoryEntry histEntry = historyEntries.get(0); 60536b6a6f8SVladimir Kotal doc.add(new TextField(QueryBuilder.LASTREV, histEntry.getRevision(), Store.YES)); 60636b6a6f8SVladimir Kotal } 60736b6a6f8SVladimir Kotal } 608b5840353SAdam Hornáček } 609b5840353SAdam Hornáček } catch (HistoryException e) { 610b5840353SAdam Hornáček LOGGER.log(Level.WARNING, "An error occurred while reading history: ", e); 611b5840353SAdam Hornáček } 612b5840353SAdam Hornáček } 613b5840353SAdam Hornáček doc.add(new Field(QueryBuilder.DATE, date, string_ft_stored_nanalyzed_norms)); 614b5840353SAdam Hornáček doc.add(new SortedDocValuesField(QueryBuilder.DATE, new BytesRef(date))); 615b5840353SAdam Hornáček 616b5840353SAdam Hornáček // `path' is not null, as it was passed to Util.path2uid() above. 617b5840353SAdam Hornáček doc.add(new TextField(QueryBuilder.PATH, path, Store.YES)); 618b5840353SAdam Hornáček Project project = Project.getProject(path); 619b5840353SAdam Hornáček if (project != null) { 620b5840353SAdam Hornáček doc.add(new TextField(QueryBuilder.PROJECT, project.getPath(), Store.YES)); 621b5840353SAdam Hornáček } 622b5840353SAdam Hornáček 623b5840353SAdam Hornáček /* 624b5840353SAdam Hornáček * Use the parent of the path -- not the absolute file as is done for 625b5840353SAdam Hornáček * FULLPATH -- so that DIRPATH is the same convention as for PATH 626b5840353SAdam Hornáček * above. A StringField, however, is used instead of a TextField. 627b5840353SAdam Hornáček */ 628b5840353SAdam Hornáček File fpath = new File(path); 629b5840353SAdam Hornáček String fileParent = fpath.getParent(); 630b5840353SAdam Hornáček if (fileParent != null && fileParent.length() > 0) { 631b5840353SAdam Hornáček String normalizedPath = QueryBuilder.normalizeDirPath(fileParent); 632b5840353SAdam Hornáček StringField npstring = new StringField(QueryBuilder.DIRPATH, 633b5840353SAdam Hornáček normalizedPath, Store.NO); 634b5840353SAdam Hornáček doc.add(npstring); 635b5840353SAdam Hornáček } 636b5840353SAdam Hornáček 637b5840353SAdam Hornáček if (fa != null) { 63857eefa47SKryštof Tulinger AbstractAnalyzer.Genre g = fa.getGenre(); 63957eefa47SKryštof Tulinger if (g == AbstractAnalyzer.Genre.PLAIN || g == AbstractAnalyzer.Genre.XREFABLE || g == AbstractAnalyzer.Genre.HTML) { 640a72324b1SAdam Hornáček doc.add(new Field(QueryBuilder.T, g.typeName(), string_ft_stored_nanalyzed_norms)); 641b5840353SAdam Hornáček } 642b5840353SAdam Hornáček fa.analyze(doc, StreamSource.fromFile(file), xrefOut); 643b5840353SAdam Hornáček 644b5840353SAdam Hornáček String type = fa.getFileTypeName(); 645b5840353SAdam Hornáček doc.add(new StringField(QueryBuilder.TYPE, type, Store.YES)); 646b5840353SAdam Hornáček } 647b5840353SAdam Hornáček } 648b5840353SAdam Hornáček 649b5840353SAdam Hornáček /** 650ff44f24aSAdam Hornáček * Write a browse-able version of the file. 651b5840353SAdam Hornáček * 652b5840353SAdam Hornáček * @param factory The analyzer factory for this file type 653b5840353SAdam Hornáček * @param in The input stream containing the data 654b5840353SAdam Hornáček * @param out Where to write the result 655b5840353SAdam Hornáček * @param defs definitions for the source file, if available 656b5840353SAdam Hornáček * @param annotation Annotation information for the file 657b5840353SAdam Hornáček * @param project Project the file belongs to 658b5840353SAdam Hornáček * @throws java.io.IOException If an error occurs while creating the output 659b5840353SAdam Hornáček */ writeXref(AnalyzerFactory factory, Reader in, Writer out, Definitions defs, Annotation annotation, Project project)66057eefa47SKryštof Tulinger public static void writeXref(AnalyzerFactory factory, Reader in, 661b5840353SAdam Hornáček Writer out, Definitions defs, 662b5840353SAdam Hornáček Annotation annotation, Project project) 663b5840353SAdam Hornáček throws IOException { 664b5840353SAdam Hornáček Reader input = in; 66557eefa47SKryštof Tulinger if (factory.getGenre() == AbstractAnalyzer.Genre.PLAIN) { 666b5840353SAdam Hornáček // This is some kind of text file, so we need to expand tabs to 667b5840353SAdam Hornáček // spaces to match the project's tab settings. 668b5840353SAdam Hornáček input = ExpandTabsReader.wrap(in, project); 669b5840353SAdam Hornáček } 670b5840353SAdam Hornáček 671b5840353SAdam Hornáček WriteXrefArgs args = new WriteXrefArgs(input, out); 672b5840353SAdam Hornáček args.setDefs(defs); 673b5840353SAdam Hornáček args.setAnnotation(annotation); 674b5840353SAdam Hornáček args.setProject(project); 675b5840353SAdam Hornáček 67657eefa47SKryštof Tulinger AbstractAnalyzer analyzer = factory.getAnalyzer(); 677b5840353SAdam Hornáček RuntimeEnvironment env = RuntimeEnvironment.getInstance(); 678b5840353SAdam Hornáček analyzer.setScopesEnabled(env.isScopesEnabled()); 679b5840353SAdam Hornáček analyzer.setFoldingEnabled(env.isFoldingEnabled()); 680b5840353SAdam Hornáček analyzer.writeXref(args); 681b5840353SAdam Hornáček } 682b5840353SAdam Hornáček 683b5840353SAdam Hornáček /** 684b5840353SAdam Hornáček * Writes a browse-able version of the file transformed for immediate 685b5840353SAdam Hornáček * serving to a web client. 686b5840353SAdam Hornáček * @param contextPath the web context path for 687b5840353SAdam Hornáček * {@link Util#dumpXref(java.io.Writer, java.io.Reader, java.lang.String)} 688b5840353SAdam Hornáček * @param factory the analyzer factory for this file type 689b5840353SAdam Hornáček * @param in the input stream containing the data 690b5840353SAdam Hornáček * @param out a defined instance to write 691b5840353SAdam Hornáček * @param defs definitions for the source file, if available 692b5840353SAdam Hornáček * @param annotation annotation information for the file 693b5840353SAdam Hornáček * @param project project the file belongs to 694b5840353SAdam Hornáček * @throws java.io.IOException if an error occurs while creating the output 695b5840353SAdam Hornáček */ writeDumpedXref(String contextPath, AnalyzerFactory factory, Reader in, Writer out, Definitions defs, Annotation annotation, Project project)696b5840353SAdam Hornáček public static void writeDumpedXref(String contextPath, 69757eefa47SKryštof Tulinger AnalyzerFactory factory, Reader in, Writer out, 698b5840353SAdam Hornáček Definitions defs, Annotation annotation, Project project) 699b5840353SAdam Hornáček throws IOException { 700b5840353SAdam Hornáček 701b5840353SAdam Hornáček File xrefTemp = File.createTempFile("ogxref", ".html"); 702b5840353SAdam Hornáček try { 703b5840353SAdam Hornáček try (FileWriter tmpout = new FileWriter(xrefTemp)) { 704b5840353SAdam Hornáček writeXref(factory, in, tmpout, defs, annotation, project); 705b5840353SAdam Hornáček } 706b5840353SAdam Hornáček Util.dumpXref(out, xrefTemp, false, contextPath); 707b5840353SAdam Hornáček } finally { 70853e96f58SVladimir Kotal Files.delete(xrefTemp.toPath()); 709b5840353SAdam Hornáček } 710b5840353SAdam Hornáček } 711b5840353SAdam Hornáček 712b5840353SAdam Hornáček /** 713ff44f24aSAdam Hornáček * Get the genre of a file. 714b5840353SAdam Hornáček * 715b5840353SAdam Hornáček * @param file The file to inspect 716b5840353SAdam Hornáček * @return The genre suitable to decide how to display the file 717b5840353SAdam Hornáček */ getGenre(String file)71857eefa47SKryštof Tulinger public static AbstractAnalyzer.Genre getGenre(String file) { 719b5840353SAdam Hornáček return getGenre(find(file)); 720b5840353SAdam Hornáček } 721b5840353SAdam Hornáček 722b5840353SAdam Hornáček /** 723ff44f24aSAdam Hornáček * Get the genre of a bulk of data. 724b5840353SAdam Hornáček * 725b5840353SAdam Hornáček * @param in A stream containing the data 726b5840353SAdam Hornáček * @return The genre suitable to decide how to display the file 727b5840353SAdam Hornáček * @throws java.io.IOException If an error occurs while getting the content 728b5840353SAdam Hornáček */ getGenre(InputStream in)72957eefa47SKryštof Tulinger public static AbstractAnalyzer.Genre getGenre(InputStream in) throws IOException { 730b5840353SAdam Hornáček return getGenre(find(in)); 731b5840353SAdam Hornáček } 732b5840353SAdam Hornáček 733b5840353SAdam Hornáček /** 734ff44f24aSAdam Hornáček * Get the genre for a named class (this is most likely an analyzer). 735b5840353SAdam Hornáček * 736b5840353SAdam Hornáček * @param factory the analyzer factory to get the genre for 737b5840353SAdam Hornáček * @return The genre of this class (null if not found) 738b5840353SAdam Hornáček */ getGenre(AnalyzerFactory factory)73957eefa47SKryštof Tulinger public static AbstractAnalyzer.Genre getGenre(AnalyzerFactory factory) { 740b5840353SAdam Hornáček if (factory != null) { 741b5840353SAdam Hornáček return factory.getGenre(); 742b5840353SAdam Hornáček } 743b5840353SAdam Hornáček return null; 744b5840353SAdam Hornáček } 745b5840353SAdam Hornáček 746b5840353SAdam Hornáček /** 747ee13dbaeSChris Fraire * Finds a {@code FileAnalyzerFactory} for the specified 748ee13dbaeSChris Fraire * {@link FileAnalyzer#getFileTypeName()}. 749ee13dbaeSChris Fraire * @param fileTypeName a defined instance 750ee13dbaeSChris Fraire * @return a defined instance or {@code null} 751ee13dbaeSChris Fraire */ findByFileTypeName(String fileTypeName)75257eefa47SKryštof Tulinger public static AnalyzerFactory findByFileTypeName(String fileTypeName) { 753ee13dbaeSChris Fraire return FILETYPE_FACTORIES.get(fileTypeName); 754ee13dbaeSChris Fraire } 755ee13dbaeSChris Fraire 756ee13dbaeSChris Fraire /** 757b5840353SAdam Hornáček * Find a {@code FileAnalyzerFactory} with the specified class name. If one 758b5840353SAdam Hornáček * doesn't exist, create one and register it. Allow specification of either 759b5840353SAdam Hornáček * the complete class name (which includes the package name) or the simple 760b5840353SAdam Hornáček * name of the class. 761b5840353SAdam Hornáček * 762b5840353SAdam Hornáček * @param factoryClassName name of the factory class 763b5840353SAdam Hornáček * @return a file analyzer factory 764b5840353SAdam Hornáček * 765b5840353SAdam Hornáček * @throws ClassNotFoundException if there is no class with that name 766b5840353SAdam Hornáček * @throws ClassCastException if the class is not a subclass of {@code 767b5840353SAdam Hornáček * FileAnalyzerFactory} 768b5840353SAdam Hornáček * @throws IllegalAccessException if the constructor cannot be accessed 769b5840353SAdam Hornáček * @throws InstantiationException if the class cannot be instantiated 770b5840353SAdam Hornáček * @throws NoSuchMethodException if no-argument constructor could not be found 771b5840353SAdam Hornáček * @throws InvocationTargetException if the underlying constructor throws an exception 772b5840353SAdam Hornáček */ findFactory(String factoryClassName)77357eefa47SKryštof Tulinger public static AnalyzerFactory findFactory(String factoryClassName) 774b5840353SAdam Hornáček throws ClassNotFoundException, IllegalAccessException, InstantiationException, NoSuchMethodException, 775b5840353SAdam Hornáček InvocationTargetException { 776c5ef7ff6SChris Fraire Class<?> fcn; 777b5840353SAdam Hornáček try { 778b5840353SAdam Hornáček fcn = Class.forName(factoryClassName); 779b5840353SAdam Hornáček 780b5840353SAdam Hornáček } catch (ClassNotFoundException e) { 781b5840353SAdam Hornáček fcn = getFactoryClass(factoryClassName); 782b5840353SAdam Hornáček 783b5840353SAdam Hornáček if (fcn == null) { 784b5840353SAdam Hornáček throw new ClassNotFoundException("Unable to locate class " + factoryClassName); 785b5840353SAdam Hornáček } 786b5840353SAdam Hornáček } 787b5840353SAdam Hornáček 788b5840353SAdam Hornáček return findFactory(fcn); 789b5840353SAdam Hornáček } 790b5840353SAdam Hornáček 791b5840353SAdam Hornáček /** 792b5840353SAdam Hornáček * Get Analyzer factory class using class simple name. 793b5840353SAdam Hornáček * 794b5840353SAdam Hornáček * @param simpleName which may be either the factory class 795b5840353SAdam Hornáček * simple name (eg. CAnalyzerFactory), the analyzer name 796b5840353SAdam Hornáček * (eg. CAnalyzer), or the language name (eg. C) 797b5840353SAdam Hornáček * 798b5840353SAdam Hornáček * @return the analyzer factory class, or null when not found. 799b5840353SAdam Hornáček */ getFactoryClass(String simpleName)800b5840353SAdam Hornáček public static Class<?> getFactoryClass(String simpleName) { 801b5840353SAdam Hornáček Class<?> factoryClass = null; 802b5840353SAdam Hornáček 803b5840353SAdam Hornáček // Build analysis package name list first time only 804b5840353SAdam Hornáček if (analysisPkgNames.isEmpty()) { 805b5840353SAdam Hornáček Package[] p = Package.getPackages(); 806b5840353SAdam Hornáček for (Package pp : p) { 807b5840353SAdam Hornáček String pname = pp.getName(); 808ff44f24aSAdam Hornáček if (pname.contains(".analysis.")) { 809b5840353SAdam Hornáček analysisPkgNames.add(pname); 810b5840353SAdam Hornáček } 811b5840353SAdam Hornáček } 812b5840353SAdam Hornáček } 813b5840353SAdam Hornáček 814b5840353SAdam Hornáček // This allows user to enter the language or analyzer name 815b5840353SAdam Hornáček // (eg. C or CAnalyzer vs. CAnalyzerFactory) 816b5840353SAdam Hornáček // Note that this assumes a regular naming scheme of 817b5840353SAdam Hornáček // all language parsers: 818b5840353SAdam Hornáček // <language>Analyzer, <language>AnalyzerFactory 819b5840353SAdam Hornáček 820ff44f24aSAdam Hornáček if (!simpleName.contains("Analyzer")) { 821b5840353SAdam Hornáček simpleName += "Analyzer"; 822b5840353SAdam Hornáček } 823b5840353SAdam Hornáček 824ff44f24aSAdam Hornáček if (!simpleName.contains("Factory")) { 825b5840353SAdam Hornáček simpleName += "Factory"; 826b5840353SAdam Hornáček } 827b5840353SAdam Hornáček 828b5840353SAdam Hornáček for (String aPackage : analysisPkgNames) { 829b5840353SAdam Hornáček try { 830b5840353SAdam Hornáček String fqn = aPackage + "." + simpleName; 831b5840353SAdam Hornáček factoryClass = Class.forName(fqn); 832b5840353SAdam Hornáček break; 833b5840353SAdam Hornáček } catch (ClassNotFoundException e) { 834b5840353SAdam Hornáček // Ignore 835b5840353SAdam Hornáček } 836b5840353SAdam Hornáček } 837b5840353SAdam Hornáček 838b5840353SAdam Hornáček return factoryClass; 839b5840353SAdam Hornáček } 840b5840353SAdam Hornáček 841b5840353SAdam Hornáček /** 842b5840353SAdam Hornáček * Find a {@code FileAnalyzerFactory} which is an instance of the specified 843b5840353SAdam Hornáček * class. If one doesn't exist, create one and register it. 844b5840353SAdam Hornáček * 845b5840353SAdam Hornáček * @param factoryClass the factory class 846b5840353SAdam Hornáček * @return a file analyzer factory 847b5840353SAdam Hornáček * 848b5840353SAdam Hornáček * @throws ClassCastException if the class is not a subclass of {@code 849b5840353SAdam Hornáček * FileAnalyzerFactory} 850b5840353SAdam Hornáček * @throws IllegalAccessException if the constructor cannot be accessed 851b5840353SAdam Hornáček * @throws InstantiationException if the class cannot be instantiated 852b5840353SAdam Hornáček * @throws NoSuchMethodException if no-argument constructor could not be found 853b5840353SAdam Hornáček * @throws InvocationTargetException if the underlying constructor throws an exception 854b5840353SAdam Hornáček */ findFactory(Class<?> factoryClass)85557eefa47SKryštof Tulinger private static AnalyzerFactory findFactory(Class<?> factoryClass) 856b5840353SAdam Hornáček throws InstantiationException, IllegalAccessException, NoSuchMethodException, InvocationTargetException { 85757eefa47SKryštof Tulinger for (AnalyzerFactory f : factories) { 858b5840353SAdam Hornáček if (f.getClass() == factoryClass) { 859b5840353SAdam Hornáček return f; 860b5840353SAdam Hornáček } 861b5840353SAdam Hornáček } 86257eefa47SKryštof Tulinger AnalyzerFactory f = (AnalyzerFactory) factoryClass.getDeclaredConstructor().newInstance(); 863b5840353SAdam Hornáček registerAnalyzer(f); 864b5840353SAdam Hornáček return f; 865b5840353SAdam Hornáček } 866b5840353SAdam Hornáček 867b5840353SAdam Hornáček /** 868b5840353SAdam Hornáček * Finds a suitable analyser class for file name. If the analyzer cannot be 869b5840353SAdam Hornáček * determined by the file extension, try to look at the data in the 870b5840353SAdam Hornáček * InputStream to find a suitable analyzer. 871b5840353SAdam Hornáček * 872b5840353SAdam Hornáček * Use if you just want to find file type. 873b5840353SAdam Hornáček * 874b5840353SAdam Hornáček * @param in The input stream containing the data 875b5840353SAdam Hornáček * @param file The file name to get the analyzer for 876b5840353SAdam Hornáček * @return the analyzer factory to use 87753e96f58SVladimir Kotal * @throws java.io.IOException If a problem occurred while reading the data 878b5840353SAdam Hornáček */ find(InputStream in, String file)87953e96f58SVladimir Kotal public static AnalyzerFactory find(InputStream in, String file) throws IOException { 88057eefa47SKryštof Tulinger AnalyzerFactory factory = find(file); 88153e96f58SVladimir Kotal 882b5840353SAdam Hornáček if (factory != null) { 883b5840353SAdam Hornáček return factory; 884b5840353SAdam Hornáček } 885b5840353SAdam Hornáček return findForStream(in, file); 886b5840353SAdam Hornáček } 887b5840353SAdam Hornáček 888b5840353SAdam Hornáček /** 889b5840353SAdam Hornáček * Finds a suitable analyser class for file name. 890b5840353SAdam Hornáček * 891b5840353SAdam Hornáček * @param file The file name to get the analyzer for 892b5840353SAdam Hornáček * @return the analyzer factory to use 893b5840353SAdam Hornáček */ find(String file)89457eefa47SKryštof Tulinger public static AnalyzerFactory find(String file) { 895b5840353SAdam Hornáček String path = file; 896b5840353SAdam Hornáček int i; 897b5840353SAdam Hornáček 898b5840353SAdam Hornáček // Get basename of the file first. 899b5840353SAdam Hornáček if (((i = path.lastIndexOf(File.separatorChar)) > 0) 900b5840353SAdam Hornáček && (i + 1 < path.length())) { 901b5840353SAdam Hornáček path = path.substring(i + 1); 902b5840353SAdam Hornáček } 903b5840353SAdam Hornáček 904b5840353SAdam Hornáček int dotpos = path.lastIndexOf('.'); 905b5840353SAdam Hornáček if (dotpos >= 0) { 90657eefa47SKryštof Tulinger AnalyzerFactory factory; 907b5840353SAdam Hornáček 908b5840353SAdam Hornáček // Try matching the prefix. 909b5840353SAdam Hornáček if (dotpos > 0) { 91052dccac1SChris Fraire factory = pre.get(path.substring(0, dotpos).toUpperCase(Locale.ROOT)); 911b5840353SAdam Hornáček if (factory != null) { 91230bba29fSChris Fraire if (LOGGER.isLoggable(Level.FINEST)) { 91330bba29fSChris Fraire LOGGER.log(Level.FINEST, "{0}: chosen by prefix: {1}", 914b5840353SAdam Hornáček new Object[]{file, 915b5840353SAdam Hornáček factory.getClass().getSimpleName() }); 916b5840353SAdam Hornáček } 917b5840353SAdam Hornáček return factory; 918b5840353SAdam Hornáček } 919b5840353SAdam Hornáček } 920b5840353SAdam Hornáček 921b5840353SAdam Hornáček // Now try matching the suffix. We kind of consider this order (first 922b5840353SAdam Hornáček // prefix then suffix) to be workable although for sure there can be 923b5840353SAdam Hornáček // cases when this does not work. 92452dccac1SChris Fraire factory = ext.get(path.substring(dotpos + 1).toUpperCase(Locale.ROOT)); 925b5840353SAdam Hornáček if (factory != null) { 92630bba29fSChris Fraire if (LOGGER.isLoggable(Level.FINEST)) { 92730bba29fSChris Fraire LOGGER.log(Level.FINEST, "{0}: chosen by suffix: {1}", 928b5840353SAdam Hornáček new Object[]{file, 929b5840353SAdam Hornáček factory.getClass().getSimpleName() }); 930b5840353SAdam Hornáček } 931b5840353SAdam Hornáček return factory; 932b5840353SAdam Hornáček } 933b5840353SAdam Hornáček } 934b5840353SAdam Hornáček 935b5840353SAdam Hornáček // file doesn't have any of the prefix or extensions we know, try full match 93652dccac1SChris Fraire return FILE_NAMES.get(path.toUpperCase(Locale.ROOT)); 937b5840353SAdam Hornáček } 938b5840353SAdam Hornáček 939b5840353SAdam Hornáček /** 940ff44f24aSAdam Hornáček * Finds a suitable analyzer class for the data in this stream. 941b5840353SAdam Hornáček * 942b5840353SAdam Hornáček * @param in The stream containing the data to analyze 943b5840353SAdam Hornáček * @return the analyzer factory to use 944b5840353SAdam Hornáček * @throws java.io.IOException if an error occurs while reading data from 945b5840353SAdam Hornáček * the stream 946b5840353SAdam Hornáček */ find(InputStream in)94757eefa47SKryštof Tulinger public static AnalyzerFactory find(InputStream in) throws IOException { 948b5840353SAdam Hornáček return findForStream(in, "<anonymous>"); 949b5840353SAdam Hornáček } 950b5840353SAdam Hornáček 951b5840353SAdam Hornáček /** 952b5840353SAdam Hornáček * Finds a suitable analyzer class for the data in this stream 953ff44f24aSAdam Hornáček * corresponding to a file of the specified name. 954b5840353SAdam Hornáček * 955b5840353SAdam Hornáček * @param in The stream containing the data to analyze 956b5840353SAdam Hornáček * @param file The file name to get the analyzer for 957b5840353SAdam Hornáček * @return the analyzer factory to use 958b5840353SAdam Hornáček * @throws java.io.IOException if an error occurs while reading data from 959b5840353SAdam Hornáček * the stream 960b5840353SAdam Hornáček */ findForStream(InputStream in, String file)96157eefa47SKryštof Tulinger private static AnalyzerFactory findForStream(InputStream in, 962b5840353SAdam Hornáček String file) throws IOException { 963b5840353SAdam Hornáček 964b5840353SAdam Hornáček in.mark(MAGIC_BYTES_NUM); 965b5840353SAdam Hornáček byte[] content = new byte[MAGIC_BYTES_NUM]; 966b5840353SAdam Hornáček int len = in.read(content); 967b5840353SAdam Hornáček in.reset(); 968b5840353SAdam Hornáček 969b5840353SAdam Hornáček if (len < MAGIC_BYTES_NUM) { 970b5840353SAdam Hornáček /* 971b5840353SAdam Hornáček * Need at least 4 bytes to perform magic string matching. 972b5840353SAdam Hornáček */ 973b5840353SAdam Hornáček if (len < 4) { 974b5840353SAdam Hornáček return null; 975b5840353SAdam Hornáček } 976b5840353SAdam Hornáček content = Arrays.copyOf(content, len); 977b5840353SAdam Hornáček } 978b5840353SAdam Hornáček 97957eefa47SKryštof Tulinger AnalyzerFactory fac; 980b5840353SAdam Hornáček 981b5840353SAdam Hornáček // First, do precise-magic Matcher matching 982b5840353SAdam Hornáček for (FileAnalyzerFactory.Matcher matcher : matchers) { 983*750b3115SChris Fraire if (matcher.isPreciseMagic()) { 984b5840353SAdam Hornáček fac = matcher.isMagic(content, in); 985b5840353SAdam Hornáček if (fac != null) { 98630bba29fSChris Fraire if (LOGGER.isLoggable(Level.FINEST)) { 98730bba29fSChris Fraire LOGGER.log(Level.FINEST, 988b5840353SAdam Hornáček "{0}: chosen by precise magic: {1}", new Object[]{ 989b5840353SAdam Hornáček file, fac.getClass().getSimpleName() }); 990b5840353SAdam Hornáček } 991b5840353SAdam Hornáček return fac; 992b5840353SAdam Hornáček } 993b5840353SAdam Hornáček } 994b5840353SAdam Hornáček } 995b5840353SAdam Hornáček 996b5840353SAdam Hornáček // Next, look for magic strings 997b5840353SAdam Hornáček String opening = readOpening(in, content); 998b5840353SAdam Hornáček fac = findMagicString(opening, file); 999b5840353SAdam Hornáček if (fac != null) { 1000b5840353SAdam Hornáček return fac; 1001b5840353SAdam Hornáček } 1002b5840353SAdam Hornáček 1003b5840353SAdam Hornáček // Last, do imprecise-magic Matcher matching 1004b5840353SAdam Hornáček for (FileAnalyzerFactory.Matcher matcher : matchers) { 1005*750b3115SChris Fraire if (!matcher.isPreciseMagic()) { 1006b5840353SAdam Hornáček fac = matcher.isMagic(content, in); 1007b5840353SAdam Hornáček if (fac != null) { 100830bba29fSChris Fraire if (LOGGER.isLoggable(Level.FINEST)) { 100930bba29fSChris Fraire LOGGER.log(Level.FINEST, 1010b5840353SAdam Hornáček "{0}: chosen by imprecise magic: {1}", 1011b5840353SAdam Hornáček new Object[]{file, 1012b5840353SAdam Hornáček fac.getClass().getSimpleName() }); 1013b5840353SAdam Hornáček } 1014b5840353SAdam Hornáček return fac; 1015b5840353SAdam Hornáček } 1016b5840353SAdam Hornáček } 1017b5840353SAdam Hornáček } 1018b5840353SAdam Hornáček 1019b5840353SAdam Hornáček return null; 1020b5840353SAdam Hornáček } 1021b5840353SAdam Hornáček findMagicString(String opening, String file)1022c5ef7ff6SChris Fraire private static AnalyzerFactory findMagicString(String opening, String file) { 1023b5840353SAdam Hornáček 1024b5840353SAdam Hornáček // first, try to look up two words in magics 1025b5840353SAdam Hornáček String fragment = getWords(opening, 2); 102657eefa47SKryštof Tulinger AnalyzerFactory fac = magics.get(fragment); 1027b5840353SAdam Hornáček if (fac != null) { 102830bba29fSChris Fraire if (LOGGER.isLoggable(Level.FINEST)) { 102930bba29fSChris Fraire LOGGER.log(Level.FINEST, "{0}: chosen by magic {2}: {1}", 1030b5840353SAdam Hornáček new Object[]{file, fac.getClass().getSimpleName(), 1031b5840353SAdam Hornáček fragment}); 1032b5840353SAdam Hornáček } 1033b5840353SAdam Hornáček return fac; 1034b5840353SAdam Hornáček } 1035b5840353SAdam Hornáček 1036b5840353SAdam Hornáček // second, try to look up one word in magics 1037b5840353SAdam Hornáček fragment = getWords(opening, 1); 1038b5840353SAdam Hornáček fac = magics.get(fragment); 1039b5840353SAdam Hornáček if (fac != null) { 104030bba29fSChris Fraire if (LOGGER.isLoggable(Level.FINEST)) { 104130bba29fSChris Fraire LOGGER.log(Level.FINEST, "{0}: chosen by magic {2}: {1}", 1042b5840353SAdam Hornáček new Object[]{file, fac.getClass().getSimpleName(), 1043b5840353SAdam Hornáček fragment}); 1044b5840353SAdam Hornáček } 1045b5840353SAdam Hornáček return fac; 1046b5840353SAdam Hornáček } 1047b5840353SAdam Hornáček 1048b5840353SAdam Hornáček // try to match initial substrings (DESC strlen) 104957eefa47SKryštof Tulinger for (Map.Entry<String, AnalyzerFactory> entry : 1050b5840353SAdam Hornáček magics.entrySet()) { 1051b5840353SAdam Hornáček String magic = entry.getKey(); 1052b5840353SAdam Hornáček if (opening.startsWith(magic)) { 1053b5840353SAdam Hornáček fac = entry.getValue(); 105430bba29fSChris Fraire if (LOGGER.isLoggable(Level.FINEST)) { 105530bba29fSChris Fraire LOGGER.log(Level.FINEST, 1056b5840353SAdam Hornáček "{0}: chosen by magic(substr) {2}: {1}", new Object[]{ 1057b5840353SAdam Hornáček file, fac.getClass().getSimpleName(), magic}); 1058b5840353SAdam Hornáček } 1059b5840353SAdam Hornáček return fac; 1060b5840353SAdam Hornáček } 1061b5840353SAdam Hornáček } 1062b5840353SAdam Hornáček 1063b5840353SAdam Hornáček return null; 1064b5840353SAdam Hornáček } 1065b5840353SAdam Hornáček 1066b5840353SAdam Hornáček /** 1067b5840353SAdam Hornáček * Extract initial words from a String, or take the entire 1068b5840353SAdam Hornáček * <code>value</code> if not enough words can be identified. (If 1069b5840353SAdam Hornáček * <code>n</code> is not 1 or more, returns an empty String.) (A "word" 1070b5840353SAdam Hornáček * ends at each and every space character.) 1071b5840353SAdam Hornáček * 1072b5840353SAdam Hornáček * @param value The source from which words are cut 1073b5840353SAdam Hornáček * @param n The number of words to try to extract 1074b5840353SAdam Hornáček * @return The extracted words or <code>""</code> 1075b5840353SAdam Hornáček */ getWords(String value, int n)1076b5840353SAdam Hornáček private static String getWords(String value, int n) { 1077a72324b1SAdam Hornáček if (n < 1) { 1078a72324b1SAdam Hornáček return ""; 1079a72324b1SAdam Hornáček } 1080b5840353SAdam Hornáček int l = 0; 1081b5840353SAdam Hornáček while (n-- > 0) { 1082b5840353SAdam Hornáček int o = l > 0 ? l + 1 : l; 1083b5840353SAdam Hornáček int i = value.indexOf(' ', o); 1084a72324b1SAdam Hornáček if (i == -1) { 1085a72324b1SAdam Hornáček return value; 1086a72324b1SAdam Hornáček } 1087b5840353SAdam Hornáček l = i; 1088b5840353SAdam Hornáček } 1089b5840353SAdam Hornáček return value.substring(0, l); 1090b5840353SAdam Hornáček } 1091b5840353SAdam Hornáček 1092b5840353SAdam Hornáček /** 1093b5840353SAdam Hornáček * Extract an opening string from the input stream, past any BOM, and past 1094b5840353SAdam Hornáček * any initial whitespace, but only up to <code>OPENING_MAX_CHARS</code> or 1095b5840353SAdam Hornáček * to the first <code>\n</code> after any non-whitespace. (Hashbang, #!, 1096b5840353SAdam Hornáček * openings will have superfluous space removed.) 1097b5840353SAdam Hornáček * 1098b5840353SAdam Hornáček * @param in The input stream containing the data 1099b5840353SAdam Hornáček * @param sig The initial sequence of bytes in the input stream 1100b5840353SAdam Hornáček * @return The extracted string or <code>""</code> 1101b5840353SAdam Hornáček * @throws java.io.IOException in case of any read error 1102b5840353SAdam Hornáček */ readOpening(InputStream in, byte[] sig)1103b5840353SAdam Hornáček private static String readOpening(InputStream in, byte[] sig) 1104b5840353SAdam Hornáček throws IOException { 1105b5840353SAdam Hornáček 1106b5840353SAdam Hornáček in.mark(MARK_READ_LIMIT); 1107b5840353SAdam Hornáček 1108b5840353SAdam Hornáček String encoding = IOUtils.findBOMEncoding(sig); 1109b5840353SAdam Hornáček if (encoding == null) { 1110b5840353SAdam Hornáček // SRCROOT is read with UTF-8 as a default. 1111b5840353SAdam Hornáček encoding = StandardCharsets.UTF_8.name(); 1112b5840353SAdam Hornáček } else { 1113b5840353SAdam Hornáček int skipForBOM = IOUtils.skipForBOM(sig); 1114b5840353SAdam Hornáček if (in.skip(skipForBOM) < skipForBOM) { 1115b5840353SAdam Hornáček in.reset(); 1116b5840353SAdam Hornáček return ""; 1117b5840353SAdam Hornáček } 1118b5840353SAdam Hornáček } 1119b5840353SAdam Hornáček 1120b5840353SAdam Hornáček int nRead = 0; 1121b5840353SAdam Hornáček boolean sawNonWhitespace = false; 1122b5840353SAdam Hornáček boolean lastWhitespace = false; 1123b5840353SAdam Hornáček boolean postHashbang = false; 1124b5840353SAdam Hornáček int r; 1125b5840353SAdam Hornáček 1126b5840353SAdam Hornáček StringBuilder opening = new StringBuilder(); 112753e96f58SVladimir Kotal BufferedReader readr = new BufferedReader(new InputStreamReader(in, encoding), OPENING_MAX_CHARS); 1128b5840353SAdam Hornáček while ((r = readr.read()) != -1) { 1129a72324b1SAdam Hornáček if (++nRead > OPENING_MAX_CHARS) { 1130a72324b1SAdam Hornáček break; 1131a72324b1SAdam Hornáček } 1132b5840353SAdam Hornáček char c = (char) r; 1133b5840353SAdam Hornáček boolean isWhitespace = Character.isWhitespace(c); 1134b5840353SAdam Hornáček if (!sawNonWhitespace) { 1135a72324b1SAdam Hornáček if (isWhitespace) { 1136a72324b1SAdam Hornáček continue; 1137a72324b1SAdam Hornáček } 1138b5840353SAdam Hornáček sawNonWhitespace = true; 1139b5840353SAdam Hornáček } 1140a72324b1SAdam Hornáček if (c == '\n') { 1141a72324b1SAdam Hornáček break; 1142a72324b1SAdam Hornáček } 1143b5840353SAdam Hornáček 1144b5840353SAdam Hornáček if (isWhitespace) { 1145b5840353SAdam Hornáček // Track `lastWhitespace' to condense stretches of whitespace, 1146b5840353SAdam Hornáček // and use ' ' regardless of actual whitespace character to 1147b5840353SAdam Hornáček // accord with magic string definitions. 1148a72324b1SAdam Hornáček if (!lastWhitespace && !postHashbang) { 1149a72324b1SAdam Hornáček opening.append(' '); 1150a72324b1SAdam Hornáček } 1151b5840353SAdam Hornáček } else { 1152b5840353SAdam Hornáček opening.append(c); 1153b5840353SAdam Hornáček postHashbang = false; 1154b5840353SAdam Hornáček } 1155b5840353SAdam Hornáček lastWhitespace = isWhitespace; 1156b5840353SAdam Hornáček 1157b5840353SAdam Hornáček // If the opening starts with "#!", then track so that any 1158b5840353SAdam Hornáček // trailing whitespace after the hashbang is ignored. 115953e96f58SVladimir Kotal if (opening.length() == 2 && opening.charAt(0) == '#' && opening.charAt(1) == '!') { 1160b5840353SAdam Hornáček postHashbang = true; 1161b5840353SAdam Hornáček } 1162b5840353SAdam Hornáček } 1163b5840353SAdam Hornáček 1164b5840353SAdam Hornáček in.reset(); 1165b5840353SAdam Hornáček return opening.toString(); 1166b5840353SAdam Hornáček } 1167b5840353SAdam Hornáček addCustomizationKey(String k)1168b5840353SAdam Hornáček private static void addCustomizationKey(String k) { 1169b5840353SAdam Hornáček CUSTOMIZATION_KEYS.add(k); 1170b5840353SAdam Hornáček Object[] keys = CUSTOMIZATION_KEYS.toArray(); 1171b5840353SAdam Hornáček customizationHashCode = Objects.hash(keys); 1172b5840353SAdam Hornáček } 1173b5840353SAdam Hornáček factoriesDifferent(AnalyzerFactory a, AnalyzerFactory b)117453e96f58SVladimir Kotal private static boolean factoriesDifferent(AnalyzerFactory a, AnalyzerFactory b) { 117553e96f58SVladimir Kotal String aName = null; 1176b5840353SAdam Hornáček if (a != null) { 117753e96f58SVladimir Kotal aName = a.getName(); 117853e96f58SVladimir Kotal if (aName == null) { 117953e96f58SVladimir Kotal aName = a.getClass().getSimpleName(); 1180b5840353SAdam Hornáček } 1181b5840353SAdam Hornáček } 118253e96f58SVladimir Kotal String bName = null; 1183b5840353SAdam Hornáček if (b != null) { 118453e96f58SVladimir Kotal bName = b.getName(); 118553e96f58SVladimir Kotal if (bName == null) { 118653e96f58SVladimir Kotal bName = b.getClass().getSimpleName(); 1187b5840353SAdam Hornáček } 1188b5840353SAdam Hornáček } 118953e96f58SVladimir Kotal if (aName == null && bName == null) { 1190b5840353SAdam Hornáček return false; 1191b5840353SAdam Hornáček } 119253e96f58SVladimir Kotal return aName == null || !aName.equals(bName); 1193b5840353SAdam Hornáček } 1194b5840353SAdam Hornáček } 1195