1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * See LICENSE.txt included in this distribution for the specific 9 * language governing permissions and limitations under the License. 10 * 11 * When distributing Covered Code, include this CDDL HEADER in each 12 * file and include the License file at LICENSE.txt. 13 * If applicable, add the following below this CDDL HEADER, with the 14 * fields enclosed by brackets "[]" replaced with your own identifying 15 * information: Portions Copyright [yyyy] [name of copyright owner] 16 * 17 * CDDL HEADER END 18 */ 19 20 /* 21 * Copyright (c) 2005, 2021, Oracle and/or its affiliates. All rights reserved. 22 * Portions Copyright (c) 2017, 2021, Chris Fraire <cfraire@me.com>. 23 */ 24 package org.opengrok.indexer.analysis; 25 26 import java.io.BufferedReader; 27 import java.io.File; 28 import java.io.FileWriter; 29 import java.io.IOException; 30 import java.io.InputStream; 31 import java.io.InputStreamReader; 32 import java.io.Reader; 33 import java.io.Writer; 34 import java.lang.reflect.InvocationTargetException; 35 import java.nio.charset.StandardCharsets; 36 import java.nio.file.Files; 37 import java.util.ArrayList; 38 import java.util.Arrays; 39 import java.util.Collections; 40 import java.util.Comparator; 41 import java.util.HashMap; 42 import java.util.List; 43 import java.util.Locale; 44 import java.util.Map; 45 import java.util.Objects; 46 import java.util.SortedMap; 47 import java.util.TreeMap; 48 import java.util.TreeSet; 49 import java.util.logging.Level; 50 import java.util.logging.Logger; 51 import org.apache.lucene.document.DateTools; 52 import org.apache.lucene.document.Document; 53 import org.apache.lucene.document.Field; 54 import org.apache.lucene.document.Field.Store; 55 import org.apache.lucene.document.FieldType; 56 import org.apache.lucene.document.SortedDocValuesField; 57 import org.apache.lucene.document.StringField; 58 import org.apache.lucene.document.TextField; 59 import org.apache.lucene.util.BytesRef; 60 import org.opengrok.indexer.analysis.FileAnalyzerFactory.Matcher; 61 import org.opengrok.indexer.analysis.ada.AdaAnalyzerFactory; 62 import org.opengrok.indexer.analysis.archive.BZip2AnalyzerFactory; 63 import org.opengrok.indexer.analysis.archive.GZIPAnalyzerFactory; 64 import org.opengrok.indexer.analysis.archive.TarAnalyzerFactory; 65 import org.opengrok.indexer.analysis.archive.ZipAnalyzerFactory; 66 import org.opengrok.indexer.analysis.asm.AsmAnalyzerFactory; 67 import org.opengrok.indexer.analysis.c.CAnalyzerFactory; 68 import org.opengrok.indexer.analysis.c.CxxAnalyzerFactory; 69 import org.opengrok.indexer.analysis.clojure.ClojureAnalyzerFactory; 70 import org.opengrok.indexer.analysis.csharp.CSharpAnalyzerFactory; 71 import org.opengrok.indexer.analysis.data.IgnorantAnalyzerFactory; 72 import org.opengrok.indexer.analysis.data.ImageAnalyzerFactory; 73 import org.opengrok.indexer.analysis.document.MandocAnalyzerFactory; 74 import org.opengrok.indexer.analysis.document.TroffAnalyzerFactory; 75 import org.opengrok.indexer.analysis.eiffel.EiffelAnalyzerFactory; 76 import org.opengrok.indexer.analysis.erlang.ErlangAnalyzerFactory; 77 import org.opengrok.indexer.analysis.executables.ELFAnalyzerFactory; 78 import org.opengrok.indexer.analysis.executables.JarAnalyzerFactory; 79 import org.opengrok.indexer.analysis.executables.JavaClassAnalyzerFactory; 80 import org.opengrok.indexer.analysis.fortran.FortranAnalyzerFactory; 81 import org.opengrok.indexer.analysis.golang.GolangAnalyzerFactory; 82 import org.opengrok.indexer.analysis.haskell.HaskellAnalyzerFactory; 83 import org.opengrok.indexer.analysis.hcl.HCLAnalyzerFactory; 84 import org.opengrok.indexer.analysis.java.JavaAnalyzerFactory; 85 import org.opengrok.indexer.analysis.javascript.JavaScriptAnalyzerFactory; 86 import org.opengrok.indexer.analysis.json.JsonAnalyzerFactory; 87 import org.opengrok.indexer.analysis.kotlin.KotlinAnalyzerFactory; 88 import org.opengrok.indexer.analysis.lisp.LispAnalyzerFactory; 89 import org.opengrok.indexer.analysis.lua.LuaAnalyzerFactory; 90 import org.opengrok.indexer.analysis.pascal.PascalAnalyzerFactory; 91 import org.opengrok.indexer.analysis.perl.PerlAnalyzerFactory; 92 import org.opengrok.indexer.analysis.php.PhpAnalyzerFactory; 93 import org.opengrok.indexer.analysis.plain.PlainAnalyzerFactory; 94 import org.opengrok.indexer.analysis.plain.XMLAnalyzerFactory; 95 import org.opengrok.indexer.analysis.powershell.PowershellAnalyzerFactory; 96 import org.opengrok.indexer.analysis.python.PythonAnalyzerFactory; 97 import org.opengrok.indexer.analysis.r.RAnalyzerFactory; 98 import org.opengrok.indexer.analysis.ruby.RubyAnalyzerFactory; 99 import org.opengrok.indexer.analysis.rust.RustAnalyzerFactory; 100 import org.opengrok.indexer.analysis.scala.ScalaAnalyzerFactory; 101 import org.opengrok.indexer.analysis.sh.ShAnalyzerFactory; 102 import org.opengrok.indexer.analysis.sql.PLSQLAnalyzerFactory; 103 import org.opengrok.indexer.analysis.sql.SQLAnalyzerFactory; 104 import org.opengrok.indexer.analysis.swift.SwiftAnalyzerFactory; 105 import org.opengrok.indexer.analysis.tcl.TclAnalyzerFactory; 106 import org.opengrok.indexer.analysis.terraform.TerraformAnalyzerFactory; 107 import org.opengrok.indexer.analysis.typescript.TypeScriptAnalyzerFactory; 108 import org.opengrok.indexer.analysis.uue.UuencodeAnalyzerFactory; 109 import org.opengrok.indexer.analysis.vb.VBAnalyzerFactory; 110 import org.opengrok.indexer.analysis.verilog.VerilogAnalyzerFactory; 111 import org.opengrok.indexer.configuration.Project; 112 import org.opengrok.indexer.configuration.RuntimeEnvironment; 113 import org.opengrok.indexer.history.Annotation; 114 import org.opengrok.indexer.history.History; 115 import org.opengrok.indexer.history.HistoryEntry; 116 import org.opengrok.indexer.history.HistoryException; 117 import org.opengrok.indexer.history.HistoryGuru; 118 import org.opengrok.indexer.history.HistoryReader; 119 import org.opengrok.indexer.logger.LoggerFactory; 120 import org.opengrok.indexer.search.QueryBuilder; 121 import org.opengrok.indexer.util.IOUtils; 122 import org.opengrok.indexer.web.Util; 123 124 /** 125 * Manages and provides Analyzers as needed. Please see 126 * <a href="https://github.com/oracle/opengrok/wiki/Internals"> 127 * this</a> page for a great description of the purpose of the AnalyzerGuru. 128 * 129 * Created on September 22, 2005 130 * 131 * @author Chandan 132 */ 133 public class AnalyzerGuru { 134 135 /** 136 * The maximum number of characters (multi-byte if a BOM is identified) to 137 * read from the input stream to be used for magic string matching. 138 */ 139 private static final int OPENING_MAX_CHARS = 100; 140 141 /** 142 * Set to 16K -- though debugging shows it would do with only 8K+3 143 * (standard buffer for Java BufferedInputStream plus 3 bytes for largest UTF BOM). 144 */ 145 private static final int MARK_READ_LIMIT = 1024 * 16; 146 147 /** 148 * The number of bytes read from the start of the file for magic number or 149 * string analysis. Some {@link FileAnalyzerFactory.Matcher} 150 * implementations may read more data subsequently, but this field defines 151 * the number of bytes initially read for general matching. 152 */ 153 private static final int MAGIC_BYTES_NUM = 8; 154 155 private static final Logger LOGGER = LoggerFactory.getLogger(AnalyzerGuru.class); 156 157 /** 158 * The default {@code FileAnalyzerFactory} instance. 159 */ 160 private static final AnalyzerFactory DEFAULT_ANALYZER_FACTORY = new FileAnalyzerFactory(); 161 162 /** 163 * Map from file names to analyzer factories. 164 */ 165 private static final Map<String, AnalyzerFactory> FILE_NAMES = new HashMap<>(); 166 167 /** 168 * Map from file extensions to analyzer factories. 169 */ 170 private static final Map<String, AnalyzerFactory> ext = new HashMap<>(); 171 172 /** 173 * Map from file prefixes to analyzer factories. 174 */ 175 private static final Map<String, AnalyzerFactory> pre = new HashMap<>(); 176 177 /** 178 * Appended when 179 * {@link #addExtension(java.lang.String, AnalyzerFactory)} 180 * or 181 * {@link #addPrefix(java.lang.String, AnalyzerFactory)} 182 * are called to augment the value in {@link #getVersionNo()}. 183 */ 184 private static final TreeSet<String> CUSTOMIZATION_KEYS = new TreeSet<>(); 185 186 private static int customizationHashCode; 187 188 /** 189 * Descending string length comparator for magics. 190 */ 191 private static final Comparator<String> descStrlenComparator = (s1, s2) -> { 192 // DESC: s2 length <=> s1 length 193 int cmp = Integer.compare(s2.length(), s1.length()); 194 if (cmp != 0) { 195 return cmp; 196 } 197 198 // the Comparator must also be "consistent with equals", so check 199 // string contents too when (length)cmp == 0. (ASC: s1 <=> s2.) 200 cmp = s1.compareTo(s2); 201 return cmp; 202 }; 203 204 /** 205 * Map from magic strings to analyzer factories. 206 */ 207 private static final SortedMap<String, AnalyzerFactory> magics = 208 new TreeMap<>(descStrlenComparator); 209 210 /** 211 * List of matcher objects which can be used to determine which analyzer 212 * factory to use. 213 */ 214 private static final List<FileAnalyzerFactory.Matcher> matchers = new ArrayList<>(); 215 216 /** 217 * List of all registered {@code FileAnalyzerFactory} instances. 218 */ 219 private static final List<AnalyzerFactory> factories = new ArrayList<>(); 220 221 /** 222 * Names of all analysis packages. 223 */ 224 private static final List<String> analysisPkgNames = new ArrayList<>(); 225 226 public static final FieldType string_ft_stored_nanalyzed_norms = new FieldType(StringField.TYPE_STORED); 227 public static final FieldType string_ft_nstored_nanalyzed_norms = new FieldType(StringField.TYPE_NOT_STORED); 228 229 private static final Map<String, String> fileTypeDescriptions = new TreeMap<>(); 230 231 /** 232 * Maps from {@link FileAnalyzer#getFileTypeName()} to 233 * {@link FileAnalyzerFactory}. 234 */ 235 private static final Map<String, AnalyzerFactory> FILETYPE_FACTORIES = 236 new HashMap<>(); 237 238 /** 239 * Maps from {@link FileAnalyzer#getFileTypeName()} to 240 * {@link FileAnalyzer#getVersionNo()}. 241 */ 242 private static final Map<String, Long> ANALYZER_VERSIONS = new HashMap<>(); 243 244 private static final LangTreeMap langMap = new LangTreeMap(); 245 private static final LangTreeMap defaultLangMap = new LangTreeMap(); 246 247 /* 248 * If you write your own analyzer please register it here. The order is 249 * important for any factory that uses a FileAnalyzerFactory.Matcher 250 * implementation, as those are run in the same order as defined below -- 251 * though precise Matchers are run before imprecise ones. 252 */ 253 static { 254 try { 255 AnalyzerFactory[] analyzers = { 256 DEFAULT_ANALYZER_FACTORY, 257 new IgnorantAnalyzerFactory(), 258 new BZip2AnalyzerFactory(), 259 new XMLAnalyzerFactory(), 260 MandocAnalyzerFactory.DEFAULT_INSTANCE, 261 TroffAnalyzerFactory.DEFAULT_INSTANCE, 262 new ELFAnalyzerFactory(), 263 JavaClassAnalyzerFactory.DEFAULT_INSTANCE, 264 new ImageAnalyzerFactory(), 265 JarAnalyzerFactory.DEFAULT_INSTANCE, 266 ZipAnalyzerFactory.DEFAULT_INSTANCE, 267 new TarAnalyzerFactory(), 268 new CAnalyzerFactory(), 269 new CSharpAnalyzerFactory(), 270 new VBAnalyzerFactory(), 271 new CxxAnalyzerFactory(), 272 new ErlangAnalyzerFactory(), 273 new ShAnalyzerFactory(), 274 new PowershellAnalyzerFactory(), 275 new UuencodeAnalyzerFactory(), 276 new GZIPAnalyzerFactory(), 277 new JavaAnalyzerFactory(), 278 new JavaScriptAnalyzerFactory(), 279 new KotlinAnalyzerFactory(), 280 new SwiftAnalyzerFactory(), 281 new JsonAnalyzerFactory(), 282 new PythonAnalyzerFactory(), 283 new RustAnalyzerFactory(), 284 new PerlAnalyzerFactory(), 285 new PhpAnalyzerFactory(), 286 new LispAnalyzerFactory(), 287 new TclAnalyzerFactory(), 288 new ScalaAnalyzerFactory(), 289 new ClojureAnalyzerFactory(), 290 new SQLAnalyzerFactory(), 291 new PLSQLAnalyzerFactory(), 292 new FortranAnalyzerFactory(), 293 new HaskellAnalyzerFactory(), 294 new GolangAnalyzerFactory(), 295 new LuaAnalyzerFactory(), 296 new PascalAnalyzerFactory(), 297 new AdaAnalyzerFactory(), 298 new RubyAnalyzerFactory(), 299 new EiffelAnalyzerFactory(), 300 new VerilogAnalyzerFactory(), 301 new TypeScriptAnalyzerFactory(), 302 new AsmAnalyzerFactory(), 303 new HCLAnalyzerFactory(), 304 new TerraformAnalyzerFactory(), 305 new RAnalyzerFactory(), 306 // Keep PlainAnalyzer last, with its lone, quite fuzzy matcher. 307 PlainAnalyzerFactory.DEFAULT_INSTANCE 308 }; 309 310 for (AnalyzerFactory analyzer : analyzers) { 311 registerAnalyzer(analyzer); 312 } 313 314 for (AnalyzerFactory analyzer : analyzers) { 315 if (analyzer.getName() != null && !analyzer.getName().isEmpty()) { analyzer.getName()316 fileTypeDescriptions.put(analyzer.getAnalyzer().getFileTypeName(), analyzer.getName()); 317 } 318 } 319 320 string_ft_stored_nanalyzed_norms.setOmitNorms(false); 321 string_ft_nstored_nanalyzed_norms.setOmitNorms(false); 322 } catch (Throwable t) { 323 LOGGER.log(Level.SEVERE, 324 "exception hit when constructing AnalyzerGuru static", t); 325 throw t; 326 } 327 } 328 329 /** 330 * Gets a version number to be used to tag documents examined by the guru so 331 * that {@link AbstractAnalyzer} selection can be re-done later if a stored 332 * version number is different from the current implementation or if guru 333 * factory registrations are modified by the user to change the guru 334 * operation. 335 * <p> 336 * The static part of the version is bumped in a release when e.g. new 337 * {@link FileAnalyzerFactory} subclasses are registered or when existing 338 * {@link FileAnalyzerFactory} subclasses are revised to target more or 339 * different files. 340 * @return a value whose lower 32-bits are a static value 341 * 20201003_00 342 * for the current implementation and whose higher-32 bits are non-zero if 343 * {@link #addExtension(java.lang.String, AnalyzerFactory)} 344 * or 345 * {@link #addPrefix(java.lang.String, AnalyzerFactory)} 346 * has been called. 347 */ getVersionNo()348 public static long getVersionNo() { 349 final int ver32 = 20201003_00; // Edit comment above too! 350 long ver = ver32; 351 if (customizationHashCode != 0) { 352 ver |= (long) customizationHashCode << 32; 353 } 354 return ver; 355 } 356 357 /** 358 * Gets a version number according to a registered 359 * {@link FileAnalyzer#getVersionNo()} for a {@code fileTypeName} according 360 * to {@link FileAnalyzer#getFileTypeName()}. 361 * @param fileTypeName a defined instance 362 * @return a registered value or {@link Long#MIN_VALUE} if 363 * {@code fileTypeName} is unknown 364 */ getAnalyzerVersionNo(String fileTypeName)365 public static long getAnalyzerVersionNo(String fileTypeName) { 366 return ANALYZER_VERSIONS.getOrDefault(fileTypeName, Long.MIN_VALUE); 367 } 368 getAnalyzersVersionNos()369 public static Map<String, Long> getAnalyzersVersionNos() { 370 return Collections.unmodifiableMap(ANALYZER_VERSIONS); 371 } 372 getExtensionsMap()373 public static Map<String, AnalyzerFactory> getExtensionsMap() { 374 return Collections.unmodifiableMap(ext); 375 } 376 getPrefixesMap()377 public static Map<String, AnalyzerFactory> getPrefixesMap() { 378 return Collections.unmodifiableMap(pre); 379 } 380 getMagicsMap()381 public static Map<String, AnalyzerFactory> getMagicsMap() { 382 return Collections.unmodifiableMap(magics); 383 } 384 getAnalyzerFactoryMatchers()385 public static List<Matcher> getAnalyzerFactoryMatchers() { 386 return Collections.unmodifiableList(matchers); 387 } 388 getfileTypeDescriptions()389 public static Map<String, String> getfileTypeDescriptions() { 390 return Collections.unmodifiableMap(fileTypeDescriptions); 391 } 392 getAnalyzerFactories()393 public static List<AnalyzerFactory> getAnalyzerFactories() { 394 return Collections.unmodifiableList(factories); 395 } 396 397 private static final String USED_IN_MULTIPLE_MSG = "' used in multiple analyzers"; 398 399 /** 400 * Register a {@code FileAnalyzerFactory} instance. 401 */ registerAnalyzer(AnalyzerFactory factory)402 private static void registerAnalyzer(AnalyzerFactory factory) { 403 for (String name : factory.getFileNames()) { 404 AnalyzerFactory old = FILE_NAMES.put(name, factory); 405 assert old == null : 406 "name '" + name + USED_IN_MULTIPLE_MSG; 407 } 408 for (String prefix : factory.getPrefixes()) { 409 AnalyzerFactory old = pre.put(prefix, factory); 410 assert old == null : 411 "prefix '" + prefix + USED_IN_MULTIPLE_MSG; 412 } 413 for (String suffix : factory.getSuffixes()) { 414 AnalyzerFactory old = ext.put(suffix, factory); 415 assert old == null : 416 "suffix '" + suffix + USED_IN_MULTIPLE_MSG; 417 } 418 for (String magic : factory.getMagicStrings()) { 419 AnalyzerFactory old = magics.put(magic, factory); 420 assert old == null : 421 "magic '" + magic + USED_IN_MULTIPLE_MSG; 422 } 423 matchers.addAll(factory.getMatchers()); 424 factories.add(factory); 425 426 AbstractAnalyzer fa = factory.getAnalyzer(); 427 String fileTypeName = fa.getFileTypeName(); 428 FILETYPE_FACTORIES.put(fileTypeName, factory); 429 ANALYZER_VERSIONS.put(fileTypeName, fa.getVersionNo()); 430 431 // Possibly configure default LANG mappings for the factory. 432 String ctagsLang = factory.getAnalyzer().getCtagsLang(); 433 if (ctagsLang != null) { 434 List<String> prefixes = factory.getPrefixes(); 435 if (prefixes != null) { 436 for (String prefix : prefixes) { 437 defaultLangMap.add(prefix, ctagsLang); 438 } 439 } 440 441 List<String> suffixes = factory.getSuffixes(); 442 if (suffixes != null) { 443 for (String suffix : suffixes) { 444 // LangMap needs a "." to signify a file extension. 445 defaultLangMap.add("." + suffix, ctagsLang); 446 } 447 } 448 } 449 } 450 451 /** 452 * Instruct the AnalyzerGuru to use a given analyzer for a given file 453 * prefix. 454 * 455 * @param prefix the file prefix to add 456 * @param factory a factory which creates the analyzer to use for the given 457 * extension (if you pass null as the analyzer, you will disable the 458 * analyzer used for that extension) 459 */ addPrefix(String prefix, AnalyzerFactory factory)460 public static void addPrefix(String prefix, AnalyzerFactory factory) { 461 AnalyzerFactory oldFactory; 462 if (factory == null) { 463 oldFactory = pre.remove(prefix); 464 langMap.exclude(prefix); 465 } else { 466 oldFactory = pre.put(prefix, factory); 467 langMap.add(prefix, factory.getAnalyzer().getCtagsLang()); 468 } 469 470 if (factoriesDifferent(factory, oldFactory)) { 471 addCustomizationKey("p:" + prefix); 472 } 473 } 474 475 /** 476 * Instruct the AnalyzerGuru to use a given analyzer for a given file 477 * extension. 478 * 479 * @param extension the file-extension to add 480 * @param factory a factory which creates the analyzer to use for the given 481 * extension (if you pass null as the analyzer, you will disable the 482 * analyzer used for that extension) 483 * @throws IllegalArgumentException if {@code extension} contains a period 484 */ addExtension(String extension, AnalyzerFactory factory)485 public static void addExtension(String extension, AnalyzerFactory factory) { 486 if (extension.contains(".")) { 487 throw new IllegalArgumentException("extension contains a '.'"); 488 } 489 490 // LangMap fileSpec requires a leading period to indicate an extension. 491 String langMapExtension = "." + extension; 492 493 AnalyzerFactory oldFactory; 494 if (factory == null) { 495 oldFactory = ext.remove(extension); 496 langMap.exclude(langMapExtension); 497 } else { 498 oldFactory = ext.put(extension, factory); 499 langMap.add(langMapExtension, factory.getAnalyzer().getCtagsLang()); 500 } 501 502 if (factoriesDifferent(factory, oldFactory)) { 503 addCustomizationKey("e:" + extension); 504 } 505 } 506 507 /** 508 * Gets an unmodifiable view of the language mappings resulting from 509 * {@link #addExtension(String, AnalyzerFactory)} and 510 * {@link #addPrefix(String, AnalyzerFactory)} merged with default language 511 * mappings of OpenGrok's analyzers. 512 */ getLangMap()513 public static LangMap getLangMap() { 514 return langMap.mergeSecondary(defaultLangMap).unmodifiable(); 515 } 516 517 /** 518 * Get the default Analyzer. 519 * 520 * @return default FileAnalyzer 521 */ getAnalyzer()522 public static AbstractAnalyzer getAnalyzer() { 523 return DEFAULT_ANALYZER_FACTORY.getAnalyzer(); 524 } 525 526 /** 527 * Gets an analyzer for the specified {@code fileTypeName} if it accords 528 * with a known {@link FileAnalyzer#getFileTypeName()}. 529 * @param fileTypeName a defined name 530 * @return a defined instance if known or otherwise {@code null} 531 */ getAnalyzer(String fileTypeName)532 public static AbstractAnalyzer getAnalyzer(String fileTypeName) { 533 AnalyzerFactory factory = FILETYPE_FACTORIES.get(fileTypeName); 534 return factory == null ? null : factory.getAnalyzer(); 535 } 536 537 /** 538 * Get an analyzer suited to analyze a file. This function will reuse 539 * analyzers since they are costly. 540 * 541 * @param in Input stream containing data to be analyzed 542 * @param file Name of the file to be analyzed 543 * @return An analyzer suited for that file content 544 * @throws java.io.IOException If an error occurs while accessing the data 545 * in the input stream. 546 */ getAnalyzer(InputStream in, String file)547 public static AbstractAnalyzer getAnalyzer(InputStream in, String file) throws IOException { 548 AnalyzerFactory factory = find(in, file); 549 if (factory == null) { 550 AbstractAnalyzer defaultAnalyzer = getAnalyzer(); 551 if (LOGGER.isLoggable(Level.FINEST)) { 552 LOGGER.log(Level.FINEST, "{0}: fallback {1}", 553 new Object[]{file, 554 defaultAnalyzer.getClass().getSimpleName() }); 555 } 556 return defaultAnalyzer; 557 } 558 return factory.getAnalyzer(); 559 } 560 561 /** 562 * Free resources associated with all registered analyzers. 563 */ returnAnalyzers()564 public static void returnAnalyzers() { 565 for (AnalyzerFactory analyzer : factories) { 566 analyzer.returnAnalyzer(); 567 } 568 } 569 570 /** 571 * Populate a Lucene document with the required fields. 572 * 573 * @param doc The document to populate 574 * @param file The file to index 575 * @param path Where the file is located (from source root) 576 * @param fa The analyzer to use on the file 577 * @param xrefOut Where to write the xref (possibly {@code null}) 578 * @throws IOException If an exception occurs while collecting the data 579 * @throws InterruptedException if a timeout occurs 580 */ populateDocument(Document doc, File file, String path, AbstractAnalyzer fa, Writer xrefOut)581 public void populateDocument(Document doc, File file, String path, AbstractAnalyzer fa, Writer xrefOut) 582 throws IOException, InterruptedException { 583 584 String date = DateTools.timeToString(file.lastModified(), 585 DateTools.Resolution.MILLISECOND); 586 path = Util.fixPathIfWindows(path); 587 doc.add(new Field(QueryBuilder.U, Util.path2uid(path, date), 588 string_ft_stored_nanalyzed_norms)); 589 doc.add(new Field(QueryBuilder.FULLPATH, file.getAbsolutePath(), 590 string_ft_nstored_nanalyzed_norms)); 591 doc.add(new SortedDocValuesField(QueryBuilder.FULLPATH, 592 new BytesRef(file.getAbsolutePath()))); 593 594 if (RuntimeEnvironment.getInstance().isHistoryEnabled()) { 595 try { 596 HistoryGuru histGuru = HistoryGuru.getInstance(); 597 HistoryReader hr = histGuru.getHistoryReader(file); 598 if (hr != null) { 599 doc.add(new TextField(QueryBuilder.HIST, hr)); 600 History history; 601 if ((history = histGuru.getHistory(file)) != null) { 602 List<HistoryEntry> historyEntries = history.getHistoryEntries(1, 0); 603 if (!historyEntries.isEmpty()) { 604 HistoryEntry histEntry = historyEntries.get(0); 605 doc.add(new TextField(QueryBuilder.LASTREV, histEntry.getRevision(), Store.YES)); 606 } 607 } 608 } 609 } catch (HistoryException e) { 610 LOGGER.log(Level.WARNING, "An error occurred while reading history: ", e); 611 } 612 } 613 doc.add(new Field(QueryBuilder.DATE, date, string_ft_stored_nanalyzed_norms)); 614 doc.add(new SortedDocValuesField(QueryBuilder.DATE, new BytesRef(date))); 615 616 // `path' is not null, as it was passed to Util.path2uid() above. 617 doc.add(new TextField(QueryBuilder.PATH, path, Store.YES)); 618 Project project = Project.getProject(path); 619 if (project != null) { 620 doc.add(new TextField(QueryBuilder.PROJECT, project.getPath(), Store.YES)); 621 } 622 623 /* 624 * Use the parent of the path -- not the absolute file as is done for 625 * FULLPATH -- so that DIRPATH is the same convention as for PATH 626 * above. A StringField, however, is used instead of a TextField. 627 */ 628 File fpath = new File(path); 629 String fileParent = fpath.getParent(); 630 if (fileParent != null && fileParent.length() > 0) { 631 String normalizedPath = QueryBuilder.normalizeDirPath(fileParent); 632 StringField npstring = new StringField(QueryBuilder.DIRPATH, 633 normalizedPath, Store.NO); 634 doc.add(npstring); 635 } 636 637 if (fa != null) { 638 AbstractAnalyzer.Genre g = fa.getGenre(); 639 if (g == AbstractAnalyzer.Genre.PLAIN || g == AbstractAnalyzer.Genre.XREFABLE || g == AbstractAnalyzer.Genre.HTML) { 640 doc.add(new Field(QueryBuilder.T, g.typeName(), string_ft_stored_nanalyzed_norms)); 641 } 642 fa.analyze(doc, StreamSource.fromFile(file), xrefOut); 643 644 String type = fa.getFileTypeName(); 645 doc.add(new StringField(QueryBuilder.TYPE, type, Store.YES)); 646 } 647 } 648 649 /** 650 * Write a browse-able version of the file. 651 * 652 * @param factory The analyzer factory for this file type 653 * @param in The input stream containing the data 654 * @param out Where to write the result 655 * @param defs definitions for the source file, if available 656 * @param annotation Annotation information for the file 657 * @param project Project the file belongs to 658 * @throws java.io.IOException If an error occurs while creating the output 659 */ writeXref(AnalyzerFactory factory, Reader in, Writer out, Definitions defs, Annotation annotation, Project project)660 public static void writeXref(AnalyzerFactory factory, Reader in, 661 Writer out, Definitions defs, 662 Annotation annotation, Project project) 663 throws IOException { 664 Reader input = in; 665 if (factory.getGenre() == AbstractAnalyzer.Genre.PLAIN) { 666 // This is some kind of text file, so we need to expand tabs to 667 // spaces to match the project's tab settings. 668 input = ExpandTabsReader.wrap(in, project); 669 } 670 671 WriteXrefArgs args = new WriteXrefArgs(input, out); 672 args.setDefs(defs); 673 args.setAnnotation(annotation); 674 args.setProject(project); 675 676 AbstractAnalyzer analyzer = factory.getAnalyzer(); 677 RuntimeEnvironment env = RuntimeEnvironment.getInstance(); 678 analyzer.setScopesEnabled(env.isScopesEnabled()); 679 analyzer.setFoldingEnabled(env.isFoldingEnabled()); 680 analyzer.writeXref(args); 681 } 682 683 /** 684 * Writes a browse-able version of the file transformed for immediate 685 * serving to a web client. 686 * @param contextPath the web context path for 687 * {@link Util#dumpXref(java.io.Writer, java.io.Reader, java.lang.String)} 688 * @param factory the analyzer factory for this file type 689 * @param in the input stream containing the data 690 * @param out a defined instance to write 691 * @param defs definitions for the source file, if available 692 * @param annotation annotation information for the file 693 * @param project project the file belongs to 694 * @throws java.io.IOException if an error occurs while creating the output 695 */ writeDumpedXref(String contextPath, AnalyzerFactory factory, Reader in, Writer out, Definitions defs, Annotation annotation, Project project)696 public static void writeDumpedXref(String contextPath, 697 AnalyzerFactory factory, Reader in, Writer out, 698 Definitions defs, Annotation annotation, Project project) 699 throws IOException { 700 701 File xrefTemp = File.createTempFile("ogxref", ".html"); 702 try { 703 try (FileWriter tmpout = new FileWriter(xrefTemp)) { 704 writeXref(factory, in, tmpout, defs, annotation, project); 705 } 706 Util.dumpXref(out, xrefTemp, false, contextPath); 707 } finally { 708 Files.delete(xrefTemp.toPath()); 709 } 710 } 711 712 /** 713 * Get the genre of a file. 714 * 715 * @param file The file to inspect 716 * @return The genre suitable to decide how to display the file 717 */ getGenre(String file)718 public static AbstractAnalyzer.Genre getGenre(String file) { 719 return getGenre(find(file)); 720 } 721 722 /** 723 * Get the genre of a bulk of data. 724 * 725 * @param in A stream containing the data 726 * @return The genre suitable to decide how to display the file 727 * @throws java.io.IOException If an error occurs while getting the content 728 */ getGenre(InputStream in)729 public static AbstractAnalyzer.Genre getGenre(InputStream in) throws IOException { 730 return getGenre(find(in)); 731 } 732 733 /** 734 * Get the genre for a named class (this is most likely an analyzer). 735 * 736 * @param factory the analyzer factory to get the genre for 737 * @return The genre of this class (null if not found) 738 */ getGenre(AnalyzerFactory factory)739 public static AbstractAnalyzer.Genre getGenre(AnalyzerFactory factory) { 740 if (factory != null) { 741 return factory.getGenre(); 742 } 743 return null; 744 } 745 746 /** 747 * Finds a {@code FileAnalyzerFactory} for the specified 748 * {@link FileAnalyzer#getFileTypeName()}. 749 * @param fileTypeName a defined instance 750 * @return a defined instance or {@code null} 751 */ findByFileTypeName(String fileTypeName)752 public static AnalyzerFactory findByFileTypeName(String fileTypeName) { 753 return FILETYPE_FACTORIES.get(fileTypeName); 754 } 755 756 /** 757 * Find a {@code FileAnalyzerFactory} with the specified class name. If one 758 * doesn't exist, create one and register it. Allow specification of either 759 * the complete class name (which includes the package name) or the simple 760 * name of the class. 761 * 762 * @param factoryClassName name of the factory class 763 * @return a file analyzer factory 764 * 765 * @throws ClassNotFoundException if there is no class with that name 766 * @throws ClassCastException if the class is not a subclass of {@code 767 * FileAnalyzerFactory} 768 * @throws IllegalAccessException if the constructor cannot be accessed 769 * @throws InstantiationException if the class cannot be instantiated 770 * @throws NoSuchMethodException if no-argument constructor could not be found 771 * @throws InvocationTargetException if the underlying constructor throws an exception 772 */ findFactory(String factoryClassName)773 public static AnalyzerFactory findFactory(String factoryClassName) 774 throws ClassNotFoundException, IllegalAccessException, InstantiationException, NoSuchMethodException, 775 InvocationTargetException { 776 Class<?> fcn; 777 try { 778 fcn = Class.forName(factoryClassName); 779 780 } catch (ClassNotFoundException e) { 781 fcn = getFactoryClass(factoryClassName); 782 783 if (fcn == null) { 784 throw new ClassNotFoundException("Unable to locate class " + factoryClassName); 785 } 786 } 787 788 return findFactory(fcn); 789 } 790 791 /** 792 * Get Analyzer factory class using class simple name. 793 * 794 * @param simpleName which may be either the factory class 795 * simple name (eg. CAnalyzerFactory), the analyzer name 796 * (eg. CAnalyzer), or the language name (eg. C) 797 * 798 * @return the analyzer factory class, or null when not found. 799 */ getFactoryClass(String simpleName)800 public static Class<?> getFactoryClass(String simpleName) { 801 Class<?> factoryClass = null; 802 803 // Build analysis package name list first time only 804 if (analysisPkgNames.isEmpty()) { 805 Package[] p = Package.getPackages(); 806 for (Package pp : p) { 807 String pname = pp.getName(); 808 if (pname.contains(".analysis.")) { 809 analysisPkgNames.add(pname); 810 } 811 } 812 } 813 814 // This allows user to enter the language or analyzer name 815 // (eg. C or CAnalyzer vs. CAnalyzerFactory) 816 // Note that this assumes a regular naming scheme of 817 // all language parsers: 818 // <language>Analyzer, <language>AnalyzerFactory 819 820 if (!simpleName.contains("Analyzer")) { 821 simpleName += "Analyzer"; 822 } 823 824 if (!simpleName.contains("Factory")) { 825 simpleName += "Factory"; 826 } 827 828 for (String aPackage : analysisPkgNames) { 829 try { 830 String fqn = aPackage + "." + simpleName; 831 factoryClass = Class.forName(fqn); 832 break; 833 } catch (ClassNotFoundException e) { 834 // Ignore 835 } 836 } 837 838 return factoryClass; 839 } 840 841 /** 842 * Find a {@code FileAnalyzerFactory} which is an instance of the specified 843 * class. If one doesn't exist, create one and register it. 844 * 845 * @param factoryClass the factory class 846 * @return a file analyzer factory 847 * 848 * @throws ClassCastException if the class is not a subclass of {@code 849 * FileAnalyzerFactory} 850 * @throws IllegalAccessException if the constructor cannot be accessed 851 * @throws InstantiationException if the class cannot be instantiated 852 * @throws NoSuchMethodException if no-argument constructor could not be found 853 * @throws InvocationTargetException if the underlying constructor throws an exception 854 */ findFactory(Class<?> factoryClass)855 private static AnalyzerFactory findFactory(Class<?> factoryClass) 856 throws InstantiationException, IllegalAccessException, NoSuchMethodException, InvocationTargetException { 857 for (AnalyzerFactory f : factories) { 858 if (f.getClass() == factoryClass) { 859 return f; 860 } 861 } 862 AnalyzerFactory f = (AnalyzerFactory) factoryClass.getDeclaredConstructor().newInstance(); 863 registerAnalyzer(f); 864 return f; 865 } 866 867 /** 868 * Finds a suitable analyser class for file name. If the analyzer cannot be 869 * determined by the file extension, try to look at the data in the 870 * InputStream to find a suitable analyzer. 871 * 872 * Use if you just want to find file type. 873 * 874 * @param in The input stream containing the data 875 * @param file The file name to get the analyzer for 876 * @return the analyzer factory to use 877 * @throws java.io.IOException If a problem occurred while reading the data 878 */ find(InputStream in, String file)879 public static AnalyzerFactory find(InputStream in, String file) throws IOException { 880 AnalyzerFactory factory = find(file); 881 882 if (factory != null) { 883 return factory; 884 } 885 return findForStream(in, file); 886 } 887 888 /** 889 * Finds a suitable analyser class for file name. 890 * 891 * @param file The file name to get the analyzer for 892 * @return the analyzer factory to use 893 */ find(String file)894 public static AnalyzerFactory find(String file) { 895 String path = file; 896 int i; 897 898 // Get basename of the file first. 899 if (((i = path.lastIndexOf(File.separatorChar)) > 0) 900 && (i + 1 < path.length())) { 901 path = path.substring(i + 1); 902 } 903 904 int dotpos = path.lastIndexOf('.'); 905 if (dotpos >= 0) { 906 AnalyzerFactory factory; 907 908 // Try matching the prefix. 909 if (dotpos > 0) { 910 factory = pre.get(path.substring(0, dotpos).toUpperCase(Locale.ROOT)); 911 if (factory != null) { 912 if (LOGGER.isLoggable(Level.FINEST)) { 913 LOGGER.log(Level.FINEST, "{0}: chosen by prefix: {1}", 914 new Object[]{file, 915 factory.getClass().getSimpleName() }); 916 } 917 return factory; 918 } 919 } 920 921 // Now try matching the suffix. We kind of consider this order (first 922 // prefix then suffix) to be workable although for sure there can be 923 // cases when this does not work. 924 factory = ext.get(path.substring(dotpos + 1).toUpperCase(Locale.ROOT)); 925 if (factory != null) { 926 if (LOGGER.isLoggable(Level.FINEST)) { 927 LOGGER.log(Level.FINEST, "{0}: chosen by suffix: {1}", 928 new Object[]{file, 929 factory.getClass().getSimpleName() }); 930 } 931 return factory; 932 } 933 } 934 935 // file doesn't have any of the prefix or extensions we know, try full match 936 return FILE_NAMES.get(path.toUpperCase(Locale.ROOT)); 937 } 938 939 /** 940 * Finds a suitable analyzer class for the data in this stream. 941 * 942 * @param in The stream containing the data to analyze 943 * @return the analyzer factory to use 944 * @throws java.io.IOException if an error occurs while reading data from 945 * the stream 946 */ find(InputStream in)947 public static AnalyzerFactory find(InputStream in) throws IOException { 948 return findForStream(in, "<anonymous>"); 949 } 950 951 /** 952 * Finds a suitable analyzer class for the data in this stream 953 * corresponding to a file of the specified name. 954 * 955 * @param in The stream containing the data to analyze 956 * @param file The file name to get the analyzer for 957 * @return the analyzer factory to use 958 * @throws java.io.IOException if an error occurs while reading data from 959 * the stream 960 */ findForStream(InputStream in, String file)961 private static AnalyzerFactory findForStream(InputStream in, 962 String file) throws IOException { 963 964 in.mark(MAGIC_BYTES_NUM); 965 byte[] content = new byte[MAGIC_BYTES_NUM]; 966 int len = in.read(content); 967 in.reset(); 968 969 if (len < MAGIC_BYTES_NUM) { 970 /* 971 * Need at least 4 bytes to perform magic string matching. 972 */ 973 if (len < 4) { 974 return null; 975 } 976 content = Arrays.copyOf(content, len); 977 } 978 979 AnalyzerFactory fac; 980 981 // First, do precise-magic Matcher matching 982 for (FileAnalyzerFactory.Matcher matcher : matchers) { 983 if (matcher.isPreciseMagic()) { 984 fac = matcher.isMagic(content, in); 985 if (fac != null) { 986 if (LOGGER.isLoggable(Level.FINEST)) { 987 LOGGER.log(Level.FINEST, 988 "{0}: chosen by precise magic: {1}", new Object[]{ 989 file, fac.getClass().getSimpleName() }); 990 } 991 return fac; 992 } 993 } 994 } 995 996 // Next, look for magic strings 997 String opening = readOpening(in, content); 998 fac = findMagicString(opening, file); 999 if (fac != null) { 1000 return fac; 1001 } 1002 1003 // Last, do imprecise-magic Matcher matching 1004 for (FileAnalyzerFactory.Matcher matcher : matchers) { 1005 if (!matcher.isPreciseMagic()) { 1006 fac = matcher.isMagic(content, in); 1007 if (fac != null) { 1008 if (LOGGER.isLoggable(Level.FINEST)) { 1009 LOGGER.log(Level.FINEST, 1010 "{0}: chosen by imprecise magic: {1}", 1011 new Object[]{file, 1012 fac.getClass().getSimpleName() }); 1013 } 1014 return fac; 1015 } 1016 } 1017 } 1018 1019 return null; 1020 } 1021 findMagicString(String opening, String file)1022 private static AnalyzerFactory findMagicString(String opening, String file) { 1023 1024 // first, try to look up two words in magics 1025 String fragment = getWords(opening, 2); 1026 AnalyzerFactory fac = magics.get(fragment); 1027 if (fac != null) { 1028 if (LOGGER.isLoggable(Level.FINEST)) { 1029 LOGGER.log(Level.FINEST, "{0}: chosen by magic {2}: {1}", 1030 new Object[]{file, fac.getClass().getSimpleName(), 1031 fragment}); 1032 } 1033 return fac; 1034 } 1035 1036 // second, try to look up one word in magics 1037 fragment = getWords(opening, 1); 1038 fac = magics.get(fragment); 1039 if (fac != null) { 1040 if (LOGGER.isLoggable(Level.FINEST)) { 1041 LOGGER.log(Level.FINEST, "{0}: chosen by magic {2}: {1}", 1042 new Object[]{file, fac.getClass().getSimpleName(), 1043 fragment}); 1044 } 1045 return fac; 1046 } 1047 1048 // try to match initial substrings (DESC strlen) 1049 for (Map.Entry<String, AnalyzerFactory> entry : 1050 magics.entrySet()) { 1051 String magic = entry.getKey(); 1052 if (opening.startsWith(magic)) { 1053 fac = entry.getValue(); 1054 if (LOGGER.isLoggable(Level.FINEST)) { 1055 LOGGER.log(Level.FINEST, 1056 "{0}: chosen by magic(substr) {2}: {1}", new Object[]{ 1057 file, fac.getClass().getSimpleName(), magic}); 1058 } 1059 return fac; 1060 } 1061 } 1062 1063 return null; 1064 } 1065 1066 /** 1067 * Extract initial words from a String, or take the entire 1068 * <code>value</code> if not enough words can be identified. (If 1069 * <code>n</code> is not 1 or more, returns an empty String.) (A "word" 1070 * ends at each and every space character.) 1071 * 1072 * @param value The source from which words are cut 1073 * @param n The number of words to try to extract 1074 * @return The extracted words or <code>""</code> 1075 */ getWords(String value, int n)1076 private static String getWords(String value, int n) { 1077 if (n < 1) { 1078 return ""; 1079 } 1080 int l = 0; 1081 while (n-- > 0) { 1082 int o = l > 0 ? l + 1 : l; 1083 int i = value.indexOf(' ', o); 1084 if (i == -1) { 1085 return value; 1086 } 1087 l = i; 1088 } 1089 return value.substring(0, l); 1090 } 1091 1092 /** 1093 * Extract an opening string from the input stream, past any BOM, and past 1094 * any initial whitespace, but only up to <code>OPENING_MAX_CHARS</code> or 1095 * to the first <code>\n</code> after any non-whitespace. (Hashbang, #!, 1096 * openings will have superfluous space removed.) 1097 * 1098 * @param in The input stream containing the data 1099 * @param sig The initial sequence of bytes in the input stream 1100 * @return The extracted string or <code>""</code> 1101 * @throws java.io.IOException in case of any read error 1102 */ readOpening(InputStream in, byte[] sig)1103 private static String readOpening(InputStream in, byte[] sig) 1104 throws IOException { 1105 1106 in.mark(MARK_READ_LIMIT); 1107 1108 String encoding = IOUtils.findBOMEncoding(sig); 1109 if (encoding == null) { 1110 // SRCROOT is read with UTF-8 as a default. 1111 encoding = StandardCharsets.UTF_8.name(); 1112 } else { 1113 int skipForBOM = IOUtils.skipForBOM(sig); 1114 if (in.skip(skipForBOM) < skipForBOM) { 1115 in.reset(); 1116 return ""; 1117 } 1118 } 1119 1120 int nRead = 0; 1121 boolean sawNonWhitespace = false; 1122 boolean lastWhitespace = false; 1123 boolean postHashbang = false; 1124 int r; 1125 1126 StringBuilder opening = new StringBuilder(); 1127 BufferedReader readr = new BufferedReader(new InputStreamReader(in, encoding), OPENING_MAX_CHARS); 1128 while ((r = readr.read()) != -1) { 1129 if (++nRead > OPENING_MAX_CHARS) { 1130 break; 1131 } 1132 char c = (char) r; 1133 boolean isWhitespace = Character.isWhitespace(c); 1134 if (!sawNonWhitespace) { 1135 if (isWhitespace) { 1136 continue; 1137 } 1138 sawNonWhitespace = true; 1139 } 1140 if (c == '\n') { 1141 break; 1142 } 1143 1144 if (isWhitespace) { 1145 // Track `lastWhitespace' to condense stretches of whitespace, 1146 // and use ' ' regardless of actual whitespace character to 1147 // accord with magic string definitions. 1148 if (!lastWhitespace && !postHashbang) { 1149 opening.append(' '); 1150 } 1151 } else { 1152 opening.append(c); 1153 postHashbang = false; 1154 } 1155 lastWhitespace = isWhitespace; 1156 1157 // If the opening starts with "#!", then track so that any 1158 // trailing whitespace after the hashbang is ignored. 1159 if (opening.length() == 2 && opening.charAt(0) == '#' && opening.charAt(1) == '!') { 1160 postHashbang = true; 1161 } 1162 } 1163 1164 in.reset(); 1165 return opening.toString(); 1166 } 1167 addCustomizationKey(String k)1168 private static void addCustomizationKey(String k) { 1169 CUSTOMIZATION_KEYS.add(k); 1170 Object[] keys = CUSTOMIZATION_KEYS.toArray(); 1171 customizationHashCode = Objects.hash(keys); 1172 } 1173 factoriesDifferent(AnalyzerFactory a, AnalyzerFactory b)1174 private static boolean factoriesDifferent(AnalyzerFactory a, AnalyzerFactory b) { 1175 String aName = null; 1176 if (a != null) { 1177 aName = a.getName(); 1178 if (aName == null) { 1179 aName = a.getClass().getSimpleName(); 1180 } 1181 } 1182 String bName = null; 1183 if (b != null) { 1184 bName = b.getName(); 1185 if (bName == null) { 1186 bName = b.getClass().getSimpleName(); 1187 } 1188 } 1189 if (aName == null && bName == null) { 1190 return false; 1191 } 1192 return aName == null || !aName.equals(bName); 1193 } 1194 } 1195