1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * See LICENSE.txt included in this distribution for the specific 9 * language governing permissions and limitations under the License. 10 * 11 * When distributing Covered Code, include this CDDL HEADER in each 12 * file and include the License file at LICENSE.txt. 13 * If applicable, add the following below this CDDL HEADER, with the 14 * fields enclosed by brackets "[]" replaced with your own identifying 15 * information: Portions Copyright [yyyy] [name of copyright owner] 16 * 17 * CDDL HEADER END 18 */ 19 20 /* 21 * Copyright (c) 2005, 2019, Oracle and/or its affiliates. All rights reserved. Use is subject to license terms. 22 * Portions Copyright (c) 2017, 2020, Chris Fraire <cfraire@me.com>. 23 */ 24 package org.opengrok.indexer.analysis; 25 26 import java.io.IOException; 27 import java.io.Writer; 28 import java.util.Locale; 29 import java.util.function.Supplier; 30 import java.util.logging.Level; 31 import java.util.logging.Logger; 32 import org.apache.lucene.analysis.Analyzer; 33 import org.apache.lucene.analysis.LowerCaseFilter; 34 import org.apache.lucene.analysis.TokenStream; 35 import org.apache.lucene.document.Document; 36 import org.apache.lucene.document.StoredField; 37 import org.opengrok.indexer.analysis.plain.PlainFullTokenizer; 38 import org.opengrok.indexer.analysis.plain.PlainSymbolTokenizer; 39 import org.opengrok.indexer.logger.LoggerFactory; 40 import org.opengrok.indexer.search.QueryBuilder; 41 42 /** 43 * Base class for all different File Analyzers. 44 * 45 * An Analyzer for a filetype provides 46 * <ol> 47 * <li>the file extensions and magic numbers it analyzes</li> 48 * <li>a lucene document listing the fields it can support</li> 49 * <li>TokenStreams for each of the field it said requires tokenizing in 2</li> 50 * <li>cross reference in HTML format</li> 51 * <li>The type of file data, plain text etc</li> 52 * </ol> 53 * 54 * Created on September 21, 2005 55 * 56 * @author Chandan 57 */ 58 public class FileAnalyzer extends AbstractAnalyzer { 59 60 private static final Logger LOGGER = LoggerFactory.getLogger(FileAnalyzer.class); 61 62 /** 63 * @return {@code null} as there is no aligned language 64 */ 65 @Override getCtagsLang()66 public String getCtagsLang() { 67 return null; 68 } 69 70 /** 71 * Gets a version number to be used to tag processed documents so that 72 * re-analysis can be re-done later if a stored version number is different 73 * from the current implementation. 74 * <p> 75 * The value is the union of a {@link FileAnalyzer} root version and the 76 * value from {@link #getSpecializedVersionNo()}. Changing the root version 77 * affects all analyzers simultaneously; while subclasses can override 78 * {@link #getSpecializedVersionNo()} to allow changes that affect a few. 79 * @return (20061115_01 << 32) | {@link #getSpecializedVersionNo()} 80 */ 81 @Override getVersionNo()82 public final long getVersionNo() { 83 final int rootVersionNo = 20061115_01; // Edit comment above too! 84 return ((long) rootVersionNo << 32) | getSpecializedVersionNo(); 85 } 86 87 @Override supportsScopes()88 protected boolean supportsScopes() { 89 return false; 90 } 91 92 /** 93 * Creates a new instance of FileAnalyzer. 94 * 95 * @param factory defined instance for the analyzer 96 */ FileAnalyzer(AnalyzerFactory factory)97 public FileAnalyzer(AnalyzerFactory factory) { 98 super(Analyzer.PER_FIELD_REUSE_STRATEGY); 99 if (factory == null) { 100 throw new IllegalArgumentException("`factory' is null"); 101 } 102 this.factory = factory; 103 this.symbolTokenizerFactory = this::createPlainSymbolTokenizer; 104 } 105 106 /** 107 * Creates a new instance of {@link FileAnalyzer}. 108 * 109 * @param factory defined instance for the analyzer 110 * @param symbolTokenizerFactory a defined instance relevant for the file 111 */ FileAnalyzer(AnalyzerFactory factory, Supplier<JFlexTokenizer> symbolTokenizerFactory)112 protected FileAnalyzer(AnalyzerFactory factory, 113 Supplier<JFlexTokenizer> symbolTokenizerFactory) { 114 115 super(Analyzer.PER_FIELD_REUSE_STRATEGY); 116 if (factory == null) { 117 throw new IllegalArgumentException("`factory' is null"); 118 } 119 if (symbolTokenizerFactory == null) { 120 throw new IllegalArgumentException("symbolTokenizerFactory is null"); 121 } 122 this.factory = factory; 123 this.symbolTokenizerFactory = symbolTokenizerFactory; 124 } 125 126 /** 127 * Returns the normalized name of the analyzer, which should corresponds to 128 * the file type. Example: The analyzer for the C language (CAnalyzer) would 129 * return “c”. 130 * 131 * @return Normalized name of the analyzer. 132 */ 133 @Override getFileTypeName()134 public String getFileTypeName() { 135 String name = this.getClass().getSimpleName().toLowerCase(Locale.ROOT); 136 String suffix = "analyzer"; 137 138 if (name.endsWith(suffix)) { 139 return name.substring(0, name.length() - suffix.length()); 140 } 141 142 return name; 143 } 144 145 /** 146 * Analyze the contents of a source file. This includes populating the 147 * Lucene document with fields to add to the index, and writing the 148 * cross-referenced data to the specified destination. 149 * 150 * @param doc the Lucene document 151 * @param src the input data source 152 * @param xrefOut where to write the xref (may be {@code null}) 153 * @throws IOException if any I/O error 154 * @throws InterruptedException if a timeout occurs 155 */ 156 @Override analyze(Document doc, StreamSource src, Writer xrefOut)157 public void analyze(Document doc, StreamSource src, Writer xrefOut) 158 throws IOException, InterruptedException { 159 // not used 160 } 161 162 /** 163 * Derived classes should override to write a cross referenced HTML file for 164 * the specified args. 165 * 166 * @param args a defined instance 167 * @return the instance used to write the cross-referencing 168 * @throws java.io.IOException if an error occurs 169 */ 170 @Override writeXref(WriteXrefArgs args)171 public Xrefer writeXref(WriteXrefArgs args) throws IOException { 172 throw new UnsupportedOperationException( 173 "Base FileAnalyzer cannot write xref"); 174 } 175 176 @Override createComponents(String fieldName)177 protected TokenStreamComponents createComponents(String fieldName) { 178 switch (fieldName) { 179 case QueryBuilder.FULL: 180 return new TokenStreamComponents(createPlainFullTokenizer()); 181 case QueryBuilder.PATH: 182 case QueryBuilder.PROJECT: 183 return new TokenStreamComponents(new PathTokenizer()); 184 case QueryBuilder.HIST: 185 try (HistoryAnalyzer historyAnalyzer = new HistoryAnalyzer()) { 186 return historyAnalyzer.createComponents(fieldName); 187 } 188 //below is set by PlainAnalyzer to workaround #1376 symbols search works like full text search 189 case QueryBuilder.REFS: { 190 return new TokenStreamComponents(symbolTokenizerFactory.get()); 191 } 192 case QueryBuilder.DEFS: 193 return new TokenStreamComponents(createPlainSymbolTokenizer()); 194 case QueryBuilder.LASTREV: 195 return new TokenStreamComponents(createPlainFullTokenizer()); 196 default: 197 LOGGER.log( 198 Level.WARNING, "Have no analyzer for: {0}", fieldName); 199 return null; 200 } 201 } 202 203 /** 204 * Add fields to store document number-of-lines and lines-of-code (LOC). 205 */ 206 @Override addNumLinesLOC(Document doc, NumLinesLOC counts)207 protected void addNumLinesLOC(Document doc, NumLinesLOC counts) { 208 doc.add(new StoredField(QueryBuilder.NUML, counts.getNumLines())); 209 doc.add(new StoredField(QueryBuilder.LOC, counts.getLOC())); 210 211 if (countsAggregator != null) { 212 countsAggregator.register(counts); 213 } 214 } 215 createPlainSymbolTokenizer()216 private JFlexTokenizer createPlainSymbolTokenizer() { 217 return new JFlexTokenizer(new PlainSymbolTokenizer( 218 AbstractAnalyzer.DUMMY_READER)); 219 } 220 createPlainFullTokenizer()221 private JFlexTokenizer createPlainFullTokenizer() { 222 return new JFlexTokenizer(new PlainFullTokenizer( 223 AbstractAnalyzer.DUMMY_READER)); 224 } 225 226 @Override normalize(String fieldName, TokenStream in)227 protected TokenStream normalize(String fieldName, TokenStream in) { 228 switch (fieldName) { 229 case QueryBuilder.DEFS: 230 case QueryBuilder.REFS: 231 return in; 232 default: 233 return new LowerCaseFilter(in); 234 } 235 } 236 } 237