xref: /OpenGrok/opengrok-indexer/src/main/java/org/opengrok/indexer/analysis/FileAnalyzer.java (revision 0e4c55544f8ea0a68e8bae37b0e502097e008ec1)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * See LICENSE.txt included in this distribution for the specific
9  * language governing permissions and limitations under the License.
10  *
11  * When distributing Covered Code, include this CDDL HEADER in each
12  * file and include the License file at LICENSE.txt.
13  * If applicable, add the following below this CDDL HEADER, with the
14  * fields enclosed by brackets "[]" replaced with your own identifying
15  * information: Portions Copyright [yyyy] [name of copyright owner]
16  *
17  * CDDL HEADER END
18  */
19 
20 /*
21  * Copyright (c) 2005, 2019, Oracle and/or its affiliates. All rights reserved. Use is subject to license terms.
22  * Portions Copyright (c) 2017, 2020, Chris Fraire <cfraire@me.com>.
23  */
24 package org.opengrok.indexer.analysis;
25 
26 import java.io.IOException;
27 import java.io.Writer;
28 import java.util.Locale;
29 import java.util.function.Supplier;
30 import java.util.logging.Level;
31 import java.util.logging.Logger;
32 import org.apache.lucene.analysis.Analyzer;
33 import org.apache.lucene.analysis.LowerCaseFilter;
34 import org.apache.lucene.analysis.TokenStream;
35 import org.apache.lucene.document.Document;
36 import org.apache.lucene.document.StoredField;
37 import org.opengrok.indexer.analysis.plain.PlainFullTokenizer;
38 import org.opengrok.indexer.analysis.plain.PlainSymbolTokenizer;
39 import org.opengrok.indexer.logger.LoggerFactory;
40 import org.opengrok.indexer.search.QueryBuilder;
41 
42 /**
43  * Base class for all different File Analyzers.
44  *
45  * An Analyzer for a filetype provides
46  * <ol>
47  * <li>the file extensions and magic numbers it analyzes</li>
48  * <li>a lucene document listing the fields it can support</li>
49  * <li>TokenStreams for each of the field it said requires tokenizing in 2</li>
50  * <li>cross reference in HTML format</li>
51  * <li>The type of file data, plain text etc</li>
52  * </ol>
53  *
54  * Created on September 21, 2005
55  *
56  * @author Chandan
57  */
58 public class FileAnalyzer extends AbstractAnalyzer {
59 
60     private static final Logger LOGGER = LoggerFactory.getLogger(FileAnalyzer.class);
61 
62     /**
63      * @return {@code null} as there is no aligned language
64      */
65     @Override
getCtagsLang()66     public String getCtagsLang() {
67         return null;
68     }
69 
70     /**
71      * Gets a version number to be used to tag processed documents so that
72      * re-analysis can be re-done later if a stored version number is different
73      * from the current implementation.
74      * <p>
75      * The value is the union of a {@link FileAnalyzer} root version and the
76      * value from {@link #getSpecializedVersionNo()}. Changing the root version
77      * affects all analyzers simultaneously; while subclasses can override
78      * {@link #getSpecializedVersionNo()} to allow changes that affect a few.
79      * @return (20061115_01 &lt;&lt; 32) | {@link #getSpecializedVersionNo()}
80      */
81     @Override
getVersionNo()82     public final long getVersionNo() {
83         final int rootVersionNo = 20061115_01; // Edit comment above too!
84         return ((long) rootVersionNo << 32) | getSpecializedVersionNo();
85     }
86 
87     @Override
supportsScopes()88     protected boolean supportsScopes() {
89         return false;
90     }
91 
92     /**
93      * Creates a new instance of FileAnalyzer.
94      *
95      * @param factory defined instance for the analyzer
96      */
FileAnalyzer(AnalyzerFactory factory)97     public FileAnalyzer(AnalyzerFactory factory) {
98         super(Analyzer.PER_FIELD_REUSE_STRATEGY);
99         if (factory == null) {
100             throw new IllegalArgumentException("`factory' is null");
101         }
102         this.factory = factory;
103         this.symbolTokenizerFactory = this::createPlainSymbolTokenizer;
104     }
105 
106     /**
107      * Creates a new instance of {@link FileAnalyzer}.
108      *
109      * @param factory defined instance for the analyzer
110      * @param symbolTokenizerFactory a defined instance relevant for the file
111      */
FileAnalyzer(AnalyzerFactory factory, Supplier<JFlexTokenizer> symbolTokenizerFactory)112     protected FileAnalyzer(AnalyzerFactory factory,
113             Supplier<JFlexTokenizer> symbolTokenizerFactory) {
114 
115         super(Analyzer.PER_FIELD_REUSE_STRATEGY);
116         if (factory == null) {
117             throw new IllegalArgumentException("`factory' is null");
118         }
119         if (symbolTokenizerFactory == null) {
120             throw new IllegalArgumentException("symbolTokenizerFactory is null");
121         }
122         this.factory = factory;
123         this.symbolTokenizerFactory = symbolTokenizerFactory;
124     }
125 
126     /**
127      * Returns the normalized name of the analyzer, which should corresponds to
128      * the file type. Example: The analyzer for the C language (CAnalyzer) would
129      * return “c”.
130      *
131      * @return Normalized name of the analyzer.
132      */
133     @Override
getFileTypeName()134     public String getFileTypeName() {
135         String name = this.getClass().getSimpleName().toLowerCase(Locale.ROOT);
136         String suffix = "analyzer";
137 
138         if (name.endsWith(suffix)) {
139             return name.substring(0, name.length() - suffix.length());
140         }
141 
142         return name;
143     }
144 
145     /**
146      * Analyze the contents of a source file. This includes populating the
147      * Lucene document with fields to add to the index, and writing the
148      * cross-referenced data to the specified destination.
149      *
150      * @param doc the Lucene document
151      * @param src the input data source
152      * @param xrefOut where to write the xref (may be {@code null})
153      * @throws IOException if any I/O error
154      * @throws InterruptedException if a timeout occurs
155      */
156     @Override
analyze(Document doc, StreamSource src, Writer xrefOut)157     public void analyze(Document doc, StreamSource src, Writer xrefOut)
158             throws IOException, InterruptedException {
159         // not used
160     }
161 
162     /**
163      * Derived classes should override to write a cross referenced HTML file for
164      * the specified args.
165      *
166      * @param args a defined instance
167      * @return the instance used to write the cross-referencing
168      * @throws java.io.IOException if an error occurs
169      */
170     @Override
writeXref(WriteXrefArgs args)171     public Xrefer writeXref(WriteXrefArgs args) throws IOException {
172         throw new UnsupportedOperationException(
173                 "Base FileAnalyzer cannot write xref");
174     }
175 
176     @Override
createComponents(String fieldName)177     protected TokenStreamComponents createComponents(String fieldName) {
178         switch (fieldName) {
179             case QueryBuilder.FULL:
180                 return new TokenStreamComponents(createPlainFullTokenizer());
181             case QueryBuilder.PATH:
182             case QueryBuilder.PROJECT:
183                 return new TokenStreamComponents(new PathTokenizer());
184             case QueryBuilder.HIST:
185                 try (HistoryAnalyzer historyAnalyzer = new HistoryAnalyzer()) {
186                     return historyAnalyzer.createComponents(fieldName);
187                 }
188             //below is set by PlainAnalyzer to workaround #1376 symbols search works like full text search
189             case QueryBuilder.REFS: {
190                 return new TokenStreamComponents(symbolTokenizerFactory.get());
191             }
192             case QueryBuilder.DEFS:
193                 return new TokenStreamComponents(createPlainSymbolTokenizer());
194             case QueryBuilder.LASTREV:
195                 return new TokenStreamComponents(createPlainFullTokenizer());
196             default:
197                 LOGGER.log(
198                         Level.WARNING, "Have no analyzer for: {0}", fieldName);
199                 return null;
200         }
201     }
202 
203     /**
204      * Add fields to store document number-of-lines and lines-of-code (LOC).
205      */
206     @Override
addNumLinesLOC(Document doc, NumLinesLOC counts)207     protected void addNumLinesLOC(Document doc, NumLinesLOC counts) {
208         doc.add(new StoredField(QueryBuilder.NUML, counts.getNumLines()));
209         doc.add(new StoredField(QueryBuilder.LOC, counts.getLOC()));
210 
211         if (countsAggregator != null) {
212             countsAggregator.register(counts);
213         }
214     }
215 
createPlainSymbolTokenizer()216     private JFlexTokenizer createPlainSymbolTokenizer() {
217         return new JFlexTokenizer(new PlainSymbolTokenizer(
218                 AbstractAnalyzer.DUMMY_READER));
219     }
220 
createPlainFullTokenizer()221     private JFlexTokenizer createPlainFullTokenizer() {
222         return new JFlexTokenizer(new PlainFullTokenizer(
223                 AbstractAnalyzer.DUMMY_READER));
224     }
225 
226     @Override
normalize(String fieldName, TokenStream in)227     protected TokenStream normalize(String fieldName, TokenStream in) {
228         switch (fieldName) {
229             case QueryBuilder.DEFS:
230             case QueryBuilder.REFS:
231                 return in;
232             default:
233                 return new LowerCaseFilter(in);
234         }
235     }
236 }
237