xref: /OpenGrok/opengrok-indexer/src/main/java/org/opengrok/indexer/analysis/plain/PlainAnalyzerFactory.java (revision 750b3115a5b8976536ee4dccce497eb97b7a4c9b)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * See LICENSE.txt included in this distribution for the specific
9  * language governing permissions and limitations under the License.
10  *
11  * When distributing Covered Code, include this CDDL HEADER in each
12  * file and include the License file at LICENSE.txt.
13  * If applicable, add the following below this CDDL HEADER, with the
14  * fields enclosed by brackets "[]" replaced with your own identifying
15  * information: Portions Copyright [yyyy] [name of copyright owner]
16  *
17  * CDDL HEADER END
18  */
19 
20 /*
21  * Copyright (c) 2007, 2021, Oracle and/or its affiliates. All rights reserved.
22  * Portions Copyright (c) 2017, 2021, Chris Fraire <cfraire@me.com>.
23  */
24 package org.opengrok.indexer.analysis.plain;
25 
26 import java.io.IOException;
27 import java.io.InputStream;
28 import java.nio.ByteBuffer;
29 import java.nio.CharBuffer;
30 import java.nio.charset.CharsetDecoder;
31 import java.nio.charset.CoderResult;
32 import java.nio.charset.CodingErrorAction;
33 import java.nio.charset.StandardCharsets;
34 import java.util.Arrays;
35 
36 import org.opengrok.indexer.analysis.AbstractAnalyzer;
37 import org.opengrok.indexer.analysis.AnalyzerFactory;
38 import org.opengrok.indexer.analysis.FileAnalyzerFactory;
39 import org.opengrok.indexer.util.IOUtils;
40 
41 /**
42  * Represents a subclass of {@link FileAnalyzerFactory} for plain-text
43  * files in ASCII, UTF-8, or UTF-16.
44  */
45 public final class PlainAnalyzerFactory extends FileAnalyzerFactory {
46 
47     private static final String NAME = "Plain Text";
48 
49     private static final int MIN_CHARS_WHILE_REMAINING = 20;
50 
51     // Up to 4 octets per UTF-8 character
52     private static final int TRY_UTF8_BYTES = MIN_CHARS_WHILE_REMAINING * 4;
53 
54     /**
55      * The reentrant {@link Matcher} implementation for plain-text files.
56      */
57     public static final Matcher MATCHER = new Matcher() {
58             @Override
59             public String description() {
60                 return "UTF-8, UTF-16BE, or UTF-16LE Byte Order Mark is present; or initial " +
61                         "bytes are all UTF-8-encoded graphic characters or whitespace";
62             }
63 
64             @Override
65             public AnalyzerFactory isMagic(byte[] content, InputStream in) throws IOException {
66                 int lengthBOM = IOUtils.skipForBOM(content);
67                 if (lengthBOM > 0) {
68                     return DEFAULT_INSTANCE;
69                 }
70                 if (readSomePlainCharactersUTF8noBOMwithoutError(in)) {
71                     return DEFAULT_INSTANCE;
72                 }
73                 return null;
74             }
75 
76             @Override
77             public AnalyzerFactory forFactory() {
78                 return DEFAULT_INSTANCE;
79             }
80     };
81 
82     /**
83      * Gets the singleton, factory instance that associates
84      * {@link PlainAnalyzer} with files whose initial bytes are the UTF-8,
85      * UTF-16BE, or UTF-16LE Byte Order Mark; or whose initial bytes are all
86      * UTF-8-encoded graphic characters or whitespace.
87      */
88     public static final PlainAnalyzerFactory DEFAULT_INSTANCE = new PlainAnalyzerFactory();
89 
PlainAnalyzerFactory()90     private PlainAnalyzerFactory() {
91         super(null, null, null, null, MATCHER, "text/plain", AbstractAnalyzer.Genre.PLAIN, NAME);
92     }
93 
94     @Override
newAnalyzer()95     protected AbstractAnalyzer newAnalyzer() {
96         return new PlainAnalyzer(this);
97     }
98 
readSomePlainCharactersUTF8noBOMwithoutError(InputStream in)99     private static boolean readSomePlainCharactersUTF8noBOMwithoutError(InputStream in)
100             throws IOException {
101 
102         boolean isEOF = false;
103         byte[] bytes = new byte[TRY_UTF8_BYTES];
104         in.mark(TRY_UTF8_BYTES);
105         int len = in.read(bytes);
106         in.reset();
107         if (len < 1) {
108             return false;
109         }
110         if (len != TRY_UTF8_BYTES) {
111             bytes = Arrays.copyOf(bytes, len);
112             isEOF = true;
113         }
114 
115         /*
116          * Decode one character at a time until either a decoding error occurs
117          * (failure) or the minimum number of required, valid characters is
118          * reached (success).
119          *
120          * "Decode bytes to chars one at a time"
121          * answered by https://stackoverflow.com/users/1831293/evgeniy-dorofeev
122          * https://stackoverflow.com/questions/17227331/decode-bytes-to-chars-one-at-a-time
123          * asked by https://stackoverflow.com/users/244360/kong
124          *
125          * Used under CC 4 with modifications noted as follows as required by
126          * license:
127          * * 2021-08-15 -- cfraire@me.com, revised to check for errors.
128          */
129         CharsetDecoder cd = StandardCharsets.UTF_8.newDecoder().
130                 onMalformedInput(CodingErrorAction.REPORT).
131                 onUnmappableCharacter(CodingErrorAction.REPORT);
132         ByteBuffer bin = ByteBuffer.wrap(bytes);
133         CharBuffer out = CharBuffer.allocate(MIN_CHARS_WHILE_REMAINING);
134         int numCharacters = 0;
135         CoderResult decodeResult = cd.decode(bin, out, isEOF);
136         if (decodeResult.isError()) {
137             return false;
138         }
139 
140         int numChars = out.position();
141         out.position(0);
142         for (int i = 0; i < numChars; ++i) {
143             char c = out.charAt(i);
144             if (Character.isISOControl(c) && !Character.isWhitespace(c)) {
145                 return false;
146             }
147             if (++numCharacters >= MIN_CHARS_WHILE_REMAINING) {
148                 return true;
149             }
150         }
151         /*
152          * At this point, as no error has occurred, then if any character was
153          * read, consider the input as plain text.
154          */
155         return (numCharacters > 0);
156     }
157 }
158