1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * See LICENSE.txt included in this distribution for the specific 9 * language governing permissions and limitations under the License. 10 * 11 * When distributing Covered Code, include this CDDL HEADER in each 12 * file and include the License file at LICENSE.txt. 13 * If applicable, add the following below this CDDL HEADER, with the 14 * fields enclosed by brackets "[]" replaced with your own identifying 15 * information: Portions Copyright [yyyy] [name of copyright owner] 16 * 17 * CDDL HEADER END 18 */ 19 20 /* 21 * Copyright (c) 2007, 2021, Oracle and/or its affiliates. All rights reserved. 22 * Portions Copyright (c) 2017, 2021, Chris Fraire <cfraire@me.com>. 23 */ 24 package org.opengrok.indexer.analysis.plain; 25 26 import java.io.IOException; 27 import java.io.InputStream; 28 import java.nio.ByteBuffer; 29 import java.nio.CharBuffer; 30 import java.nio.charset.CharsetDecoder; 31 import java.nio.charset.CoderResult; 32 import java.nio.charset.CodingErrorAction; 33 import java.nio.charset.StandardCharsets; 34 import java.util.Arrays; 35 36 import org.opengrok.indexer.analysis.AbstractAnalyzer; 37 import org.opengrok.indexer.analysis.AnalyzerFactory; 38 import org.opengrok.indexer.analysis.FileAnalyzerFactory; 39 import org.opengrok.indexer.util.IOUtils; 40 41 /** 42 * Represents a subclass of {@link FileAnalyzerFactory} for plain-text 43 * files in ASCII, UTF-8, or UTF-16. 44 */ 45 public final class PlainAnalyzerFactory extends FileAnalyzerFactory { 46 47 private static final String NAME = "Plain Text"; 48 49 private static final int MIN_CHARS_WHILE_REMAINING = 20; 50 51 // Up to 4 octets per UTF-8 character 52 private static final int TRY_UTF8_BYTES = MIN_CHARS_WHILE_REMAINING * 4; 53 54 /** 55 * The reentrant {@link Matcher} implementation for plain-text files. 56 */ 57 public static final Matcher MATCHER = new Matcher() { 58 @Override 59 public String description() { 60 return "UTF-8, UTF-16BE, or UTF-16LE Byte Order Mark is present; or initial " + 61 "bytes are all UTF-8-encoded graphic characters or whitespace"; 62 } 63 64 @Override 65 public AnalyzerFactory isMagic(byte[] content, InputStream in) throws IOException { 66 int lengthBOM = IOUtils.skipForBOM(content); 67 if (lengthBOM > 0) { 68 return DEFAULT_INSTANCE; 69 } 70 if (readSomePlainCharactersUTF8noBOMwithoutError(in)) { 71 return DEFAULT_INSTANCE; 72 } 73 return null; 74 } 75 76 @Override 77 public AnalyzerFactory forFactory() { 78 return DEFAULT_INSTANCE; 79 } 80 }; 81 82 /** 83 * Gets the singleton, factory instance that associates 84 * {@link PlainAnalyzer} with files whose initial bytes are the UTF-8, 85 * UTF-16BE, or UTF-16LE Byte Order Mark; or whose initial bytes are all 86 * UTF-8-encoded graphic characters or whitespace. 87 */ 88 public static final PlainAnalyzerFactory DEFAULT_INSTANCE = new PlainAnalyzerFactory(); 89 PlainAnalyzerFactory()90 private PlainAnalyzerFactory() { 91 super(null, null, null, null, MATCHER, "text/plain", AbstractAnalyzer.Genre.PLAIN, NAME); 92 } 93 94 @Override newAnalyzer()95 protected AbstractAnalyzer newAnalyzer() { 96 return new PlainAnalyzer(this); 97 } 98 readSomePlainCharactersUTF8noBOMwithoutError(InputStream in)99 private static boolean readSomePlainCharactersUTF8noBOMwithoutError(InputStream in) 100 throws IOException { 101 102 boolean isEOF = false; 103 byte[] bytes = new byte[TRY_UTF8_BYTES]; 104 in.mark(TRY_UTF8_BYTES); 105 int len = in.read(bytes); 106 in.reset(); 107 if (len < 1) { 108 return false; 109 } 110 if (len != TRY_UTF8_BYTES) { 111 bytes = Arrays.copyOf(bytes, len); 112 isEOF = true; 113 } 114 115 /* 116 * Decode one character at a time until either a decoding error occurs 117 * (failure) or the minimum number of required, valid characters is 118 * reached (success). 119 * 120 * "Decode bytes to chars one at a time" 121 * answered by https://stackoverflow.com/users/1831293/evgeniy-dorofeev 122 * https://stackoverflow.com/questions/17227331/decode-bytes-to-chars-one-at-a-time 123 * asked by https://stackoverflow.com/users/244360/kong 124 * 125 * Used under CC 4 with modifications noted as follows as required by 126 * license: 127 * * 2021-08-15 -- cfraire@me.com, revised to check for errors. 128 */ 129 CharsetDecoder cd = StandardCharsets.UTF_8.newDecoder(). 130 onMalformedInput(CodingErrorAction.REPORT). 131 onUnmappableCharacter(CodingErrorAction.REPORT); 132 ByteBuffer bin = ByteBuffer.wrap(bytes); 133 CharBuffer out = CharBuffer.allocate(MIN_CHARS_WHILE_REMAINING); 134 int numCharacters = 0; 135 CoderResult decodeResult = cd.decode(bin, out, isEOF); 136 if (decodeResult.isError()) { 137 return false; 138 } 139 140 int numChars = out.position(); 141 out.position(0); 142 for (int i = 0; i < numChars; ++i) { 143 char c = out.charAt(i); 144 if (Character.isISOControl(c) && !Character.isWhitespace(c)) { 145 return false; 146 } 147 if (++numCharacters >= MIN_CHARS_WHILE_REMAINING) { 148 return true; 149 } 150 } 151 /* 152 * At this point, as no error has occurred, then if any character was 153 * read, consider the input as plain text. 154 */ 155 return (numCharacters > 0); 156 } 157 } 158