1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * See LICENSE.txt included in this distribution for the specific 9 * language governing permissions and limitations under the License. 10 * 11 * When distributing Covered Code, include this CDDL HEADER in each 12 * file and include the License file at LICENSE.txt. 13 * If applicable, add the following below this CDDL HEADER, with the 14 * fields enclosed by brackets "[]" replaced with your own identifying 15 * information: Portions Copyright [yyyy] [name of copyright owner] 16 * 17 * CDDL HEADER END 18 */ 19 20 /* 21 * Copyright (c) 2021, Chris Fraire <cfraire@me.com>. 22 */ 23 package org.opengrok.indexer.analysis.plain; 24 25 import org.junit.jupiter.api.Test; 26 import org.opengrok.indexer.util.IOUtils; 27 28 import java.io.ByteArrayInputStream; 29 import java.io.ByteArrayOutputStream; 30 import java.io.IOException; 31 import java.nio.charset.Charset; 32 import java.nio.charset.StandardCharsets; 33 import java.util.Arrays; 34 35 import static org.junit.jupiter.api.Assertions.assertFalse; 36 import static org.junit.jupiter.api.Assertions.assertTrue; 37 38 /** 39 * Represents a container for tests of {@link PlainAnalyzerFactory}. 40 */ 41 class PlainAnalyzerFactoryTest { 42 43 @Test shouldMatchStrictASCII()44 void shouldMatchStrictASCII() throws IOException { 45 byte[] fileBytes = "The contents of this file are subject to the terms of the". 46 getBytes(StandardCharsets.US_ASCII); 47 boolean isMatch = checkIsPlainMatch(fileBytes); 48 assertTrue(isMatch, "should match strict ASCII content"); 49 } 50 51 @Test shouldMatchShortStrictASCII()52 void shouldMatchShortStrictASCII() throws IOException { 53 byte[] fileBytes = "The".getBytes(StandardCharsets.US_ASCII); 54 boolean isMatch = checkIsPlainMatch(fileBytes); 55 assertTrue(isMatch, "should match strict ASCII short content"); 56 } 57 58 @Test shouldNotMatchASCIIWithNonWhitespaceControl()59 void shouldNotMatchASCIIWithNonWhitespaceControl() throws IOException { 60 byte[] fileBytes = "The\u0001contents of this file are subject to the terms of the". 61 getBytes(StandardCharsets.US_ASCII); 62 boolean isMatch = checkIsPlainMatch(fileBytes); 63 assertFalse(isMatch, "should not match ASCII with non-whitespace control character"); 64 } 65 66 @Test shouldMatchNonASCIIUTF8WithoutBOM()67 void shouldMatchNonASCIIUTF8WithoutBOM() throws IOException { 68 byte[] fileBytes = "ゲーム盤の生成(h:縦,w:横,m:爆弾の数)".getBytes(StandardCharsets.UTF_8); 69 boolean isMatch = checkIsPlainMatch(fileBytes); 70 assertTrue(isMatch, "should match non-ASCII UTF-8 without BOM"); 71 } 72 73 @Test shouldMatchNonASCIIUTF8WithBOM()74 void shouldMatchNonASCIIUTF8WithBOM() throws IOException { 75 byte[] fileBytes = enc(IOUtils.utf8Bom(), "ゲーム盤の生成(h:縦,w:横,m:爆弾の数)", 76 StandardCharsets.UTF_8); 77 boolean isMatch = checkIsPlainMatch(fileBytes); 78 assertTrue(isMatch, "should match non-ASCII UTF-8 with BOM"); 79 } 80 81 @Test shouldMatchUTF16BEWithBOM()82 void shouldMatchUTF16BEWithBOM() throws IOException { 83 byte[] fileBytes = enc(IOUtils.utf16BeBom(), "The contents of this file are subject to", 84 StandardCharsets.UTF_16BE); 85 boolean isMatch = checkIsPlainMatch(fileBytes); 86 assertTrue(isMatch, "should match UTF-16BE content with BOM"); 87 } 88 89 @Test shouldNotMatchUTF16BEWithoutBOM()90 void shouldNotMatchUTF16BEWithoutBOM() throws IOException { 91 byte[] fileBytes = "The contents of this file are subject to".getBytes( 92 StandardCharsets.UTF_16BE); 93 boolean isMatch = checkIsPlainMatch(fileBytes); 94 assertFalse(isMatch, "should not match UTF-16BE content without BOM"); 95 } 96 97 @Test shouldMatchUTF16LEWithBOM()98 void shouldMatchUTF16LEWithBOM() throws IOException { 99 byte[] fileBytes = enc(IOUtils.utf16LeBom(), "The contents of this file are subject to", 100 StandardCharsets.UTF_16LE); 101 boolean isMatch = checkIsPlainMatch(fileBytes); 102 assertTrue(isMatch, "should match UTF-16LE content"); 103 } 104 105 @Test shouldNotMatchUTF16LEWithoutBOM()106 void shouldNotMatchUTF16LEWithoutBOM() throws IOException { 107 byte[] fileBytes = "The contents of this file are subject to".getBytes( 108 StandardCharsets.UTF_16LE); 109 boolean isMatch = checkIsPlainMatch(fileBytes); 110 assertFalse(isMatch, "should not match UTF-16LE content without BOM"); 111 } 112 113 @Test shouldNotMatchUtfEbcdic()114 void shouldNotMatchUtfEbcdic() throws IOException { 115 /* 116 * 4-byte UTF-EBCDIC BOM plus 2-byte UTF-EBCDIC 'H' 'i'. EBCDIC 'H' 'i' 117 * on its own would be mis-identified as extended ASCII plain text. 118 */ 119 byte[] fileBytes = new byte[]{(byte) 0xDD, (byte) 0x73, (byte) 0x66, (byte) 0x73, 120 (byte) 0xC8, (byte) 0x89}; 121 boolean isMatch = checkIsPlainMatch(fileBytes); 122 assertFalse(isMatch, "should not match UTF-EBCDIC content"); 123 } 124 checkIsPlainMatch(byte[] fileBytes)125 private static boolean checkIsPlainMatch(byte[] fileBytes) throws IOException { 126 byte[] leadingContent = Arrays.copyOf(fileBytes, Math.max(8, fileBytes.length)); 127 ByteArrayInputStream bin = new ByteArrayInputStream(fileBytes); 128 129 return (PlainAnalyzerFactory.MATCHER.isMagic(leadingContent, bin) != null); 130 } 131 enc(byte[] bom, String value, Charset charset)132 private static byte[] enc(byte[] bom, String value, Charset charset) throws IOException { 133 ByteArrayOutputStream bout = new ByteArrayOutputStream(); 134 bout.write(bom); 135 bout.write(value.getBytes(charset)); 136 return bout.toByteArray(); 137 } 138 } 139