xref: /OpenGrok/opengrok-indexer/src/test/java/org/opengrok/indexer/analysis/plain/PlainAnalyzerFactoryTest.java (revision d6df19e1b22784c78f567cf74c42f18e3901b900)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * See LICENSE.txt included in this distribution for the specific
9  * language governing permissions and limitations under the License.
10  *
11  * When distributing Covered Code, include this CDDL HEADER in each
12  * file and include the License file at LICENSE.txt.
13  * If applicable, add the following below this CDDL HEADER, with the
14  * fields enclosed by brackets "[]" replaced with your own identifying
15  * information: Portions Copyright [yyyy] [name of copyright owner]
16  *
17  * CDDL HEADER END
18  */
19 
20 /*
21  * Copyright (c) 2021, Chris Fraire <cfraire@me.com>.
22  */
23 package org.opengrok.indexer.analysis.plain;
24 
25 import org.junit.jupiter.api.Test;
26 import org.opengrok.indexer.util.IOUtils;
27 
28 import java.io.ByteArrayInputStream;
29 import java.io.ByteArrayOutputStream;
30 import java.io.IOException;
31 import java.nio.charset.Charset;
32 import java.nio.charset.StandardCharsets;
33 import java.util.Arrays;
34 
35 import static org.junit.jupiter.api.Assertions.assertFalse;
36 import static org.junit.jupiter.api.Assertions.assertTrue;
37 
38 /**
39  * Represents a container for tests of {@link PlainAnalyzerFactory}.
40  */
41 class PlainAnalyzerFactoryTest {
42 
43     @Test
shouldMatchStrictASCII()44     void shouldMatchStrictASCII() throws IOException {
45         byte[] fileBytes = "The contents of this file are subject to the terms of the".
46                 getBytes(StandardCharsets.US_ASCII);
47         boolean isMatch = checkIsPlainMatch(fileBytes);
48         assertTrue(isMatch, "should match strict ASCII content");
49     }
50 
51     @Test
shouldMatchShortStrictASCII()52     void shouldMatchShortStrictASCII() throws IOException {
53         byte[] fileBytes = "The".getBytes(StandardCharsets.US_ASCII);
54         boolean isMatch = checkIsPlainMatch(fileBytes);
55         assertTrue(isMatch, "should match strict ASCII short content");
56     }
57 
58     @Test
shouldNotMatchASCIIWithNonWhitespaceControl()59     void shouldNotMatchASCIIWithNonWhitespaceControl() throws IOException {
60         byte[] fileBytes = "The\u0001contents of this file are subject to the terms of the".
61                 getBytes(StandardCharsets.US_ASCII);
62         boolean isMatch = checkIsPlainMatch(fileBytes);
63         assertFalse(isMatch, "should not match ASCII with non-whitespace control character");
64     }
65 
66     @Test
shouldMatchNonASCIIUTF8WithoutBOM()67     void shouldMatchNonASCIIUTF8WithoutBOM() throws IOException {
68         byte[] fileBytes = "ゲーム盤の生成(h:縦,w:横,m:爆弾の数)".getBytes(StandardCharsets.UTF_8);
69         boolean isMatch = checkIsPlainMatch(fileBytes);
70         assertTrue(isMatch, "should match non-ASCII UTF-8 without BOM");
71     }
72 
73     @Test
shouldMatchNonASCIIUTF8WithBOM()74     void shouldMatchNonASCIIUTF8WithBOM() throws IOException {
75         byte[] fileBytes = enc(IOUtils.utf8Bom(), "ゲーム盤の生成(h:縦,w:横,m:爆弾の数)",
76                 StandardCharsets.UTF_8);
77         boolean isMatch = checkIsPlainMatch(fileBytes);
78         assertTrue(isMatch, "should match non-ASCII UTF-8 with BOM");
79     }
80 
81     @Test
shouldMatchUTF16BEWithBOM()82     void shouldMatchUTF16BEWithBOM() throws IOException {
83         byte[] fileBytes = enc(IOUtils.utf16BeBom(), "The contents of this file are subject to",
84                 StandardCharsets.UTF_16BE);
85         boolean isMatch = checkIsPlainMatch(fileBytes);
86         assertTrue(isMatch, "should match UTF-16BE content with BOM");
87     }
88 
89     @Test
shouldNotMatchUTF16BEWithoutBOM()90     void shouldNotMatchUTF16BEWithoutBOM() throws IOException {
91         byte[] fileBytes = "The contents of this file are subject to".getBytes(
92                 StandardCharsets.UTF_16BE);
93         boolean isMatch = checkIsPlainMatch(fileBytes);
94         assertFalse(isMatch, "should not match UTF-16BE content without BOM");
95     }
96 
97     @Test
shouldMatchUTF16LEWithBOM()98     void shouldMatchUTF16LEWithBOM() throws IOException {
99         byte[] fileBytes = enc(IOUtils.utf16LeBom(), "The contents of this file are subject to",
100                 StandardCharsets.UTF_16LE);
101         boolean isMatch = checkIsPlainMatch(fileBytes);
102         assertTrue(isMatch, "should match UTF-16LE content");
103     }
104 
105     @Test
shouldNotMatchUTF16LEWithoutBOM()106     void shouldNotMatchUTF16LEWithoutBOM() throws IOException {
107         byte[] fileBytes = "The contents of this file are subject to".getBytes(
108                 StandardCharsets.UTF_16LE);
109         boolean isMatch = checkIsPlainMatch(fileBytes);
110         assertFalse(isMatch, "should not match UTF-16LE content without BOM");
111     }
112 
113     @Test
shouldNotMatchUtfEbcdic()114     void shouldNotMatchUtfEbcdic() throws IOException {
115         /*
116          * 4-byte UTF-EBCDIC BOM plus 2-byte UTF-EBCDIC 'H' 'i'. EBCDIC 'H' 'i'
117          * on its own would be mis-identified as extended ASCII plain text.
118          */
119         byte[] fileBytes = new byte[]{(byte) 0xDD, (byte) 0x73, (byte) 0x66, (byte) 0x73,
120                 (byte) 0xC8, (byte) 0x89};
121         boolean isMatch = checkIsPlainMatch(fileBytes);
122         assertFalse(isMatch, "should not match UTF-EBCDIC content");
123     }
124 
checkIsPlainMatch(byte[] fileBytes)125     private static boolean checkIsPlainMatch(byte[] fileBytes) throws IOException {
126         byte[] leadingContent = Arrays.copyOf(fileBytes, Math.max(8, fileBytes.length));
127         ByteArrayInputStream bin = new ByteArrayInputStream(fileBytes);
128 
129         return (PlainAnalyzerFactory.MATCHER.isMagic(leadingContent, bin) != null);
130     }
131 
enc(byte[] bom, String value, Charset charset)132     private static byte[] enc(byte[] bom, String value, Charset charset) throws IOException {
133         ByteArrayOutputStream bout = new ByteArrayOutputStream();
134         bout.write(bom);
135         bout.write(value.getBytes(charset));
136         return bout.toByteArray();
137     }
138 }
139