xref: /OpenGrok/opengrok-indexer/src/test/java/org/opengrok/indexer/analysis/TextAnalyzerTest.java (revision 52d10766ed1db3b0fd2c59a0da7292a32f244b50)
1b5840353SAdam Hornáček /*
2b5840353SAdam Hornáček  * CDDL HEADER START
3b5840353SAdam Hornáček  *
4b5840353SAdam Hornáček  * The contents of this file are subject to the terms of the
5b5840353SAdam Hornáček  * Common Development and Distribution License (the "License").
6b5840353SAdam Hornáček  * You may not use this file except in compliance with the License.
7b5840353SAdam Hornáček  *
8b5840353SAdam Hornáček  * See LICENSE.txt included in this distribution for the specific
9b5840353SAdam Hornáček  * language governing permissions and limitations under the License.
10b5840353SAdam Hornáček  *
11b5840353SAdam Hornáček  * When distributing Covered Code, include this CDDL HEADER in each
12b5840353SAdam Hornáček  * file and include the License file at LICENSE.txt.
13b5840353SAdam Hornáček  * If applicable, add the following below this CDDL HEADER, with the
14b5840353SAdam Hornáček  * fields enclosed by brackets "[]" replaced with your own identifying
15b5840353SAdam Hornáček  * information: Portions Copyright [yyyy] [name of copyright owner]
16b5840353SAdam Hornáček  *
17b5840353SAdam Hornáček  * CDDL HEADER END
18b5840353SAdam Hornáček  */
19b5840353SAdam Hornáček 
20b5840353SAdam Hornáček /*
21*52d10766SAdam Hornacek  * Copyright (c) 2010, 2021, Oracle and/or its affiliates. All rights reserved.
22b5840353SAdam Hornáček  * Portions Copyright (c) 2017, Chris Fraire <cfraire@me.com>.
23b5840353SAdam Hornáček  */
249805b761SAdam Hornáček package org.opengrok.indexer.analysis;
25b5840353SAdam Hornáček 
26b5840353SAdam Hornáček import java.io.ByteArrayInputStream;
27b5840353SAdam Hornáček import java.io.IOException;
28b5840353SAdam Hornáček import java.io.InputStream;
29b5840353SAdam Hornáček import java.io.InputStreamReader;
30b5840353SAdam Hornáček import java.io.Reader;
31b5840353SAdam Hornáček import java.io.Writer;
32b5840353SAdam Hornáček import java.nio.ByteBuffer;
33d051e170SChris Fraire import java.nio.charset.StandardCharsets;
34b5840353SAdam Hornáček 
35b5840353SAdam Hornáček import org.apache.lucene.document.Document;
36*52d10766SAdam Hornacek import org.junit.jupiter.api.Test;
379805b761SAdam Hornáček import org.opengrok.indexer.analysis.plain.PlainAnalyzerFactory;
38b5840353SAdam Hornáček 
39*52d10766SAdam Hornacek import static org.junit.jupiter.api.Assertions.assertEquals;
40*52d10766SAdam Hornacek 
41b5840353SAdam Hornáček public class TextAnalyzerTest {
42b5840353SAdam Hornáček 
43b5840353SAdam Hornáček     private String encoding;
44b5840353SAdam Hornáček     private String contents;
45b5840353SAdam Hornáček 
getStreamSource(final byte[] bytes)46b5840353SAdam Hornáček     private static StreamSource getStreamSource(final byte[] bytes) {
47b5840353SAdam Hornáček         return new StreamSource() {
48b5840353SAdam Hornáček             @Override
49b5840353SAdam Hornáček             public InputStream getStream() throws IOException {
50b5840353SAdam Hornáček                 return new ByteArrayInputStream(bytes);
51b5840353SAdam Hornáček             }
52b5840353SAdam Hornáček         };
53b5840353SAdam Hornáček     }
54b5840353SAdam Hornáček 
55b5840353SAdam Hornáček     @Test
56b5840353SAdam Hornáček     public void resetsStreamOnShortInput() throws IOException {
57b5840353SAdam Hornáček         new TestableTextAnalyzer().analyze(new Document(),
58b5840353SAdam Hornáček                 getStreamSource("hi".getBytes()), null);
59b5840353SAdam Hornáček 
60b5840353SAdam Hornáček         assertEquals("hi", contents);
61b5840353SAdam Hornáček     }
62b5840353SAdam Hornáček 
63b5840353SAdam Hornáček     @Test
64b5840353SAdam Hornáček     public void utf8WithBOM() throws IOException {
65b5840353SAdam Hornáček         byte[] buffer = new byte[]{(byte) 239, (byte) 187, (byte) 191, 'h', 'e', 'l', 'l', 'o'};
66b5840353SAdam Hornáček         new TestableTextAnalyzer().analyze(new Document(),
67b5840353SAdam Hornáček                 getStreamSource(buffer), null);
68b5840353SAdam Hornáček 
69b5840353SAdam Hornáček         assertEquals("hello", contents);
70b5840353SAdam Hornáček         assertEquals("UTF8", encoding);
71b5840353SAdam Hornáček     }
72b5840353SAdam Hornáček 
73b5840353SAdam Hornáček     @Test
74b5840353SAdam Hornáček     public void utf16WithBOM() throws IOException {
75d051e170SChris Fraire         final ByteBuffer utf16str = StandardCharsets.UTF_16.encode("hello");
76b5840353SAdam Hornáček         byte[] bytes = new byte[utf16str.remaining()];
77b5840353SAdam Hornáček         utf16str.get(bytes, 0, bytes.length);
78b5840353SAdam Hornáček 
79b5840353SAdam Hornáček         new TestableTextAnalyzer().analyze(new Document(),
80b5840353SAdam Hornáček                 getStreamSource(bytes), null);
81b5840353SAdam Hornáček 
82b5840353SAdam Hornáček         assertEquals("UTF-16", encoding);
83b5840353SAdam Hornáček 
84b5840353SAdam Hornáček         assertEquals("hello", contents);
85b5840353SAdam Hornáček     }
86b5840353SAdam Hornáček 
87b5840353SAdam Hornáček     @Test
88b5840353SAdam Hornáček     public void utf16WithBOMAlternate() throws IOException {
89d051e170SChris Fraire         final ByteBuffer utf16str = StandardCharsets.UTF_16.encode("hello");
90b5840353SAdam Hornáček         byte[] bytes = new byte[utf16str.remaining()];
91b5840353SAdam Hornáček         utf16str.get(bytes, 0, bytes.length);
92b5840353SAdam Hornáček 
93b5840353SAdam Hornáček         for (int i = 0; i < bytes.length; i += 2) {
94b5840353SAdam Hornáček             byte b = bytes[i];
95b5840353SAdam Hornáček             bytes[i] = bytes[i + 1];
96b5840353SAdam Hornáček             bytes[i + 1] = b;
97b5840353SAdam Hornáček         }
98b5840353SAdam Hornáček 
99b5840353SAdam Hornáček         new TestableTextAnalyzer().analyze(new Document(),
100b5840353SAdam Hornáček                 getStreamSource(bytes), null);
101b5840353SAdam Hornáček 
102b5840353SAdam Hornáček         assertEquals("UTF-16", encoding);
103b5840353SAdam Hornáček 
104b5840353SAdam Hornáček         assertEquals("hello", contents);
105b5840353SAdam Hornáček     }
106b5840353SAdam Hornáček 
107b5840353SAdam Hornáček     public class TestableTextAnalyzer extends TextAnalyzer {
108b5840353SAdam Hornáček 
109b5840353SAdam Hornáček         public TestableTextAnalyzer() {
110b5840353SAdam Hornáček             // Using PlainAnalyzerFactory.DEFAULT_INSTANCE is OK for this test.
111b5840353SAdam Hornáček             super(PlainAnalyzerFactory.DEFAULT_INSTANCE);
112b5840353SAdam Hornáček         }
113b5840353SAdam Hornáček 
114b5840353SAdam Hornáček         @Override
115b5840353SAdam Hornáček         public void analyze(Document doc, StreamSource src, Writer xrefOut) throws IOException {
116b5840353SAdam Hornáček             try (Reader r = getReader(src.getStream())) {
117d051e170SChris Fraire                 // Gross and fragile but testing of encoding is needed.
118d051e170SChris Fraire                 if (r instanceof ZeroReader) {
119d051e170SChris Fraire                     encoding = ((ZeroReader) r).getUnderlyingEncoding();
120d051e170SChris Fraire                 } else {
121b5840353SAdam Hornáček                     encoding = ((InputStreamReader) r).getEncoding();
122d051e170SChris Fraire                 }
123b5840353SAdam Hornáček 
124b5840353SAdam Hornáček                 StringBuilder sb = new StringBuilder();
125b5840353SAdam Hornáček                 int c;
126b5840353SAdam Hornáček                 while ((c = r.read()) != -1) {
127b5840353SAdam Hornáček                     sb.append((char) c);
128b5840353SAdam Hornáček                 }
129b5840353SAdam Hornáček 
130b5840353SAdam Hornáček                 contents = sb.toString();
131b5840353SAdam Hornáček             }
132b5840353SAdam Hornáček         }
133b5840353SAdam Hornáček 
134b5840353SAdam Hornáček         @Override
135b5840353SAdam Hornáček         protected JFlexXref newXref(Reader reader) {
136b5840353SAdam Hornáček             throw new UnsupportedOperationException("Not needed by test.");
137b5840353SAdam Hornáček         }
138b5840353SAdam Hornáček     }
139b5840353SAdam Hornáček }
140