1b5840353SAdam Hornáček /* 2b5840353SAdam Hornáček * CDDL HEADER START 3b5840353SAdam Hornáček * 4b5840353SAdam Hornáček * The contents of this file are subject to the terms of the 5b5840353SAdam Hornáček * Common Development and Distribution License (the "License"). 6b5840353SAdam Hornáček * You may not use this file except in compliance with the License. 7b5840353SAdam Hornáček * 8b5840353SAdam Hornáček * See LICENSE.txt included in this distribution for the specific 9b5840353SAdam Hornáček * language governing permissions and limitations under the License. 10b5840353SAdam Hornáček * 11b5840353SAdam Hornáček * When distributing Covered Code, include this CDDL HEADER in each 12b5840353SAdam Hornáček * file and include the License file at LICENSE.txt. 13b5840353SAdam Hornáček * If applicable, add the following below this CDDL HEADER, with the 14b5840353SAdam Hornáček * fields enclosed by brackets "[]" replaced with your own identifying 15b5840353SAdam Hornáček * information: Portions Copyright [yyyy] [name of copyright owner] 16b5840353SAdam Hornáček * 17b5840353SAdam Hornáček * CDDL HEADER END 18b5840353SAdam Hornáček */ 19b5840353SAdam Hornáček 20b5840353SAdam Hornáček /* 21*52d10766SAdam Hornacek * Copyright (c) 2010, 2021, Oracle and/or its affiliates. All rights reserved. 22b5840353SAdam Hornáček * Portions Copyright (c) 2017, Chris Fraire <cfraire@me.com>. 23b5840353SAdam Hornáček */ 249805b761SAdam Hornáček package org.opengrok.indexer.analysis; 25b5840353SAdam Hornáček 26b5840353SAdam Hornáček import java.io.ByteArrayInputStream; 27b5840353SAdam Hornáček import java.io.IOException; 28b5840353SAdam Hornáček import java.io.InputStream; 29b5840353SAdam Hornáček import java.io.InputStreamReader; 30b5840353SAdam Hornáček import java.io.Reader; 31b5840353SAdam Hornáček import java.io.Writer; 32b5840353SAdam Hornáček import java.nio.ByteBuffer; 33d051e170SChris Fraire import java.nio.charset.StandardCharsets; 34b5840353SAdam Hornáček 35b5840353SAdam Hornáček import org.apache.lucene.document.Document; 36*52d10766SAdam Hornacek import org.junit.jupiter.api.Test; 379805b761SAdam Hornáček import org.opengrok.indexer.analysis.plain.PlainAnalyzerFactory; 38b5840353SAdam Hornáček 39*52d10766SAdam Hornacek import static org.junit.jupiter.api.Assertions.assertEquals; 40*52d10766SAdam Hornacek 41b5840353SAdam Hornáček public class TextAnalyzerTest { 42b5840353SAdam Hornáček 43b5840353SAdam Hornáček private String encoding; 44b5840353SAdam Hornáček private String contents; 45b5840353SAdam Hornáček getStreamSource(final byte[] bytes)46b5840353SAdam Hornáček private static StreamSource getStreamSource(final byte[] bytes) { 47b5840353SAdam Hornáček return new StreamSource() { 48b5840353SAdam Hornáček @Override 49b5840353SAdam Hornáček public InputStream getStream() throws IOException { 50b5840353SAdam Hornáček return new ByteArrayInputStream(bytes); 51b5840353SAdam Hornáček } 52b5840353SAdam Hornáček }; 53b5840353SAdam Hornáček } 54b5840353SAdam Hornáček 55b5840353SAdam Hornáček @Test 56b5840353SAdam Hornáček public void resetsStreamOnShortInput() throws IOException { 57b5840353SAdam Hornáček new TestableTextAnalyzer().analyze(new Document(), 58b5840353SAdam Hornáček getStreamSource("hi".getBytes()), null); 59b5840353SAdam Hornáček 60b5840353SAdam Hornáček assertEquals("hi", contents); 61b5840353SAdam Hornáček } 62b5840353SAdam Hornáček 63b5840353SAdam Hornáček @Test 64b5840353SAdam Hornáček public void utf8WithBOM() throws IOException { 65b5840353SAdam Hornáček byte[] buffer = new byte[]{(byte) 239, (byte) 187, (byte) 191, 'h', 'e', 'l', 'l', 'o'}; 66b5840353SAdam Hornáček new TestableTextAnalyzer().analyze(new Document(), 67b5840353SAdam Hornáček getStreamSource(buffer), null); 68b5840353SAdam Hornáček 69b5840353SAdam Hornáček assertEquals("hello", contents); 70b5840353SAdam Hornáček assertEquals("UTF8", encoding); 71b5840353SAdam Hornáček } 72b5840353SAdam Hornáček 73b5840353SAdam Hornáček @Test 74b5840353SAdam Hornáček public void utf16WithBOM() throws IOException { 75d051e170SChris Fraire final ByteBuffer utf16str = StandardCharsets.UTF_16.encode("hello"); 76b5840353SAdam Hornáček byte[] bytes = new byte[utf16str.remaining()]; 77b5840353SAdam Hornáček utf16str.get(bytes, 0, bytes.length); 78b5840353SAdam Hornáček 79b5840353SAdam Hornáček new TestableTextAnalyzer().analyze(new Document(), 80b5840353SAdam Hornáček getStreamSource(bytes), null); 81b5840353SAdam Hornáček 82b5840353SAdam Hornáček assertEquals("UTF-16", encoding); 83b5840353SAdam Hornáček 84b5840353SAdam Hornáček assertEquals("hello", contents); 85b5840353SAdam Hornáček } 86b5840353SAdam Hornáček 87b5840353SAdam Hornáček @Test 88b5840353SAdam Hornáček public void utf16WithBOMAlternate() throws IOException { 89d051e170SChris Fraire final ByteBuffer utf16str = StandardCharsets.UTF_16.encode("hello"); 90b5840353SAdam Hornáček byte[] bytes = new byte[utf16str.remaining()]; 91b5840353SAdam Hornáček utf16str.get(bytes, 0, bytes.length); 92b5840353SAdam Hornáček 93b5840353SAdam Hornáček for (int i = 0; i < bytes.length; i += 2) { 94b5840353SAdam Hornáček byte b = bytes[i]; 95b5840353SAdam Hornáček bytes[i] = bytes[i + 1]; 96b5840353SAdam Hornáček bytes[i + 1] = b; 97b5840353SAdam Hornáček } 98b5840353SAdam Hornáček 99b5840353SAdam Hornáček new TestableTextAnalyzer().analyze(new Document(), 100b5840353SAdam Hornáček getStreamSource(bytes), null); 101b5840353SAdam Hornáček 102b5840353SAdam Hornáček assertEquals("UTF-16", encoding); 103b5840353SAdam Hornáček 104b5840353SAdam Hornáček assertEquals("hello", contents); 105b5840353SAdam Hornáček } 106b5840353SAdam Hornáček 107b5840353SAdam Hornáček public class TestableTextAnalyzer extends TextAnalyzer { 108b5840353SAdam Hornáček 109b5840353SAdam Hornáček public TestableTextAnalyzer() { 110b5840353SAdam Hornáček // Using PlainAnalyzerFactory.DEFAULT_INSTANCE is OK for this test. 111b5840353SAdam Hornáček super(PlainAnalyzerFactory.DEFAULT_INSTANCE); 112b5840353SAdam Hornáček } 113b5840353SAdam Hornáček 114b5840353SAdam Hornáček @Override 115b5840353SAdam Hornáček public void analyze(Document doc, StreamSource src, Writer xrefOut) throws IOException { 116b5840353SAdam Hornáček try (Reader r = getReader(src.getStream())) { 117d051e170SChris Fraire // Gross and fragile but testing of encoding is needed. 118d051e170SChris Fraire if (r instanceof ZeroReader) { 119d051e170SChris Fraire encoding = ((ZeroReader) r).getUnderlyingEncoding(); 120d051e170SChris Fraire } else { 121b5840353SAdam Hornáček encoding = ((InputStreamReader) r).getEncoding(); 122d051e170SChris Fraire } 123b5840353SAdam Hornáček 124b5840353SAdam Hornáček StringBuilder sb = new StringBuilder(); 125b5840353SAdam Hornáček int c; 126b5840353SAdam Hornáček while ((c = r.read()) != -1) { 127b5840353SAdam Hornáček sb.append((char) c); 128b5840353SAdam Hornáček } 129b5840353SAdam Hornáček 130b5840353SAdam Hornáček contents = sb.toString(); 131b5840353SAdam Hornáček } 132b5840353SAdam Hornáček } 133b5840353SAdam Hornáček 134b5840353SAdam Hornáček @Override 135b5840353SAdam Hornáček protected JFlexXref newXref(Reader reader) { 136b5840353SAdam Hornáček throw new UnsupportedOperationException("Not needed by test."); 137b5840353SAdam Hornáček } 138b5840353SAdam Hornáček } 139b5840353SAdam Hornáček } 140