1 /* 2 Egothor Software License version 1.00 3 Copyright (C) 1997-2004 Leo Galambos. 4 Copyright (C) 2002-2004 "Egothor developers" 5 on behalf of the Egothor Project. 6 All rights reserved. 7 8 This software is copyrighted by the "Egothor developers". If this 9 license applies to a single file or document, the "Egothor developers" 10 are the people or entities mentioned as copyright holders in that file 11 or document. If this license applies to the Egothor project as a 12 whole, the copyright holders are the people or entities mentioned in 13 the file CREDITS. This file can be found in the same location as this 14 license in the distribution. 15 16 Redistribution and use in source and binary forms, with or without 17 modification, are permitted provided that the following conditions are 18 met: 19 1. Redistributions of source code must retain the above copyright 20 notice, the list of contributors, this list of conditions, and the 21 following disclaimer. 22 2. Redistributions in binary form must reproduce the above copyright 23 notice, the list of contributors, this list of conditions, and the 24 disclaimer that follows these conditions in the documentation 25 and/or other materials provided with the distribution. 26 3. The name "Egothor" must not be used to endorse or promote products 27 derived from this software without prior written permission. For 28 written permission, please contact Leo.G@seznam.cz 29 4. Products derived from this software may not be called "Egothor", 30 nor may "Egothor" appear in their name, without prior written 31 permission from Leo.G@seznam.cz. 32 33 In addition, we request that you include in the end-user documentation 34 provided with the redistribution and/or in the software itself an 35 acknowledgement equivalent to the following: 36 "This product includes software developed by the Egothor Project. 37 http://egothor.sf.net/" 38 39 THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESSED OR IMPLIED 40 WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF 41 MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. 42 IN NO EVENT SHALL THE EGOTHOR PROJECT OR ITS CONTRIBUTORS BE LIABLE 43 FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 44 CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 45 SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR 46 BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, 47 WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE 48 OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN 49 IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 50 51 This software consists of voluntary contributions made by many 52 individuals on behalf of the Egothor Project and was originally 53 created by Leo Galambos (Leo.G@seznam.cz). 54 */ 55 package org.egothor.stemmer; 56 57 import java.io.BufferedOutputStream; 58 import java.io.DataOutputStream; 59 import java.io.LineNumberReader; 60 import java.nio.charset.Charset; 61 import java.nio.file.Files; 62 import java.nio.file.Paths; 63 import java.util.Locale; 64 import java.util.StringTokenizer; 65 import org.apache.lucene.util.SuppressForbidden; 66 67 /** The Compile class is used to compile a stemmer table. */ 68 public class Compile { 69 70 static boolean backward; 71 static boolean multi; 72 static Trie trie; 73 74 /** no instantiation */ Compile()75 private Compile() {} 76 77 /** 78 * Entry point to the Compile application. 79 * 80 * <p>This program takes any number of arguments: the first is the name of the desired stemming 81 * algorithm to use (a list is available in the package description) , all of the rest should be 82 * the path or paths to a file or files containing a stemmer table to compile. 83 * 84 * @param args the command line arguments 85 */ 86 @SuppressForbidden(reason = "System.out required: command line tool") main(java.lang.String[] args)87 public static void main(java.lang.String[] args) throws Exception { 88 if (args.length < 1) { 89 return; 90 } 91 92 args[0].toUpperCase(Locale.ROOT); 93 94 backward = args[0].charAt(0) == '-'; 95 int qq = (backward) ? 1 : 0; 96 boolean storeorig = false; 97 98 if (args[0].charAt(qq) == '0') { 99 storeorig = true; 100 qq++; 101 } 102 103 multi = args[0].charAt(qq) == 'M'; 104 if (multi) { 105 qq++; 106 } 107 108 String charset = System.getProperty("egothor.stemmer.charset", "UTF-8"); 109 110 char[] optimizer = new char[args[0].length() - qq]; 111 for (int i = 0; i < optimizer.length; i++) { 112 optimizer[i] = args[0].charAt(qq + i); 113 } 114 115 for (int i = 1; i < args.length; i++) { 116 // System.out.println("[" + args[i] + "]"); 117 Diff diff = new Diff(); 118 119 allocTrie(); 120 121 System.out.println(args[i]); 122 try (LineNumberReader in = 123 new LineNumberReader( 124 Files.newBufferedReader(Paths.get(args[i]), Charset.forName(charset)))) { 125 for (String line = in.readLine(); line != null; line = in.readLine()) { 126 try { 127 line = line.toLowerCase(Locale.ROOT); 128 StringTokenizer st = new StringTokenizer(line); 129 String stem = st.nextToken(); 130 if (storeorig) { 131 trie.add(stem, "-a"); 132 } 133 while (st.hasMoreTokens()) { 134 String token = st.nextToken(); 135 if (token.equals(stem) == false) { 136 trie.add(token, diff.exec(token, stem)); 137 } 138 } 139 } catch ( 140 @SuppressWarnings("unused") 141 java.util.NoSuchElementException x) { 142 // no base token (stem) on a line 143 } 144 } 145 } 146 147 Optimizer o = new Optimizer(); 148 Optimizer2 o2 = new Optimizer2(); 149 Lift l = new Lift(true); 150 Lift e = new Lift(false); 151 Gener g = new Gener(); 152 153 for (int j = 0; j < optimizer.length; j++) { 154 String prefix; 155 switch (optimizer[j]) { 156 case 'G': 157 trie = trie.reduce(g); 158 prefix = "G: "; 159 break; 160 case 'L': 161 trie = trie.reduce(l); 162 prefix = "L: "; 163 break; 164 case 'E': 165 trie = trie.reduce(e); 166 prefix = "E: "; 167 break; 168 case '2': 169 trie = trie.reduce(o2); 170 prefix = "2: "; 171 break; 172 case '1': 173 trie = trie.reduce(o); 174 prefix = "1: "; 175 break; 176 default: 177 continue; 178 } 179 trie.printInfo(System.out, prefix + " "); 180 } 181 182 try (DataOutputStream os = 183 new DataOutputStream( 184 new BufferedOutputStream(Files.newOutputStream(Paths.get(args[i] + ".out"))))) { 185 os.writeUTF(args[0]); 186 trie.store(os); 187 } 188 } 189 } 190 allocTrie()191 static void allocTrie() { 192 if (multi) { 193 trie = new MultiTrie2(!backward); 194 } else { 195 trie = new Trie(!backward); 196 } 197 } 198 } 199