1 /* 2 Egothor Software License version 1.00 3 Copyright (C) 1997-2004 Leo Galambos. 4 Copyright (C) 2002-2004 "Egothor developers" 5 on behalf of the Egothor Project. 6 All rights reserved. 7 8 This software is copyrighted by the "Egothor developers". If this 9 license applies to a single file or document, the "Egothor developers" 10 are the people or entities mentioned as copyright holders in that file 11 or document. If this license applies to the Egothor project as a 12 whole, the copyright holders are the people or entities mentioned in 13 the file CREDITS. This file can be found in the same location as this 14 license in the distribution. 15 16 Redistribution and use in source and binary forms, with or without 17 modification, are permitted provided that the following conditions are 18 met: 19 1. Redistributions of source code must retain the above copyright 20 notice, the list of contributors, this list of conditions, and the 21 following disclaimer. 22 2. Redistributions in binary form must reproduce the above copyright 23 notice, the list of contributors, this list of conditions, and the 24 disclaimer that follows these conditions in the documentation 25 and/or other materials provided with the distribution. 26 3. The name "Egothor" must not be used to endorse or promote products 27 derived from this software without prior written permission. For 28 written permission, please contact Leo.G@seznam.cz 29 4. Products derived from this software may not be called "Egothor", 30 nor may "Egothor" appear in their name, without prior written 31 permission from Leo.G@seznam.cz. 32 33 In addition, we request that you include in the end-user documentation 34 provided with the redistribution and/or in the software itself an 35 acknowledgement equivalent to the following: 36 "This product includes software developed by the Egothor Project. 37 http://egothor.sf.net/" 38 39 THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESSED OR IMPLIED 40 WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF 41 MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. 42 IN NO EVENT SHALL THE EGOTHOR PROJECT OR ITS CONTRIBUTORS BE LIABLE 43 FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 44 CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 45 SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR 46 BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, 47 WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE 48 OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN 49 IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 50 51 This software consists of voluntary contributions made by many 52 individuals on behalf of the Egothor Project and was originally 53 created by Leo Galambos (Leo.G@seznam.cz). 54 */ 55 package org.egothor.stemmer; 56 57 import java.io.DataInput; 58 import java.io.DataOutput; 59 import java.io.IOException; 60 import java.util.ArrayList; 61 import java.util.List; 62 63 /** 64 * The MultiTrie is a Trie of Tries. 65 * 66 * <p>It stores words and their associated patch commands. The MultiTrie handles patch commands 67 * broken into their constituent parts, as a MultiTrie does, but the commands are delimited by the 68 * skip command. 69 */ 70 public class MultiTrie2 extends MultiTrie { 71 /** 72 * Constructor for the MultiTrie object. 73 * 74 * @param is the input stream 75 * @exception IOException if an I/O error occurs 76 */ MultiTrie2(DataInput is)77 public MultiTrie2(DataInput is) throws IOException { 78 super(is); 79 } 80 81 /** 82 * Constructor for the MultiTrie2 object 83 * 84 * @param forward set to <code>true</code> if the elements should be read left to right 85 */ MultiTrie2(boolean forward)86 public MultiTrie2(boolean forward) { 87 super(forward); 88 } 89 90 /** 91 * Return the element that is stored in a cell associated with the given key. 92 * 93 * @param key the key to the cell holding the desired element 94 * @return the element 95 */ 96 @Override getFully(CharSequence key)97 public CharSequence getFully(CharSequence key) { 98 StringBuilder result = new StringBuilder(tries.size() * 2); 99 try { 100 CharSequence lastkey = key; 101 CharSequence[] p = new CharSequence[tries.size()]; 102 char lastch = ' '; 103 for (int i = 0; i < tries.size(); i++) { 104 CharSequence r = tries.get(i).getFully(lastkey); 105 if (r == null || (r.length() == 1 && r.charAt(0) == EOM)) { 106 return result; 107 } 108 if (cannotFollow(lastch, r.charAt(0))) { 109 return result; 110 } else { 111 lastch = r.charAt(r.length() - 2); 112 } 113 // key=key.substring(lengthPP(r)); 114 p[i] = r; 115 if (p[i].charAt(0) == '-') { 116 if (i > 0) { 117 key = skip(key, lengthPP(p[i - 1])); 118 } 119 key = skip(key, lengthPP(p[i])); 120 } 121 // key = skip(key, lengthPP(r)); 122 result.append(r); 123 if (key.length() != 0) { 124 lastkey = key; 125 } 126 } 127 } catch ( 128 @SuppressWarnings("unused") 129 IndexOutOfBoundsException x) { 130 } 131 return result; 132 } 133 134 /** 135 * Return the element that is stored as last on a path belonging to the given key. 136 * 137 * @param key the key associated with the desired element 138 * @return the element that is stored as last on a path 139 */ 140 @Override getLastOnPath(CharSequence key)141 public CharSequence getLastOnPath(CharSequence key) { 142 StringBuilder result = new StringBuilder(tries.size() * 2); 143 try { 144 CharSequence lastkey = key; 145 CharSequence[] p = new CharSequence[tries.size()]; 146 char lastch = ' '; 147 for (int i = 0; i < tries.size(); i++) { 148 CharSequence r = tries.get(i).getLastOnPath(lastkey); 149 if (r == null || (r.length() == 1 && r.charAt(0) == EOM)) { 150 return result; 151 } 152 // System.err.println("LP:"+key+" last:"+lastch+" new:"+r); 153 if (cannotFollow(lastch, r.charAt(0))) { 154 return result; 155 } else { 156 lastch = r.charAt(r.length() - 2); 157 } 158 // key=key.substring(lengthPP(r)); 159 p[i] = r; 160 if (p[i].charAt(0) == '-') { 161 if (i > 0) { 162 key = skip(key, lengthPP(p[i - 1])); 163 } 164 key = skip(key, lengthPP(p[i])); 165 } 166 // key = skip(key, lengthPP(r)); 167 result.append(r); 168 if (key.length() != 0) { 169 lastkey = key; 170 } 171 } 172 } catch ( 173 @SuppressWarnings("unused") 174 IndexOutOfBoundsException x) { 175 } 176 return result; 177 } 178 179 /** 180 * Write this data structure to the given output stream. 181 * 182 * @param os the output stream 183 * @exception IOException if an I/O error occurs 184 */ 185 @Override store(DataOutput os)186 public void store(DataOutput os) throws IOException { 187 super.store(os); 188 } 189 190 /** 191 * Add an element to this structure consisting of the given key and patch command. 192 * 193 * <p>This method will return without executing if the <code>cmd</code> parameter's length is 0. 194 * 195 * @param key the key 196 * @param cmd the patch command 197 */ 198 @Override add(CharSequence key, CharSequence cmd)199 public void add(CharSequence key, CharSequence cmd) { 200 if (cmd.length() == 0) { 201 return; 202 } 203 // System.err.println( cmd ); 204 CharSequence[] p = decompose(cmd); 205 int levels = p.length; 206 // System.err.println("levels "+key+" cmd "+cmd+"|"+levels); 207 while (levels >= tries.size()) { 208 tries.add(new Trie(forward)); 209 } 210 CharSequence lastkey = key; 211 for (int i = 0; i < levels; i++) { 212 if (key.length() > 0) { 213 tries.get(i).add(key, p[i]); 214 lastkey = key; 215 } else { 216 tries.get(i).add(lastkey, p[i]); 217 } 218 // System.err.println("-"+key+" "+p[i]+"|"+key.length()); 219 /* 220 * key=key.substring(lengthPP(p[i])); 221 */ 222 if (p[i].length() > 0 && p[i].charAt(0) == '-') { 223 if (i > 0) { 224 key = skip(key, lengthPP(p[i - 1])); 225 } 226 key = skip(key, lengthPP(p[i])); 227 } 228 // System.err.println("--->"+key); 229 } 230 if (key.length() > 0) { 231 tries.get(levels).add(key, EOM_NODE); 232 } else { 233 tries.get(levels).add(lastkey, EOM_NODE); 234 } 235 } 236 237 /** 238 * Break the given patch command into its constituent pieces. The pieces are delimited by NOOP 239 * commands. 240 * 241 * @param cmd the patch command 242 * @return an array containing the pieces of the command 243 */ decompose(CharSequence cmd)244 public CharSequence[] decompose(CharSequence cmd) { 245 int parts = 0; 246 247 for (int i = 0; 0 <= i && i < cmd.length(); ) { 248 int next = dashEven(cmd, i); 249 if (i == next) { 250 parts++; 251 i = next + 2; 252 } else { 253 parts++; 254 i = next; 255 } 256 } 257 258 CharSequence[] part = new CharSequence[parts]; 259 int x = 0; 260 261 for (int i = 0; 0 <= i && i < cmd.length(); ) { 262 int next = dashEven(cmd, i); 263 if (i == next) { 264 part[x++] = cmd.subSequence(i, i + 2); 265 i = next + 2; 266 } else { 267 part[x++] = (next < 0) ? cmd.subSequence(i, cmd.length()) : cmd.subSequence(i, next); 268 i = next; 269 } 270 } 271 return part; 272 } 273 274 /** 275 * Remove empty rows from the given Trie and return the newly reduced Trie. 276 * 277 * @param by the Trie to reduce 278 * @return the newly reduced Trie 279 */ 280 @Override reduce(Reduce by)281 public Trie reduce(Reduce by) { 282 List<Trie> h = new ArrayList<>(); 283 for (Trie trie : tries) h.add(trie.reduce(by)); 284 285 MultiTrie2 m = new MultiTrie2(forward); 286 m.tries = h; 287 return m; 288 } 289 cannotFollow(char after, char goes)290 private boolean cannotFollow(char after, char goes) { 291 switch (after) { 292 case '-': 293 case 'D': 294 return after == goes; 295 } 296 return false; 297 } 298 skip(CharSequence in, int count)299 private CharSequence skip(CharSequence in, int count) { 300 if (forward) { 301 return in.subSequence(count, in.length()); 302 } else { 303 return in.subSequence(0, in.length() - count); 304 } 305 } 306 dashEven(CharSequence in, int from)307 private int dashEven(CharSequence in, int from) { 308 while (from < in.length()) { 309 if (in.charAt(from) == '-') { 310 return from; 311 } else { 312 from += 2; 313 } 314 } 315 return -1; 316 } 317 318 @SuppressWarnings("fallthrough") lengthPP(CharSequence cmd)319 private int lengthPP(CharSequence cmd) { 320 int len = 0; 321 for (int i = 0; i < cmd.length(); i++) { 322 switch (cmd.charAt(i++)) { 323 case '-': 324 case 'D': 325 len += cmd.charAt(i) - 'a' + 1; 326 break; 327 case 'R': 328 len++; /* intentional fallthrough */ 329 case 'I': 330 break; 331 } 332 } 333 return len; 334 } 335 } 336