1 /* 2 Egothor Software License version 1.00 3 Copyright (C) 1997-2004 Leo Galambos. 4 Copyright (C) 2002-2004 "Egothor developers" 5 on behalf of the Egothor Project. 6 All rights reserved. 7 8 This software is copyrighted by the "Egothor developers". If this 9 license applies to a single file or document, the "Egothor developers" 10 are the people or entities mentioned as copyright holders in that file 11 or document. If this license applies to the Egothor project as a 12 whole, the copyright holders are the people or entities mentioned in 13 the file CREDITS. This file can be found in the same location as this 14 license in the distribution. 15 16 Redistribution and use in source and binary forms, with or without 17 modification, are permitted provided that the following conditions are 18 met: 19 1. Redistributions of source code must retain the above copyright 20 notice, the list of contributors, this list of conditions, and the 21 following disclaimer. 22 2. Redistributions in binary form must reproduce the above copyright 23 notice, the list of contributors, this list of conditions, and the 24 disclaimer that follows these conditions in the documentation 25 and/or other materials provided with the distribution. 26 3. The name "Egothor" must not be used to endorse or promote products 27 derived from this software without prior written permission. For 28 written permission, please contact Leo.G@seznam.cz 29 4. Products derived from this software may not be called "Egothor", 30 nor may "Egothor" appear in their name, without prior written 31 permission from Leo.G@seznam.cz. 32 33 In addition, we request that you include in the end-user documentation 34 provided with the redistribution and/or in the software itself an 35 acknowledgement equivalent to the following: 36 "This product includes software developed by the Egothor Project. 37 http://egothor.sf.net/" 38 39 THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESSED OR IMPLIED 40 WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF 41 MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. 42 IN NO EVENT SHALL THE EGOTHOR PROJECT OR ITS CONTRIBUTORS BE LIABLE 43 FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 44 CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 45 SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR 46 BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, 47 WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE 48 OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN 49 IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 50 51 This software consists of voluntary contributions made by many 52 individuals on behalf of the Egothor Project and was originally 53 created by Leo Galambos (Leo.G@seznam.cz). 54 */ 55 package org.egothor.stemmer; 56 57 import java.io.DataInput; 58 import java.io.DataOutput; 59 import java.io.IOException; 60 import java.io.PrintStream; 61 import java.util.ArrayList; 62 import java.util.List; 63 64 /** 65 * A Trie is used to store a dictionary of words and their stems. 66 * 67 * <p>Actually, what is stored are words with their respective patch commands. A trie can be termed 68 * forward (keys read from left to right) or backward (keys read from right to left). This property 69 * will vary depending on the language for which a Trie is constructed. 70 */ 71 public class Trie { 72 List<Row> rows = new ArrayList<>(); 73 List<CharSequence> cmds = new ArrayList<>(); 74 int root; 75 76 boolean forward = false; 77 78 /** 79 * Constructor for the Trie object. 80 * 81 * @param is the input stream 82 * @exception IOException if an I/O error occurs 83 */ Trie(DataInput is)84 public Trie(DataInput is) throws IOException { 85 forward = is.readBoolean(); 86 root = is.readInt(); 87 for (int i = is.readInt(); i > 0; i--) { 88 cmds.add(is.readUTF()); 89 } 90 for (int i = is.readInt(); i > 0; i--) { 91 rows.add(new Row(is)); 92 } 93 } 94 95 /** 96 * Constructor for the Trie object. 97 * 98 * @param forward set to <code>true</code> 99 */ Trie(boolean forward)100 public Trie(boolean forward) { 101 rows.add(new Row()); 102 root = 0; 103 this.forward = forward; 104 } 105 106 /** 107 * Constructor for the Trie object. 108 * 109 * @param forward <code>true</code> if read left to right, <code>false</code> if read right to 110 * left 111 * @param root index of the row that is the root node 112 * @param cmds the patch commands to store 113 * @param rows a Vector of Vectors. Each inner Vector is a node of this Trie 114 */ Trie(boolean forward, int root, List<CharSequence> cmds, List<Row> rows)115 public Trie(boolean forward, int root, List<CharSequence> cmds, List<Row> rows) { 116 this.rows = rows; 117 this.cmds = cmds; 118 this.root = root; 119 this.forward = forward; 120 } 121 122 /** 123 * Gets the all attribute of the Trie object 124 * 125 * @param key Description of the Parameter 126 * @return The all value 127 */ getAll(CharSequence key)128 public CharSequence[] getAll(CharSequence key) { 129 int[] res = new int[key.length()]; 130 int resc = 0; 131 Row now = getRow(root); 132 int w; 133 StrEnum e = new StrEnum(key, forward); 134 boolean br = false; 135 136 for (int i = 0; i < key.length() - 1; i++) { 137 Character ch = e.next(); 138 w = now.getCmd(ch); 139 if (w >= 0) { 140 int n = w; 141 for (int j = 0; j < resc; j++) { 142 if (n == res[j]) { 143 n = -1; 144 break; 145 } 146 } 147 if (n >= 0) { 148 res[resc++] = n; 149 } 150 } 151 w = now.getRef(ch); 152 if (w >= 0) { 153 now = getRow(w); 154 } else { 155 br = true; 156 break; 157 } 158 } 159 if (br == false) { 160 w = now.getCmd(e.next()); 161 if (w >= 0) { 162 int n = w; 163 for (int j = 0; j < resc; j++) { 164 if (n == res[j]) { 165 n = -1; 166 break; 167 } 168 } 169 if (n >= 0) { 170 res[resc++] = n; 171 } 172 } 173 } 174 175 if (resc < 1) { 176 return null; 177 } 178 CharSequence[] R = new CharSequence[resc]; 179 for (int j = 0; j < resc; j++) { 180 R[j] = cmds.get(res[j]); 181 } 182 return R; 183 } 184 185 /** 186 * Return the number of cells in this Trie object. 187 * 188 * @return the number of cells 189 */ getCells()190 public int getCells() { 191 int size = 0; 192 for (Row row : rows) size += row.getCells(); 193 return size; 194 } 195 196 /** 197 * Gets the cellsPnt attribute of the Trie object 198 * 199 * @return The cellsPnt value 200 */ getCellsPnt()201 public int getCellsPnt() { 202 int size = 0; 203 for (Row row : rows) size += row.getCellsPnt(); 204 return size; 205 } 206 207 /** 208 * Gets the cellsVal attribute of the Trie object 209 * 210 * @return The cellsVal value 211 */ getCellsVal()212 public int getCellsVal() { 213 int size = 0; 214 for (Row row : rows) size += row.getCellsVal(); 215 return size; 216 } 217 218 /** 219 * Return the element that is stored in a cell associated with the given key. 220 * 221 * @param key the key 222 * @return the associated element 223 */ getFully(CharSequence key)224 public CharSequence getFully(CharSequence key) { 225 Row now = getRow(root); 226 int w; 227 Cell c; 228 int cmd = -1; 229 StrEnum e = new StrEnum(key, forward); 230 Character ch = null; 231 232 for (int i = 0; i < key.length(); ) { 233 ch = e.next(); 234 i++; 235 236 c = now.at(ch); 237 if (c == null) { 238 return null; 239 } 240 241 cmd = c.cmd; 242 243 for (int skip = c.skip; skip > 0; skip--) { 244 if (i < key.length()) { 245 e.next(); 246 } else { 247 return null; 248 } 249 i++; 250 } 251 252 w = now.getRef(ch); 253 if (w >= 0) { 254 now = getRow(w); 255 } else if (i < key.length()) { 256 return null; 257 } 258 } 259 return (cmd == -1) ? null : cmds.get(cmd); 260 } 261 262 /** 263 * Return the element that is stored as last on a path associated with the given key. 264 * 265 * @param key the key associated with the desired element 266 * @return the last on path element 267 */ getLastOnPath(CharSequence key)268 public CharSequence getLastOnPath(CharSequence key) { 269 Row now = getRow(root); 270 int w; 271 CharSequence last = null; 272 StrEnum e = new StrEnum(key, forward); 273 274 for (int i = 0; i < key.length() - 1; i++) { 275 Character ch = e.next(); 276 w = now.getCmd(ch); 277 if (w >= 0) { 278 last = cmds.get(w); 279 } 280 w = now.getRef(ch); 281 if (w >= 0) { 282 now = getRow(w); 283 } else { 284 return last; 285 } 286 } 287 w = now.getCmd(e.next()); 288 return (w >= 0) ? cmds.get(w) : last; 289 } 290 291 /** 292 * Return the Row at the given index. 293 * 294 * @param index the index containing the desired Row 295 * @return the Row 296 */ getRow(int index)297 private Row getRow(int index) { 298 if (index < 0 || index >= rows.size()) { 299 return null; 300 } 301 return rows.get(index); 302 } 303 304 /** 305 * Write this Trie to the given output stream. 306 * 307 * @param os the output stream 308 * @exception IOException if an I/O error occurs 309 */ store(DataOutput os)310 public void store(DataOutput os) throws IOException { 311 os.writeBoolean(forward); 312 os.writeInt(root); 313 os.writeInt(cmds.size()); 314 for (CharSequence cmd : cmds) os.writeUTF(cmd.toString()); 315 316 os.writeInt(rows.size()); 317 for (Row row : rows) row.store(os); 318 } 319 320 /** 321 * Add the given key associated with the given patch command. If either parameter is null this 322 * method will return without executing. 323 * 324 * @param key the key 325 * @param cmd the patch command 326 */ add(CharSequence key, CharSequence cmd)327 void add(CharSequence key, CharSequence cmd) { 328 if (key == null || cmd == null) { 329 return; 330 } 331 if (cmd.length() == 0) { 332 return; 333 } 334 int id_cmd = cmds.indexOf(cmd); 335 if (id_cmd == -1) { 336 id_cmd = cmds.size(); 337 cmds.add(cmd); 338 } 339 340 int node = root; 341 Row r = getRow(node); 342 343 StrEnum e = new StrEnum(key, forward); 344 345 for (int i = 0; i < e.length() - 1; i++) { 346 Character ch = e.next(); 347 node = r.getRef(ch); 348 if (node >= 0) { 349 r = getRow(node); 350 } else { 351 node = rows.size(); 352 Row n; 353 rows.add(n = new Row()); 354 r.setRef(ch, node); 355 r = n; 356 } 357 } 358 r.setCmd(e.next(), id_cmd); 359 } 360 361 /** 362 * Remove empty rows from the given Trie and return the newly reduced Trie. 363 * 364 * @param by the Trie to reduce 365 * @return the newly reduced Trie 366 */ reduce(Reduce by)367 public Trie reduce(Reduce by) { 368 return by.optimize(this); 369 } 370 371 /** writes debugging info to the printstream */ printInfo(PrintStream out, CharSequence prefix)372 public void printInfo(PrintStream out, CharSequence prefix) { 373 out.println( 374 prefix 375 + "nds " 376 + rows.size() 377 + " cmds " 378 + cmds.size() 379 + " cells " 380 + getCells() 381 + " valcells " 382 + getCellsVal() 383 + " pntcells " 384 + getCellsPnt()); 385 } 386 387 /** This class is part of the Egothor Project */ 388 class StrEnum { 389 CharSequence s; 390 int from; 391 int by; 392 393 /** 394 * Constructor for the StrEnum object 395 * 396 * @param s Description of the Parameter 397 * @param up Description of the Parameter 398 */ StrEnum(CharSequence s, boolean up)399 StrEnum(CharSequence s, boolean up) { 400 this.s = s; 401 if (up) { 402 from = 0; 403 by = 1; 404 } else { 405 from = s.length() - 1; 406 by = -1; 407 } 408 } 409 length()410 int length() { 411 return s.length(); 412 } 413 next()414 char next() { 415 char ch = s.charAt(from); 416 from += by; 417 return ch; 418 } 419 } 420 } 421