xref: /Lucene/lucene/analysis/stempel/src/java/org/egothor/stemmer/Trie.java (revision 8ac26737913d0c1555019e93bc6bf7db1ab9047e)
1 /*
2                    Egothor Software License version 1.00
3                    Copyright (C) 1997-2004 Leo Galambos.
4                 Copyright (C) 2002-2004 "Egothor developers"
5                      on behalf of the Egothor Project.
6                             All rights reserved.
7 
8   This  software  is  copyrighted  by  the "Egothor developers". If this
9   license applies to a single file or document, the "Egothor developers"
10   are the people or entities mentioned as copyright holders in that file
11   or  document.  If  this  license  applies  to the Egothor project as a
12   whole,  the  copyright holders are the people or entities mentioned in
13   the  file CREDITS. This file can be found in the same location as this
14   license in the distribution.
15 
16   Redistribution  and  use  in  source and binary forms, with or without
17   modification, are permitted provided that the following conditions are
18   met:
19    1. Redistributions  of  source  code  must retain the above copyright
20       notice, the list of contributors, this list of conditions, and the
21       following disclaimer.
22    2. Redistributions  in binary form must reproduce the above copyright
23       notice, the list of contributors, this list of conditions, and the
24       disclaimer  that  follows  these  conditions  in the documentation
25       and/or other materials provided with the distribution.
26    3. The name "Egothor" must not be used to endorse or promote products
27       derived  from  this software without prior written permission. For
28       written permission, please contact Leo.G@seznam.cz
29    4. Products  derived  from this software may not be called "Egothor",
30       nor  may  "Egothor"  appear  in  their name, without prior written
31       permission from Leo.G@seznam.cz.
32 
33   In addition, we request that you include in the end-user documentation
34   provided  with  the  redistribution  and/or  in the software itself an
35   acknowledgement equivalent to the following:
36   "This product includes software developed by the Egothor Project.
37    http://egothor.sf.net/"
38 
39   THIS  SOFTWARE  IS  PROVIDED  ``AS  IS''  AND ANY EXPRESSED OR IMPLIED
40   WARRANTIES,  INCLUDING,  BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
41   MERCHANTABILITY  AND  FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
42   IN  NO  EVENT  SHALL THE EGOTHOR PROJECT OR ITS CONTRIBUTORS BE LIABLE
43   FOR   ANY   DIRECT,   INDIRECT,  INCIDENTAL,  SPECIAL,  EXEMPLARY,  OR
44   CONSEQUENTIAL  DAMAGES  (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
45   SUBSTITUTE  GOODS  OR  SERVICES;  LOSS  OF  USE,  DATA, OR PROFITS; OR
46   BUSINESS  INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
47   WHETHER  IN  CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE
48   OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN
49   IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
50 
51   This  software  consists  of  voluntary  contributions  made  by  many
52   individuals  on  behalf  of  the  Egothor  Project  and was originally
53   created by Leo Galambos (Leo.G@seznam.cz).
54 */
55 package org.egothor.stemmer;
56 
57 import java.io.DataInput;
58 import java.io.DataOutput;
59 import java.io.IOException;
60 import java.io.PrintStream;
61 import java.util.ArrayList;
62 import java.util.List;
63 
64 /**
65  * A Trie is used to store a dictionary of words and their stems.
66  *
67  * <p>Actually, what is stored are words with their respective patch commands. A trie can be termed
68  * forward (keys read from left to right) or backward (keys read from right to left). This property
69  * will vary depending on the language for which a Trie is constructed.
70  */
71 public class Trie {
72   List<Row> rows = new ArrayList<>();
73   List<CharSequence> cmds = new ArrayList<>();
74   int root;
75 
76   boolean forward = false;
77 
78   /**
79    * Constructor for the Trie object.
80    *
81    * @param is the input stream
82    * @exception IOException if an I/O error occurs
83    */
Trie(DataInput is)84   public Trie(DataInput is) throws IOException {
85     forward = is.readBoolean();
86     root = is.readInt();
87     for (int i = is.readInt(); i > 0; i--) {
88       cmds.add(is.readUTF());
89     }
90     for (int i = is.readInt(); i > 0; i--) {
91       rows.add(new Row(is));
92     }
93   }
94 
95   /**
96    * Constructor for the Trie object.
97    *
98    * @param forward set to <code>true</code>
99    */
Trie(boolean forward)100   public Trie(boolean forward) {
101     rows.add(new Row());
102     root = 0;
103     this.forward = forward;
104   }
105 
106   /**
107    * Constructor for the Trie object.
108    *
109    * @param forward <code>true</code> if read left to right, <code>false</code> if read right to
110    *     left
111    * @param root index of the row that is the root node
112    * @param cmds the patch commands to store
113    * @param rows a Vector of Vectors. Each inner Vector is a node of this Trie
114    */
Trie(boolean forward, int root, List<CharSequence> cmds, List<Row> rows)115   public Trie(boolean forward, int root, List<CharSequence> cmds, List<Row> rows) {
116     this.rows = rows;
117     this.cmds = cmds;
118     this.root = root;
119     this.forward = forward;
120   }
121 
122   /**
123    * Gets the all attribute of the Trie object
124    *
125    * @param key Description of the Parameter
126    * @return The all value
127    */
getAll(CharSequence key)128   public CharSequence[] getAll(CharSequence key) {
129     int[] res = new int[key.length()];
130     int resc = 0;
131     Row now = getRow(root);
132     int w;
133     StrEnum e = new StrEnum(key, forward);
134     boolean br = false;
135 
136     for (int i = 0; i < key.length() - 1; i++) {
137       Character ch = e.next();
138       w = now.getCmd(ch);
139       if (w >= 0) {
140         int n = w;
141         for (int j = 0; j < resc; j++) {
142           if (n == res[j]) {
143             n = -1;
144             break;
145           }
146         }
147         if (n >= 0) {
148           res[resc++] = n;
149         }
150       }
151       w = now.getRef(ch);
152       if (w >= 0) {
153         now = getRow(w);
154       } else {
155         br = true;
156         break;
157       }
158     }
159     if (br == false) {
160       w = now.getCmd(e.next());
161       if (w >= 0) {
162         int n = w;
163         for (int j = 0; j < resc; j++) {
164           if (n == res[j]) {
165             n = -1;
166             break;
167           }
168         }
169         if (n >= 0) {
170           res[resc++] = n;
171         }
172       }
173     }
174 
175     if (resc < 1) {
176       return null;
177     }
178     CharSequence[] R = new CharSequence[resc];
179     for (int j = 0; j < resc; j++) {
180       R[j] = cmds.get(res[j]);
181     }
182     return R;
183   }
184 
185   /**
186    * Return the number of cells in this Trie object.
187    *
188    * @return the number of cells
189    */
getCells()190   public int getCells() {
191     int size = 0;
192     for (Row row : rows) size += row.getCells();
193     return size;
194   }
195 
196   /**
197    * Gets the cellsPnt attribute of the Trie object
198    *
199    * @return The cellsPnt value
200    */
getCellsPnt()201   public int getCellsPnt() {
202     int size = 0;
203     for (Row row : rows) size += row.getCellsPnt();
204     return size;
205   }
206 
207   /**
208    * Gets the cellsVal attribute of the Trie object
209    *
210    * @return The cellsVal value
211    */
getCellsVal()212   public int getCellsVal() {
213     int size = 0;
214     for (Row row : rows) size += row.getCellsVal();
215     return size;
216   }
217 
218   /**
219    * Return the element that is stored in a cell associated with the given key.
220    *
221    * @param key the key
222    * @return the associated element
223    */
getFully(CharSequence key)224   public CharSequence getFully(CharSequence key) {
225     Row now = getRow(root);
226     int w;
227     Cell c;
228     int cmd = -1;
229     StrEnum e = new StrEnum(key, forward);
230     Character ch = null;
231 
232     for (int i = 0; i < key.length(); ) {
233       ch = e.next();
234       i++;
235 
236       c = now.at(ch);
237       if (c == null) {
238         return null;
239       }
240 
241       cmd = c.cmd;
242 
243       for (int skip = c.skip; skip > 0; skip--) {
244         if (i < key.length()) {
245           e.next();
246         } else {
247           return null;
248         }
249         i++;
250       }
251 
252       w = now.getRef(ch);
253       if (w >= 0) {
254         now = getRow(w);
255       } else if (i < key.length()) {
256         return null;
257       }
258     }
259     return (cmd == -1) ? null : cmds.get(cmd);
260   }
261 
262   /**
263    * Return the element that is stored as last on a path associated with the given key.
264    *
265    * @param key the key associated with the desired element
266    * @return the last on path element
267    */
getLastOnPath(CharSequence key)268   public CharSequence getLastOnPath(CharSequence key) {
269     Row now = getRow(root);
270     int w;
271     CharSequence last = null;
272     StrEnum e = new StrEnum(key, forward);
273 
274     for (int i = 0; i < key.length() - 1; i++) {
275       Character ch = e.next();
276       w = now.getCmd(ch);
277       if (w >= 0) {
278         last = cmds.get(w);
279       }
280       w = now.getRef(ch);
281       if (w >= 0) {
282         now = getRow(w);
283       } else {
284         return last;
285       }
286     }
287     w = now.getCmd(e.next());
288     return (w >= 0) ? cmds.get(w) : last;
289   }
290 
291   /**
292    * Return the Row at the given index.
293    *
294    * @param index the index containing the desired Row
295    * @return the Row
296    */
getRow(int index)297   private Row getRow(int index) {
298     if (index < 0 || index >= rows.size()) {
299       return null;
300     }
301     return rows.get(index);
302   }
303 
304   /**
305    * Write this Trie to the given output stream.
306    *
307    * @param os the output stream
308    * @exception IOException if an I/O error occurs
309    */
store(DataOutput os)310   public void store(DataOutput os) throws IOException {
311     os.writeBoolean(forward);
312     os.writeInt(root);
313     os.writeInt(cmds.size());
314     for (CharSequence cmd : cmds) os.writeUTF(cmd.toString());
315 
316     os.writeInt(rows.size());
317     for (Row row : rows) row.store(os);
318   }
319 
320   /**
321    * Add the given key associated with the given patch command. If either parameter is null this
322    * method will return without executing.
323    *
324    * @param key the key
325    * @param cmd the patch command
326    */
add(CharSequence key, CharSequence cmd)327   void add(CharSequence key, CharSequence cmd) {
328     if (key == null || cmd == null) {
329       return;
330     }
331     if (cmd.length() == 0) {
332       return;
333     }
334     int id_cmd = cmds.indexOf(cmd);
335     if (id_cmd == -1) {
336       id_cmd = cmds.size();
337       cmds.add(cmd);
338     }
339 
340     int node = root;
341     Row r = getRow(node);
342 
343     StrEnum e = new StrEnum(key, forward);
344 
345     for (int i = 0; i < e.length() - 1; i++) {
346       Character ch = e.next();
347       node = r.getRef(ch);
348       if (node >= 0) {
349         r = getRow(node);
350       } else {
351         node = rows.size();
352         Row n;
353         rows.add(n = new Row());
354         r.setRef(ch, node);
355         r = n;
356       }
357     }
358     r.setCmd(e.next(), id_cmd);
359   }
360 
361   /**
362    * Remove empty rows from the given Trie and return the newly reduced Trie.
363    *
364    * @param by the Trie to reduce
365    * @return the newly reduced Trie
366    */
reduce(Reduce by)367   public Trie reduce(Reduce by) {
368     return by.optimize(this);
369   }
370 
371   /** writes debugging info to the printstream */
printInfo(PrintStream out, CharSequence prefix)372   public void printInfo(PrintStream out, CharSequence prefix) {
373     out.println(
374         prefix
375             + "nds "
376             + rows.size()
377             + " cmds "
378             + cmds.size()
379             + " cells "
380             + getCells()
381             + " valcells "
382             + getCellsVal()
383             + " pntcells "
384             + getCellsPnt());
385   }
386 
387   /** This class is part of the Egothor Project */
388   class StrEnum {
389     CharSequence s;
390     int from;
391     int by;
392 
393     /**
394      * Constructor for the StrEnum object
395      *
396      * @param s Description of the Parameter
397      * @param up Description of the Parameter
398      */
StrEnum(CharSequence s, boolean up)399     StrEnum(CharSequence s, boolean up) {
400       this.s = s;
401       if (up) {
402         from = 0;
403         by = 1;
404       } else {
405         from = s.length() - 1;
406         by = -1;
407       }
408     }
409 
length()410     int length() {
411       return s.length();
412     }
413 
next()414     char next() {
415       char ch = s.charAt(from);
416       from += by;
417       return ch;
418     }
419   }
420 }
421