xref: /Lucene/lucene/analysis/stempel/src/java/org/egothor/stemmer/MultiTrie2.java (revision 8ac26737913d0c1555019e93bc6bf7db1ab9047e)
1 /*
2                    Egothor Software License version 1.00
3                    Copyright (C) 1997-2004 Leo Galambos.
4                 Copyright (C) 2002-2004 "Egothor developers"
5                      on behalf of the Egothor Project.
6                             All rights reserved.
7 
8   This  software  is  copyrighted  by  the "Egothor developers". If this
9   license applies to a single file or document, the "Egothor developers"
10   are the people or entities mentioned as copyright holders in that file
11   or  document.  If  this  license  applies  to the Egothor project as a
12   whole,  the  copyright holders are the people or entities mentioned in
13   the  file CREDITS. This file can be found in the same location as this
14   license in the distribution.
15 
16   Redistribution  and  use  in  source and binary forms, with or without
17   modification, are permitted provided that the following conditions are
18   met:
19    1. Redistributions  of  source  code  must retain the above copyright
20       notice, the list of contributors, this list of conditions, and the
21       following disclaimer.
22    2. Redistributions  in binary form must reproduce the above copyright
23       notice, the list of contributors, this list of conditions, and the
24       disclaimer  that  follows  these  conditions  in the documentation
25       and/or other materials provided with the distribution.
26    3. The name "Egothor" must not be used to endorse or promote products
27       derived  from  this software without prior written permission. For
28       written permission, please contact Leo.G@seznam.cz
29    4. Products  derived  from this software may not be called "Egothor",
30       nor  may  "Egothor"  appear  in  their name, without prior written
31       permission from Leo.G@seznam.cz.
32 
33   In addition, we request that you include in the end-user documentation
34   provided  with  the  redistribution  and/or  in the software itself an
35   acknowledgement equivalent to the following:
36   "This product includes software developed by the Egothor Project.
37    http://egothor.sf.net/"
38 
39   THIS  SOFTWARE  IS  PROVIDED  ``AS  IS''  AND ANY EXPRESSED OR IMPLIED
40   WARRANTIES,  INCLUDING,  BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
41   MERCHANTABILITY  AND  FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
42   IN  NO  EVENT  SHALL THE EGOTHOR PROJECT OR ITS CONTRIBUTORS BE LIABLE
43   FOR   ANY   DIRECT,   INDIRECT,  INCIDENTAL,  SPECIAL,  EXEMPLARY,  OR
44   CONSEQUENTIAL  DAMAGES  (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
45   SUBSTITUTE  GOODS  OR  SERVICES;  LOSS  OF  USE,  DATA, OR PROFITS; OR
46   BUSINESS  INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
47   WHETHER  IN  CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE
48   OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN
49   IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
50 
51   This  software  consists  of  voluntary  contributions  made  by  many
52   individuals  on  behalf  of  the  Egothor  Project  and was originally
53   created by Leo Galambos (Leo.G@seznam.cz).
54 */
55 package org.egothor.stemmer;
56 
57 import java.io.DataInput;
58 import java.io.DataOutput;
59 import java.io.IOException;
60 import java.util.ArrayList;
61 import java.util.List;
62 
63 /**
64  * The MultiTrie is a Trie of Tries.
65  *
66  * <p>It stores words and their associated patch commands. The MultiTrie handles patch commands
67  * broken into their constituent parts, as a MultiTrie does, but the commands are delimited by the
68  * skip command.
69  */
70 public class MultiTrie2 extends MultiTrie {
71   /**
72    * Constructor for the MultiTrie object.
73    *
74    * @param is the input stream
75    * @exception IOException if an I/O error occurs
76    */
MultiTrie2(DataInput is)77   public MultiTrie2(DataInput is) throws IOException {
78     super(is);
79   }
80 
81   /**
82    * Constructor for the MultiTrie2 object
83    *
84    * @param forward set to <code>true</code> if the elements should be read left to right
85    */
MultiTrie2(boolean forward)86   public MultiTrie2(boolean forward) {
87     super(forward);
88   }
89 
90   /**
91    * Return the element that is stored in a cell associated with the given key.
92    *
93    * @param key the key to the cell holding the desired element
94    * @return the element
95    */
96   @Override
getFully(CharSequence key)97   public CharSequence getFully(CharSequence key) {
98     StringBuilder result = new StringBuilder(tries.size() * 2);
99     try {
100       CharSequence lastkey = key;
101       CharSequence[] p = new CharSequence[tries.size()];
102       char lastch = ' ';
103       for (int i = 0; i < tries.size(); i++) {
104         CharSequence r = tries.get(i).getFully(lastkey);
105         if (r == null || (r.length() == 1 && r.charAt(0) == EOM)) {
106           return result;
107         }
108         if (cannotFollow(lastch, r.charAt(0))) {
109           return result;
110         } else {
111           lastch = r.charAt(r.length() - 2);
112         }
113         // key=key.substring(lengthPP(r));
114         p[i] = r;
115         if (p[i].charAt(0) == '-') {
116           if (i > 0) {
117             key = skip(key, lengthPP(p[i - 1]));
118           }
119           key = skip(key, lengthPP(p[i]));
120         }
121         // key = skip(key, lengthPP(r));
122         result.append(r);
123         if (key.length() != 0) {
124           lastkey = key;
125         }
126       }
127     } catch (
128         @SuppressWarnings("unused")
129         IndexOutOfBoundsException x) {
130     }
131     return result;
132   }
133 
134   /**
135    * Return the element that is stored as last on a path belonging to the given key.
136    *
137    * @param key the key associated with the desired element
138    * @return the element that is stored as last on a path
139    */
140   @Override
getLastOnPath(CharSequence key)141   public CharSequence getLastOnPath(CharSequence key) {
142     StringBuilder result = new StringBuilder(tries.size() * 2);
143     try {
144       CharSequence lastkey = key;
145       CharSequence[] p = new CharSequence[tries.size()];
146       char lastch = ' ';
147       for (int i = 0; i < tries.size(); i++) {
148         CharSequence r = tries.get(i).getLastOnPath(lastkey);
149         if (r == null || (r.length() == 1 && r.charAt(0) == EOM)) {
150           return result;
151         }
152         // System.err.println("LP:"+key+" last:"+lastch+" new:"+r);
153         if (cannotFollow(lastch, r.charAt(0))) {
154           return result;
155         } else {
156           lastch = r.charAt(r.length() - 2);
157         }
158         // key=key.substring(lengthPP(r));
159         p[i] = r;
160         if (p[i].charAt(0) == '-') {
161           if (i > 0) {
162             key = skip(key, lengthPP(p[i - 1]));
163           }
164           key = skip(key, lengthPP(p[i]));
165         }
166         // key = skip(key, lengthPP(r));
167         result.append(r);
168         if (key.length() != 0) {
169           lastkey = key;
170         }
171       }
172     } catch (
173         @SuppressWarnings("unused")
174         IndexOutOfBoundsException x) {
175     }
176     return result;
177   }
178 
179   /**
180    * Write this data structure to the given output stream.
181    *
182    * @param os the output stream
183    * @exception IOException if an I/O error occurs
184    */
185   @Override
store(DataOutput os)186   public void store(DataOutput os) throws IOException {
187     super.store(os);
188   }
189 
190   /**
191    * Add an element to this structure consisting of the given key and patch command.
192    *
193    * <p>This method will return without executing if the <code>cmd</code> parameter's length is 0.
194    *
195    * @param key the key
196    * @param cmd the patch command
197    */
198   @Override
add(CharSequence key, CharSequence cmd)199   public void add(CharSequence key, CharSequence cmd) {
200     if (cmd.length() == 0) {
201       return;
202     }
203     // System.err.println( cmd );
204     CharSequence[] p = decompose(cmd);
205     int levels = p.length;
206     // System.err.println("levels "+key+" cmd "+cmd+"|"+levels);
207     while (levels >= tries.size()) {
208       tries.add(new Trie(forward));
209     }
210     CharSequence lastkey = key;
211     for (int i = 0; i < levels; i++) {
212       if (key.length() > 0) {
213         tries.get(i).add(key, p[i]);
214         lastkey = key;
215       } else {
216         tries.get(i).add(lastkey, p[i]);
217       }
218       // System.err.println("-"+key+" "+p[i]+"|"+key.length());
219       /*
220        * key=key.substring(lengthPP(p[i]));
221        */
222       if (p[i].length() > 0 && p[i].charAt(0) == '-') {
223         if (i > 0) {
224           key = skip(key, lengthPP(p[i - 1]));
225         }
226         key = skip(key, lengthPP(p[i]));
227       }
228       // System.err.println("--->"+key);
229     }
230     if (key.length() > 0) {
231       tries.get(levels).add(key, EOM_NODE);
232     } else {
233       tries.get(levels).add(lastkey, EOM_NODE);
234     }
235   }
236 
237   /**
238    * Break the given patch command into its constituent pieces. The pieces are delimited by NOOP
239    * commands.
240    *
241    * @param cmd the patch command
242    * @return an array containing the pieces of the command
243    */
decompose(CharSequence cmd)244   public CharSequence[] decompose(CharSequence cmd) {
245     int parts = 0;
246 
247     for (int i = 0; 0 <= i && i < cmd.length(); ) {
248       int next = dashEven(cmd, i);
249       if (i == next) {
250         parts++;
251         i = next + 2;
252       } else {
253         parts++;
254         i = next;
255       }
256     }
257 
258     CharSequence[] part = new CharSequence[parts];
259     int x = 0;
260 
261     for (int i = 0; 0 <= i && i < cmd.length(); ) {
262       int next = dashEven(cmd, i);
263       if (i == next) {
264         part[x++] = cmd.subSequence(i, i + 2);
265         i = next + 2;
266       } else {
267         part[x++] = (next < 0) ? cmd.subSequence(i, cmd.length()) : cmd.subSequence(i, next);
268         i = next;
269       }
270     }
271     return part;
272   }
273 
274   /**
275    * Remove empty rows from the given Trie and return the newly reduced Trie.
276    *
277    * @param by the Trie to reduce
278    * @return the newly reduced Trie
279    */
280   @Override
reduce(Reduce by)281   public Trie reduce(Reduce by) {
282     List<Trie> h = new ArrayList<>();
283     for (Trie trie : tries) h.add(trie.reduce(by));
284 
285     MultiTrie2 m = new MultiTrie2(forward);
286     m.tries = h;
287     return m;
288   }
289 
cannotFollow(char after, char goes)290   private boolean cannotFollow(char after, char goes) {
291     switch (after) {
292       case '-':
293       case 'D':
294         return after == goes;
295     }
296     return false;
297   }
298 
skip(CharSequence in, int count)299   private CharSequence skip(CharSequence in, int count) {
300     if (forward) {
301       return in.subSequence(count, in.length());
302     } else {
303       return in.subSequence(0, in.length() - count);
304     }
305   }
306 
dashEven(CharSequence in, int from)307   private int dashEven(CharSequence in, int from) {
308     while (from < in.length()) {
309       if (in.charAt(from) == '-') {
310         return from;
311       } else {
312         from += 2;
313       }
314     }
315     return -1;
316   }
317 
318   @SuppressWarnings("fallthrough")
lengthPP(CharSequence cmd)319   private int lengthPP(CharSequence cmd) {
320     int len = 0;
321     for (int i = 0; i < cmd.length(); i++) {
322       switch (cmd.charAt(i++)) {
323         case '-':
324         case 'D':
325           len += cmd.charAt(i) - 'a' + 1;
326           break;
327         case 'R':
328           len++; /* intentional fallthrough */
329         case 'I':
330           break;
331       }
332     }
333     return len;
334   }
335 }
336