xref: /Lucene/lucene/analysis/common/src/java/org/tartarus/snowball/SnowballProgram.java (revision 8ac26737913d0c1555019e93bc6bf7db1ab9047e)
1 /*
2 Copyright (c) 2001, Dr Martin Porter
3 Copyright (c) 2004,2005, Richard Boulton
4 Copyright (c) 2013, Yoshiki Shibukawa
5 Copyright (c) 2006,2007,2009,2010,2011,2014-2019, Olly Betts
6 All rights reserved.
7 
8 Redistribution and use in source and binary forms, with or without
9 modification, are permitted provided that the following conditions
10 are met:
11 
12   1. Redistributions of source code must retain the above copyright notice,
13      this list of conditions and the following disclaimer.
14   2. Redistributions in binary form must reproduce the above copyright notice,
15      this list of conditions and the following disclaimer in the documentation
16      and/or other materials provided with the distribution.
17   3. Neither the name of the Snowball project nor the names of its contributors
18      may be used to endorse or promote products derived from this software
19      without specific prior written permission.
20 
21 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
22 ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
23 WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
24 DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
25 ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
26 (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
27 LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
28 ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
29 (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
30 SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
31 */
32 
33 package org.tartarus.snowball;
34 
35 import java.io.Serializable;
36 import java.lang.reflect.UndeclaredThrowableException;
37 
38 /** Base class for a snowball stemmer */
39 public class SnowballProgram implements Serializable {
SnowballProgram()40   protected SnowballProgram() {
41     current = new char[8];
42     setCurrent("");
43   }
44 
45   static final long serialVersionUID = 2016072500L;
46 
47   /** Set the current string. */
setCurrent(String value)48   public void setCurrent(String value) {
49     current = value.toCharArray();
50     cursor = 0;
51     limit = value.length();
52     limit_backward = 0;
53     bra = cursor;
54     ket = limit;
55   }
56 
57   /** Get the current string. */
getCurrent()58   public String getCurrent() {
59     return new String(current, 0, limit);
60   }
61 
62   /**
63    * Set the current string.
64    *
65    * @param text character array containing input
66    * @param length valid length of text.
67    */
setCurrent(char[] text, int length)68   public void setCurrent(char[] text, int length) {
69     current = text;
70     cursor = 0;
71     limit = length;
72     limit_backward = 0;
73     bra = cursor;
74     ket = limit;
75   }
76 
77   /**
78    * Get the current buffer containing the stem.
79    *
80    * <p>NOTE: this may be a reference to a different character array than the one originally
81    * provided with setCurrent, in the exceptional case that stemming produced a longer intermediate
82    * or result string.
83    *
84    * <p>It is necessary to use {@link #getCurrentBufferLength()} to determine the valid length of
85    * the returned buffer. For example, many words are stemmed simply by subtracting from the length
86    * to remove suffixes.
87    *
88    * @see #getCurrentBufferLength()
89    */
getCurrentBuffer()90   public char[] getCurrentBuffer() {
91     return current;
92   }
93 
94   /**
95    * Get the valid length of the character array in {@link #getCurrentBuffer()}.
96    *
97    * @return valid length of the array.
98    */
getCurrentBufferLength()99   public int getCurrentBufferLength() {
100     return limit;
101   }
102 
103   // current string
104   private char[] current;
105 
106   protected int cursor;
107   protected int limit;
108   protected int limit_backward;
109   protected int bra;
110   protected int ket;
111 
SnowballProgram(SnowballProgram other)112   public SnowballProgram(SnowballProgram other) {
113     current = other.current;
114     cursor = other.cursor;
115     limit = other.limit;
116     limit_backward = other.limit_backward;
117     bra = other.bra;
118     ket = other.ket;
119   }
120 
copy_from(SnowballProgram other)121   protected void copy_from(SnowballProgram other) {
122     current = other.current;
123     cursor = other.cursor;
124     limit = other.limit;
125     limit_backward = other.limit_backward;
126     bra = other.bra;
127     ket = other.ket;
128   }
129 
in_grouping(char[] s, int min, int max)130   protected boolean in_grouping(char[] s, int min, int max) {
131     if (cursor >= limit) return false;
132     char ch = current[cursor];
133     if (ch > max || ch < min) return false;
134     ch -= min;
135     if ((s[ch >> 3] & (0X1 << (ch & 0X7))) == 0) return false;
136     cursor++;
137     return true;
138   }
139 
in_grouping_b(char[] s, int min, int max)140   protected boolean in_grouping_b(char[] s, int min, int max) {
141     if (cursor <= limit_backward) return false;
142     char ch = current[cursor - 1];
143     if (ch > max || ch < min) return false;
144     ch -= min;
145     if ((s[ch >> 3] & (0X1 << (ch & 0X7))) == 0) return false;
146     cursor--;
147     return true;
148   }
149 
out_grouping(char[] s, int min, int max)150   protected boolean out_grouping(char[] s, int min, int max) {
151     if (cursor >= limit) return false;
152     char ch = current[cursor];
153     if (ch > max || ch < min) {
154       cursor++;
155       return true;
156     }
157     ch -= min;
158     if ((s[ch >> 3] & (0X1 << (ch & 0X7))) == 0) {
159       cursor++;
160       return true;
161     }
162     return false;
163   }
164 
out_grouping_b(char[] s, int min, int max)165   protected boolean out_grouping_b(char[] s, int min, int max) {
166     if (cursor <= limit_backward) return false;
167     char ch = current[cursor - 1];
168     if (ch > max || ch < min) {
169       cursor--;
170       return true;
171     }
172     ch -= min;
173     if ((s[ch >> 3] & (0X1 << (ch & 0X7))) == 0) {
174       cursor--;
175       return true;
176     }
177     return false;
178   }
179 
eq_s(CharSequence s)180   protected boolean eq_s(CharSequence s) {
181     if (limit - cursor < s.length()) return false;
182     int i;
183     for (i = 0; i != s.length(); i++) {
184       if (current[cursor + i] != s.charAt(i)) return false;
185     }
186     cursor += s.length();
187     return true;
188   }
189 
eq_s_b(CharSequence s)190   protected boolean eq_s_b(CharSequence s) {
191     if (cursor - limit_backward < s.length()) return false;
192     int i;
193     for (i = 0; i != s.length(); i++) {
194       if (current[cursor - s.length() + i] != s.charAt(i)) return false;
195     }
196     cursor -= s.length();
197     return true;
198   }
199 
find_among(Among v[])200   protected int find_among(Among v[]) {
201     int i = 0;
202     int j = v.length;
203 
204     int c = cursor;
205     int l = limit;
206 
207     int common_i = 0;
208     int common_j = 0;
209 
210     boolean first_key_inspected = false;
211 
212     while (true) {
213       int k = i + ((j - i) >> 1);
214       int diff = 0;
215       int common = common_i < common_j ? common_i : common_j; // smaller
216       Among w = v[k];
217       int i2;
218       for (i2 = common; i2 < w.s.length; i2++) {
219         if (c + common == l) {
220           diff = -1;
221           break;
222         }
223         diff = current[c + common] - w.s[i2];
224         if (diff != 0) break;
225         common++;
226       }
227       if (diff < 0) {
228         j = k;
229         common_j = common;
230       } else {
231         i = k;
232         common_i = common;
233       }
234       if (j - i <= 1) {
235         if (i > 0) break; // v->s has been inspected
236         if (j == i) break; // only one item in v
237 
238         // - but now we need to go round once more to get
239         // v->s inspected. This looks messy, but is actually
240         // the optimal approach.
241 
242         if (first_key_inspected) break;
243         first_key_inspected = true;
244       }
245     }
246     while (true) {
247       Among w = v[i];
248       if (common_i >= w.s.length) {
249         cursor = c + w.s.length;
250         if (w.method == null) return w.result;
251         boolean res = false;
252         try {
253           res = (boolean) w.method.invokeExact(this);
254         } catch (Error | RuntimeException e) {
255           throw e;
256         } catch (Throwable e) {
257           throw new UndeclaredThrowableException(e);
258         }
259         cursor = c + w.s.length;
260         if (res) return w.result;
261       }
262       i = w.substring_i;
263       if (i < 0) return 0;
264     }
265   }
266 
267   // find_among_b is for backwards processing. Same comments apply
268   protected int find_among_b(Among v[]) {
269     int i = 0;
270     int j = v.length;
271 
272     int c = cursor;
273     int lb = limit_backward;
274 
275     int common_i = 0;
276     int common_j = 0;
277 
278     boolean first_key_inspected = false;
279 
280     while (true) {
281       int k = i + ((j - i) >> 1);
282       int diff = 0;
283       int common = common_i < common_j ? common_i : common_j;
284       Among w = v[k];
285       int i2;
286       for (i2 = w.s.length - 1 - common; i2 >= 0; i2--) {
287         if (c - common == lb) {
288           diff = -1;
289           break;
290         }
291         diff = current[c - 1 - common] - w.s[i2];
292         if (diff != 0) break;
293         common++;
294       }
295       if (diff < 0) {
296         j = k;
297         common_j = common;
298       } else {
299         i = k;
300         common_i = common;
301       }
302       if (j - i <= 1) {
303         if (i > 0) break;
304         if (j == i) break;
305         if (first_key_inspected) break;
306         first_key_inspected = true;
307       }
308     }
309     while (true) {
310       Among w = v[i];
311       if (common_i >= w.s.length) {
312         cursor = c - w.s.length;
313         if (w.method == null) return w.result;
314 
315         boolean res = false;
316         try {
317           res = (boolean) w.method.invokeExact(this);
318         } catch (Error | RuntimeException e) {
319           throw e;
320         } catch (Throwable e) {
321           throw new UndeclaredThrowableException(e);
322         }
323         cursor = c - w.s.length;
324         if (res) return w.result;
325       }
326       i = w.substring_i;
327       if (i < 0) return 0;
328     }
329   }
330 
331   // mini version of ArrayUtil.oversize from lucene, specialized to chars
332   static int oversize(int minTargetSize) {
333     int extra = minTargetSize >> 3;
334     if (extra < 3) {
335       extra = 3;
336     }
337     int newSize = minTargetSize + extra;
338     return (newSize + 3) & 0x7ffffffc;
339   }
340 
341   /* to replace chars between c_bra and c_ket in current by the
342    * chars in s.
343    */
replace_s(int c_bra, int c_ket, CharSequence s)344   protected int replace_s(int c_bra, int c_ket, CharSequence s) {
345     final int adjustment = s.length() - (c_ket - c_bra);
346     final int newLength = limit + adjustment;
347     // resize if necessary
348     if (newLength > current.length) {
349       char[] newBuffer = new char[oversize(newLength)];
350       System.arraycopy(current, 0, newBuffer, 0, limit);
351       current = newBuffer;
352     }
353     // if the substring being replaced is longer or shorter than the
354     // replacement, need to shift things around
355     if (adjustment != 0 && c_ket < limit) {
356       System.arraycopy(current, c_ket, current, c_bra + s.length(), limit - c_ket);
357     }
358     // insert the replacement text
359     // Note, faster is s.getChars(0, s.length(), current, c_bra);
360     // but would have to duplicate this method for both String and StringBuilder
361     for (int i = 0; i < s.length(); i++) current[c_bra + i] = s.charAt(i);
362 
363     limit += adjustment;
364     if (cursor >= c_ket) cursor += adjustment;
365     else if (cursor > c_bra) cursor = c_bra;
366     return adjustment;
367   }
368 
slice_check()369   protected void slice_check() {
370     if (bra < 0 || bra > ket || ket > limit) {
371       throw new IllegalArgumentException(
372           "faulty slice operation: bra=" + bra + ",ket=" + ket + ",limit=" + limit);
373     }
374   }
375 
slice_from(CharSequence s)376   protected void slice_from(CharSequence s) {
377     slice_check();
378     replace_s(bra, ket, s);
379   }
380 
slice_del()381   protected void slice_del() {
382     slice_from("");
383   }
384 
insert(int c_bra, int c_ket, CharSequence s)385   protected void insert(int c_bra, int c_ket, CharSequence s) {
386     int adjustment = replace_s(c_bra, c_ket, s);
387     if (c_bra <= bra) bra += adjustment;
388     if (c_bra <= ket) ket += adjustment;
389   }
390 
391   /* Copy the slice into the supplied StringBuilder */
slice_to(StringBuilder s)392   protected void slice_to(StringBuilder s) {
393     slice_check();
394     int len = ket - bra;
395     s.setLength(0);
396     s.append(current, bra, len);
397   }
398 
assign_to(StringBuilder s)399   protected void assign_to(StringBuilder s) {
400     s.setLength(0);
401     s.append(current, 0, limit);
402   }
403 
404   /*
405   extern void debug(struct SN_env * z, int number, int line_count)
406   {   int i;
407       int limit = SIZE(z->p);
408       //if (number >= 0) printf("%3d (line %4d): '", number, line_count);
409       if (number >= 0) printf("%3d (line %4d): [%d]'", number, line_count,limit);
410       for (i = 0; i <= limit; i++)
411       {   if (z->lb == i) printf("{");
412           if (z->bra == i) printf("[");
413           if (z->c == i) printf("|");
414           if (z->ket == i) printf("]");
415           if (z->l == i) printf("}");
416           if (i < limit)
417           {   int ch = z->p[i];
418               if (ch == 0) ch = '#';
419               printf("%c", ch);
420           }
421       }
422       printf("'\n");
423   }
424   */
425 
426 }
427 ;
428