1 /* 2 Copyright (c) 2001, Dr Martin Porter 3 Copyright (c) 2004,2005, Richard Boulton 4 Copyright (c) 2013, Yoshiki Shibukawa 5 Copyright (c) 2006,2007,2009,2010,2011,2014-2019, Olly Betts 6 All rights reserved. 7 8 Redistribution and use in source and binary forms, with or without 9 modification, are permitted provided that the following conditions 10 are met: 11 12 1. Redistributions of source code must retain the above copyright notice, 13 this list of conditions and the following disclaimer. 14 2. Redistributions in binary form must reproduce the above copyright notice, 15 this list of conditions and the following disclaimer in the documentation 16 and/or other materials provided with the distribution. 17 3. Neither the name of the Snowball project nor the names of its contributors 18 may be used to endorse or promote products derived from this software 19 without specific prior written permission. 20 21 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND 22 ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED 23 WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 24 DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR 25 ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES 26 (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 27 LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON 28 ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 29 (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 30 SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 31 */ 32 33 package org.tartarus.snowball; 34 35 import java.io.Serializable; 36 import java.lang.reflect.UndeclaredThrowableException; 37 38 /** Base class for a snowball stemmer */ 39 public class SnowballProgram implements Serializable { SnowballProgram()40 protected SnowballProgram() { 41 current = new char[8]; 42 setCurrent(""); 43 } 44 45 static final long serialVersionUID = 2016072500L; 46 47 /** Set the current string. */ setCurrent(String value)48 public void setCurrent(String value) { 49 current = value.toCharArray(); 50 cursor = 0; 51 limit = value.length(); 52 limit_backward = 0; 53 bra = cursor; 54 ket = limit; 55 } 56 57 /** Get the current string. */ getCurrent()58 public String getCurrent() { 59 return new String(current, 0, limit); 60 } 61 62 /** 63 * Set the current string. 64 * 65 * @param text character array containing input 66 * @param length valid length of text. 67 */ setCurrent(char[] text, int length)68 public void setCurrent(char[] text, int length) { 69 current = text; 70 cursor = 0; 71 limit = length; 72 limit_backward = 0; 73 bra = cursor; 74 ket = limit; 75 } 76 77 /** 78 * Get the current buffer containing the stem. 79 * 80 * <p>NOTE: this may be a reference to a different character array than the one originally 81 * provided with setCurrent, in the exceptional case that stemming produced a longer intermediate 82 * or result string. 83 * 84 * <p>It is necessary to use {@link #getCurrentBufferLength()} to determine the valid length of 85 * the returned buffer. For example, many words are stemmed simply by subtracting from the length 86 * to remove suffixes. 87 * 88 * @see #getCurrentBufferLength() 89 */ getCurrentBuffer()90 public char[] getCurrentBuffer() { 91 return current; 92 } 93 94 /** 95 * Get the valid length of the character array in {@link #getCurrentBuffer()}. 96 * 97 * @return valid length of the array. 98 */ getCurrentBufferLength()99 public int getCurrentBufferLength() { 100 return limit; 101 } 102 103 // current string 104 private char[] current; 105 106 protected int cursor; 107 protected int limit; 108 protected int limit_backward; 109 protected int bra; 110 protected int ket; 111 SnowballProgram(SnowballProgram other)112 public SnowballProgram(SnowballProgram other) { 113 current = other.current; 114 cursor = other.cursor; 115 limit = other.limit; 116 limit_backward = other.limit_backward; 117 bra = other.bra; 118 ket = other.ket; 119 } 120 copy_from(SnowballProgram other)121 protected void copy_from(SnowballProgram other) { 122 current = other.current; 123 cursor = other.cursor; 124 limit = other.limit; 125 limit_backward = other.limit_backward; 126 bra = other.bra; 127 ket = other.ket; 128 } 129 in_grouping(char[] s, int min, int max)130 protected boolean in_grouping(char[] s, int min, int max) { 131 if (cursor >= limit) return false; 132 char ch = current[cursor]; 133 if (ch > max || ch < min) return false; 134 ch -= min; 135 if ((s[ch >> 3] & (0X1 << (ch & 0X7))) == 0) return false; 136 cursor++; 137 return true; 138 } 139 in_grouping_b(char[] s, int min, int max)140 protected boolean in_grouping_b(char[] s, int min, int max) { 141 if (cursor <= limit_backward) return false; 142 char ch = current[cursor - 1]; 143 if (ch > max || ch < min) return false; 144 ch -= min; 145 if ((s[ch >> 3] & (0X1 << (ch & 0X7))) == 0) return false; 146 cursor--; 147 return true; 148 } 149 out_grouping(char[] s, int min, int max)150 protected boolean out_grouping(char[] s, int min, int max) { 151 if (cursor >= limit) return false; 152 char ch = current[cursor]; 153 if (ch > max || ch < min) { 154 cursor++; 155 return true; 156 } 157 ch -= min; 158 if ((s[ch >> 3] & (0X1 << (ch & 0X7))) == 0) { 159 cursor++; 160 return true; 161 } 162 return false; 163 } 164 out_grouping_b(char[] s, int min, int max)165 protected boolean out_grouping_b(char[] s, int min, int max) { 166 if (cursor <= limit_backward) return false; 167 char ch = current[cursor - 1]; 168 if (ch > max || ch < min) { 169 cursor--; 170 return true; 171 } 172 ch -= min; 173 if ((s[ch >> 3] & (0X1 << (ch & 0X7))) == 0) { 174 cursor--; 175 return true; 176 } 177 return false; 178 } 179 eq_s(CharSequence s)180 protected boolean eq_s(CharSequence s) { 181 if (limit - cursor < s.length()) return false; 182 int i; 183 for (i = 0; i != s.length(); i++) { 184 if (current[cursor + i] != s.charAt(i)) return false; 185 } 186 cursor += s.length(); 187 return true; 188 } 189 eq_s_b(CharSequence s)190 protected boolean eq_s_b(CharSequence s) { 191 if (cursor - limit_backward < s.length()) return false; 192 int i; 193 for (i = 0; i != s.length(); i++) { 194 if (current[cursor - s.length() + i] != s.charAt(i)) return false; 195 } 196 cursor -= s.length(); 197 return true; 198 } 199 find_among(Among v[])200 protected int find_among(Among v[]) { 201 int i = 0; 202 int j = v.length; 203 204 int c = cursor; 205 int l = limit; 206 207 int common_i = 0; 208 int common_j = 0; 209 210 boolean first_key_inspected = false; 211 212 while (true) { 213 int k = i + ((j - i) >> 1); 214 int diff = 0; 215 int common = common_i < common_j ? common_i : common_j; // smaller 216 Among w = v[k]; 217 int i2; 218 for (i2 = common; i2 < w.s.length; i2++) { 219 if (c + common == l) { 220 diff = -1; 221 break; 222 } 223 diff = current[c + common] - w.s[i2]; 224 if (diff != 0) break; 225 common++; 226 } 227 if (diff < 0) { 228 j = k; 229 common_j = common; 230 } else { 231 i = k; 232 common_i = common; 233 } 234 if (j - i <= 1) { 235 if (i > 0) break; // v->s has been inspected 236 if (j == i) break; // only one item in v 237 238 // - but now we need to go round once more to get 239 // v->s inspected. This looks messy, but is actually 240 // the optimal approach. 241 242 if (first_key_inspected) break; 243 first_key_inspected = true; 244 } 245 } 246 while (true) { 247 Among w = v[i]; 248 if (common_i >= w.s.length) { 249 cursor = c + w.s.length; 250 if (w.method == null) return w.result; 251 boolean res = false; 252 try { 253 res = (boolean) w.method.invokeExact(this); 254 } catch (Error | RuntimeException e) { 255 throw e; 256 } catch (Throwable e) { 257 throw new UndeclaredThrowableException(e); 258 } 259 cursor = c + w.s.length; 260 if (res) return w.result; 261 } 262 i = w.substring_i; 263 if (i < 0) return 0; 264 } 265 } 266 267 // find_among_b is for backwards processing. Same comments apply 268 protected int find_among_b(Among v[]) { 269 int i = 0; 270 int j = v.length; 271 272 int c = cursor; 273 int lb = limit_backward; 274 275 int common_i = 0; 276 int common_j = 0; 277 278 boolean first_key_inspected = false; 279 280 while (true) { 281 int k = i + ((j - i) >> 1); 282 int diff = 0; 283 int common = common_i < common_j ? common_i : common_j; 284 Among w = v[k]; 285 int i2; 286 for (i2 = w.s.length - 1 - common; i2 >= 0; i2--) { 287 if (c - common == lb) { 288 diff = -1; 289 break; 290 } 291 diff = current[c - 1 - common] - w.s[i2]; 292 if (diff != 0) break; 293 common++; 294 } 295 if (diff < 0) { 296 j = k; 297 common_j = common; 298 } else { 299 i = k; 300 common_i = common; 301 } 302 if (j - i <= 1) { 303 if (i > 0) break; 304 if (j == i) break; 305 if (first_key_inspected) break; 306 first_key_inspected = true; 307 } 308 } 309 while (true) { 310 Among w = v[i]; 311 if (common_i >= w.s.length) { 312 cursor = c - w.s.length; 313 if (w.method == null) return w.result; 314 315 boolean res = false; 316 try { 317 res = (boolean) w.method.invokeExact(this); 318 } catch (Error | RuntimeException e) { 319 throw e; 320 } catch (Throwable e) { 321 throw new UndeclaredThrowableException(e); 322 } 323 cursor = c - w.s.length; 324 if (res) return w.result; 325 } 326 i = w.substring_i; 327 if (i < 0) return 0; 328 } 329 } 330 331 // mini version of ArrayUtil.oversize from lucene, specialized to chars 332 static int oversize(int minTargetSize) { 333 int extra = minTargetSize >> 3; 334 if (extra < 3) { 335 extra = 3; 336 } 337 int newSize = minTargetSize + extra; 338 return (newSize + 3) & 0x7ffffffc; 339 } 340 341 /* to replace chars between c_bra and c_ket in current by the 342 * chars in s. 343 */ replace_s(int c_bra, int c_ket, CharSequence s)344 protected int replace_s(int c_bra, int c_ket, CharSequence s) { 345 final int adjustment = s.length() - (c_ket - c_bra); 346 final int newLength = limit + adjustment; 347 // resize if necessary 348 if (newLength > current.length) { 349 char[] newBuffer = new char[oversize(newLength)]; 350 System.arraycopy(current, 0, newBuffer, 0, limit); 351 current = newBuffer; 352 } 353 // if the substring being replaced is longer or shorter than the 354 // replacement, need to shift things around 355 if (adjustment != 0 && c_ket < limit) { 356 System.arraycopy(current, c_ket, current, c_bra + s.length(), limit - c_ket); 357 } 358 // insert the replacement text 359 // Note, faster is s.getChars(0, s.length(), current, c_bra); 360 // but would have to duplicate this method for both String and StringBuilder 361 for (int i = 0; i < s.length(); i++) current[c_bra + i] = s.charAt(i); 362 363 limit += adjustment; 364 if (cursor >= c_ket) cursor += adjustment; 365 else if (cursor > c_bra) cursor = c_bra; 366 return adjustment; 367 } 368 slice_check()369 protected void slice_check() { 370 if (bra < 0 || bra > ket || ket > limit) { 371 throw new IllegalArgumentException( 372 "faulty slice operation: bra=" + bra + ",ket=" + ket + ",limit=" + limit); 373 } 374 } 375 slice_from(CharSequence s)376 protected void slice_from(CharSequence s) { 377 slice_check(); 378 replace_s(bra, ket, s); 379 } 380 slice_del()381 protected void slice_del() { 382 slice_from(""); 383 } 384 insert(int c_bra, int c_ket, CharSequence s)385 protected void insert(int c_bra, int c_ket, CharSequence s) { 386 int adjustment = replace_s(c_bra, c_ket, s); 387 if (c_bra <= bra) bra += adjustment; 388 if (c_bra <= ket) ket += adjustment; 389 } 390 391 /* Copy the slice into the supplied StringBuilder */ slice_to(StringBuilder s)392 protected void slice_to(StringBuilder s) { 393 slice_check(); 394 int len = ket - bra; 395 s.setLength(0); 396 s.append(current, bra, len); 397 } 398 assign_to(StringBuilder s)399 protected void assign_to(StringBuilder s) { 400 s.setLength(0); 401 s.append(current, 0, limit); 402 } 403 404 /* 405 extern void debug(struct SN_env * z, int number, int line_count) 406 { int i; 407 int limit = SIZE(z->p); 408 //if (number >= 0) printf("%3d (line %4d): '", number, line_count); 409 if (number >= 0) printf("%3d (line %4d): [%d]'", number, line_count,limit); 410 for (i = 0; i <= limit; i++) 411 { if (z->lb == i) printf("{"); 412 if (z->bra == i) printf("["); 413 if (z->c == i) printf("|"); 414 if (z->ket == i) printf("]"); 415 if (z->l == i) printf("}"); 416 if (i < limit) 417 { int ch = z->p[i]; 418 if (ch == 0) ch = '#'; 419 printf("%c", ch); 420 } 421 } 422 printf("'\n"); 423 } 424 */ 425 426 } 427 ; 428