1diff --git a/algorithms/armenian.sbl b/algorithms/armenian.sbl 2new file mode 100644 3index 0000000..3a9a926 4--- /dev/null 5+++ b/algorithms/armenian.sbl 6@@ -0,0 +1,301 @@ 7+stringescapes {} 8+ 9+stringdef a '{U+0561}' // 531 10+stringdef b '{U+0562}' // 532 11+stringdef g '{U+0563}' // 533 12+stringdef d '{U+0564}' // 534 13+stringdef ye '{U+0565}' // 535 14+stringdef z '{U+0566}' // 536 15+stringdef e '{U+0567}' // 537 16+stringdef y '{U+0568}' // 538 17+stringdef dt '{U+0569}' // 539 18+stringdef zh '{U+056A}' // 53A 19+stringdef i '{U+056B}' // 53B 20+stringdef l '{U+056C}' // 53C 21+stringdef kh '{U+056D}' // 53D 22+stringdef ts '{U+056E}' // 53E 23+stringdef k '{U+056F}' // 53F 24+stringdef h '{U+0570}' // 540 25+stringdef dz '{U+0571}' // 541 26+stringdef gh '{U+0572}' // 542 27+stringdef djch '{U+0573}' // 543 28+stringdef m '{U+0574}' // 544 29+stringdef j '{U+0575}' // 545 30+stringdef n '{U+0576}' // 546 31+stringdef sh '{U+0577}' // 547 32+stringdef vo '{U+0578}' // 548 33+stringdef ch '{U+0579}' // 549 34+stringdef p '{U+057A}' // 54A 35+stringdef dj '{U+057B}' // 54B 36+stringdef r '{U+057C}' // 54C 37+stringdef s '{U+057D}' // 54D 38+stringdef v '{U+057E}' // 54E 39+stringdef t '{U+057F}' // 54F 40+stringdef r' '{U+0580}' // 550 41+stringdef c '{U+0581}' // 551 42+stringdef u '{U+0582}' // 552 //vjun 43+stringdef bp '{U+0583}' // 553 44+stringdef q '{U+0584}' // 554 45+stringdef ev '{U+0587}' 46+stringdef o '{U+0585}' // 555 47+stringdef f '{U+0586}' // 556 48+ 49+routines ( mark_regions R2 50+ adjective 51+ verb 52+ noun 53+ ending 54+) 55+ 56+externals ( stem ) 57+ 58+integers ( pV p2 ) 59+ 60+groupings ( v ) 61+ 62+define v '{a}{e}{i}{o}{u}{ye}{vo}{y}' 63+ 64+define mark_regions as ( 65+ 66+ $pV = limit 67+ $p2 = limit 68+ do ( 69+ gopast v setmark pV gopast non-v 70+ gopast v gopast non-v setmark p2 71+ ) 72+) 73+ 74+backwardmode ( 75+ 76+ define R2 as $p2 <= cursor 77+ 78+ define adjective as ( 79+ [substring] among ( 80+ '{b}{a}{r'}' 81+ '{p}{ye}{s}' 82+ '{vo}{r'}{e}{n}' 83+ '{vo}{v}{i}{n}' 84+ '{a}{k}{i}' 85+ '{l}{a}{j}{n}' 86+ '{r'}{vo}{r'}{d}' 87+ '{ye}{r'}{vo}{r'}{d}' 88+ '{a}{k}{a}{n}' 89+ '{a}{l}{i}' 90+ '{k}{vo}{t}' 91+ '{ye}{k}{ye}{n}' 92+ '{vo}{r'}{a}{k}' 93+ '{ye}{gh}' 94+ '{v}{vo}{u}{n}' 95+ '{ye}{r'}{ye}{n}' 96+ '{a}{r'}{a}{n}' 97+ '{ye}{n}' 98+ '{a}{v}{ye}{t}' 99+ '{g}{i}{n}' 100+ '{i}{v}' 101+ '{a}{t}' 102+ '{i}{n}' 103+ 104+ (delete) 105+ ) 106+ ) 107+ 108+ define verb as ( 109+ [substring] among ( 110+ '{vo}{u}{m}' 111+ '{v}{vo}{u}{m}' 112+ '{a}{l}{vo}{u}' 113+ '{ye}{l}{vo}{u}' 114+ '{v}{ye}{l}' 115+ '{a}{n}{a}{l}' 116+ '{ye}{l}{vo}{u}{c}' 117+ '{a}{l}{vo}{u}{c}' 118+ '{y}{a}{l}' 119+ '{y}{ye}{l}' 120+ '{a}{l}{vo}{v}' 121+ '{ye}{l}{vo}{v}' 122+ '{a}{l}{i}{s}' 123+ '{ye}{l}{i}{s}' 124+ '{ye}{n}{a}{l}' 125+ '{a}{c}{n}{a}{l}' 126+ '{ye}{c}{n}{ye}{l}' 127+ '{c}{n}{ye}{l}' 128+ '{n}{ye}{l}' 129+ '{a}{t}{ye}{l}' 130+ '{vo}{t}{ye}{l}' 131+ '{k}{vo}{t}{ye}{l}' 132+ '{t}{ye}{l}' 133+ '{v}{a}{ts}' 134+ '{ye}{c}{v}{ye}{l}' 135+ '{a}{c}{v}{ye}{l}' 136+ '{ye}{c}{i}{r'}' 137+ '{a}{c}{i}{r'}' 138+ '{ye}{c}{i}{n}{q}' 139+ '{a}{c}{i}{n}{q}' 140+ '{v}{ye}{c}{i}{r'}' 141+ '{v}{ye}{c}{i}{n}{q}' 142+ '{v}{ye}{c}{i}{q}' 143+ '{v}{ye}{c}{i}{n}' 144+ '{a}{c}{r'}{i}{r'}' 145+ '{a}{c}{r'}{ye}{c}' 146+ '{a}{c}{r'}{i}{n}{q}' 147+ '{a}{c}{r'}{i}{q}' 148+ '{a}{c}{r'}{i}{n}' 149+ '{ye}{c}{i}{q}' 150+ '{a}{c}{i}{q}' 151+ '{ye}{c}{i}{n}' 152+ '{a}{c}{i}{n}' 153+ '{a}{c}{a}{r'}' 154+ '{a}{c}{a}{v}' 155+ '{a}{c}{a}{n}{q}' 156+ '{a}{c}{a}{q}' 157+ '{a}{c}{a}{n}' 158+ '{v}{ye}{c}{i}' 159+ '{a}{c}{r'}{i}' 160+ '{ye}{c}{a}{r'}' 161+ '{ye}{c}{a}{v}' 162+ '{c}{a}{n}{q}' 163+ '{c}{a}{q}' 164+ '{c}{a}{n}' 165+ '{a}{c}{a}' 166+ '{a}{c}{i}' 167+ '{ye}{c}{a}' 168+ '{ch}{ye}{l}' 169+ '{ye}{c}{i}' 170+ '{a}{r'}' 171+ '{a}{v}' 172+ '{a}{n}{q}' 173+ '{a}{q}' 174+ '{a}{n}' 175+ '{a}{l}' 176+ '{ye}{l}' 177+ '{ye}{c}' 178+ '{a}{c}' 179+ '{v}{ye}' 180+ '{a}' 181+ 182+ (delete) 183+ ) 184+ ) 185+ 186+ define noun as ( 187+ [substring] among ( 188+ '{a}{ts}{vo}' 189+ '{a}{n}{a}{k}' 190+ '{a}{n}{o}{c}' 191+ '{a}{r'}{a}{n}' 192+ '{a}{r'}{q}' 193+ '{p}{a}{n}' 194+ '{s}{t}{a}{n}' 195+ '{ye}{gh}{e}{n}' 196+ '{ye}{n}{q}' 197+ '{i}{k}' 198+ '{i}{ch}' 199+ '{i}{q}' 200+ '{m}{vo}{u}{n}{q}' 201+ '{j}{a}{k}' 202+ '{j}{vo}{u}{n}' 203+ '{vo}{n}{q}' 204+ '{vo}{r'}{d}' 205+ '{vo}{c}' 206+ '{ch}{ye}{q}' 207+ '{v}{a}{ts}{q}' 208+ '{v}{vo}{r'}' 209+ '{a}{v}{vo}{r'}' 210+ '{vo}{u}{dt}{j}{vo}{u}{n}' 211+ '{vo}{u}{k}' 212+ '{vo}{u}{h}{i}' 213+ '{vo}{u}{j}{dt}' 214+ '{vo}{u}{j}{q}' 215+ '{vo}{u}{s}{t}' 216+ '{vo}{u}{s}' 217+ '{c}{i}' 218+ '{a}{l}{i}{q}' 219+ '{a}{n}{i}{q}' 220+ '{i}{l}' 221+ '{i}{ch}{q}' 222+ '{vo}{u}{n}{q}' 223+ '{g}{a}{r'}' 224+ '{vo}{u}' 225+ '{a}{k}' 226+ '{a}{n}' 227+ '{q}' 228+ 229+ (delete) 230+ ) 231+ ) 232+ 233+ define ending as ( 234+ [substring] R2 among ( 235+ '{n}{ye}{r'}{y}' 236+ '{n}{ye}{r'}{n}' 237+ '{n}{ye}{r'}{i}' 238+ '{n}{ye}{r'}{d}' 239+ '{ye}{r'}{i}{c}' 240+ '{n}{ye}{r'}{i}{c}' 241+ '{ye}{r'}{i}' 242+ '{ye}{r'}{d}' 243+ '{ye}{r'}{n}' 244+ '{ye}{r'}{y}' 245+ '{n}{ye}{r'}{i}{n}' 246+ '{vo}{u}{dt}{j}{a}{n}{n}' 247+ '{vo}{u}{dt}{j}{a}{n}{y}' 248+ '{vo}{u}{dt}{j}{a}{n}{s}' 249+ '{vo}{u}{dt}{j}{a}{n}{d}' 250+ '{vo}{u}{dt}{j}{a}{n}' 251+ '{ye}{r'}{i}{n}' 252+ '{i}{n}' 253+ '{s}{a}' 254+ '{vo}{dj}' 255+ '{i}{c}' 256+ '{ye}{r'}{vo}{v}' 257+ '{n}{ye}{r'}{vo}{v}' 258+ '{ye}{r'}{vo}{u}{m}' 259+ '{n}{ye}{r'}{vo}{u}{m}' 260+ '{vo}{u}{n}' 261+ '{vo}{u}{d}' 262+ '{v}{a}{n}{s}' 263+ '{v}{a}{n}{y}' 264+ '{v}{a}{n}{d}' 265+ '{a}{n}{y}' 266+ '{a}{n}{d}' 267+ '{v}{a}{n}' 268+ '{vo}{dj}{y}' 269+ '{vo}{dj}{s}' 270+ '{vo}{dj}{d}' 271+ '{vo}{c}' 272+ '{vo}{u}{c}' 273+ '{vo}{dj}{i}{c}' 274+ '{c}{i}{c}' 275+ '{v}{i}{c}' 276+ '{v}{i}' 277+ '{v}{vo}{v}' 278+ '{vo}{v}' 279+ '{a}{n}{vo}{v}' 280+ '{a}{n}{vo}{u}{m}' 281+ '{v}{a}{n}{i}{c}' 282+ '{a}{m}{b}' 283+ '{a}{n}' 284+ '{n}{ye}{r'}' 285+ '{ye}{r'}' 286+ '{v}{a}' 287+ '{y}' 288+ '{n}' 289+ '{d}' 290+ '{c}' 291+ '{i}' 292+ 293+ (delete) 294+ ) 295+ ) 296+) 297+ 298+define stem as ( 299+ 300+ do mark_regions 301+ backwards setlimit tomark pV for ( 302+ do ending 303+ do verb 304+ do adjective 305+ do noun 306+ ) 307+) 308diff --git a/algorithms/estonian.sbl b/algorithms/estonian.sbl 309new file mode 100644 310index 0000000..0cc2b60 311--- /dev/null 312+++ b/algorithms/estonian.sbl 313@@ -0,0 +1,258 @@ 314+/* Estonian stemmer version 1.3 315+ 316+Made by Linda Freienthal in January 2019. 317+ 318+*/ 319+ 320+routines ( 321+ mark_regions 322+ LONGV 323+ special_noun_endings 324+ case_ending 325+ emphasis 326+ plural_three_first_cases 327+ remove_double_kpt 328+ double 329+ undouble 330+ i_plural 331+ degrees 332+ substantive 333+ verb_exceptions 334+ verb 335+ nu 336+) 337+ 338+stringescapes {} 339+stringdef a" '{U+00E4}' //a-umlaut ä 340+stringdef o" '{U+00F6}' //o-umlaut ö 341+stringdef o' '{U+00F5}' //o with tilde õ 342+stringdef u" '{U+00FC}' //u-umlaut ü 343+stringdef s" '{U+0161}' //s with caron š 344+stringdef z" '{U+017E}' //z with caron ž 345+ 346+externals ( stem ) 347+booleans ( is_verb ) 348+integers ( p1 ) 349+groupings ( V1 RV KI GI) 350+ 351+define V1 'aeiou{o'}{a"}{o"}{u"}' 352+define RV 'aeiuo' 353+define KI 'kptgbdshf{s"}z{z"}' 354+define GI 'cjlmnqrvwxaeiou{o'}{a"}{o"}{u"}' 355+define mark_regions as ( 356+ 357+ $p1 = limit 358+ 359+ goto V1 gopast non-V1 setmark p1 360+) 361+ 362+ 363+backwardmode ( 364+ 365+ define emphasis as ( 366+ setlimit tomark p1 for ([substring]) 367+ test hop 4 //kingi -> kingi 368+ among( 369+ 'gi' ((GI and not LONGV) delete) //jooksemegi -> jookseme, bioloogi -> bioloogi 370+ 'ki' (KI delete) //kookki -> kook 371+ ) 372+ 373+ ) 374+ 375+ define verb as ( 376+ setlimit tomark p1 for ([substring]) 377+ among( 378+ 'nuksin' 'nuksime' 'nuksid' 'nuksite' (delete) //seleta-nuksite 379+ 'ksin' 'ksid' 'ksime' 'ksite' (delete) //personal conditional: rõõmusta-ksin 380+ 'mata' (delete) 381+ 'takse' 'dakse' (delete) //impersonal: laul-dakse, luba-takse 382+ 'taks' 'daks' (delete) //impersonal conditional: laul-daks, saade-taks 383+ 'akse' (<-'a') //impersonal: tulla-kse, süüa-kse, teha-kse, püüt-akse, leita-kse 384+ 'sime' (delete) //pl1pst: saat-sime 385+ 'site' (delete) //pl2pst: saat-site 386+ 'sin' (delete) //sg1pst: laul-sin, saat-sin 387+ 'me' (V1 delete) //pl1prs: laula-me, tule-me 388+ 'da' (V1 delete) //da-infinitive: luba-da 389+ 'n' (V1 delete) //sg1prs: kirjuta-n 390+ 'b' (V1 delete) //sg3prs: laula-b 391+ ) 392+ set is_verb 393+ ) 394+ 395+ define LONGV as 396+ among('aa' 'ee' 'ii' 'oo' 'uu' '{a"}{a"}' '{o"}{o"}' '{u"}{u"}' '{o'}{o'}') 397+ 398+ define i_plural as ( 399+ setlimit tomark p1 for ([substring]) 400+ among( 401+ 'i' (RV) //raamatui -> raamatu, lapsikui -> lapsiku 402+ ) 403+ delete 404+ ) 405+ 406+ define special_noun_endings as ( 407+ setlimit tomark p1 for ([substring]) 408+ among( 409+ 'lasse' (<- 'lase') //teadlasse -> teadlase 410+ 'last' (<- 'lase') //teadlast -> teadlase 411+ 'lane' (<- 'lase') //teadlane -> teadlase 412+ 'lasi'(<- 'lase') //teadlasi -> teadlase 413+ 'misse' (<- 'mise') //tegemisse -> tegemise 414+ 'mist' (<- 'mise') //kasutamist -> kasutamise 415+ 'mine' (<- 'mise') //tegemine -> tegemise 416+ 'misi' (<- 'mise') //kasutamisi -> kasutamise 417+ 'lisse' (<- 'lise') //rohelisse -> rohelise 418+ 'list' (<- 'lise') //tavalist -> tavalise 419+ 'line' (<- 'lise') //roheline -> rohelise 420+ 'lisi' (<- 'lise') //tavalisi -> tavalise 421+ ) 422+ 423+ ) 424+ define case_ending as ( 425+ setlimit tomark p1 for ([substring]) 426+ among( 427+ 'sse' (RV or LONGV) //illative: saapa-sse, tegemisse -> tegemisse 428+ 'st' (RV or LONGV) //elative: saapa-st, rohelist -> rohelist 429+ 'le' (RV or LONGV) //allative: raamatu-le 430+ 'lt' (RV or LONGV) //ablative: raamatu-lt 431+ 'ga' (RV or LONGV) //komitatiive: õpetaja-ga 432+ 'ks' (RV or LONGV) //translative: õpetaja-ks 433+ 'ta' (RV or LONGV) //abessive and da-infinitive: õpetaja-ta and hüpa-ta 434+ 't' //partitiiv, raamatu-t and kapsas-t 435+ 's' (RV or LONGV)//inessive and sg3pst: raamatu-s and sõiti-s 436+ 'l' (RV or LONGV) //adessive: raamatu-l and kapsa-l. 437+ ) 438+ delete 439+ ) 440+ 441+ 442+ define plural_three_first_cases as ( 443+ setlimit tomark p1 for ([substring]) 444+ among( 445+ 'ikkude' (<-'iku') //plural genitive: õnnelikkude -> õnneliku 446+ 'ikke' (<-'iku') //plural partitive: rahulikke -> rahuliku 447+ 'ike' (<-'iku') //plural genitive: ohtlike -> ohtliku 448+ 'sid' (not LONGV delete) //plural partitive and sg2pst and pl3pst: auto-sid and laul-sid (exludes plural nominative with words like gaasid, roosid) 449+ 'te' ((test hop 4 (('mis' <- 'e') or ('las' <- 'e') or ('lis' <- 'e') or (not 't' delete))) or (not 't' <-'t')) //plural genitive and pl2: ministri-te, olulis-te and saada-te, laula-te; also torte -> tort (if not in compound word) and kokkuvõtte -> kokkuvõte and roheliste -> rohelise, tegemiste -> tegemise, teadlaste -> teadlase 450+ 'de' ((RV or LONGV) delete) //plural genitive: lauda-de 451+ 'd' ((RV or LONGV) delete) //plural nominative: voodid -> voodi, rattaid -> rattai, lapsikuid -> lapsiku 452+ ) 453+ ) 454+ 455+ define double as ( 456+ test among('kk' 'tt' 'pp') 457+ ) 458+ 459+ define undouble as ( 460+ next [hop 1] delete 461+ ) 462+ 463+ define nu as ( 464+ setlimit tomark p1 for ([substring]) 465+ among( 466+ 'nu' //haka-nu(-te-ga) 467+ 'tu' //luba-tu(-d) 468+ 'du' //laul-du(-te-st) 469+ 'va' //laul-va(-te-le) 470+ ) 471+ delete 472+ ) 473+ 474+ define remove_double_kpt as (// undouble kpt consonant if 'C1C1V': mõtte(-le) -> mõte, hakka(-n) -> haka, haka(-nu-d) -> haka 475+ (V1) (double) 476+ and undouble 477+ ) 478+ 479+ define degrees as ( 480+ setlimit tomark p1 for ([substring]) 481+ among( 482+ 'mai' (RV delete) //heleda-mai(-le) 483+ 'ma' (delete) //tugeva-ma(-le) and ma-infinitive: sõit-ma 484+ 'm' (RV delete) //kauge-i-m, rõõmsa-m 485+ ) 486+ ) 487+ 488+ define substantive as ( 489+ do special_noun_endings 490+ do case_ending 491+ do plural_three_first_cases 492+ do degrees 493+ do i_plural 494+ do nu 495+ ) 496+) 497+ 498+ 499+define verb_exceptions as ( 500+ [substring] atlimit 501+ among( 502+ 'joon' 'jood' 'joob' 'joote' 'joome' 'joovad' (<-'joo') 503+ 'j{o'}in' 'j{o'}id' 'j{o'}i' 'j{o'}ime' 'j{o'}ite' (<-'joo') 504+ 'joomata' 'juuakse' 'joodakse' 'juua' 'jooma' (<- 'joo') 505+ 'saan' 'saad' 'saab' 'saate' 'saame' 'saavad' (<-'saa') 506+ 'saaksin' 'saaksid' 'saaks' 'saaksite' 'saaksime' (<-'saa') 507+ 'sain' 'said' 'sai' 'saite' 'saime' (<-'saa') 508+ 'saamata' 'saadakse' 'saadi' 'saama' 'saada' (<-'saa') 509+ 'viin' 'viid' 'viib' 'viite' 'viime' 'viivad' (<-'viima') 510+ 'viiksin' 'viiksid' 'viiks' 'viiksite' 'viiksime' (<-'viima') 511+ 'viisin' 'viisite' 'viisime' (<-'viima') 512+ 'viimata' 'viiakse' 'viidi' 'viima' 'viia' (<-'viima') 513+ 'keen' 'keeb' 'keed' 'kees' 'keeme' 'keete' 'keevad' (<-'keesi') 514+ 'keeksin' 'keeks' 'keeksid' 'keeksime' 'keeksite' (<-'keesi') 515+ 'keemata' 'keema' 'keeta' 'keedakse' (<-'keesi') 516+ 'l{o"}{o"}n' 'l{o"}{o"}d' 'l{o"}{o"}b' 'l{o"}{o"}me' 'l{o"}{o"}te' 'l{o"}{o"}vad' (<-'l{o"}{o"}') 517+ 'l{o"}{o"}ksin' 'l{o"}{o"}ksid' 'l{o"}{o"}ks' 'l{o"}{o"}ksime' 'l{o"}{o"}ksite' (<-'l{o"}{o"}') 518+ 'l{o"}{o"}mata' 'l{u"}{u"}akse' 'l{o"}{o"}dakse' 'l{o"}{o"}di' 'l{o"}{o"}ma' 'l{u"}{u"}a' (<-'l{o"}{o"}') 519+ 'l{o'}in' 'l{o'}id' 'l{o'}i' 'l{o'}ime' 'l{o'}ite' (<-'l{o"}i') //looma-lõi, lööma-lõi 520+ 'loon' 'lood' 'loob' 'loome' 'loote' 'loovad' (<-'loo') 521+ 'looksin' 'looksid' 'looks' 'looksime' 'looksite' (<-'loo') 522+ 'loomata' 'luuakse' 'loodi' 'luua' 'looma' (<-'loo') 523+ 'k{a"}in' 'k{a"}ib' 'k{a"}id' 'k{a"}is' 'k{a"}ime' 'k{a"}ite' 'k{a"}ivad' (<-'k{a"}isi') 524+ 'k{a"}iksin' 'k{a"}iks' 'k{a"}iksid' 'k{a"}iksime' 'k{a"}iksite' (<-'k{a"}isi') 525+ 'k{a"}imata' 'k{a"}iakse' 'k{a"}idi' 'k{a"}ia' 'k{a"}ima' (<-'k{a"}isi') 526+ 's{o"}{o"}n' 's{o"}{o"}b' 's{o"}{o"}d' 's{o"}{o"}me' 's{o"}{o"}te' 's{o"}{o"}vad' (<-'s{o"}{o"}') 527+ 's{o"}{o"}ksin' 's{o"}{o"}ks' 's{o"}{o"}ksid' 's{o"}{o"}ksime' 's{o"}{o"}ksite' (<-'s{o"}{o"}') 528+ 's{o'}in' 's{o'}i' 's{o'}id' 's{o'}ime' 's{o'}ite' (<-'s{o"}{o"}') 529+ 's{o"}{o"}mata' 's{u"}{u"}akse' 's{o"}{o"}dakse' 's{o"}{o"}di' 's{o"}{o"}ma' 's{u"}{u"}a' (<-'s{o"}{o"}') 530+ 'toon' 'tood' 'toob' 'toote' 'toome' 'toovad' (<-'too') 531+ 'tooksin' 'tooksid' 'tooks' 'tooksite' 'tooksime' (<-'too') 532+ 't{o'}in' 't{o'}id' 't{o'}i' 't{o'}ime' 't{o'}ite' (<-'too') 533+ 'toomata' 'tuuakse' 'toodi' 'tooma' 'tuua' (<-'too') 534+ 'v{o'}in' 'v{o'}id' 'v{o'}ib' 'v{o'}ime' 'v{o'}is' 'v{o'}ite' 'v{o'}ivad' (<-'v{o'}isi') 535+ 'v{o'}iksin' 'v{o'}iksid' 'v{o'}iks' 'v{o'}iksime' 'v{o'}iksite' (<-'v{o'}isi') 536+ 'v{o'}imata' 'v{o'}idakse' 'v{o'}idi' 'v{o'}ida' 'v{o'}ima' (<-'v{o'}isi') 537+ 'j{a"}{a"}n' 'j{a"}{a"}d' 'j{a"}{a"}b' 'j{a"}{a"}me' 'j{a"}{a"}te' 'j{a"}{a"}vad' (<-'j{a"}{a"}ma') 538+ 'j{a"}{a"}ksin' 'j{a"}{a"}ksid' 'j{a"}{a"}ks' 'j{a"}{a"}ksime' 'j{a"}{a"}ksite' (<-'j{a"}{a"}ma') 539+ 'j{a"}ime' 'j{a"}ite' 'j{a"}in' 'j{a"}id' 'j{a"}i' (<-'j{a"}{a"}ma') 540+ 'j{a"}{a"}mata' 'j{a"}{a"}dakse' 'j{a"}{a"}da' 'j{a"}{a"}ma' 'j{a"}{a"}di' (<-'j{a"}{a"}ma') 541+ 'm{u"}{u"}n' 'm{u"}{u"}d' 'm{u"}{u"}b' 'm{u"}{u"}s' 'm{u"}{u"}me' 'm{u"}{u"}te' 'm{u"}{u"}vad' (<-'m{u"}{u"}si') 542+ 'm{u"}{u"}ksin' 'm{u"}{u"}ksid' 'm{u"}{u"}ks' 'm{u"}{u"}ksime' 'm{u"}{u"}ksite' (<-'m{u"}{u"}si') 543+ 'm{u"}{u"}mata' 'm{u"}{u"}akse' 'm{u"}{u"}di' 'm{u"}{u"}a' 'm{u"}{u"}ma' (<-'m{u"}{u"}si') 544+ 'loeb' 'loen' 'loed' 'loeme' 'loete' 'loevad' (<- 'luge') 545+ 'loeks' 'loeksin' 'loeksid' 'loeksime' 'loeksite' (<- 'luge') 546+ 'p{o'}en' 'p{o'}eb' 'p{o'}ed' 'p{o'}eme' 'p{o'}ete' 'p{o'}evad' (<- 'p{o'}de') 547+ 'p{o'}eksin' 'p{o'}eks' 'p{o'}eksid' 'p{o'}eksime' 'p{o'}eksite' (<- 'p{o'}de') 548+ 'laon' 'laob' 'laod' 'laome' 'laote' 'laovad' (<- 'ladu') 549+ 'laoksin' 'laoks' 'laoksid' 'laoksime' 'laoksite' (<- 'ladu') 550+ 'teeksin' 'teeks' 'teeksid' 'teeksime' 'teeksite' (<- 'tegi') 551+ 'teen' 'teeb' 'teed' 'teeme' 'teete' 'teevad' (<- 'tegi') 552+ 'tegemata' 'tehakse' 'tehti' 'tegema' 'teha' (<-'tegi') 553+ 'n{a"}en' 'n{a"}eb' 'n{a"}ed' 'n{a"}eme' 'n{a"}ete' 'n{a"}evad' (<-'n{a"}gi') 554+ 'n{a"}eksin' 'n{a"}eks' 'n{a"}eksid' 'n{a"}eksime' 'n{a"}eksite' (<-'n{a"}gi') 555+ 'n{a"}gemata' 'n{a"}hakse' 'n{a"}hti' 'n{a"}ha' 'n{a"}gema' (<-'n{a"}gi') 556+ ) 557+) 558+ 559+ 560+define stem as ( 561+ do mark_regions 562+ not verb_exceptions 563+ unset is_verb 564+ backwards ( 565+ do emphasis 566+ do verb 567+ try (not is_verb do substantive) 568+ do remove_double_kpt 569+ 570+ ) 571+) 572diff --git a/compiler/generator_java.c b/compiler/generator_java.c 573index 2958452..c254dbe 100644 574--- a/compiler/generator_java.c 575+++ b/compiler/generator_java.c 576@@ -272,7 +272,7 @@ static void generate_AE(struct generator * g, struct node * p) { 577 break; 578 case c_len: /* Same as size() for Java. */ 579 case c_size: 580- w(g, "current.length()"); 581+ w(g, "limit"); 582 break; 583 } 584 } 585@@ -941,9 +941,12 @@ static void generate_define(struct generator * g, struct node * p) { 586 * be required to allow the SnowballProgram base class to invoke them. 587 * FIXME: Is this avoidable? 588 */ 589- if (q->type == t_routine && !q->used_in_among) { 590+ if (q->used_in_among) { 591+ g->S[0] = "public"; 592+ } else if (q->type == t_routine) { 593 g->S[0] = "private"; 594 } else { 595+ w(g, "~N~M@Override"); 596 g->S[0] = "public"; 597 } 598 g->V[0] = q; 599@@ -1140,6 +1143,7 @@ static void generate_class_begin(struct generator * g) { 600 w(g, " {~+~N" 601 "~N" 602 "~Mprivate static final long serialVersionUID = 1L;~N" 603+ "~Mprivate static final java.lang.invoke.MethodHandles.Lookup methodObject = java.lang.invoke.MethodHandles.lookup();~N" 604 "~N"); 605 } 606 607@@ -1186,7 +1190,7 @@ static void generate_among_table(struct generator * g, struct among * x) { 608 if (v->function != 0) { 609 w(g, ", \""); 610 write_varname(g, v->function); 611- w(g, "\", ~n.class"); 612+ w(g, "\", methodObject"); 613 } 614 w(g, ")~S0~N"); 615 v++; 616diff --git a/java/org/tartarus/snowball/Among.java b/java/org/tartarus/snowball/Among.java 617index 8261503..abb8685 100644 618--- a/java/org/tartarus/snowball/Among.java 619+++ b/java/org/tartarus/snowball/Among.java 620@@ -1,7 +1,13 @@ 621 package org.tartarus.snowball; 622 623-import java.lang.reflect.Method; 624+import java.lang.invoke.MethodHandle; 625+import java.lang.invoke.MethodHandles; 626+import java.lang.invoke.MethodType; 627+import java.util.Locale; 628 629+/** 630+ * Internal class used by Snowball stemmers 631+ */ 632 public class Among { 633 public Among (String s, int substring_i, int result) { 634 this.s = s.toCharArray(); 635@@ -11,19 +17,30 @@ public class Among { 636 } 637 638 public Among (String s, int substring_i, int result, String methodname, 639- Class<? extends SnowballProgram> programclass) { 640+ MethodHandles.Lookup methodobject) { 641 this.s = s.toCharArray(); 642 this.substring_i = substring_i; 643 this.result = result; 644- try { 645- this.method = programclass.getDeclaredMethod(methodname); 646- } catch (NoSuchMethodException e) { 647- throw new RuntimeException(e); 648- } 649+ final Class<? extends SnowballProgram> clazz = methodobject.lookupClass().asSubclass(SnowballProgram.class); 650+ if (methodname.length() > 0) { 651+ try { 652+ this.method = methodobject.findVirtual(clazz, methodname, MethodType.methodType(boolean.class)) 653+ .asType(MethodType.methodType(boolean.class, SnowballProgram.class)); 654+ } catch (NoSuchMethodException | IllegalAccessException e) { 655+ throw new RuntimeException(String.format(Locale.ENGLISH, 656+ "Snowball program '%s' is broken, cannot access method: boolean %s()", 657+ clazz.getSimpleName(), methodname 658+ ), e); 659+ } 660+ } else { 661+ this.method = null; 662+ } 663 } 664 665- public final char[] s; /* search string */ 666- public final int substring_i; /* index to longest matching substring */ 667- public final int result; /* result of the lookup */ 668- public final Method method; /* method to use if substring matches */ 669+ final char[] s; /* search string */ 670+ final int substring_i; /* index to longest matching substring */ 671+ final int result; /* result of the lookup */ 672+ 673+ // Make sure this is not accessible outside package for Java security reasons! 674+ final MethodHandle method; /* method to use if substring matches */ 675 }; 676diff --git a/java/org/tartarus/snowball/SnowballProgram.java b/java/org/tartarus/snowball/SnowballProgram.java 677index 1b27b96..94f2d4b 100644 678--- a/java/org/tartarus/snowball/SnowballProgram.java 679+++ b/java/org/tartarus/snowball/SnowballProgram.java 680@@ -1,50 +1,84 @@ 681 682 package org.tartarus.snowball; 683-import java.lang.reflect.InvocationTargetException; 684+import java.lang.reflect.UndeclaredThrowableException; 685 import java.io.Serializable; 686 687+/** 688+ * Base class for a snowball stemmer 689+ */ 690 public class SnowballProgram implements Serializable { 691 protected SnowballProgram() 692 { 693- current = new StringBuilder(); 694- init(); 695+ current = new char[8]; 696+ setCurrent(""); 697 } 698 699 static final long serialVersionUID = 2016072500L; 700 701- private void init() { 702+ /** 703+ * Set the current string. 704+ */ 705+ public void setCurrent(String value) 706+ { 707+ current = value.toCharArray(); 708 cursor = 0; 709- limit = current.length(); 710+ limit = value.length(); 711 limit_backward = 0; 712 bra = cursor; 713 ket = limit; 714 } 715 716 /** 717- * Set the current string. 718+ * Get the current string. 719 */ 720- public void setCurrent(String value) 721+ public String getCurrent() 722 { 723- // Make a new StringBuilder. If we reuse the old one, and a user of 724- // the library keeps a reference to the buffer returned (for example, 725- // by converting it to a String in a way which doesn't force a copy), 726- // the buffer size will not decrease, and we will risk wasting a large 727- // amount of memory. 728- // Thanks to Wolfram Esser for spotting this problem. 729- current = new StringBuilder(value); 730- init(); 731+ return new String(current, 0, limit); 732 } 733 734 /** 735- * Get the current string. 736+ * Set the current string. 737+ * @param text character array containing input 738+ * @param length valid length of text. 739 */ 740- public String getCurrent() 741- { 742- return current.toString(); 743+ public void setCurrent(char[] text, int length) { 744+ current = text; 745+ cursor = 0; 746+ limit = length; 747+ limit_backward = 0; 748+ bra = cursor; 749+ ket = limit; 750+ } 751+ 752+ /** 753+ * Get the current buffer containing the stem. 754+ * <p> 755+ * NOTE: this may be a reference to a different character array than the 756+ * one originally provided with setCurrent, in the exceptional case that 757+ * stemming produced a longer intermediate or result string. 758+ * </p> 759+ * <p> 760+ * It is necessary to use {@link #getCurrentBufferLength()} to determine 761+ * the valid length of the returned buffer. For example, many words are 762+ * stemmed simply by subtracting from the length to remove suffixes. 763+ * </p> 764+ * @see #getCurrentBufferLength() 765+ */ 766+ public char[] getCurrentBuffer() { 767+ return current; 768+ } 769+ 770+ /** 771+ * Get the valid length of the character array in 772+ * {@link #getCurrentBuffer()}. 773+ * @return valid length of the array. 774+ */ 775+ public int getCurrentBufferLength() { 776+ return limit; 777 } 778 779 // current string 780- protected StringBuilder current; 781+ private char[] current; 782 783 protected int cursor; 784 protected int limit; 785@@ -74,7 +108,7 @@ public class SnowballProgram implements Serializable { 786 protected boolean in_grouping(char [] s, int min, int max) 787 { 788 if (cursor >= limit) return false; 789- char ch = current.charAt(cursor); 790+ char ch = current[cursor]; 791 if (ch > max || ch < min) return false; 792 ch -= min; 793 if ((s[ch >> 3] & (0X1 << (ch & 0X7))) == 0) return false; 794@@ -85,7 +119,7 @@ public class SnowballProgram implements Serializable { 795 protected boolean in_grouping_b(char [] s, int min, int max) 796 { 797 if (cursor <= limit_backward) return false; 798- char ch = current.charAt(cursor - 1); 799+ char ch = current[cursor - 1]; 800 if (ch > max || ch < min) return false; 801 ch -= min; 802 if ((s[ch >> 3] & (0X1 << (ch & 0X7))) == 0) return false; 803@@ -96,7 +130,7 @@ public class SnowballProgram implements Serializable { 804 protected boolean out_grouping(char [] s, int min, int max) 805 { 806 if (cursor >= limit) return false; 807- char ch = current.charAt(cursor); 808+ char ch = current[cursor]; 809 if (ch > max || ch < min) { 810 cursor++; 811 return true; 812@@ -112,7 +146,7 @@ public class SnowballProgram implements Serializable { 813 protected boolean out_grouping_b(char [] s, int min, int max) 814 { 815 if (cursor <= limit_backward) return false; 816- char ch = current.charAt(cursor - 1); 817+ char ch = current[cursor - 1]; 818 if (ch > max || ch < min) { 819 cursor--; 820 return true; 821@@ -130,7 +164,7 @@ public class SnowballProgram implements Serializable { 822 if (limit - cursor < s.length()) return false; 823 int i; 824 for (i = 0; i != s.length(); i++) { 825- if (current.charAt(cursor + i) != s.charAt(i)) return false; 826+ if (current[cursor + i] != s.charAt(i)) return false; 827 } 828 cursor += s.length(); 829 return true; 830@@ -141,7 +175,7 @@ public class SnowballProgram implements Serializable { 831 if (cursor - limit_backward < s.length()) return false; 832 int i; 833 for (i = 0; i != s.length(); i++) { 834- if (current.charAt(cursor - s.length() + i) != s.charAt(i)) return false; 835+ if (current[cursor - s.length() + i] != s.charAt(i)) return false; 836 } 837 cursor -= s.length(); 838 return true; 839@@ -171,7 +205,7 @@ public class SnowballProgram implements Serializable { 840 diff = -1; 841 break; 842 } 843- diff = current.charAt(c + common) - w.s[i2]; 844+ diff = current[c + common] - w.s[i2]; 845 if (diff != 0) break; 846 common++; 847 } 848@@ -199,16 +233,13 @@ public class SnowballProgram implements Serializable { 849 if (common_i >= w.s.length) { 850 cursor = c + w.s.length; 851 if (w.method == null) return w.result; 852- boolean res; 853+ boolean res = false; 854 try { 855- Object resobj = w.method.invoke(this); 856- res = resobj.toString().equals("true"); 857- } catch (InvocationTargetException e) { 858- res = false; 859- // FIXME - debug message 860- } catch (IllegalAccessException e) { 861- res = false; 862- // FIXME - debug message 863+ res = (boolean) w.method.invokeExact(this); 864+ } catch (Error | RuntimeException e) { 865+ throw e; 866+ } catch (Throwable e) { 867+ throw new UndeclaredThrowableException(e); 868 } 869 cursor = c + w.s.length; 870 if (res) return w.result; 871@@ -243,7 +274,7 @@ public class SnowballProgram implements Serializable { 872 diff = -1; 873 break; 874 } 875- diff = current.charAt(c - 1 - common) - w.s[i2]; 876+ diff = current[c - 1 - common] - w.s[i2]; 877 if (diff != 0) break; 878 common++; 879 } 880@@ -267,16 +298,13 @@ public class SnowballProgram implements Serializable { 881 cursor = c - w.s.length; 882 if (w.method == null) return w.result; 883 884- boolean res; 885+ boolean res = false; 886 try { 887- Object resobj = w.method.invoke(this); 888- res = resobj.toString().equals("true"); 889- } catch (InvocationTargetException e) { 890- res = false; 891- // FIXME - debug message 892- } catch (IllegalAccessException e) { 893- res = false; 894- // FIXME - debug message 895+ res = (boolean) w.method.invokeExact(this); 896+ } catch (Error | RuntimeException e) { 897+ throw e; 898+ } catch (Throwable e) { 899+ throw new UndeclaredThrowableException(e); 900 } 901 cursor = c - w.s.length; 902 if (res) return w.result; 903@@ -286,13 +314,41 @@ public class SnowballProgram implements Serializable { 904 } 905 } 906 907+ // mini version of ArrayUtil.oversize from lucene, specialized to chars 908+ static int oversize(int minTargetSize) { 909+ int extra = minTargetSize >> 3; 910+ if (extra < 3) { 911+ extra = 3; 912+ } 913+ int newSize = minTargetSize + extra; 914+ return (newSize + 3) & 0x7ffffffc; 915+ } 916+ 917 /* to replace chars between c_bra and c_ket in current by the 918 * chars in s. 919 */ 920- protected int replace_s(int c_bra, int c_ket, String s) 921+ protected int replace_s(int c_bra, int c_ket, CharSequence s) 922 { 923- int adjustment = s.length() - (c_ket - c_bra); 924- current.replace(c_bra, c_ket, s); 925+ final int adjustment = s.length() - (c_ket - c_bra); 926+ final int newLength = limit + adjustment; 927+ //resize if necessary 928+ if (newLength > current.length) { 929+ char[] newBuffer = new char[oversize(newLength)]; 930+ System.arraycopy(current, 0, newBuffer, 0, limit); 931+ current = newBuffer; 932+ } 933+ // if the substring being replaced is longer or shorter than the 934+ // replacement, need to shift things around 935+ if (adjustment != 0 && c_ket < limit) { 936+ System.arraycopy(current, c_ket, current, c_bra + s.length(), 937+ limit - c_ket); 938+ } 939+ // insert the replacement text 940+ // Note, faster is s.getChars(0, s.length(), current, c_bra); 941+ // but would have to duplicate this method for both String and StringBuilder 942+ for (int i = 0; i < s.length(); i++) 943+ current[c_bra + i] = s.charAt(i); 944+ 945 limit += adjustment; 946 if (cursor >= c_ket) cursor += adjustment; 947 else if (cursor > c_bra) cursor = c_bra; 948@@ -303,57 +359,43 @@ public class SnowballProgram implements Serializable { 949 { 950 if (bra < 0 || 951 bra > ket || 952- ket > limit || 953- limit > current.length()) // this line could be removed 954+ ket > limit) 955 { 956- System.err.println("faulty slice operation"); 957- // FIXME: report error somehow. 958- /* 959- fprintf(stderr, "faulty slice operation:\n"); 960- debug(z, -1, 0); 961- exit(1); 962- */ 963+ throw new IllegalArgumentException("faulty slice operation: bra=" + bra + ",ket=" + ket + ",limit=" + limit); 964 } 965 } 966 967- protected void slice_from(String s) 968+ protected void slice_from(CharSequence s) 969 { 970 slice_check(); 971 replace_s(bra, ket, s); 972 } 973 974- protected void slice_from(CharSequence s) 975- { 976- slice_from(s.toString()); 977- } 978- 979 protected void slice_del() 980 { 981 slice_from(""); 982 } 983 984- protected void insert(int c_bra, int c_ket, String s) 985+ protected void insert(int c_bra, int c_ket, CharSequence s) 986 { 987 int adjustment = replace_s(c_bra, c_ket, s); 988 if (c_bra <= bra) bra += adjustment; 989 if (c_bra <= ket) ket += adjustment; 990 } 991 992- protected void insert(int c_bra, int c_ket, CharSequence s) 993- { 994- insert(c_bra, c_ket, s.toString()); 995- } 996- 997 /* Copy the slice into the supplied StringBuilder */ 998 protected void slice_to(StringBuilder s) 999 { 1000 slice_check(); 1001- s.replace(0, s.length(), current.substring(bra, ket)); 1002+ int len = ket - bra; 1003+ s.setLength(0); 1004+ s.append(current, bra, len); 1005 } 1006 1007 protected void assign_to(StringBuilder s) 1008 { 1009- s.replace(0, s.length(), current.substring(0, limit)); 1010+ s.setLength(0); 1011+ s.append(current, 0, limit); 1012 } 1013 1014 /* 1015diff --git a/java/org/tartarus/snowball/SnowballStemmer.java b/java/org/tartarus/snowball/SnowballStemmer.java 1016index 73a81a9..f7772d3 100644 1017--- a/java/org/tartarus/snowball/SnowballStemmer.java 1018+++ b/java/org/tartarus/snowball/SnowballStemmer.java 1019@@ -1,6 +1,9 @@ 1020 1021 package org.tartarus.snowball; 1022 1023+/** 1024+ * Parent class of all snowball stemmers, which must implement <code>stem</code> 1025+ */ 1026 public abstract class SnowballStemmer extends SnowballProgram { 1027 public abstract boolean stem(); 1028 1029diff --git a/libstemmer/modules.txt b/libstemmer/modules.txt 1030index b8ec17a..d2c8e61 100644 1031--- a/libstemmer/modules.txt 1032+++ b/libstemmer/modules.txt 1033@@ -10,11 +10,13 @@ 1034 # the most commonly used encoding. 1035 1036 arabic UTF_8 arabic,ar,ara 1037+armenian UTF_8 armenian,hy,arm,hye 1038 basque UTF_8,ISO_8859_1 basque,eu,eus,baq 1039 catalan UTF_8,ISO_8859_1 catalan,ca,cat 1040 danish UTF_8,ISO_8859_1 danish,da,dan 1041 dutch UTF_8,ISO_8859_1 dutch,nl,dut,nld 1042 english UTF_8,ISO_8859_1 english,en,eng 1043+estonian UTF_8 estonian,et,est 1044 finnish UTF_8,ISO_8859_1 finnish,fi,fin 1045 french UTF_8,ISO_8859_1 french,fr,fre,fra 1046 german UTF_8,ISO_8859_1 german,de,ger,deu 1047@@ -51,12 +53,12 @@ porter UTF_8,ISO_8859_1 porter english 1048 # algorithms are: 1049 # 1050 # german2 - This is a slight modification of the german stemmer. 1051-#german2 UTF_8,ISO_8859_1 german2 german 1052+german2 UTF_8,ISO_8859_1 german2 german 1053 # 1054 # kraaij_pohlmann - This is a different dutch stemmer. 1055-#kraaij_pohlmann UTF_8,ISO_8859_1 kraaij_pohlmann dutch 1056+kraaij_pohlmann UTF_8,ISO_8859_1 kraaij_pohlmann dutch 1057 # 1058 # lovins - This is an english stemmer, but fairly outdated, and 1059 # only really applicable to a restricted type of input text 1060 # (keywords in academic publications). 1061-#lovins UTF_8,ISO_8859_1 lovins english 1062+lovins UTF_8,ISO_8859_1 lovins english 1063