xref: /Lucene/lucene/analysis/icu/src/data/uax29/Default.rbbi (revision 2ea416ee3d2a6fd6d2a74701b79231be2ebb0b71)
1#
2# Licensed to the Apache Software Foundation (ASF) under one or more
3# contributor license agreements.  See the NOTICE file distributed with
4# this work for additional information regarding copyright ownership.
5# The ASF licenses this file to You under the Apache License, Version 2.0
6# (the "License"); you may not use this file except in compliance with
7# the License.  You may obtain a copy of the License at
8#
9#     http://www.apache.org/licenses/LICENSE-2.0
10#
11# Unless required by applicable law or agreed to in writing, software
12# distributed under the License is distributed on an "AS IS" BASIS,
13# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14# See the License for the specific language governing permissions and
15# limitations under the License.
16#
17# This file is from ICU (with some small modifications, to avoid CJK dictionary break,
18# and status code change related to that)
19#
20# Copyright (C) 2016 and later: Unicode, Inc. and others.
21# License & terms of use: http://www.unicode.org/copyright.html
22# Copyright (C) 2002-2016, International Business Machines Corporation
23# and others. All Rights Reserved.
24#
25# file:  word.txt
26#
27# ICU Word Break Rules
28#      See Unicode Standard Annex #29.
29#      These rules are based on UAX #29 Revision 29 for Unicode Version 9.0
30#      with additions for Emoji Sequences from https://goo.gl/cluFCn
31#      Plus additional characters introduces with Emoji 5, http://www.unicode.org/reports/tr51/proposed.html
32#
33# Note:  Updates to word.txt will usually need to be merged into
34#        word_POSIX.txt also.
35
36##############################################################################
37#
38#  Character class definitions from TR 29
39#
40##############################################################################
41
42!!chain;
43!!quoted_literals_only;
44
45
46#
47#  Character Class Definitions.
48#
49
50$CR                 = [\p{Word_Break = CR}];
51$LF                 = [\p{Word_Break = LF}];
52$Newline            = [\p{Word_Break = Newline} ];
53$Extend             = [\p{Word_Break = Extend}];
54$ZWJ                = [\p{Word_Break = ZWJ}];
55$Regional_Indicator = [\p{Word_Break = Regional_Indicator}];
56$Format             = [\p{Word_Break = Format}];
57$Katakana           = [\p{Word_Break = Katakana}];
58$Hebrew_Letter      = [\p{Word_Break = Hebrew_Letter}];
59$ALetter            = [\p{Word_Break = ALetter}];
60$Single_Quote       = [\p{Word_Break = Single_Quote}];
61$Double_Quote       = [\p{Word_Break = Double_Quote}];
62$MidNumLet          = [\p{Word_Break = MidNumLet}];
63$MidLetter          = [\p{Word_Break = MidLetter}];
64$MidNum             = [\p{Word_Break = MidNum}];
65$Numeric            = [\p{Word_Break = Numeric}[[:Decomposition_Type=Wide:]&[:General_Category=Decimal_Number:]]];
66
67$ExtendNumLet       = [\p{Word_Break = ExtendNumLet}];
68$WSegSpace          = [\p{Word_Break = WSegSpace}];
69$Extended_Pict      = [:ExtPict:];
70
71$Han                = [:Han:];
72$Hiragana           = [:Hiragana:];
73
74
75#   Dictionary character set, for triggering language-based break engines. Currently
76#   limited to LineBreak=Complex_Context. Note that this set only works in Unicode
77#   5.0 or later as the definition of Complex_Context was corrected to include all
78#   characters requiring dictionary break.
79
80$Control        = [\p{Grapheme_Cluster_Break = Control}];
81$HangulSyllable = [\uac00-\ud7a3];
82$ComplexContext = [:LineBreak = Complex_Context:];
83$KanaKanji      = [$Han $Hiragana $Katakana];
84$dictionaryCJK  = [$Han $Hiragana $HangulSyllable];
85$dictionary     = [$ComplexContext];
86
87# leave CJK scripts out of ALetterPlus
88$ALetterPlus  = [$ALetter-$dictionaryCJK [$ComplexContext-$Extend-$Control]];
89
90
91#
92#  Rules 4    Ignore Format and Extend characters,
93#             except when they appear at the beginning of a region of text.
94#
95# TODO: check if handling of katakana in dictionary makes rules incorrect/void
96$KatakanaEx           = $Katakana           ($Extend |  $Format | $ZWJ)*;
97$Hebrew_LetterEx      = $Hebrew_Letter      ($Extend |  $Format | $ZWJ)*;
98$ALetterEx            = $ALetterPlus        ($Extend |  $Format | $ZWJ)*;
99$Single_QuoteEx       = $Single_Quote       ($Extend |  $Format | $ZWJ)*;
100$Double_QuoteEx       = $Double_Quote       ($Extend |  $Format | $ZWJ)*;
101$MidNumLetEx          = $MidNumLet          ($Extend |  $Format | $ZWJ)*;
102$MidLetterEx          = $MidLetter          ($Extend |  $Format | $ZWJ)*;
103$MidNumEx             = $MidNum             ($Extend |  $Format | $ZWJ)*;
104$NumericEx            = $Numeric            ($Extend |  $Format | $ZWJ)*;
105$ExtendNumLetEx       = $ExtendNumLet       ($Extend |  $Format | $ZWJ)*;
106$Regional_IndicatorEx = $Regional_Indicator ($Extend |  $Format | $ZWJ)*;
107
108$Ideographic    = [\p{Ideographic}];
109$HiraganaEx     = $Hiragana     ($Extend |  $Format | $ZWJ)*;
110$IdeographicEx  = $Ideographic  ($Extend |  $Format | $ZWJ)*;
111
112## -------------------------------------------------
113
114# Rule 3 - CR x LF
115#
116$CR $LF;
117
118# Rule 3c   ZWJ x (Extended_Pict | EmojiNRK).  Precedes WB4, so no intervening Extend chars allowed.
119#
120$ZWJ $Extended_Pict;
121
122# Rule 3d - Keep horizontal whitespace together.
123#
124$WSegSpace $WSegSpace;
125
126# Rule 4 - ignore Format and Extend characters, except when they appear at the beginning
127#          of a region of Text.   The rule here comes into play when the start of text
128#          begins with a group of Format chars, or with a "word" consisting of a single
129#          char that is not in any of the listed word break categories followed by
130#          format char(s), or is not a CJK dictionary character.
131[^$CR $LF $Newline]? ($Extend |  $Format | $ZWJ)+;
132
133$NumericEx {100};
134$ALetterEx {200};
135$HangulSyllable {200};
136$Hebrew_LetterEx{200};
137$KatakanaEx {300};       # note:  these status values override those from rule 5
138$HiraganaEx {300};       #        by virtue of being numerically larger.
139$IdeographicEx {400};    #
140
141$Extended_Pict ($Extend | $Format | $ZWJ)*;
142
143#
144# rule 5
145#    Do not break between most letters.
146#
147($ALetterEx | $Hebrew_LetterEx)  ($ALetterEx | $Hebrew_LetterEx) {200};
148
149# rule 6 and 7
150($ALetterEx | $Hebrew_LetterEx) ($MidLetterEx | $MidNumLetEx | $Single_QuoteEx) ($ALetterEx | $Hebrew_LetterEx) {200};
151
152# rule 7a
153$Hebrew_LetterEx $Single_QuoteEx {200};
154
155# rule 7b and 7c
156$Hebrew_LetterEx $Double_QuoteEx $Hebrew_LetterEx {200};
157
158# rule 8
159
160$NumericEx $NumericEx {100};
161
162# rule 9
163
164($ALetterEx | $Hebrew_LetterEx) $NumericEx {200};
165
166# rule 10
167
168$NumericEx ($ALetterEx | $Hebrew_LetterEx) {200};
169
170# rule 11 and 12
171
172$NumericEx ($MidNumEx | $MidNumLetEx | $Single_QuoteEx) $NumericEx {100};
173
174# rule 13
175$KatakanaEx  $KatakanaEx {300};
176
177# rule 13a/b
178
179$ALetterEx       $ExtendNumLetEx {200};    #  (13a)
180$Hebrew_LetterEx $ExtendNumLetEx {200};    #  (13a)
181$NumericEx       $ExtendNumLetEx {100};    #  (13a)
182$KatakanaEx      $ExtendNumLetEx {300};    #  (13a)
183$ExtendNumLetEx  $ExtendNumLetEx {200};    #  (13a)
184
185$ExtendNumLetEx  $ALetterEx      {200};    #  (13b)
186$ExtendNumLetEx  $Hebrew_Letter  {200};    #  (13b)
187$ExtendNumLetEx  $NumericEx      {100};    #  (13b)
188$ExtendNumLetEx  $KatakanaEx     {300};    #  (13b)
189
190# rules 15 - 17
191#    Pairs of Regional Indicators stay together.
192#    With rule chaining disabled by ^, this rule will match exactly two of them.
193#    No other rule begins with a Regional_Indicator, so chaining cannot extend the match.
194#
195^$Regional_IndicatorEx $Regional_IndicatorEx;
196
197# special handling for CJK characters: chain for later dictionary segmentation
198$HangulSyllable $HangulSyllable {200};
199
200# Rule 999
201#     Match a single code point if no other rule applies.
202.;
203