xref: /OpenGrok/opengrok-indexer/src/main/jflex/analysis/ruby/RubyProductions.lexh (revision d219b4cea555a12b602d2d5518daa22134ad4879)
1*d219b4ceSAdam Hornacek/*
2*d219b4ceSAdam Hornacek * CDDL HEADER START
3*d219b4ceSAdam Hornacek *
4*d219b4ceSAdam Hornacek * The contents of this file are subject to the terms of the
5*d219b4ceSAdam Hornacek * Common Development and Distribution License (the "License").
6*d219b4ceSAdam Hornacek * You may not use this file except in compliance with the License.
7*d219b4ceSAdam Hornacek *
8*d219b4ceSAdam Hornacek * See LICENSE.txt included in this distribution for the specific
9*d219b4ceSAdam Hornacek * language governing permissions and limitations under the License.
10*d219b4ceSAdam Hornacek *
11*d219b4ceSAdam Hornacek * When distributing Covered Code, include this CDDL HEADER in each
12*d219b4ceSAdam Hornacek * file and include the License file at LICENSE.txt.
13*d219b4ceSAdam Hornacek * If applicable, add the following below this CDDL HEADER, with the
14*d219b4ceSAdam Hornacek * fields enclosed by brackets "[]" replaced with your own identifying
15*d219b4ceSAdam Hornacek * information: Portions Copyright [yyyy] [name of copyright owner]
16*d219b4ceSAdam Hornacek *
17*d219b4ceSAdam Hornacek * CDDL HEADER END
18*d219b4ceSAdam Hornacek */
19*d219b4ceSAdam Hornacek
20*d219b4ceSAdam Hornacek/*
21*d219b4ceSAdam Hornacek * Copyright (c) 2010, 2016, Oracle and/or its affiliates. All rights reserved.
22*d219b4ceSAdam Hornacek * Portions Copyright (c) 2017, 2019-2020, Chris Fraire <cfraire@me.com>.
23*d219b4ceSAdam Hornacek */
24*d219b4ceSAdam Hornacek
25*d219b4ceSAdam Hornacek/*
26*d219b4ceSAdam Hornacek * Regex productions shared between RubyXref and RubySymbolTokenizer
27*d219b4ceSAdam Hornacek */
28*d219b4ceSAdam Hornacek
29*d219b4ceSAdam HornacekMaybeWhsp     = {WhspChar}*
30*d219b4ceSAdam Hornacek
31*d219b4ceSAdam Hornacek/*
32*d219b4ceSAdam Hornacek * globals_rdoc: Pre-defined variables
33*d219b4ceSAdam Hornacek * regexp_rdoc: Special global variables
34*d219b4ceSAdam Hornacek */
35*d219b4ceSAdam HornacekSPIdentifier = \$ ( [\!\@\&\`\'\+\1\~\=\/\\\,\;\<\>\_\0\*\$\?\:\"] |
36*d219b4ceSAdam Hornacek    "-0" | "-a" | "-d" | "-F" | "-i" | "-I" | "-l" | "-p" | "-v" | "-w" |
37*d219b4ceSAdam Hornacek    [~&\`\'\+] | [0-9]+ )
38*d219b4ceSAdam Hornacek
39*d219b4ceSAdam HornacekAnyIdentifier = ({Local_var} | {Instance_var} | {Class_var} | {Global_var} |
40*d219b4ceSAdam Hornacek    {Method_name})
41*d219b4ceSAdam Hornacek
42*d219b4ceSAdam Hornacek/*
43*d219b4ceSAdam Hornacek * A local variable name must start with a lowercase US-ASCII letter or a
44*d219b4ceSAdam Hornacek * character with the eight bit set. Typically local variables are US-ASCII
45*d219b4ceSAdam Hornacek * compatible since the keys to type them exist on all keyboards. (Ruby
46*d219b4ceSAdam Hornacek * programs must be written in a US-ASCII-compatible character set. In such
47*d219b4ceSAdam Hornacek * character sets if the eight bit is set it indicates an extended character.
48*d219b4ceSAdam Hornacek * Ruby allows local variables to contain such characters.)
49*d219b4ceSAdam Hornacek *  A local variable name may contain letters, numbers, an _ (underscore or low
50*d219b4ceSAdam Hornacek * line) or a character with the eighth bit set.
51*d219b4ceSAdam Hornacek */
52*d219b4ceSAdam HornacekLocal_var = {Local_char1} {Local_nextchar}*
53*d219b4ceSAdam HornacekLocal_char1 = ([a-z] | {Char8})
54*d219b4ceSAdam HornacekLocal_nextchar = ([a-zA-Z0-9_] | {Char8})
55*d219b4ceSAdam HornacekChar8 = [\xA0-\xFF]
56*d219b4ceSAdam Hornacek/*
57*d219b4ceSAdam Hornacek * An instance variable must start with a @ (“at” sign or commercial at).
58*d219b4ceSAdam Hornacek * Otherwise instance variable names follow the rules as local variable names.
59*d219b4ceSAdam Hornacek * Since the instance variable starts with an @ the second character may be an
60*d219b4ceSAdam Hornacek * upper-case letter.
61*d219b4ceSAdam Hornacek */
62*d219b4ceSAdam HornacekInstance_var = [@]{Local_nextchar}+
63*d219b4ceSAdam Hornacek/*
64*d219b4ceSAdam Hornacek * A class variable must start with a @@ (two “at” signs). The rest of the name
65*d219b4ceSAdam Hornacek * follows the same rules as instance variables.
66*d219b4ceSAdam Hornacek */
67*d219b4ceSAdam HornacekClass_var = [@][@]{Local_nextchar}+
68*d219b4ceSAdam Hornacek/*
69*d219b4ceSAdam Hornacek * Global variables start with a $ (dollar sign). The rest of the name follows
70*d219b4ceSAdam Hornacek * the same rules as instance variables.
71*d219b4ceSAdam Hornacek */
72*d219b4ceSAdam HornacekGlobal_var = [$]{Local_nextchar}+
73*d219b4ceSAdam Hornacek
74*d219b4ceSAdam Hornacek/*
75*d219b4ceSAdam Hornacek * methods_rdoc: Method Names
76*d219b4ceSAdam Hornacek *
77*d219b4ceSAdam Hornacek * Method names may be one of the operators or must start a letter or a
78*d219b4ceSAdam Hornacek * character with the eight bit set.
79*d219b4ceSAdam Hornacek *
80*d219b4ceSAdam Hornacek * Method names may end with a ! (bang or exclamation mark), a ? (question
81*d219b4ceSAdam Hornacek * mark) or = equals sign.
82*d219b4ceSAdam Hornacek *
83*d219b4ceSAdam Hornacek * N.b. an '=' suffix is not included in {Method_name}, because that character
84*d219b4ceSAdam Hornacek * in a name is aligned with the operation (assignment) and not with the target
85*d219b4ceSAdam Hornacek * (variable). E.g., `def birthdate=` is for an assignment of `birthdate'.
86*d219b4ceSAdam Hornacek */
87*d219b4ceSAdam HornacekMethod_name_base = ([a-zA-Z] | {Char8}) {Local_nextchar}*
88*d219b4ceSAdam HornacekMethod_name = {Method_name_base} [\!\?]?
89*d219b4ceSAdam Hornacek
90*d219b4ceSAdam Hornacek/*
91*d219b4ceSAdam Hornacek * modules_and_classes_rdoc: Nesting
92*d219b4ceSAdam Hornacek *
93*d219b4ceSAdam Hornacek * You may also define inner modules using ::
94*d219b4ceSAdam Hornacek */
95*d219b4ceSAdam HornacekModules_nested = {AnyIdentifier}("::"{AnyIdentifier})+
96*d219b4ceSAdam Hornacek
97*d219b4ceSAdam Hornacek/*
98*d219b4ceSAdam Hornacek * literals_rdoc: Numbers
99*d219b4ceSAdam Hornacek */
100*d219b4ceSAdam HornacekNumeric_literal = ({Decimal_literal} | {Decimal_prefixed} | {Hex_prefixed} |
101*d219b4ceSAdam Hornacek    {Octal_prefixed} | {Binary_prefixed})
102*d219b4ceSAdam Hornacek/*
103*d219b4ceSAdam Hornacek * You can write integers of any size as follows: 1234 1_234
104*d219b4ceSAdam Hornacek * Floating point numbers may be written as follows: 12.34 1234e-2 1.234E1
105*d219b4ceSAdam Hornacek */
106*d219b4ceSAdam HornacekDecimal_literal = {Numeral} ([\.]{Numeral})? {Exponent}?
107*d219b4ceSAdam HornacekNumeral = {Digit} ([_]? {Digit})*
108*d219b4ceSAdam HornacekExponent = [Ee] [\+\-]? {Numeral}
109*d219b4ceSAdam HornacekDigit = [0-9]
110*d219b4ceSAdam Hornacek/*
111*d219b4ceSAdam Hornacek * You can use a special prefix to write numbers in decimal, hexadecimal, octal
112*d219b4ceSAdam Hornacek * or binary formats. For decimal numbers use a prefix of 0d, for hexadecimal
113*d219b4ceSAdam Hornacek * numbers use a prefix of 0x, for octal numbers use a prefix of 0 or 0o, for
114*d219b4ceSAdam Hornacek * binary numbers use a prefix of 0b. The alphabetic component of the number is
115*d219b4ceSAdam Hornacek * not case-sensitive.
116*d219b4ceSAdam Hornacek *
117*d219b4ceSAdam Hornacek * Like integers and floats you may use an underscore for readability.
118*d219b4ceSAdam Hornacek *
119*d219b4ceSAdam Hornacek * Examples: 0d170 0D170 0xaa 0xAa 0xAA 0Xaa 0XAa 0XaA 0252 0o252 0O252
120*d219b4ceSAdam Hornacek *     0b10101010 0B10101010
121*d219b4ceSAdam Hornacek */
122*d219b4ceSAdam HornacekDecimal_prefixed = [0][Dd] {Numeral}
123*d219b4ceSAdam HornacekHex_prefixed = [0][Xx] {Hex_numeral}
124*d219b4ceSAdam HornacekHex_numeral = {Hex_digit} ([_]? {Hex_digit})*
125*d219b4ceSAdam HornacekHex_digit = [0-9A-Fa-z]
126*d219b4ceSAdam Hornacek/*
127*d219b4ceSAdam Hornacek * The "0"-prefixed octal number as a regex will be captured by
128*d219b4ceSAdam Hornacek * {Decimal_literal} so it is not defined in {Octal_prefixed}.
129*d219b4ceSAdam Hornacek */
130*d219b4ceSAdam HornacekOctal_prefixed = [0][Oo]{Octal_numeral}
131*d219b4ceSAdam HornacekOctal_numeral = {Octal_digit} ([_]? {Octal_digit})*
132*d219b4ceSAdam HornacekOctal_digit = [0-7]
133*d219b4ceSAdam HornacekBinary_prefixed = [0][Bb]{Binary_numeral}
134*d219b4ceSAdam HornacekBinary_numeral = {Binary_digit} ([_]? {Binary_digit})*
135*d219b4ceSAdam HornacekBinary_digit = [01]
136*d219b4ceSAdam Hornacek
137*d219b4ceSAdam Hornacek/*
138*d219b4ceSAdam Hornacek * There is also a character literal notation to represent single character
139*d219b4ceSAdam Hornacek * strings, which syntax is a question mark (?) followed by a single character
140*d219b4ceSAdam Hornacek * or escape sequence that corresponds to a single codepoint in the script
141*d219b4ceSAdam Hornacek * encoding: ?a #=> "a"  ?abc #=> SyntaxError  ?\n #=> "\n"  ?\s #=> " "
142*d219b4ceSAdam Hornacek *     ?\\ #=> "\\"  ?\u{41} #=> "A"  ?\C-a #=> "\x01"  ?\M-a #=> "\xE1"
143*d219b4ceSAdam Hornacek *     ?\M-\C-a #=> "\x81"  ?\C-\M-a #=> "\x81", same as above  ?あ #=> "あ"
144*d219b4ceSAdam Hornacek * N.b. the Ruby rule about ?abc is not enforced in this regex.
145*d219b4ceSAdam Hornacek */
146*d219b4ceSAdam HornacekCharacter_literal = [?] ({Character_literal_esc} | [^\s])
147*d219b4ceSAdam HornacekCharacter_literal_esc = [\\] ([MC][\-][^\s] | "u{" [0-9]+ "}" |
148*d219b4ceSAdam Hornacek    "M-C-" [^\s] | "C-M-" [^\s] | [^\s])
149*d219b4ceSAdam Hornacek
150*d219b4ceSAdam Hornacek/*
151*d219b4ceSAdam Hornacek * literals_rdoc: Strings
152*d219b4ceSAdam Hornacek *
153*d219b4ceSAdam Hornacek * The most common way of writing strings is using ". The string may be many
154*d219b4ceSAdam Hornacek * lines long. Any internal " must be escaped. Strings may allow interpolation
155*d219b4ceSAdam Hornacek * of other values using #{...}, or they may be cross-referenced as URLs or
156*d219b4ceSAdam Hornacek * files, so they are handled as separate yy states.
157*d219b4ceSAdam Hornacek */
158*d219b4ceSAdam Hornacek
159*d219b4ceSAdam HornacekWxSigils = [[\W]--[\$\@\"\'\`\#\r\n]]
160*d219b4ceSAdam Hornacek
161*d219b4ceSAdam HornacekFileExt = ([Rr][Bb] | [Rr][Uu][Bb][Yy] | [Dd][Ii][Ff][Ff] |
162*d219b4ceSAdam Hornacek    [Pp][Aa][Tt][Cc][Hh])
163*d219b4ceSAdam HornacekFile = [a-zA-Z]{FNameChar}* "." {FileExt}
164*d219b4ceSAdam Hornacek
165*d219b4ceSAdam HornacekPOD_begin = "=begin"
166*d219b4ceSAdam HornacekPOD_end = "=end"
167*d219b4ceSAdam Hornacek
168*d219b4ceSAdam HornacekQuo0 =           [[\`\(\)\<\>\[\]\{\}\p{P}\p{S}]]
169*d219b4ceSAdam HornacekQuoP =     [%]{Quo0}
170*d219b4ceSAdam HornacekQuoPC =    [%][IQRSWX]{Quo0}
171*d219b4ceSAdam HornacekQuoPC_xN = [%][iqrswx]{Quo0}
172*d219b4ceSAdam Hornacek
173*d219b4ceSAdam HornacekSymbol =    [:]{AnyIdentifier}
174*d219b4ceSAdam HornacekSymquo =    [:][\"]
175*d219b4ceSAdam HornacekSymquo_xN = [:][\']
176*d219b4ceSAdam Hornacek
177*d219b4ceSAdam Hornacek//
178*d219b4ceSAdam Hornacek// Track some keywords that can be used to identify heuristically a possible
179*d219b4ceSAdam Hornacek// beginning of the shortcut syntax, //, for m//. Also include any perlfunc
180*d219b4ceSAdam Hornacek// that takes /PATTERN/. Heuristics using punctuation are defined inline later
181*d219b4ceSAdam Hornacek// in some rules.
182*d219b4ceSAdam Hornacek//
183*d219b4ceSAdam HornacekMwords_1 = ("and" | "or" | "not")
184*d219b4ceSAdam HornacekMwords_2 = ("begin" | "end" | "unless" | "until" | "when" | "while")
185*d219b4ceSAdam HornacekMwords = ({Mwords_1} | {Mwords_2})
186*d219b4ceSAdam Hornacek
187*d219b4ceSAdam HornacekMpunc1YYIN = [\(\!\[]
188*d219b4ceSAdam HornacekMpunc2IN = ([!=]"~" | [\:\?\=\+\-\<\>] | "=="[=]?|"!="|"<="|">="|"<=>"|"=>")
189*d219b4ceSAdam Hornacek
190*d219b4ceSAdam HornacekHere_marker = {Local_nextchar}+
191*d219b4ceSAdam HornacekHere_EOF1 = {Here_marker}
192*d219b4ceSAdam HornacekHere_EOF2 = [\'][^\r\n\']*[\']
193*d219b4ceSAdam HornacekHere_EOF3 = [\`][^\r\n\`]*[\`]
194*d219b4ceSAdam Hornacek
195*d219b4ceSAdam Hornacek/*
196*d219b4ceSAdam Hornacek * YYINITIAL : nothing yet parsed or just after an non-continuation EOL
197*d219b4ceSAdam Hornacek * INTRA : saw content from YYINITIAL but not yet other state or [{}] or non-
198*d219b4ceSAdam Hornacek *     continuation {EOL}
199*d219b4ceSAdam Hornacek * SCOMMENT : single-line comment
200*d219b4ceSAdam Hornacek * POD : embedded documentation
201*d219b4ceSAdam Hornacek * QUO : quote-like that is OK to match paths|files|URLs|e-mails
202*d219b4ceSAdam Hornacek * QUOxN : "" but with no interpolation
203*d219b4ceSAdam Hornacek * QUOxL : quote-like that is not OK to match paths|files|URLs|e-mails
204*d219b4ceSAdam Hornacek *     because a non-traditional character is used as the quote-like delimiter
205*d219b4ceSAdam Hornacek * QUOxLxN : "" but with no interpolation
206*d219b4ceSAdam Hornacek * QM : a quote-like has ended, and quote modifier chars are awaited
207*d219b4ceSAdam Hornacek * HERE : Here-docs
208*d219b4ceSAdam Hornacek * HERExN : Here-docs with no interpolation
209*d219b4ceSAdam Hornacek * HEREin : Indented Here-docs
210*d219b4ceSAdam Hornacek * HEREinxN : Indented Here-docs with no interpolation
211*d219b4ceSAdam Hornacek */
212*d219b4ceSAdam Hornacek%state INTRA SCOMMENT POD
213*d219b4ceSAdam Hornacek%state QUO QUOxN QUOxL QUOxLxN QM
214*d219b4ceSAdam Hornacek%state HERE HERExN HEREin HEREinxN
215*d219b4ceSAdam Hornacek
216*d219b4ceSAdam Hornacek%%
217*d219b4ceSAdam Hornacek<HERE, HERExN> {
218*d219b4ceSAdam Hornacek    ^ {Here_marker} / {MaybeWhsp}{EOL}    {
219*d219b4ceSAdam Hornacek        chkLOC();
220*d219b4ceSAdam Hornacek        maybeEndHere(yytext());
221*d219b4ceSAdam Hornacek    }
222*d219b4ceSAdam Hornacek}
223*d219b4ceSAdam Hornacek
224*d219b4ceSAdam Hornacek<HEREin, HEREinxN> {
225*d219b4ceSAdam Hornacek    ^ {MaybeWhsp} {Here_marker} / {MaybeWhsp}{EOL}    {
226*d219b4ceSAdam Hornacek        chkLOC();
227*d219b4ceSAdam Hornacek        maybeEndHere(yytext());
228*d219b4ceSAdam Hornacek    }
229*d219b4ceSAdam Hornacek}
230*d219b4ceSAdam Hornacek
231*d219b4ceSAdam Hornacek<INTRA> {
232*d219b4ceSAdam Hornacek    // Syntax that switches back to YYINITIAL but preserves otherwise the stack
233*d219b4ceSAdam Hornacek    [\{] |
234*d219b4ceSAdam Hornacek    "&&" |
235*d219b4ceSAdam Hornacek    "||"    {
236*d219b4ceSAdam Hornacek        chkLOC();
237*d219b4ceSAdam Hornacek        yypushback(yytext().length());
238*d219b4ceSAdam Hornacek        yybegin(YYINITIAL);
239*d219b4ceSAdam Hornacek    }
240*d219b4ceSAdam Hornacek}
241*d219b4ceSAdam Hornacek
242*d219b4ceSAdam Hornacek<YYINITIAL, INTRA> {
243*d219b4ceSAdam Hornacek    [\\] {EOL}    {
244*d219b4ceSAdam Hornacek        maybeIntraState();
245*d219b4ceSAdam Hornacek        offer("\\");
246*d219b4ceSAdam Hornacek        onEndOfLineMatched(yytext(), yychar);
247*d219b4ceSAdam Hornacek    }
248*d219b4ceSAdam Hornacek
249*d219b4ceSAdam Hornacek    "<<"[~\-]? ({Here_EOF1} | {Here_EOF2} | {Here_EOF3})    {
250*d219b4ceSAdam Hornacek        chkLOC();
251*d219b4ceSAdam Hornacek        maybeIntraState();
252*d219b4ceSAdam Hornacek        hop(yytext());
253*d219b4ceSAdam Hornacek    }
254*d219b4ceSAdam Hornacek
255*d219b4ceSAdam Hornacek    {Instance_var} | {Class_var} | {Global_var} | {Symbol}    {
256*d219b4ceSAdam Hornacek        chkLOC();
257*d219b4ceSAdam Hornacek        maybeIntraState();
258*d219b4ceSAdam Hornacek        String id = yytext();
259*d219b4ceSAdam Hornacek        if (offerSymbol(id, 0, true) && returnOnSymbol()) {
260*d219b4ceSAdam Hornacek            return yystate();
261*d219b4ceSAdam Hornacek        }
262*d219b4ceSAdam Hornacek    }
263*d219b4ceSAdam Hornacek
264*d219b4ceSAdam Hornacek    {Local_var} | {Method_name}    {
265*d219b4ceSAdam Hornacek        chkLOC();
266*d219b4ceSAdam Hornacek        maybeIntraState();
267*d219b4ceSAdam Hornacek        String id = yytext();
268*d219b4ceSAdam Hornacek        if (offerSymbol(id, 0, false) && returnOnSymbol()) {
269*d219b4ceSAdam Hornacek            return yystate();
270*d219b4ceSAdam Hornacek        }
271*d219b4ceSAdam Hornacek    }
272*d219b4ceSAdam Hornacek
273*d219b4ceSAdam Hornacek    {SPIdentifier}    {
274*d219b4ceSAdam Hornacek        chkLOC();
275*d219b4ceSAdam Hornacek        maybeIntraState();
276*d219b4ceSAdam Hornacek        offerKeyword(yytext());
277*d219b4ceSAdam Hornacek    }
278*d219b4ceSAdam Hornacek
279*d219b4ceSAdam Hornacek    {Modules_nested}    {
280*d219b4ceSAdam Hornacek        chkLOC();
281*d219b4ceSAdam Hornacek        maybeIntraState();
282*d219b4ceSAdam Hornacek        takeModules(yytext());
283*d219b4ceSAdam Hornacek    }
284*d219b4ceSAdam Hornacek
285*d219b4ceSAdam Hornacek    {Character_literal}    {
286*d219b4ceSAdam Hornacek        chkLOC();
287*d219b4ceSAdam Hornacek        maybeIntraState();
288*d219b4ceSAdam Hornacek        onDisjointSpanChanged(HtmlConsts.STRING_CLASS, yychar);
289*d219b4ceSAdam Hornacek        offer(yytext());
290*d219b4ceSAdam Hornacek        onDisjointSpanChanged(null, yychar);
291*d219b4ceSAdam Hornacek    }
292*d219b4ceSAdam Hornacek
293*d219b4ceSAdam Hornacek    {Numeric_literal}    {
294*d219b4ceSAdam Hornacek        chkLOC();
295*d219b4ceSAdam Hornacek        maybeIntraState();
296*d219b4ceSAdam Hornacek        onDisjointSpanChanged(HtmlConsts.NUMBER_CLASS, yychar);
297*d219b4ceSAdam Hornacek        offer(yytext());
298*d219b4ceSAdam Hornacek        onDisjointSpanChanged(null, yychar);
299*d219b4ceSAdam Hornacek    }
300*d219b4ceSAdam Hornacek
301*d219b4ceSAdam Hornacek    \"    { chkLOC(); qop(yytext(), 0, false); }
302*d219b4ceSAdam Hornacek    \'    { chkLOC(); qop(yytext(), 0, true); }
303*d219b4ceSAdam Hornacek    \#    {
304*d219b4ceSAdam Hornacek        yypush(SCOMMENT);
305*d219b4ceSAdam Hornacek        onDisjointSpanChanged(HtmlConsts.COMMENT_CLASS, yychar);
306*d219b4ceSAdam Hornacek        offer(yytext());
307*d219b4ceSAdam Hornacek    }
308*d219b4ceSAdam Hornacek
309*d219b4ceSAdam Hornacek    // Quote with two character names plus possibly a {WxSigils} spacer
310*d219b4ceSAdam Hornacek    ^ {QuoPC}                   { chkLOC(); qop(yytext(), 2, false); }
311*d219b4ceSAdam Hornacek    {WxSigils}{QuoPC}           { chkLOC(); qop(yytext(), 3, false); }
312*d219b4ceSAdam Hornacek    ^ {QuoPC_xN}                { chkLOC(); qop(yytext(), 2, true); }
313*d219b4ceSAdam Hornacek    {WxSigils}{QuoPC_xN}        { chkLOC(); qop(yytext(), 3, true); }
314*d219b4ceSAdam Hornacek
315*d219b4ceSAdam Hornacek    // Quote with one character names plus possibly a {WxSigils} spacer
316*d219b4ceSAdam Hornacek    ^ {QuoP}                    { chkLOC(); qop(yytext(), 1, false); }
317*d219b4ceSAdam Hornacek    {WxSigils}{QuoP}            { chkLOC(); qop(yytext(), 2, false); }
318*d219b4ceSAdam Hornacek    ^ {Symquo}                  { chkLOC(); qop(yytext(), 1, false); }
319*d219b4ceSAdam Hornacek    {WxSigils}{Symquo}          { chkLOC(); qop(yytext(), 2, false); }
320*d219b4ceSAdam Hornacek    ^ {Symquo_xN}               { chkLOC(); qop(yytext(), 1, true); }
321*d219b4ceSAdam Hornacek    {WxSigils}{Symquo_xN}       { chkLOC(); qop(yytext(), 2, true); }
322*d219b4ceSAdam Hornacek
323*d219b4ceSAdam Hornacek    // POD-end without having seen POD-begin is akin to a one-line comment
324*d219b4ceSAdam Hornacek    ^ {POD_end} [^\n\r]*    {
325*d219b4ceSAdam Hornacek        offer(yytext());
326*d219b4ceSAdam Hornacek    }
327*d219b4ceSAdam Hornacek
328*d219b4ceSAdam Hornacek    // POD start
329*d219b4ceSAdam Hornacek    ^ {POD_begin}   {
330*d219b4ceSAdam Hornacek        yypush(POD);
331*d219b4ceSAdam Hornacek        onDisjointSpanChanged(HtmlConsts.COMMENT_CLASS, yychar);
332*d219b4ceSAdam Hornacek        offer(yytext());
333*d219b4ceSAdam Hornacek    }
334*d219b4ceSAdam Hornacek
335*d219b4ceSAdam Hornacek    [\{\}]    {
336*d219b4ceSAdam Hornacek        chkLOC();
337*d219b4ceSAdam Hornacek        String capture = yytext();
338*d219b4ceSAdam Hornacek        if (!maybeEndInterpolation(capture)) {
339*d219b4ceSAdam Hornacek            offer(capture);
340*d219b4ceSAdam Hornacek        }
341*d219b4ceSAdam Hornacek    }
342*d219b4ceSAdam Hornacek}
343*d219b4ceSAdam Hornacek
344*d219b4ceSAdam Hornacek<YYINITIAL> {
345*d219b4ceSAdam Hornacek    "/"    {
346*d219b4ceSAdam Hornacek        chkLOC();
347*d219b4ceSAdam Hornacek        // OK to pass a fake "m/" with doWrite=false
348*d219b4ceSAdam Hornacek        qop(false, "m/", 1, false);
349*d219b4ceSAdam Hornacek        onDisjointSpanChanged(HtmlConsts.STRING_CLASS, yychar);
350*d219b4ceSAdam Hornacek        offer(yytext());
351*d219b4ceSAdam Hornacek    }
352*d219b4ceSAdam Hornacek}
353*d219b4ceSAdam Hornacek
354*d219b4ceSAdam Hornacek<YYINITIAL, INTRA> {
355*d219b4ceSAdam Hornacek    // Use some heuristics to identify double-slash syntax for the m//
356*d219b4ceSAdam Hornacek    // operator. We can't handle all possible appearances of `//', because the
357*d219b4ceSAdam Hornacek    // first slash cannot always be distinguished from division (/) without
358*d219b4ceSAdam Hornacek    // true parsing.
359*d219b4ceSAdam Hornacek
360*d219b4ceSAdam Hornacek    {Mpunc1YYIN} \s* "/"    { chkLOC(); hqopPunc(yytext()); }
361*d219b4ceSAdam Hornacek}
362*d219b4ceSAdam Hornacek
363*d219b4ceSAdam Hornacek<INTRA> {
364*d219b4ceSAdam Hornacek    // Continue with more punctuation heuristics
365*d219b4ceSAdam Hornacek
366*d219b4ceSAdam Hornacek    {Mpunc2IN} \s* "/"      { chkLOC(); hqopPunc(yytext()); }
367*d219b4ceSAdam Hornacek}
368*d219b4ceSAdam Hornacek
369*d219b4ceSAdam Hornacek<YYINITIAL, INTRA> {
370*d219b4ceSAdam Hornacek    // Define keyword heuristics
371*d219b4ceSAdam Hornacek
372*d219b4ceSAdam Hornacek    ^ {Mwords} \s* "/"    {
373*d219b4ceSAdam Hornacek        chkLOC();
374*d219b4ceSAdam Hornacek        hqopSymbol(yytext());
375*d219b4ceSAdam Hornacek    }
376*d219b4ceSAdam Hornacek
377*d219b4ceSAdam Hornacek    {WxSigils}{Mwords} \s* "/"    {
378*d219b4ceSAdam Hornacek        chkLOC();
379*d219b4ceSAdam Hornacek        String capture = yytext();
380*d219b4ceSAdam Hornacek        if (takeAllContent()) {
381*d219b4ceSAdam Hornacek            String boundary = capture.substring(0, 1);
382*d219b4ceSAdam Hornacek            offer(boundary);
383*d219b4ceSAdam Hornacek        }
384*d219b4ceSAdam Hornacek        hqopSymbol(capture.substring(1));
385*d219b4ceSAdam Hornacek    }
386*d219b4ceSAdam Hornacek}
387*d219b4ceSAdam Hornacek
388*d219b4ceSAdam Hornacek<QUO, QUOxN, QUOxL, QUOxLxN, HERE, HEREin> {
389*d219b4ceSAdam Hornacek    \\ \S    {
390*d219b4ceSAdam Hornacek        chkLOC();
391*d219b4ceSAdam Hornacek        offer(yytext());
392*d219b4ceSAdam Hornacek    }
393*d219b4ceSAdam Hornacek}
394*d219b4ceSAdam Hornacek
395*d219b4ceSAdam Hornacek<QUO, QUOxL> {
396*d219b4ceSAdam Hornacek    "#{"    {
397*d219b4ceSAdam Hornacek        chkLOC();
398*d219b4ceSAdam Hornacek        offer(yytext());
399*d219b4ceSAdam Hornacek        onDisjointSpanChanged(null, yychar);
400*d219b4ceSAdam Hornacek        yypush(YYINITIAL);
401*d219b4ceSAdam Hornacek        pushData();
402*d219b4ceSAdam Hornacek        interpop();
403*d219b4ceSAdam Hornacek    }
404*d219b4ceSAdam Hornacek}
405*d219b4ceSAdam Hornacek
406*d219b4ceSAdam Hornacek<QUO, QUOxN, QUOxL, QUOxLxN> {
407*d219b4ceSAdam Hornacek    {Quo0}    {
408*d219b4ceSAdam Hornacek        chkLOC();
409*d219b4ceSAdam Hornacek        String capture = yytext();
410*d219b4ceSAdam Hornacek        offer(capture);
411*d219b4ceSAdam Hornacek        if (maybeEndQuote(capture)) {
412*d219b4ceSAdam Hornacek            yypop();
413*d219b4ceSAdam Hornacek            if (areModifiersOK()) {
414*d219b4ceSAdam Hornacek                yypush(QM);
415*d219b4ceSAdam Hornacek            }
416*d219b4ceSAdam Hornacek            onDisjointSpanChanged(null, yychar);
417*d219b4ceSAdam Hornacek        }
418*d219b4ceSAdam Hornacek    }
419*d219b4ceSAdam Hornacek}
420*d219b4ceSAdam Hornacek
421*d219b4ceSAdam Hornacek<QUO, QUOxN, QUOxL, QUOxLxN, HERE, HERExN, HEREin, HEREinxN> {
422*d219b4ceSAdam Hornacek    {WhspChar}*{EOL}    {
423*d219b4ceSAdam Hornacek        onDisjointSpanChanged(null, yychar);
424*d219b4ceSAdam Hornacek        onEndOfLineMatched(yytext(), yychar);
425*d219b4ceSAdam Hornacek        onDisjointSpanChanged(HtmlConsts.STRING_CLASS, yychar);
426*d219b4ceSAdam Hornacek    }
427*d219b4ceSAdam Hornacek}
428*d219b4ceSAdam Hornacek
429*d219b4ceSAdam Hornacek<QM> {
430*d219b4ceSAdam Hornacek    // /PAT/imxouesn
431*d219b4ceSAdam Hornacek    [a-z]    {
432*d219b4ceSAdam Hornacek        chkLOC();
433*d219b4ceSAdam Hornacek        offer(yytext());
434*d219b4ceSAdam Hornacek    }
435*d219b4ceSAdam Hornacek
436*d219b4ceSAdam Hornacek    // anything else ends the quote-modifiers state
437*d219b4ceSAdam Hornacek    [^]    {
438*d219b4ceSAdam Hornacek        yypop();
439*d219b4ceSAdam Hornacek        yypushback(yytext().length());
440*d219b4ceSAdam Hornacek    }
441*d219b4ceSAdam Hornacek}
442*d219b4ceSAdam Hornacek
443*d219b4ceSAdam Hornacek<POD> {
444*d219b4ceSAdam Hornacek    ^ {POD_end} [^\n\r]*    {
445*d219b4ceSAdam Hornacek        yypop();
446*d219b4ceSAdam Hornacek        offer(yytext());
447*d219b4ceSAdam Hornacek        onDisjointSpanChanged(null, yychar);
448*d219b4ceSAdam Hornacek    }
449*d219b4ceSAdam Hornacek
450*d219b4ceSAdam Hornacek    {WhspChar}*{EOL}    {
451*d219b4ceSAdam Hornacek        onEndOfLineMatched(yytext(), yychar);
452*d219b4ceSAdam Hornacek    }
453*d219b4ceSAdam Hornacek}
454*d219b4ceSAdam Hornacek
455*d219b4ceSAdam Hornacek<SCOMMENT> {
456*d219b4ceSAdam Hornacek    {WhspChar}*{EOL}    {
457*d219b4ceSAdam Hornacek        String capture = yytext();
458*d219b4ceSAdam Hornacek        yypushback(capture.length());
459*d219b4ceSAdam Hornacek        yypop();
460*d219b4ceSAdam Hornacek        onDisjointSpanChanged(null, yychar);
461*d219b4ceSAdam Hornacek    }
462*d219b4ceSAdam Hornacek}
463*d219b4ceSAdam Hornacek
464*d219b4ceSAdam Hornacek<YYINITIAL, INTRA> {
465*d219b4ceSAdam Hornacek    {WhspChar}*{EOL}    {
466*d219b4ceSAdam Hornacek        String capture = yytext();
467*d219b4ceSAdam Hornacek        if (maybeStartHere()) {
468*d219b4ceSAdam Hornacek            yypushback(capture.length());
469*d219b4ceSAdam Hornacek        } else {
470*d219b4ceSAdam Hornacek            onEndOfLineMatched(yytext(), yychar);
471*d219b4ceSAdam Hornacek            yybegin(YYINITIAL);
472*d219b4ceSAdam Hornacek        }
473*d219b4ceSAdam Hornacek    }
474*d219b4ceSAdam Hornacek}
475*d219b4ceSAdam Hornacek
476*d219b4ceSAdam Hornacek<YYINITIAL, INTRA, SCOMMENT, POD, QUO, QUOxN, QUOxL, QUOxLxN,
477*d219b4ceSAdam Hornacek    HERE, HERExN, HEREin, HEREinxN> {
478*d219b4ceSAdam Hornacek
479*d219b4ceSAdam Hornacek    // Only one whitespace char at a time or else {WxSigils} can be broken
480*d219b4ceSAdam Hornacek    {WhspChar} |
481*d219b4ceSAdam Hornacek    [[\s]--[\n\r]]    {
482*d219b4ceSAdam Hornacek        offer(yytext());
483*d219b4ceSAdam Hornacek    }
484*d219b4ceSAdam Hornacek    // Only one char at a time due to restriction on {WhspChar} above.
485*d219b4ceSAdam Hornacek    [^\n\r]    {
486*d219b4ceSAdam Hornacek        chkLOC();
487*d219b4ceSAdam Hornacek        maybeIntraState();
488*d219b4ceSAdam Hornacek        offer(yytext());
489*d219b4ceSAdam Hornacek    }
490*d219b4ceSAdam Hornacek}
491*d219b4ceSAdam Hornacek
492*d219b4ceSAdam Hornacek// "string links" and "comment links"
493*d219b4ceSAdam Hornacek<SCOMMENT, POD, QUO, QUOxN, HERE, HERExN, HEREin, HEREinxN> {
494*d219b4ceSAdam Hornacek    {FPath}    {
495*d219b4ceSAdam Hornacek        chkLOC();
496*d219b4ceSAdam Hornacek        if (takeAllContent()) {
497*d219b4ceSAdam Hornacek            onPathlikeMatched(yytext(), '/', false, yychar);
498*d219b4ceSAdam Hornacek        }
499*d219b4ceSAdam Hornacek    }
500*d219b4ceSAdam Hornacek
501*d219b4ceSAdam Hornacek    {File}    {
502*d219b4ceSAdam Hornacek        chkLOC();
503*d219b4ceSAdam Hornacek        if (takeAllContent()) {
504*d219b4ceSAdam Hornacek            String path = yytext();
505*d219b4ceSAdam Hornacek            onFilelikeMatched(path, yychar);
506*d219b4ceSAdam Hornacek        }
507*d219b4ceSAdam Hornacek    }
508*d219b4ceSAdam Hornacek
509*d219b4ceSAdam Hornacek    {FNameChar}+ "@" {FNameChar}+ "." {FNameChar}+    {
510*d219b4ceSAdam Hornacek        chkLOC();
511*d219b4ceSAdam Hornacek        if (takeAllContent()) {
512*d219b4ceSAdam Hornacek            onEmailAddressMatched(yytext(), yychar);
513*d219b4ceSAdam Hornacek        }
514*d219b4ceSAdam Hornacek    }
515*d219b4ceSAdam Hornacek}
516*d219b4ceSAdam Hornacek
517*d219b4ceSAdam Hornacek<SCOMMENT, POD, HERE, HERExN, HEREin, HEREinxN> {
518*d219b4ceSAdam Hornacek    {BrowseableURI}    {
519*d219b4ceSAdam Hornacek        chkLOC();
520*d219b4ceSAdam Hornacek        if (takeAllContent()) {
521*d219b4ceSAdam Hornacek            onUriMatched(yytext(), yychar, null);
522*d219b4ceSAdam Hornacek        }
523*d219b4ceSAdam Hornacek        // no skipLink() needed except in QUO* states
524*d219b4ceSAdam Hornacek    }
525*d219b4ceSAdam Hornacek}
526*d219b4ceSAdam Hornacek
527*d219b4ceSAdam Hornacek<QUO, QUOxN> {
528*d219b4ceSAdam Hornacek    {BrowseableURI}    {
529*d219b4ceSAdam Hornacek        chkLOC();
530*d219b4ceSAdam Hornacek        String capture = yytext();
531*d219b4ceSAdam Hornacek        Pattern collateralCapture = getCollateralCapturePattern();
532*d219b4ceSAdam Hornacek        if (takeAllContent()) {
533*d219b4ceSAdam Hornacek            onUriMatched(capture, yychar, collateralCapture);
534*d219b4ceSAdam Hornacek        } else {
535*d219b4ceSAdam Hornacek            skipLink(capture, collateralCapture);
536*d219b4ceSAdam Hornacek        }
537*d219b4ceSAdam Hornacek    }
538*d219b4ceSAdam Hornacek}
539