xref: /OpenGrok/opengrok-indexer/src/main/jflex/analysis/ruby/RubyProductions.lexh (revision d219b4cea555a12b602d2d5518daa22134ad4879)
1/*
2 * CDDL HEADER START
3 *
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
7 *
8 * See LICENSE.txt included in this distribution for the specific
9 * language governing permissions and limitations under the License.
10 *
11 * When distributing Covered Code, include this CDDL HEADER in each
12 * file and include the License file at LICENSE.txt.
13 * If applicable, add the following below this CDDL HEADER, with the
14 * fields enclosed by brackets "[]" replaced with your own identifying
15 * information: Portions Copyright [yyyy] [name of copyright owner]
16 *
17 * CDDL HEADER END
18 */
19
20/*
21 * Copyright (c) 2010, 2016, Oracle and/or its affiliates. All rights reserved.
22 * Portions Copyright (c) 2017, 2019-2020, Chris Fraire <cfraire@me.com>.
23 */
24
25/*
26 * Regex productions shared between RubyXref and RubySymbolTokenizer
27 */
28
29MaybeWhsp     = {WhspChar}*
30
31/*
32 * globals_rdoc: Pre-defined variables
33 * regexp_rdoc: Special global variables
34 */
35SPIdentifier = \$ ( [\!\@\&\`\'\+\1\~\=\/\\\,\;\<\>\_\0\*\$\?\:\"] |
36    "-0" | "-a" | "-d" | "-F" | "-i" | "-I" | "-l" | "-p" | "-v" | "-w" |
37    [~&\`\'\+] | [0-9]+ )
38
39AnyIdentifier = ({Local_var} | {Instance_var} | {Class_var} | {Global_var} |
40    {Method_name})
41
42/*
43 * A local variable name must start with a lowercase US-ASCII letter or a
44 * character with the eight bit set. Typically local variables are US-ASCII
45 * compatible since the keys to type them exist on all keyboards. (Ruby
46 * programs must be written in a US-ASCII-compatible character set. In such
47 * character sets if the eight bit is set it indicates an extended character.
48 * Ruby allows local variables to contain such characters.)
49 *  A local variable name may contain letters, numbers, an _ (underscore or low
50 * line) or a character with the eighth bit set.
51 */
52Local_var = {Local_char1} {Local_nextchar}*
53Local_char1 = ([a-z] | {Char8})
54Local_nextchar = ([a-zA-Z0-9_] | {Char8})
55Char8 = [\xA0-\xFF]
56/*
57 * An instance variable must start with a @ (“at” sign or commercial at).
58 * Otherwise instance variable names follow the rules as local variable names.
59 * Since the instance variable starts with an @ the second character may be an
60 * upper-case letter.
61 */
62Instance_var = [@]{Local_nextchar}+
63/*
64 * A class variable must start with a @@ (two “at” signs). The rest of the name
65 * follows the same rules as instance variables.
66 */
67Class_var = [@][@]{Local_nextchar}+
68/*
69 * Global variables start with a $ (dollar sign). The rest of the name follows
70 * the same rules as instance variables.
71 */
72Global_var = [$]{Local_nextchar}+
73
74/*
75 * methods_rdoc: Method Names
76 *
77 * Method names may be one of the operators or must start a letter or a
78 * character with the eight bit set.
79 *
80 * Method names may end with a ! (bang or exclamation mark), a ? (question
81 * mark) or = equals sign.
82 *
83 * N.b. an '=' suffix is not included in {Method_name}, because that character
84 * in a name is aligned with the operation (assignment) and not with the target
85 * (variable). E.g., `def birthdate=` is for an assignment of `birthdate'.
86 */
87Method_name_base = ([a-zA-Z] | {Char8}) {Local_nextchar}*
88Method_name = {Method_name_base} [\!\?]?
89
90/*
91 * modules_and_classes_rdoc: Nesting
92 *
93 * You may also define inner modules using ::
94 */
95Modules_nested = {AnyIdentifier}("::"{AnyIdentifier})+
96
97/*
98 * literals_rdoc: Numbers
99 */
100Numeric_literal = ({Decimal_literal} | {Decimal_prefixed} | {Hex_prefixed} |
101    {Octal_prefixed} | {Binary_prefixed})
102/*
103 * You can write integers of any size as follows: 1234 1_234
104 * Floating point numbers may be written as follows: 12.34 1234e-2 1.234E1
105 */
106Decimal_literal = {Numeral} ([\.]{Numeral})? {Exponent}?
107Numeral = {Digit} ([_]? {Digit})*
108Exponent = [Ee] [\+\-]? {Numeral}
109Digit = [0-9]
110/*
111 * You can use a special prefix to write numbers in decimal, hexadecimal, octal
112 * or binary formats. For decimal numbers use a prefix of 0d, for hexadecimal
113 * numbers use a prefix of 0x, for octal numbers use a prefix of 0 or 0o, for
114 * binary numbers use a prefix of 0b. The alphabetic component of the number is
115 * not case-sensitive.
116 *
117 * Like integers and floats you may use an underscore for readability.
118 *
119 * Examples: 0d170 0D170 0xaa 0xAa 0xAA 0Xaa 0XAa 0XaA 0252 0o252 0O252
120 *     0b10101010 0B10101010
121 */
122Decimal_prefixed = [0][Dd] {Numeral}
123Hex_prefixed = [0][Xx] {Hex_numeral}
124Hex_numeral = {Hex_digit} ([_]? {Hex_digit})*
125Hex_digit = [0-9A-Fa-z]
126/*
127 * The "0"-prefixed octal number as a regex will be captured by
128 * {Decimal_literal} so it is not defined in {Octal_prefixed}.
129 */
130Octal_prefixed = [0][Oo]{Octal_numeral}
131Octal_numeral = {Octal_digit} ([_]? {Octal_digit})*
132Octal_digit = [0-7]
133Binary_prefixed = [0][Bb]{Binary_numeral}
134Binary_numeral = {Binary_digit} ([_]? {Binary_digit})*
135Binary_digit = [01]
136
137/*
138 * There is also a character literal notation to represent single character
139 * strings, which syntax is a question mark (?) followed by a single character
140 * or escape sequence that corresponds to a single codepoint in the script
141 * encoding: ?a #=> "a"  ?abc #=> SyntaxError  ?\n #=> "\n"  ?\s #=> " "
142 *     ?\\ #=> "\\"  ?\u{41} #=> "A"  ?\C-a #=> "\x01"  ?\M-a #=> "\xE1"
143 *     ?\M-\C-a #=> "\x81"  ?\C-\M-a #=> "\x81", same as above  ?あ #=> "あ"
144 * N.b. the Ruby rule about ?abc is not enforced in this regex.
145 */
146Character_literal = [?] ({Character_literal_esc} | [^\s])
147Character_literal_esc = [\\] ([MC][\-][^\s] | "u{" [0-9]+ "}" |
148    "M-C-" [^\s] | "C-M-" [^\s] | [^\s])
149
150/*
151 * literals_rdoc: Strings
152 *
153 * The most common way of writing strings is using ". The string may be many
154 * lines long. Any internal " must be escaped. Strings may allow interpolation
155 * of other values using #{...}, or they may be cross-referenced as URLs or
156 * files, so they are handled as separate yy states.
157 */
158
159WxSigils = [[\W]--[\$\@\"\'\`\#\r\n]]
160
161FileExt = ([Rr][Bb] | [Rr][Uu][Bb][Yy] | [Dd][Ii][Ff][Ff] |
162    [Pp][Aa][Tt][Cc][Hh])
163File = [a-zA-Z]{FNameChar}* "." {FileExt}
164
165POD_begin = "=begin"
166POD_end = "=end"
167
168Quo0 =           [[\`\(\)\<\>\[\]\{\}\p{P}\p{S}]]
169QuoP =     [%]{Quo0}
170QuoPC =    [%][IQRSWX]{Quo0}
171QuoPC_xN = [%][iqrswx]{Quo0}
172
173Symbol =    [:]{AnyIdentifier}
174Symquo =    [:][\"]
175Symquo_xN = [:][\']
176
177//
178// Track some keywords that can be used to identify heuristically a possible
179// beginning of the shortcut syntax, //, for m//. Also include any perlfunc
180// that takes /PATTERN/. Heuristics using punctuation are defined inline later
181// in some rules.
182//
183Mwords_1 = ("and" | "or" | "not")
184Mwords_2 = ("begin" | "end" | "unless" | "until" | "when" | "while")
185Mwords = ({Mwords_1} | {Mwords_2})
186
187Mpunc1YYIN = [\(\!\[]
188Mpunc2IN = ([!=]"~" | [\:\?\=\+\-\<\>] | "=="[=]?|"!="|"<="|">="|"<=>"|"=>")
189
190Here_marker = {Local_nextchar}+
191Here_EOF1 = {Here_marker}
192Here_EOF2 = [\'][^\r\n\']*[\']
193Here_EOF3 = [\`][^\r\n\`]*[\`]
194
195/*
196 * YYINITIAL : nothing yet parsed or just after an non-continuation EOL
197 * INTRA : saw content from YYINITIAL but not yet other state or [{}] or non-
198 *     continuation {EOL}
199 * SCOMMENT : single-line comment
200 * POD : embedded documentation
201 * QUO : quote-like that is OK to match paths|files|URLs|e-mails
202 * QUOxN : "" but with no interpolation
203 * QUOxL : quote-like that is not OK to match paths|files|URLs|e-mails
204 *     because a non-traditional character is used as the quote-like delimiter
205 * QUOxLxN : "" but with no interpolation
206 * QM : a quote-like has ended, and quote modifier chars are awaited
207 * HERE : Here-docs
208 * HERExN : Here-docs with no interpolation
209 * HEREin : Indented Here-docs
210 * HEREinxN : Indented Here-docs with no interpolation
211 */
212%state INTRA SCOMMENT POD
213%state QUO QUOxN QUOxL QUOxLxN QM
214%state HERE HERExN HEREin HEREinxN
215
216%%
217<HERE, HERExN> {
218    ^ {Here_marker} / {MaybeWhsp}{EOL}    {
219        chkLOC();
220        maybeEndHere(yytext());
221    }
222}
223
224<HEREin, HEREinxN> {
225    ^ {MaybeWhsp} {Here_marker} / {MaybeWhsp}{EOL}    {
226        chkLOC();
227        maybeEndHere(yytext());
228    }
229}
230
231<INTRA> {
232    // Syntax that switches back to YYINITIAL but preserves otherwise the stack
233    [\{] |
234    "&&" |
235    "||"    {
236        chkLOC();
237        yypushback(yytext().length());
238        yybegin(YYINITIAL);
239    }
240}
241
242<YYINITIAL, INTRA> {
243    [\\] {EOL}    {
244        maybeIntraState();
245        offer("\\");
246        onEndOfLineMatched(yytext(), yychar);
247    }
248
249    "<<"[~\-]? ({Here_EOF1} | {Here_EOF2} | {Here_EOF3})    {
250        chkLOC();
251        maybeIntraState();
252        hop(yytext());
253    }
254
255    {Instance_var} | {Class_var} | {Global_var} | {Symbol}    {
256        chkLOC();
257        maybeIntraState();
258        String id = yytext();
259        if (offerSymbol(id, 0, true) && returnOnSymbol()) {
260            return yystate();
261        }
262    }
263
264    {Local_var} | {Method_name}    {
265        chkLOC();
266        maybeIntraState();
267        String id = yytext();
268        if (offerSymbol(id, 0, false) && returnOnSymbol()) {
269            return yystate();
270        }
271    }
272
273    {SPIdentifier}    {
274        chkLOC();
275        maybeIntraState();
276        offerKeyword(yytext());
277    }
278
279    {Modules_nested}    {
280        chkLOC();
281        maybeIntraState();
282        takeModules(yytext());
283    }
284
285    {Character_literal}    {
286        chkLOC();
287        maybeIntraState();
288        onDisjointSpanChanged(HtmlConsts.STRING_CLASS, yychar);
289        offer(yytext());
290        onDisjointSpanChanged(null, yychar);
291    }
292
293    {Numeric_literal}    {
294        chkLOC();
295        maybeIntraState();
296        onDisjointSpanChanged(HtmlConsts.NUMBER_CLASS, yychar);
297        offer(yytext());
298        onDisjointSpanChanged(null, yychar);
299    }
300
301    \"    { chkLOC(); qop(yytext(), 0, false); }
302    \'    { chkLOC(); qop(yytext(), 0, true); }
303    \#    {
304        yypush(SCOMMENT);
305        onDisjointSpanChanged(HtmlConsts.COMMENT_CLASS, yychar);
306        offer(yytext());
307    }
308
309    // Quote with two character names plus possibly a {WxSigils} spacer
310    ^ {QuoPC}                   { chkLOC(); qop(yytext(), 2, false); }
311    {WxSigils}{QuoPC}           { chkLOC(); qop(yytext(), 3, false); }
312    ^ {QuoPC_xN}                { chkLOC(); qop(yytext(), 2, true); }
313    {WxSigils}{QuoPC_xN}        { chkLOC(); qop(yytext(), 3, true); }
314
315    // Quote with one character names plus possibly a {WxSigils} spacer
316    ^ {QuoP}                    { chkLOC(); qop(yytext(), 1, false); }
317    {WxSigils}{QuoP}            { chkLOC(); qop(yytext(), 2, false); }
318    ^ {Symquo}                  { chkLOC(); qop(yytext(), 1, false); }
319    {WxSigils}{Symquo}          { chkLOC(); qop(yytext(), 2, false); }
320    ^ {Symquo_xN}               { chkLOC(); qop(yytext(), 1, true); }
321    {WxSigils}{Symquo_xN}       { chkLOC(); qop(yytext(), 2, true); }
322
323    // POD-end without having seen POD-begin is akin to a one-line comment
324    ^ {POD_end} [^\n\r]*    {
325        offer(yytext());
326    }
327
328    // POD start
329    ^ {POD_begin}   {
330        yypush(POD);
331        onDisjointSpanChanged(HtmlConsts.COMMENT_CLASS, yychar);
332        offer(yytext());
333    }
334
335    [\{\}]    {
336        chkLOC();
337        String capture = yytext();
338        if (!maybeEndInterpolation(capture)) {
339            offer(capture);
340        }
341    }
342}
343
344<YYINITIAL> {
345    "/"    {
346        chkLOC();
347        // OK to pass a fake "m/" with doWrite=false
348        qop(false, "m/", 1, false);
349        onDisjointSpanChanged(HtmlConsts.STRING_CLASS, yychar);
350        offer(yytext());
351    }
352}
353
354<YYINITIAL, INTRA> {
355    // Use some heuristics to identify double-slash syntax for the m//
356    // operator. We can't handle all possible appearances of `//', because the
357    // first slash cannot always be distinguished from division (/) without
358    // true parsing.
359
360    {Mpunc1YYIN} \s* "/"    { chkLOC(); hqopPunc(yytext()); }
361}
362
363<INTRA> {
364    // Continue with more punctuation heuristics
365
366    {Mpunc2IN} \s* "/"      { chkLOC(); hqopPunc(yytext()); }
367}
368
369<YYINITIAL, INTRA> {
370    // Define keyword heuristics
371
372    ^ {Mwords} \s* "/"    {
373        chkLOC();
374        hqopSymbol(yytext());
375    }
376
377    {WxSigils}{Mwords} \s* "/"    {
378        chkLOC();
379        String capture = yytext();
380        if (takeAllContent()) {
381            String boundary = capture.substring(0, 1);
382            offer(boundary);
383        }
384        hqopSymbol(capture.substring(1));
385    }
386}
387
388<QUO, QUOxN, QUOxL, QUOxLxN, HERE, HEREin> {
389    \\ \S    {
390        chkLOC();
391        offer(yytext());
392    }
393}
394
395<QUO, QUOxL> {
396    "#{"    {
397        chkLOC();
398        offer(yytext());
399        onDisjointSpanChanged(null, yychar);
400        yypush(YYINITIAL);
401        pushData();
402        interpop();
403    }
404}
405
406<QUO, QUOxN, QUOxL, QUOxLxN> {
407    {Quo0}    {
408        chkLOC();
409        String capture = yytext();
410        offer(capture);
411        if (maybeEndQuote(capture)) {
412            yypop();
413            if (areModifiersOK()) {
414                yypush(QM);
415            }
416            onDisjointSpanChanged(null, yychar);
417        }
418    }
419}
420
421<QUO, QUOxN, QUOxL, QUOxLxN, HERE, HERExN, HEREin, HEREinxN> {
422    {WhspChar}*{EOL}    {
423        onDisjointSpanChanged(null, yychar);
424        onEndOfLineMatched(yytext(), yychar);
425        onDisjointSpanChanged(HtmlConsts.STRING_CLASS, yychar);
426    }
427}
428
429<QM> {
430    // /PAT/imxouesn
431    [a-z]    {
432        chkLOC();
433        offer(yytext());
434    }
435
436    // anything else ends the quote-modifiers state
437    [^]    {
438        yypop();
439        yypushback(yytext().length());
440    }
441}
442
443<POD> {
444    ^ {POD_end} [^\n\r]*    {
445        yypop();
446        offer(yytext());
447        onDisjointSpanChanged(null, yychar);
448    }
449
450    {WhspChar}*{EOL}    {
451        onEndOfLineMatched(yytext(), yychar);
452    }
453}
454
455<SCOMMENT> {
456    {WhspChar}*{EOL}    {
457        String capture = yytext();
458        yypushback(capture.length());
459        yypop();
460        onDisjointSpanChanged(null, yychar);
461    }
462}
463
464<YYINITIAL, INTRA> {
465    {WhspChar}*{EOL}    {
466        String capture = yytext();
467        if (maybeStartHere()) {
468            yypushback(capture.length());
469        } else {
470            onEndOfLineMatched(yytext(), yychar);
471            yybegin(YYINITIAL);
472        }
473    }
474}
475
476<YYINITIAL, INTRA, SCOMMENT, POD, QUO, QUOxN, QUOxL, QUOxLxN,
477    HERE, HERExN, HEREin, HEREinxN> {
478
479    // Only one whitespace char at a time or else {WxSigils} can be broken
480    {WhspChar} |
481    [[\s]--[\n\r]]    {
482        offer(yytext());
483    }
484    // Only one char at a time due to restriction on {WhspChar} above.
485    [^\n\r]    {
486        chkLOC();
487        maybeIntraState();
488        offer(yytext());
489    }
490}
491
492// "string links" and "comment links"
493<SCOMMENT, POD, QUO, QUOxN, HERE, HERExN, HEREin, HEREinxN> {
494    {FPath}    {
495        chkLOC();
496        if (takeAllContent()) {
497            onPathlikeMatched(yytext(), '/', false, yychar);
498        }
499    }
500
501    {File}    {
502        chkLOC();
503        if (takeAllContent()) {
504            String path = yytext();
505            onFilelikeMatched(path, yychar);
506        }
507    }
508
509    {FNameChar}+ "@" {FNameChar}+ "." {FNameChar}+    {
510        chkLOC();
511        if (takeAllContent()) {
512            onEmailAddressMatched(yytext(), yychar);
513        }
514    }
515}
516
517<SCOMMENT, POD, HERE, HERExN, HEREin, HEREinxN> {
518    {BrowseableURI}    {
519        chkLOC();
520        if (takeAllContent()) {
521            onUriMatched(yytext(), yychar, null);
522        }
523        // no skipLink() needed except in QUO* states
524    }
525}
526
527<QUO, QUOxN> {
528    {BrowseableURI}    {
529        chkLOC();
530        String capture = yytext();
531        Pattern collateralCapture = getCollateralCapturePattern();
532        if (takeAllContent()) {
533            onUriMatched(capture, yychar, collateralCapture);
534        } else {
535            skipLink(capture, collateralCapture);
536        }
537    }
538}
539