xref: /OpenGrok/opengrok-indexer/src/main/jflex/analysis/perl/PerlProductions.lexh (revision d219b4cea555a12b602d2d5518daa22134ad4879)
1/*
2 * CDDL HEADER START
3 *
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
7 *
8 * See LICENSE.txt included in this distribution for the specific
9 * language governing permissions and limitations under the License.
10 *
11 * When distributing Covered Code, include this CDDL HEADER in each
12 * file and include the License file at LICENSE.txt.
13 * If applicable, add the following below this CDDL HEADER, with the
14 * fields enclosed by brackets "[]" replaced with your own identifying
15 * information: Portions Copyright [yyyy] [name of copyright owner]
16 *
17 * CDDL HEADER END
18 */
19
20/*
21 * Copyright (c) 2010, 2016, Oracle and/or its affiliates. All rights reserved.
22 * Portions Copyright (c) 2017, 2019, Chris Fraire <cfraire@me.com>.
23 */
24
25/*
26 * Regex productions shared between PerlXref and PerlSymbolTokenizer
27 */
28
29MaybeWhsp     = {WhspChar}*
30Identifier = [a-zA-Z_] [a-zA-Z0-9_]*
31Sigils = ("$" | "@" | "%" | "&" | "*")
32WxSigils = [[\W]--[\$\@\%\&\*\"\'\`\#\r\n]]
33
34// Perl special identifiers (four of six from
35// https://perldoc.perl.org/perldata.html#Identifier-parsing):
36//
37// 1. A sigil, followed solely by digits matching \p{POSIX_Digit} , like $0 ,
38// $1 , or $10000 .
39SPIdentifier1 = "$" \d+
40
41// 2(a). A sigil followed by a single character matching the \p{POSIX_Punct}
42// property, like $! or %+ , except the character "{" doesn't work.
43SPIdentifier2Xquo = [\$\%] [[\p{P}--{]]
44
45// 2(b). A sigil followed by a single character matching the \p{POSIX_Punct}
46// property, like $! or %+ , except the characters "{" and "\" don't work.
47SPIdentifier2Quo = [\$\%] [[\p{P}--[{\\]]]
48
49// 3. A sigil, followed by a caret and any one of the characters [][A-Z^_?\\] ,
50// like $^V or $^] .
51SPIdentifier3 = "$^" ( "]" | "[" | [A-Z\^_?\\] )
52
53// 4. Similar to the above, a sigil, followed by bareword text in braces, where
54// the first character is a caret. The next character is any one of the
55// characters [][A-Z^_?\\] , followed by ASCII word characters. An example is
56// ${^GLOBAL_PHASE} . ASCII \w matches the 63 characters: [a-zA-Z0-9_].
57SPIdentifier4 = "${^" ( "]" | "[" | [A-Z\^_?\\] ) [a-zA-Z0-9_]* "}"
58
59// prototype attribute must be recognized explicitly or else "($)" can be
60// mistaken for an SPIdentifier2
61ProtoAttr = "(" ( [\\]? {Sigils} | ";" | {WhspChar}+ )* ")"
62
63FileExt = ("pl"|"perl"|"pm"|"conf"|"txt"|"htm"|"html"|"xml"|"ini"|"diff"|"patch"|
64           "PL"|"PERL"|"PM"|"CONF"|"TXT"|"HTM"|"HTML"|"XML"|"INI"|"DIFF"|"PATCH")
65File = [a-zA-Z]{FNameChar}* "." {FileExt}
66
67Number = (0[xX][0-9a-fA-F]+|[0-9]+\.[0-9]+|[0-9][0-9_]*)([eE][+-]?[0-9]+)?
68
69PodEND = "=cut"
70
71Quo0 =           [[\`\(\)\<\>\[\]\{\}\p{P}\p{S}]]
72Quo0xHash =      [[\`\(\)\<\>\[\]\{\}\p{P}\p{S}]--\#]
73Quo0xHashxApos = [[\`\(\)\<\>\[\]\{\}\p{P}\p{S}]--[\#\']]
74
75MSapos = [ms] {MaybeWhsp} \'
76MShash = [ms]\#
77MSpunc = [ms] {MaybeWhsp} {Quo0xHashxApos}
78MSword = [ms] {WhspChar}+ \w
79QYhash = [qy]\#
80QYpunc = [qy] {MaybeWhsp} {Quo0xHash}
81QYword = [qy] {WhspChar}+ \w
82
83QXRapos  = "q"[xr] {MaybeWhsp} \'
84QQXRhash = "q"[qxr]\#
85QQXRPunc = "q"[qxr] {MaybeWhsp} {Quo0xHash}
86QQXRword = "q"[qxr] {WhspChar}+ \w
87
88QWhash = "qw"\#
89QWpunc = "qw" {MaybeWhsp} {Quo0xHash}
90QWword = "qw" {WhspChar}+ \w
91TRhash = "tr"\#
92TRpunc = "tr" {MaybeWhsp} {Quo0xHash}
93TRword = "tr" {WhspChar}+ \w
94
95HereEOF1 = [\"][^\r\n\"]*[\"]
96HereEOF2 = [\`][^\r\n\`]*[\`]
97HereEOF3 = [\'][^\r\n\']*[\']
98HereEOF4 = [\\]?{Identifier}
99
100//
101// Track some keywords that can be used to identify heuristically a possible
102// beginning of the shortcut syntax, //, for m//. Also include any perlfunc
103// that takes /PATTERN/. Heuristics using punctuation are defined inline later
104// in some rules.
105//
106Mwords_1 = ("eq" | "ne" | "le" | "ge" | "lt" | "gt" | "cmp")
107Mwords_2 = ("if" | "unless" | "or" | "and" | "not")
108Mwords_3 = ("split" | "grep")
109Mwords = ({Mwords_1} | {Mwords_2} | {Mwords_3})
110
111Mpunc1YYIN = [\(\!]
112Mpunc2IN = ([!=]"~" | [\:\?\=\+\-\<\>] | "=="|"!="|"<="|">="|"<=>"|"=>")
113
114//
115// There are two dimensions to quoting: "link"-or-not and "interpolate"-or-not.
116// Unfortunately, we cannot control the %state values, so we have to declare
117// a cross-product of states. (Technically, state values are not guaranteed to
118// be unique by jflex, but states that do not have identical rules will have
119// different values. The four "QUO" below states satisfy this difference
120// criterion; as likewise do the four "HERE" states.)
121//
122// YYINITIAL : nothing yet parsed or just after a non-quoted [;{}]
123// INTRA : saw content from YYINITIAL but not yet other state or [;{}]
124// SCOMMENT : single-line comment
125// POD : Perl Plain-Old-Documentation
126// FMT : an output record format
127// QUO : quote-like that is OK to match paths|files|URLs|e-mails
128// QUOxN : "" but with no interpolation
129// QUOxL : quote-like that is not OK to match paths|files|URLs|e-mails
130//      because a non-traditional character is used as the quote-like delimiter
131// QUOxLxN : "" but with no interpolation
132// QM : a quote-like has ended, and quote modifier chars are awaited
133// HERE : Here-docs
134// HERExN : Here-docs with no interpolation
135// HEREin : Indented Here-docs
136// HEREinxN : Indented Here-docs with no interpolation
137//
138%state INTRA SCOMMENT POD FMT
139%state QUO QUOxN QUOxL QUOxLxN QM
140%state HERE HERExN HEREin HEREinxN
141
142%%
143<HERE, HERExN> {
144    ^ {Identifier} / {MaybeWhsp}{EOL}    {
145        chkLOC();
146        maybeEndHere(yytext());
147    }
148}
149
150<HEREin, HEREinxN> {
151    ^ {MaybeWhsp} {Identifier} / {MaybeWhsp}{EOL}    {
152        chkLOC();
153        maybeEndHere(yytext());
154    }
155}
156
157<YYINITIAL, INTRA>{
158    // Part 1 of syntax that jumps back to YYINITIAL
159    [;\{] |
160    "&&" |
161    "||" |
162    {ProtoAttr}    {
163        chkLOC();
164        yyjump(YYINITIAL);
165        offer(yytext());
166    }
167    // Part 2 of syntax that jumps back to YYINITIAL. Since this does a
168    // look-ahead, keep it apart from "part 1" which uses OR-syntax ("|") --
169    // as it seems the look-ahead would apply to all cases.
170    "}" / {MaybeWhsp} {EOL}    {
171        chkLOC();
172        yyjump(YYINITIAL);
173        offer(yytext());
174    }
175
176 "<<"[~]? {MaybeWhsp} ({HereEOF1}|{HereEOF2}|{HereEOF3}|{HereEOF4})    {
177    chkLOC();
178    hop(yytext());
179 }
180
181 {Identifier}    {
182    chkLOC();
183    maybeIntraState();
184    String id = yytext();
185    if (offerSymbol(id, 0, false) && returnOnSymbol()) {
186        return yystate();
187    }
188 }
189
190 "<" ({File} | {FPath}) ">"    {
191    chkLOC();
192    maybeIntraState();
193    if (takeAllContent()) {
194        offer("<");
195        String path = yytext();
196        path = path.substring(1, path.length() - 1);
197        onFilelikeMatched(path, yychar);
198        offer(">");
199    }
200 }
201
202 {Number}    {
203    chkLOC();
204    maybeIntraState();
205    onDisjointSpanChanged(HtmlConsts.NUMBER_CLASS, yychar);
206    offer(yytext());
207    onDisjointSpanChanged(null, yychar);
208 }
209
210 [\"\`] { chkLOC(); qop(yytext(), 0, false); }
211 \'     { chkLOC(); qop(yytext(), 0, true); }
212 \#     {
213        yypush(SCOMMENT);
214        onDisjointSpanChanged(HtmlConsts.COMMENT_CLASS, yychar);
215        offer(yytext());
216 }
217
218 // qq//, qx//, qw//, qr/, tr/// and variants -- all with 2 character names
219 ^ {QXRapos} |
220 {WxSigils}{QXRapos}   { chkLOC(); qop(yytext(), 2, true); } // qx'' qr''
221 ^ {QQXRhash} |
222 {WxSigils}{QQXRhash}  { chkLOC(); qop(yytext(), 2, false); }
223 ^ {QQXRPunc} |
224 {WxSigils}{QQXRPunc}  { chkLOC(); qop(yytext(), 2, false); }
225 ^ {QQXRword} |
226 {WxSigils}{QQXRword}  { chkLOC(); qop(yytext(), 2, false); }
227
228 // In Perl these do not actually "interpolate," but "interpolate" for OpenGrok
229 // xref just means to cross-reference, which is appropriate for qw//.
230 ^ {QWhash} |
231 {WxSigils}{QWhash}  { chkLOC(); qop(yytext(), 2, false); }
232 ^ {QWpunc} |
233 {WxSigils}{QWpunc}  { chkLOC(); qop(yytext(), 2, false); }
234 ^ {QWword} |
235 {WxSigils}{QWword}  { chkLOC(); qop(yytext(), 2, false); }
236
237 ^ {TRhash} |
238 {WxSigils}{TRhash}  { chkLOC(); qop(yytext(), 2, true); }
239 ^ {TRpunc} |
240 {WxSigils}{TRpunc}  { chkLOC(); qop(yytext(), 2, true); }
241 ^ {TRword} |
242 {WxSigils}{TRword}  { chkLOC(); qop(yytext(), 2, true); }
243
244 // capture hyphen plus [qmsy] to prevent that combination from being mistaken
245 // for q//, m//, s//, y// and variants
246 ^ "-" [qmsy] |
247 {WxSigils} "-" [qmsy]    {
248    chkLOC();
249    maybeIntraState();
250    offer(yytext());
251 }
252 // q//, m//, s//, y// and variants -- all with 1 character names
253 ^ {MSapos} |
254 {WxSigils}{MSapos}  { chkLOC(); qop(yytext(), 1, true); } // m'' s''
255 ^ {MShash} |
256 {WxSigils}{MShash}  { chkLOC(); qop(yytext(), 1, false); }
257 ^ {MSpunc} |
258 {WxSigils}{MSpunc}  { chkLOC(); qop(yytext(), 1, false); }
259 ^ {MSword} |
260 {WxSigils}{MSword}  { chkLOC(); qop(yytext(), 1, false); }
261 ^ {QYhash} |
262 {WxSigils}{QYhash}  { chkLOC(); qop(yytext(), 1, true); }
263 ^ {QYpunc} |
264 {WxSigils}{QYpunc}  { chkLOC(); qop(yytext(), 1, true); }
265 ^ {QYword} |
266 {WxSigils}{QYword}  { chkLOC(); qop(yytext(), 1, true); }
267
268 // seeing POD-end without having seen POD-start is akin to a one-line comment
269 ^ {PodEND} [^\n\r]*    {
270        offer(yytext());
271 }
272
273 // POD start
274 ^ "=" [a-zA-Z_] [a-zA-Z0-9_]*    {
275        yypush(POD);
276        onDisjointSpanChanged(HtmlConsts.COMMENT_CLASS, yychar);
277        offer(yytext());
278 }
279
280 // FORMAT start
281 ^ {MaybeWhsp} "format" ({WhspChar}+ {Identifier})? {MaybeWhsp} "=" /
282     {MaybeWhsp}{EOL}    {
283    chkLOC();
284    yypush(FMT);
285    if (takeAllContent()) {
286        // split off the "  format" as `initial' for keyword processing
287        String capture = yytext();
288        String following = capture.replaceFirst("^\\s+", "").
289            substring("format".length());
290        String initial = capture.substring(0, capture.length() -
291            following.length());
292
293        offerKeyword(initial);
294        offer(following);
295        // start a pseudo-"string"
296        onDisjointSpanChanged(HtmlConsts.STRING_CLASS, yychar);
297    }
298 }
299}
300
301<YYINITIAL> {
302    "/"    {
303        chkLOC();
304        // OK to pass a fake "m/" with doWrite=false
305        qop(false, "m/", 1, false);
306        onDisjointSpanChanged(HtmlConsts.STRING_CLASS, yychar);
307        offer(yytext());
308    }
309}
310
311<YYINITIAL, INTRA> {
312    // Use some heuristics to identify double-slash syntax for the m//
313    // operator. We can't handle all possible appearances of `//', because the
314    // first slash cannot always be distinguished from division (/) without
315    // true parsing.
316
317    {Mpunc1YYIN} \s* "/"    { chkLOC(); hqopPunc(yytext()); }
318}
319
320<INTRA> {
321    // Continue with more punctuation heuristics
322
323    {Mpunc2IN} \s* "/"      { chkLOC(); hqopPunc(yytext()); }
324}
325
326<YYINITIAL, INTRA> {
327    // Define keyword heuristics
328
329    ^ {Mwords} \s* "/"    {
330        chkLOC();
331        hqopSymbol(yytext());
332    }
333
334    {WxSigils}{Mwords} \s* "/"    {
335        chkLOC();
336        String capture = yytext();
337        if (takeAllContent()) {
338            String boundary = capture.substring(0, 1);
339            offer(boundary);
340        }
341        hqopSymbol(capture.substring(1));
342    }
343}
344
345<YYINITIAL, INTRA> {
346    {Sigils} {MaybeWhsp} {Identifier} {
347        chkLOC();
348        maybeIntraState();
349        //we ignore keywords if the identifier starts with a sigil ...
350        sigilID(yytext());
351        if (returnOnSymbol()) {
352            return yystate();
353        }
354    }
355}
356
357<YYINITIAL, INTRA, FMT, QUO, QUOxL, HERE, HEREin> {
358    {Sigils} {MaybeWhsp} "{" {MaybeWhsp} {Identifier} {MaybeWhsp} "}" {
359        chkLOC();
360        maybeIntraState();
361        //we ignore keywords if the identifier starts with a sigil ...
362        bracedSigilID(yytext());
363        if (returnOnSymbol()) {
364            return yystate();
365        }
366    }
367
368    {SPIdentifier1} |
369    {SPIdentifier3} |
370    {SPIdentifier4} {
371        chkLOC();
372        maybeIntraState();
373        specialID(yytext());
374    }
375}
376
377<YYINITIAL, INTRA, FMT, HERE, HEREin> {
378    {SPIdentifier2Xquo}    {
379        chkLOC();
380        maybeIntraState();
381        specialID(yytext());
382    }
383}
384
385<QUO, QUOxL> {
386    {SPIdentifier2Quo}    {
387        chkLOC();
388        maybeIntraState();
389        specialID(yytext());
390    }
391}
392
393<FMT, QUO, QUOxL, HERE, HEREin> {
394    {Sigils} {Identifier} {
395        chkLOC();
396        //we ignore keywords if the identifier starts with a sigil ...
397        sigilID(yytext());
398        if (returnOnSymbol()) {
399            return yystate();
400        }
401    }
402}
403
404<FMT, QUO, QUOxN, QUOxL, QUOxLxN, HERE, HEREin> {
405    \\ \S    {
406        chkLOC();
407        offer(yytext());
408    }
409}
410
411<QUO, QUOxN, QUOxL, QUOxLxN> {
412    {Quo0} |
413    \w    {
414        chkLOC();
415        String capture = yytext();
416        offer(capture);
417        if (maybeEndQuote(capture)) {
418            yypop();
419            if (areModifiersOK()) {
420                yypush(QM);
421            }
422            onDisjointSpanChanged(null, yychar);
423        }
424    }
425}
426
427<FMT, QUO, QUOxN, QUOxL, QUOxLxN, HERE, HERExN, HEREin, HEREinxN> {
428    {WhspChar}*{EOL}    {
429        onDisjointSpanChanged(null, yychar);
430        onEndOfLineMatched(yytext(), yychar);
431        onDisjointSpanChanged(HtmlConsts.STRING_CLASS, yychar);
432    }
433}
434
435<QM> {
436    // m/PATTERN/msixpodualngc and /PATTERN/msixpodualngc
437    // qr/STRING/msixpodualn
438    // s/PATTERN/REPLACEMENT/msixpodualngcer
439    // tr/SEARCHLIST/REPLACEMENTLIST/cdsr
440    // y/SEARCHLIST/REPLACEMENTLIST/cdsr
441    [a-z]    {
442        chkLOC();
443        offer(yytext());
444    }
445
446    // anything else ends the quote-modifiers state
447    [^]    {
448        yypop();
449        yypushback(yytext().length());
450    }
451}
452
453<POD> {
454  ^ {PodEND} [^\n\r]*    {
455    yypop();
456    offer(yytext());
457    onDisjointSpanChanged(null, yychar);
458  }
459
460  {WhspChar}*{EOL}    {
461    onEndOfLineMatched(yytext(), yychar);
462  }
463}
464
465<FMT> {
466    // terminate a format
467    ^ "." / {MaybeWhsp} {EOL}    {
468        chkLOC();
469        yypop();
470        offer(yytext());
471        onDisjointSpanChanged(null, yychar);
472    }
473
474    // "A comment, indicated by putting a '#' in the first column."
475    ^ "#"    {
476        yypush(SCOMMENT);
477        onDisjointSpanChanged(HtmlConsts.COMMENT_CLASS, yychar);
478        offer(yytext());
479    }
480
481    // The other two types of line in a format FORMLIST -- "a 'picture' line
482    // giving the format for one output line" and "an argument line supplying
483    // values to plug into the previous picture line" -- are not handled
484    // in a particular way by this lexer.
485}
486
487<SCOMMENT> {
488  {WhspChar}*{EOL}    {
489    String capture = yytext();
490    yypushback(capture.length());
491    yypop();
492    onDisjointSpanChanged(null, yychar);
493  }
494}
495
496<YYINITIAL, INTRA> {
497    {WhspChar}*{EOL}    {
498        String capture = yytext();
499        if (maybeStartHere()) {
500            yypushback(capture.length());
501        } else {
502            onEndOfLineMatched(yytext(), yychar);
503        }
504    }
505}
506
507<YYINITIAL, INTRA, SCOMMENT, POD, FMT, QUO, QUOxN, QUOxL, QUOxLxN,
508    HERE, HERExN, HEREin, HEREinxN> {
509
510 // Only one whitespace char at a time or else {WxSigils} can be broken
511 {WhspChar} |
512 [[\s]--[\n\r]]    {
513        offer(yytext());
514 }
515 // Only one char at a time due to restriction on {WhspChar} above.
516 [^\n\r]          {
517        chkLOC();
518        maybeIntraState();
519        offer(yytext());
520 }
521}
522
523// "string links" and "comment links"
524<SCOMMENT, POD, FMT, QUO, QUOxN, HERE, HERExN, HEREin, HEREinxN> {
525    {FPath}    {
526        chkLOC();
527        if (takeAllContent()) {
528            onPathlikeMatched(yytext(), '/', false, yychar);
529        }
530    }
531
532    {File}    {
533        chkLOC();
534        if (takeAllContent()) {
535            String path = yytext();
536            onFilelikeMatched(path, yychar);
537        }
538    }
539
540    {FNameChar}+ "@" {FNameChar}+ "." {FNameChar}+    {
541        chkLOC();
542        if (takeAllContent()) {
543            onEmailAddressMatched(yytext(), yychar);
544        }
545    }
546}
547
548<SCOMMENT, POD, FMT, HERE, HERExN, HEREin, HEREinxN> {
549    {BrowseableURI}    {
550        chkLOC();
551        if (takeAllContent()) {
552            onUriMatched(yytext(), yychar, null);
553        }
554        // no skipLink() needed except in QUO* states
555    }
556}
557
558<QUO, QUOxN> {
559    {BrowseableURI}    {
560        chkLOC();
561        String capture = yytext();
562        Pattern collateralCapture = getCollateralCapturePattern();
563        if (takeAllContent()) {
564            onUriMatched(capture, yychar, collateralCapture);
565        } else {
566            skipLink(yytext(), collateralCapture);
567        }
568    }
569}
570