1/* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * See LICENSE.txt included in this distribution for the specific 9 * language governing permissions and limitations under the License. 10 * 11 * When distributing Covered Code, include this CDDL HEADER in each 12 * file and include the License file at LICENSE.txt. 13 * If applicable, add the following below this CDDL HEADER, with the 14 * fields enclosed by brackets "[]" replaced with your own identifying 15 * information: Portions Copyright [yyyy] [name of copyright owner] 16 * 17 * CDDL HEADER END 18 */ 19 20/* 21 * Copyright (c) 2010, 2016, Oracle and/or its affiliates. All rights reserved. 22 * Portions Copyright (c) 2017, 2019, Chris Fraire <cfraire@me.com>. 23 */ 24 25/* 26 * Regex productions shared between PerlXref and PerlSymbolTokenizer 27 */ 28 29MaybeWhsp = {WhspChar}* 30Identifier = [a-zA-Z_] [a-zA-Z0-9_]* 31Sigils = ("$" | "@" | "%" | "&" | "*") 32WxSigils = [[\W]--[\$\@\%\&\*\"\'\`\#\r\n]] 33 34// Perl special identifiers (four of six from 35// https://perldoc.perl.org/perldata.html#Identifier-parsing): 36// 37// 1. A sigil, followed solely by digits matching \p{POSIX_Digit} , like $0 , 38// $1 , or $10000 . 39SPIdentifier1 = "$" \d+ 40 41// 2(a). A sigil followed by a single character matching the \p{POSIX_Punct} 42// property, like $! or %+ , except the character "{" doesn't work. 43SPIdentifier2Xquo = [\$\%] [[\p{P}--{]] 44 45// 2(b). A sigil followed by a single character matching the \p{POSIX_Punct} 46// property, like $! or %+ , except the characters "{" and "\" don't work. 47SPIdentifier2Quo = [\$\%] [[\p{P}--[{\\]]] 48 49// 3. A sigil, followed by a caret and any one of the characters [][A-Z^_?\\] , 50// like $^V or $^] . 51SPIdentifier3 = "$^" ( "]" | "[" | [A-Z\^_?\\] ) 52 53// 4. Similar to the above, a sigil, followed by bareword text in braces, where 54// the first character is a caret. The next character is any one of the 55// characters [][A-Z^_?\\] , followed by ASCII word characters. An example is 56// ${^GLOBAL_PHASE} . ASCII \w matches the 63 characters: [a-zA-Z0-9_]. 57SPIdentifier4 = "${^" ( "]" | "[" | [A-Z\^_?\\] ) [a-zA-Z0-9_]* "}" 58 59// prototype attribute must be recognized explicitly or else "($)" can be 60// mistaken for an SPIdentifier2 61ProtoAttr = "(" ( [\\]? {Sigils} | ";" | {WhspChar}+ )* ")" 62 63FileExt = ("pl"|"perl"|"pm"|"conf"|"txt"|"htm"|"html"|"xml"|"ini"|"diff"|"patch"| 64 "PL"|"PERL"|"PM"|"CONF"|"TXT"|"HTM"|"HTML"|"XML"|"INI"|"DIFF"|"PATCH") 65File = [a-zA-Z]{FNameChar}* "." {FileExt} 66 67Number = (0[xX][0-9a-fA-F]+|[0-9]+\.[0-9]+|[0-9][0-9_]*)([eE][+-]?[0-9]+)? 68 69PodEND = "=cut" 70 71Quo0 = [[\`\(\)\<\>\[\]\{\}\p{P}\p{S}]] 72Quo0xHash = [[\`\(\)\<\>\[\]\{\}\p{P}\p{S}]--\#] 73Quo0xHashxApos = [[\`\(\)\<\>\[\]\{\}\p{P}\p{S}]--[\#\']] 74 75MSapos = [ms] {MaybeWhsp} \' 76MShash = [ms]\# 77MSpunc = [ms] {MaybeWhsp} {Quo0xHashxApos} 78MSword = [ms] {WhspChar}+ \w 79QYhash = [qy]\# 80QYpunc = [qy] {MaybeWhsp} {Quo0xHash} 81QYword = [qy] {WhspChar}+ \w 82 83QXRapos = "q"[xr] {MaybeWhsp} \' 84QQXRhash = "q"[qxr]\# 85QQXRPunc = "q"[qxr] {MaybeWhsp} {Quo0xHash} 86QQXRword = "q"[qxr] {WhspChar}+ \w 87 88QWhash = "qw"\# 89QWpunc = "qw" {MaybeWhsp} {Quo0xHash} 90QWword = "qw" {WhspChar}+ \w 91TRhash = "tr"\# 92TRpunc = "tr" {MaybeWhsp} {Quo0xHash} 93TRword = "tr" {WhspChar}+ \w 94 95HereEOF1 = [\"][^\r\n\"]*[\"] 96HereEOF2 = [\`][^\r\n\`]*[\`] 97HereEOF3 = [\'][^\r\n\']*[\'] 98HereEOF4 = [\\]?{Identifier} 99 100// 101// Track some keywords that can be used to identify heuristically a possible 102// beginning of the shortcut syntax, //, for m//. Also include any perlfunc 103// that takes /PATTERN/. Heuristics using punctuation are defined inline later 104// in some rules. 105// 106Mwords_1 = ("eq" | "ne" | "le" | "ge" | "lt" | "gt" | "cmp") 107Mwords_2 = ("if" | "unless" | "or" | "and" | "not") 108Mwords_3 = ("split" | "grep") 109Mwords = ({Mwords_1} | {Mwords_2} | {Mwords_3}) 110 111Mpunc1YYIN = [\(\!] 112Mpunc2IN = ([!=]"~" | [\:\?\=\+\-\<\>] | "=="|"!="|"<="|">="|"<=>"|"=>") 113 114// 115// There are two dimensions to quoting: "link"-or-not and "interpolate"-or-not. 116// Unfortunately, we cannot control the %state values, so we have to declare 117// a cross-product of states. (Technically, state values are not guaranteed to 118// be unique by jflex, but states that do not have identical rules will have 119// different values. The four "QUO" below states satisfy this difference 120// criterion; as likewise do the four "HERE" states.) 121// 122// YYINITIAL : nothing yet parsed or just after a non-quoted [;{}] 123// INTRA : saw content from YYINITIAL but not yet other state or [;{}] 124// SCOMMENT : single-line comment 125// POD : Perl Plain-Old-Documentation 126// FMT : an output record format 127// QUO : quote-like that is OK to match paths|files|URLs|e-mails 128// QUOxN : "" but with no interpolation 129// QUOxL : quote-like that is not OK to match paths|files|URLs|e-mails 130// because a non-traditional character is used as the quote-like delimiter 131// QUOxLxN : "" but with no interpolation 132// QM : a quote-like has ended, and quote modifier chars are awaited 133// HERE : Here-docs 134// HERExN : Here-docs with no interpolation 135// HEREin : Indented Here-docs 136// HEREinxN : Indented Here-docs with no interpolation 137// 138%state INTRA SCOMMENT POD FMT 139%state QUO QUOxN QUOxL QUOxLxN QM 140%state HERE HERExN HEREin HEREinxN 141 142%% 143<HERE, HERExN> { 144 ^ {Identifier} / {MaybeWhsp}{EOL} { 145 chkLOC(); 146 maybeEndHere(yytext()); 147 } 148} 149 150<HEREin, HEREinxN> { 151 ^ {MaybeWhsp} {Identifier} / {MaybeWhsp}{EOL} { 152 chkLOC(); 153 maybeEndHere(yytext()); 154 } 155} 156 157<YYINITIAL, INTRA>{ 158 // Part 1 of syntax that jumps back to YYINITIAL 159 [;\{] | 160 "&&" | 161 "||" | 162 {ProtoAttr} { 163 chkLOC(); 164 yyjump(YYINITIAL); 165 offer(yytext()); 166 } 167 // Part 2 of syntax that jumps back to YYINITIAL. Since this does a 168 // look-ahead, keep it apart from "part 1" which uses OR-syntax ("|") -- 169 // as it seems the look-ahead would apply to all cases. 170 "}" / {MaybeWhsp} {EOL} { 171 chkLOC(); 172 yyjump(YYINITIAL); 173 offer(yytext()); 174 } 175 176 "<<"[~]? {MaybeWhsp} ({HereEOF1}|{HereEOF2}|{HereEOF3}|{HereEOF4}) { 177 chkLOC(); 178 hop(yytext()); 179 } 180 181 {Identifier} { 182 chkLOC(); 183 maybeIntraState(); 184 String id = yytext(); 185 if (offerSymbol(id, 0, false) && returnOnSymbol()) { 186 return yystate(); 187 } 188 } 189 190 "<" ({File} | {FPath}) ">" { 191 chkLOC(); 192 maybeIntraState(); 193 if (takeAllContent()) { 194 offer("<"); 195 String path = yytext(); 196 path = path.substring(1, path.length() - 1); 197 onFilelikeMatched(path, yychar); 198 offer(">"); 199 } 200 } 201 202 {Number} { 203 chkLOC(); 204 maybeIntraState(); 205 onDisjointSpanChanged(HtmlConsts.NUMBER_CLASS, yychar); 206 offer(yytext()); 207 onDisjointSpanChanged(null, yychar); 208 } 209 210 [\"\`] { chkLOC(); qop(yytext(), 0, false); } 211 \' { chkLOC(); qop(yytext(), 0, true); } 212 \# { 213 yypush(SCOMMENT); 214 onDisjointSpanChanged(HtmlConsts.COMMENT_CLASS, yychar); 215 offer(yytext()); 216 } 217 218 // qq//, qx//, qw//, qr/, tr/// and variants -- all with 2 character names 219 ^ {QXRapos} | 220 {WxSigils}{QXRapos} { chkLOC(); qop(yytext(), 2, true); } // qx'' qr'' 221 ^ {QQXRhash} | 222 {WxSigils}{QQXRhash} { chkLOC(); qop(yytext(), 2, false); } 223 ^ {QQXRPunc} | 224 {WxSigils}{QQXRPunc} { chkLOC(); qop(yytext(), 2, false); } 225 ^ {QQXRword} | 226 {WxSigils}{QQXRword} { chkLOC(); qop(yytext(), 2, false); } 227 228 // In Perl these do not actually "interpolate," but "interpolate" for OpenGrok 229 // xref just means to cross-reference, which is appropriate for qw//. 230 ^ {QWhash} | 231 {WxSigils}{QWhash} { chkLOC(); qop(yytext(), 2, false); } 232 ^ {QWpunc} | 233 {WxSigils}{QWpunc} { chkLOC(); qop(yytext(), 2, false); } 234 ^ {QWword} | 235 {WxSigils}{QWword} { chkLOC(); qop(yytext(), 2, false); } 236 237 ^ {TRhash} | 238 {WxSigils}{TRhash} { chkLOC(); qop(yytext(), 2, true); } 239 ^ {TRpunc} | 240 {WxSigils}{TRpunc} { chkLOC(); qop(yytext(), 2, true); } 241 ^ {TRword} | 242 {WxSigils}{TRword} { chkLOC(); qop(yytext(), 2, true); } 243 244 // capture hyphen plus [qmsy] to prevent that combination from being mistaken 245 // for q//, m//, s//, y// and variants 246 ^ "-" [qmsy] | 247 {WxSigils} "-" [qmsy] { 248 chkLOC(); 249 maybeIntraState(); 250 offer(yytext()); 251 } 252 // q//, m//, s//, y// and variants -- all with 1 character names 253 ^ {MSapos} | 254 {WxSigils}{MSapos} { chkLOC(); qop(yytext(), 1, true); } // m'' s'' 255 ^ {MShash} | 256 {WxSigils}{MShash} { chkLOC(); qop(yytext(), 1, false); } 257 ^ {MSpunc} | 258 {WxSigils}{MSpunc} { chkLOC(); qop(yytext(), 1, false); } 259 ^ {MSword} | 260 {WxSigils}{MSword} { chkLOC(); qop(yytext(), 1, false); } 261 ^ {QYhash} | 262 {WxSigils}{QYhash} { chkLOC(); qop(yytext(), 1, true); } 263 ^ {QYpunc} | 264 {WxSigils}{QYpunc} { chkLOC(); qop(yytext(), 1, true); } 265 ^ {QYword} | 266 {WxSigils}{QYword} { chkLOC(); qop(yytext(), 1, true); } 267 268 // seeing POD-end without having seen POD-start is akin to a one-line comment 269 ^ {PodEND} [^\n\r]* { 270 offer(yytext()); 271 } 272 273 // POD start 274 ^ "=" [a-zA-Z_] [a-zA-Z0-9_]* { 275 yypush(POD); 276 onDisjointSpanChanged(HtmlConsts.COMMENT_CLASS, yychar); 277 offer(yytext()); 278 } 279 280 // FORMAT start 281 ^ {MaybeWhsp} "format" ({WhspChar}+ {Identifier})? {MaybeWhsp} "=" / 282 {MaybeWhsp}{EOL} { 283 chkLOC(); 284 yypush(FMT); 285 if (takeAllContent()) { 286 // split off the " format" as `initial' for keyword processing 287 String capture = yytext(); 288 String following = capture.replaceFirst("^\\s+", ""). 289 substring("format".length()); 290 String initial = capture.substring(0, capture.length() - 291 following.length()); 292 293 offerKeyword(initial); 294 offer(following); 295 // start a pseudo-"string" 296 onDisjointSpanChanged(HtmlConsts.STRING_CLASS, yychar); 297 } 298 } 299} 300 301<YYINITIAL> { 302 "/" { 303 chkLOC(); 304 // OK to pass a fake "m/" with doWrite=false 305 qop(false, "m/", 1, false); 306 onDisjointSpanChanged(HtmlConsts.STRING_CLASS, yychar); 307 offer(yytext()); 308 } 309} 310 311<YYINITIAL, INTRA> { 312 // Use some heuristics to identify double-slash syntax for the m// 313 // operator. We can't handle all possible appearances of `//', because the 314 // first slash cannot always be distinguished from division (/) without 315 // true parsing. 316 317 {Mpunc1YYIN} \s* "/" { chkLOC(); hqopPunc(yytext()); } 318} 319 320<INTRA> { 321 // Continue with more punctuation heuristics 322 323 {Mpunc2IN} \s* "/" { chkLOC(); hqopPunc(yytext()); } 324} 325 326<YYINITIAL, INTRA> { 327 // Define keyword heuristics 328 329 ^ {Mwords} \s* "/" { 330 chkLOC(); 331 hqopSymbol(yytext()); 332 } 333 334 {WxSigils}{Mwords} \s* "/" { 335 chkLOC(); 336 String capture = yytext(); 337 if (takeAllContent()) { 338 String boundary = capture.substring(0, 1); 339 offer(boundary); 340 } 341 hqopSymbol(capture.substring(1)); 342 } 343} 344 345<YYINITIAL, INTRA> { 346 {Sigils} {MaybeWhsp} {Identifier} { 347 chkLOC(); 348 maybeIntraState(); 349 //we ignore keywords if the identifier starts with a sigil ... 350 sigilID(yytext()); 351 if (returnOnSymbol()) { 352 return yystate(); 353 } 354 } 355} 356 357<YYINITIAL, INTRA, FMT, QUO, QUOxL, HERE, HEREin> { 358 {Sigils} {MaybeWhsp} "{" {MaybeWhsp} {Identifier} {MaybeWhsp} "}" { 359 chkLOC(); 360 maybeIntraState(); 361 //we ignore keywords if the identifier starts with a sigil ... 362 bracedSigilID(yytext()); 363 if (returnOnSymbol()) { 364 return yystate(); 365 } 366 } 367 368 {SPIdentifier1} | 369 {SPIdentifier3} | 370 {SPIdentifier4} { 371 chkLOC(); 372 maybeIntraState(); 373 specialID(yytext()); 374 } 375} 376 377<YYINITIAL, INTRA, FMT, HERE, HEREin> { 378 {SPIdentifier2Xquo} { 379 chkLOC(); 380 maybeIntraState(); 381 specialID(yytext()); 382 } 383} 384 385<QUO, QUOxL> { 386 {SPIdentifier2Quo} { 387 chkLOC(); 388 maybeIntraState(); 389 specialID(yytext()); 390 } 391} 392 393<FMT, QUO, QUOxL, HERE, HEREin> { 394 {Sigils} {Identifier} { 395 chkLOC(); 396 //we ignore keywords if the identifier starts with a sigil ... 397 sigilID(yytext()); 398 if (returnOnSymbol()) { 399 return yystate(); 400 } 401 } 402} 403 404<FMT, QUO, QUOxN, QUOxL, QUOxLxN, HERE, HEREin> { 405 \\ \S { 406 chkLOC(); 407 offer(yytext()); 408 } 409} 410 411<QUO, QUOxN, QUOxL, QUOxLxN> { 412 {Quo0} | 413 \w { 414 chkLOC(); 415 String capture = yytext(); 416 offer(capture); 417 if (maybeEndQuote(capture)) { 418 yypop(); 419 if (areModifiersOK()) { 420 yypush(QM); 421 } 422 onDisjointSpanChanged(null, yychar); 423 } 424 } 425} 426 427<FMT, QUO, QUOxN, QUOxL, QUOxLxN, HERE, HERExN, HEREin, HEREinxN> { 428 {WhspChar}*{EOL} { 429 onDisjointSpanChanged(null, yychar); 430 onEndOfLineMatched(yytext(), yychar); 431 onDisjointSpanChanged(HtmlConsts.STRING_CLASS, yychar); 432 } 433} 434 435<QM> { 436 // m/PATTERN/msixpodualngc and /PATTERN/msixpodualngc 437 // qr/STRING/msixpodualn 438 // s/PATTERN/REPLACEMENT/msixpodualngcer 439 // tr/SEARCHLIST/REPLACEMENTLIST/cdsr 440 // y/SEARCHLIST/REPLACEMENTLIST/cdsr 441 [a-z] { 442 chkLOC(); 443 offer(yytext()); 444 } 445 446 // anything else ends the quote-modifiers state 447 [^] { 448 yypop(); 449 yypushback(yytext().length()); 450 } 451} 452 453<POD> { 454 ^ {PodEND} [^\n\r]* { 455 yypop(); 456 offer(yytext()); 457 onDisjointSpanChanged(null, yychar); 458 } 459 460 {WhspChar}*{EOL} { 461 onEndOfLineMatched(yytext(), yychar); 462 } 463} 464 465<FMT> { 466 // terminate a format 467 ^ "." / {MaybeWhsp} {EOL} { 468 chkLOC(); 469 yypop(); 470 offer(yytext()); 471 onDisjointSpanChanged(null, yychar); 472 } 473 474 // "A comment, indicated by putting a '#' in the first column." 475 ^ "#" { 476 yypush(SCOMMENT); 477 onDisjointSpanChanged(HtmlConsts.COMMENT_CLASS, yychar); 478 offer(yytext()); 479 } 480 481 // The other two types of line in a format FORMLIST -- "a 'picture' line 482 // giving the format for one output line" and "an argument line supplying 483 // values to plug into the previous picture line" -- are not handled 484 // in a particular way by this lexer. 485} 486 487<SCOMMENT> { 488 {WhspChar}*{EOL} { 489 String capture = yytext(); 490 yypushback(capture.length()); 491 yypop(); 492 onDisjointSpanChanged(null, yychar); 493 } 494} 495 496<YYINITIAL, INTRA> { 497 {WhspChar}*{EOL} { 498 String capture = yytext(); 499 if (maybeStartHere()) { 500 yypushback(capture.length()); 501 } else { 502 onEndOfLineMatched(yytext(), yychar); 503 } 504 } 505} 506 507<YYINITIAL, INTRA, SCOMMENT, POD, FMT, QUO, QUOxN, QUOxL, QUOxLxN, 508 HERE, HERExN, HEREin, HEREinxN> { 509 510 // Only one whitespace char at a time or else {WxSigils} can be broken 511 {WhspChar} | 512 [[\s]--[\n\r]] { 513 offer(yytext()); 514 } 515 // Only one char at a time due to restriction on {WhspChar} above. 516 [^\n\r] { 517 chkLOC(); 518 maybeIntraState(); 519 offer(yytext()); 520 } 521} 522 523// "string links" and "comment links" 524<SCOMMENT, POD, FMT, QUO, QUOxN, HERE, HERExN, HEREin, HEREinxN> { 525 {FPath} { 526 chkLOC(); 527 if (takeAllContent()) { 528 onPathlikeMatched(yytext(), '/', false, yychar); 529 } 530 } 531 532 {File} { 533 chkLOC(); 534 if (takeAllContent()) { 535 String path = yytext(); 536 onFilelikeMatched(path, yychar); 537 } 538 } 539 540 {FNameChar}+ "@" {FNameChar}+ "." {FNameChar}+ { 541 chkLOC(); 542 if (takeAllContent()) { 543 onEmailAddressMatched(yytext(), yychar); 544 } 545 } 546} 547 548<SCOMMENT, POD, FMT, HERE, HERExN, HEREin, HEREinxN> { 549 {BrowseableURI} { 550 chkLOC(); 551 if (takeAllContent()) { 552 onUriMatched(yytext(), yychar, null); 553 } 554 // no skipLink() needed except in QUO* states 555 } 556} 557 558<QUO, QUOxN> { 559 {BrowseableURI} { 560 chkLOC(); 561 String capture = yytext(); 562 Pattern collateralCapture = getCollateralCapturePattern(); 563 if (takeAllContent()) { 564 onUriMatched(capture, yychar, collateralCapture); 565 } else { 566 skipLink(yytext(), collateralCapture); 567 } 568 } 569} 570