1/* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * See LICENSE.txt included in this distribution for the specific 9 * language governing permissions and limitations under the License. 10 * 11 * When distributing Covered Code, include this CDDL HEADER in each 12 * file and include the License file at LICENSE.txt. 13 * If applicable, add the following below this CDDL HEADER, with the 14 * fields enclosed by brackets "[]" replaced with your own identifying 15 * information: Portions Copyright [yyyy] [name of copyright owner] 16 * 17 * CDDL HEADER END 18 */ 19 20/* 21 * Copyright (c) 2010, 2016, Oracle and/or its affiliates. All rights reserved. 22 * Portions Copyright (c) 2017, 2019-2020, Chris Fraire <cfraire@me.com>. 23 */ 24 25/* 26 * Regex productions shared between RubyXref and RubySymbolTokenizer 27 */ 28 29MaybeWhsp = {WhspChar}* 30 31/* 32 * globals_rdoc: Pre-defined variables 33 * regexp_rdoc: Special global variables 34 */ 35SPIdentifier = \$ ( [\!\@\&\`\'\+\1\~\=\/\\\,\;\<\>\_\0\*\$\?\:\"] | 36 "-0" | "-a" | "-d" | "-F" | "-i" | "-I" | "-l" | "-p" | "-v" | "-w" | 37 [~&\`\'\+] | [0-9]+ ) 38 39AnyIdentifier = ({Local_var} | {Instance_var} | {Class_var} | {Global_var} | 40 {Method_name}) 41 42/* 43 * A local variable name must start with a lowercase US-ASCII letter or a 44 * character with the eight bit set. Typically local variables are US-ASCII 45 * compatible since the keys to type them exist on all keyboards. (Ruby 46 * programs must be written in a US-ASCII-compatible character set. In such 47 * character sets if the eight bit is set it indicates an extended character. 48 * Ruby allows local variables to contain such characters.) 49 * A local variable name may contain letters, numbers, an _ (underscore or low 50 * line) or a character with the eighth bit set. 51 */ 52Local_var = {Local_char1} {Local_nextchar}* 53Local_char1 = ([a-z] | {Char8}) 54Local_nextchar = ([a-zA-Z0-9_] | {Char8}) 55Char8 = [\xA0-\xFF] 56/* 57 * An instance variable must start with a @ (“at” sign or commercial at). 58 * Otherwise instance variable names follow the rules as local variable names. 59 * Since the instance variable starts with an @ the second character may be an 60 * upper-case letter. 61 */ 62Instance_var = [@]{Local_nextchar}+ 63/* 64 * A class variable must start with a @@ (two “at” signs). The rest of the name 65 * follows the same rules as instance variables. 66 */ 67Class_var = [@][@]{Local_nextchar}+ 68/* 69 * Global variables start with a $ (dollar sign). The rest of the name follows 70 * the same rules as instance variables. 71 */ 72Global_var = [$]{Local_nextchar}+ 73 74/* 75 * methods_rdoc: Method Names 76 * 77 * Method names may be one of the operators or must start a letter or a 78 * character with the eight bit set. 79 * 80 * Method names may end with a ! (bang or exclamation mark), a ? (question 81 * mark) or = equals sign. 82 * 83 * N.b. an '=' suffix is not included in {Method_name}, because that character 84 * in a name is aligned with the operation (assignment) and not with the target 85 * (variable). E.g., `def birthdate=` is for an assignment of `birthdate'. 86 */ 87Method_name_base = ([a-zA-Z] | {Char8}) {Local_nextchar}* 88Method_name = {Method_name_base} [\!\?]? 89 90/* 91 * modules_and_classes_rdoc: Nesting 92 * 93 * You may also define inner modules using :: 94 */ 95Modules_nested = {AnyIdentifier}("::"{AnyIdentifier})+ 96 97/* 98 * literals_rdoc: Numbers 99 */ 100Numeric_literal = ({Decimal_literal} | {Decimal_prefixed} | {Hex_prefixed} | 101 {Octal_prefixed} | {Binary_prefixed}) 102/* 103 * You can write integers of any size as follows: 1234 1_234 104 * Floating point numbers may be written as follows: 12.34 1234e-2 1.234E1 105 */ 106Decimal_literal = {Numeral} ([\.]{Numeral})? {Exponent}? 107Numeral = {Digit} ([_]? {Digit})* 108Exponent = [Ee] [\+\-]? {Numeral} 109Digit = [0-9] 110/* 111 * You can use a special prefix to write numbers in decimal, hexadecimal, octal 112 * or binary formats. For decimal numbers use a prefix of 0d, for hexadecimal 113 * numbers use a prefix of 0x, for octal numbers use a prefix of 0 or 0o, for 114 * binary numbers use a prefix of 0b. The alphabetic component of the number is 115 * not case-sensitive. 116 * 117 * Like integers and floats you may use an underscore for readability. 118 * 119 * Examples: 0d170 0D170 0xaa 0xAa 0xAA 0Xaa 0XAa 0XaA 0252 0o252 0O252 120 * 0b10101010 0B10101010 121 */ 122Decimal_prefixed = [0][Dd] {Numeral} 123Hex_prefixed = [0][Xx] {Hex_numeral} 124Hex_numeral = {Hex_digit} ([_]? {Hex_digit})* 125Hex_digit = [0-9A-Fa-z] 126/* 127 * The "0"-prefixed octal number as a regex will be captured by 128 * {Decimal_literal} so it is not defined in {Octal_prefixed}. 129 */ 130Octal_prefixed = [0][Oo]{Octal_numeral} 131Octal_numeral = {Octal_digit} ([_]? {Octal_digit})* 132Octal_digit = [0-7] 133Binary_prefixed = [0][Bb]{Binary_numeral} 134Binary_numeral = {Binary_digit} ([_]? {Binary_digit})* 135Binary_digit = [01] 136 137/* 138 * There is also a character literal notation to represent single character 139 * strings, which syntax is a question mark (?) followed by a single character 140 * or escape sequence that corresponds to a single codepoint in the script 141 * encoding: ?a #=> "a" ?abc #=> SyntaxError ?\n #=> "\n" ?\s #=> " " 142 * ?\\ #=> "\\" ?\u{41} #=> "A" ?\C-a #=> "\x01" ?\M-a #=> "\xE1" 143 * ?\M-\C-a #=> "\x81" ?\C-\M-a #=> "\x81", same as above ?あ #=> "あ" 144 * N.b. the Ruby rule about ?abc is not enforced in this regex. 145 */ 146Character_literal = [?] ({Character_literal_esc} | [^\s]) 147Character_literal_esc = [\\] ([MC][\-][^\s] | "u{" [0-9]+ "}" | 148 "M-C-" [^\s] | "C-M-" [^\s] | [^\s]) 149 150/* 151 * literals_rdoc: Strings 152 * 153 * The most common way of writing strings is using ". The string may be many 154 * lines long. Any internal " must be escaped. Strings may allow interpolation 155 * of other values using #{...}, or they may be cross-referenced as URLs or 156 * files, so they are handled as separate yy states. 157 */ 158 159WxSigils = [[\W]--[\$\@\"\'\`\#\r\n]] 160 161FileExt = ([Rr][Bb] | [Rr][Uu][Bb][Yy] | [Dd][Ii][Ff][Ff] | 162 [Pp][Aa][Tt][Cc][Hh]) 163File = [a-zA-Z]{FNameChar}* "." {FileExt} 164 165POD_begin = "=begin" 166POD_end = "=end" 167 168Quo0 = [[\`\(\)\<\>\[\]\{\}\p{P}\p{S}]] 169QuoP = [%]{Quo0} 170QuoPC = [%][IQRSWX]{Quo0} 171QuoPC_xN = [%][iqrswx]{Quo0} 172 173Symbol = [:]{AnyIdentifier} 174Symquo = [:][\"] 175Symquo_xN = [:][\'] 176 177// 178// Track some keywords that can be used to identify heuristically a possible 179// beginning of the shortcut syntax, //, for m//. Also include any perlfunc 180// that takes /PATTERN/. Heuristics using punctuation are defined inline later 181// in some rules. 182// 183Mwords_1 = ("and" | "or" | "not") 184Mwords_2 = ("begin" | "end" | "unless" | "until" | "when" | "while") 185Mwords = ({Mwords_1} | {Mwords_2}) 186 187Mpunc1YYIN = [\(\!\[] 188Mpunc2IN = ([!=]"~" | [\:\?\=\+\-\<\>] | "=="[=]?|"!="|"<="|">="|"<=>"|"=>") 189 190Here_marker = {Local_nextchar}+ 191Here_EOF1 = {Here_marker} 192Here_EOF2 = [\'][^\r\n\']*[\'] 193Here_EOF3 = [\`][^\r\n\`]*[\`] 194 195/* 196 * YYINITIAL : nothing yet parsed or just after an non-continuation EOL 197 * INTRA : saw content from YYINITIAL but not yet other state or [{}] or non- 198 * continuation {EOL} 199 * SCOMMENT : single-line comment 200 * POD : embedded documentation 201 * QUO : quote-like that is OK to match paths|files|URLs|e-mails 202 * QUOxN : "" but with no interpolation 203 * QUOxL : quote-like that is not OK to match paths|files|URLs|e-mails 204 * because a non-traditional character is used as the quote-like delimiter 205 * QUOxLxN : "" but with no interpolation 206 * QM : a quote-like has ended, and quote modifier chars are awaited 207 * HERE : Here-docs 208 * HERExN : Here-docs with no interpolation 209 * HEREin : Indented Here-docs 210 * HEREinxN : Indented Here-docs with no interpolation 211 */ 212%state INTRA SCOMMENT POD 213%state QUO QUOxN QUOxL QUOxLxN QM 214%state HERE HERExN HEREin HEREinxN 215 216%% 217<HERE, HERExN> { 218 ^ {Here_marker} / {MaybeWhsp}{EOL} { 219 chkLOC(); 220 maybeEndHere(yytext()); 221 } 222} 223 224<HEREin, HEREinxN> { 225 ^ {MaybeWhsp} {Here_marker} / {MaybeWhsp}{EOL} { 226 chkLOC(); 227 maybeEndHere(yytext()); 228 } 229} 230 231<INTRA> { 232 // Syntax that switches back to YYINITIAL but preserves otherwise the stack 233 [\{] | 234 "&&" | 235 "||" { 236 chkLOC(); 237 yypushback(yytext().length()); 238 yybegin(YYINITIAL); 239 } 240} 241 242<YYINITIAL, INTRA> { 243 [\\] {EOL} { 244 maybeIntraState(); 245 offer("\\"); 246 onEndOfLineMatched(yytext(), yychar); 247 } 248 249 "<<"[~\-]? ({Here_EOF1} | {Here_EOF2} | {Here_EOF3}) { 250 chkLOC(); 251 maybeIntraState(); 252 hop(yytext()); 253 } 254 255 {Instance_var} | {Class_var} | {Global_var} | {Symbol} { 256 chkLOC(); 257 maybeIntraState(); 258 String id = yytext(); 259 if (offerSymbol(id, 0, true) && returnOnSymbol()) { 260 return yystate(); 261 } 262 } 263 264 {Local_var} | {Method_name} { 265 chkLOC(); 266 maybeIntraState(); 267 String id = yytext(); 268 if (offerSymbol(id, 0, false) && returnOnSymbol()) { 269 return yystate(); 270 } 271 } 272 273 {SPIdentifier} { 274 chkLOC(); 275 maybeIntraState(); 276 offerKeyword(yytext()); 277 } 278 279 {Modules_nested} { 280 chkLOC(); 281 maybeIntraState(); 282 takeModules(yytext()); 283 } 284 285 {Character_literal} { 286 chkLOC(); 287 maybeIntraState(); 288 onDisjointSpanChanged(HtmlConsts.STRING_CLASS, yychar); 289 offer(yytext()); 290 onDisjointSpanChanged(null, yychar); 291 } 292 293 {Numeric_literal} { 294 chkLOC(); 295 maybeIntraState(); 296 onDisjointSpanChanged(HtmlConsts.NUMBER_CLASS, yychar); 297 offer(yytext()); 298 onDisjointSpanChanged(null, yychar); 299 } 300 301 \" { chkLOC(); qop(yytext(), 0, false); } 302 \' { chkLOC(); qop(yytext(), 0, true); } 303 \# { 304 yypush(SCOMMENT); 305 onDisjointSpanChanged(HtmlConsts.COMMENT_CLASS, yychar); 306 offer(yytext()); 307 } 308 309 // Quote with two character names plus possibly a {WxSigils} spacer 310 ^ {QuoPC} { chkLOC(); qop(yytext(), 2, false); } 311 {WxSigils}{QuoPC} { chkLOC(); qop(yytext(), 3, false); } 312 ^ {QuoPC_xN} { chkLOC(); qop(yytext(), 2, true); } 313 {WxSigils}{QuoPC_xN} { chkLOC(); qop(yytext(), 3, true); } 314 315 // Quote with one character names plus possibly a {WxSigils} spacer 316 ^ {QuoP} { chkLOC(); qop(yytext(), 1, false); } 317 {WxSigils}{QuoP} { chkLOC(); qop(yytext(), 2, false); } 318 ^ {Symquo} { chkLOC(); qop(yytext(), 1, false); } 319 {WxSigils}{Symquo} { chkLOC(); qop(yytext(), 2, false); } 320 ^ {Symquo_xN} { chkLOC(); qop(yytext(), 1, true); } 321 {WxSigils}{Symquo_xN} { chkLOC(); qop(yytext(), 2, true); } 322 323 // POD-end without having seen POD-begin is akin to a one-line comment 324 ^ {POD_end} [^\n\r]* { 325 offer(yytext()); 326 } 327 328 // POD start 329 ^ {POD_begin} { 330 yypush(POD); 331 onDisjointSpanChanged(HtmlConsts.COMMENT_CLASS, yychar); 332 offer(yytext()); 333 } 334 335 [\{\}] { 336 chkLOC(); 337 String capture = yytext(); 338 if (!maybeEndInterpolation(capture)) { 339 offer(capture); 340 } 341 } 342} 343 344<YYINITIAL> { 345 "/" { 346 chkLOC(); 347 // OK to pass a fake "m/" with doWrite=false 348 qop(false, "m/", 1, false); 349 onDisjointSpanChanged(HtmlConsts.STRING_CLASS, yychar); 350 offer(yytext()); 351 } 352} 353 354<YYINITIAL, INTRA> { 355 // Use some heuristics to identify double-slash syntax for the m// 356 // operator. We can't handle all possible appearances of `//', because the 357 // first slash cannot always be distinguished from division (/) without 358 // true parsing. 359 360 {Mpunc1YYIN} \s* "/" { chkLOC(); hqopPunc(yytext()); } 361} 362 363<INTRA> { 364 // Continue with more punctuation heuristics 365 366 {Mpunc2IN} \s* "/" { chkLOC(); hqopPunc(yytext()); } 367} 368 369<YYINITIAL, INTRA> { 370 // Define keyword heuristics 371 372 ^ {Mwords} \s* "/" { 373 chkLOC(); 374 hqopSymbol(yytext()); 375 } 376 377 {WxSigils}{Mwords} \s* "/" { 378 chkLOC(); 379 String capture = yytext(); 380 if (takeAllContent()) { 381 String boundary = capture.substring(0, 1); 382 offer(boundary); 383 } 384 hqopSymbol(capture.substring(1)); 385 } 386} 387 388<QUO, QUOxN, QUOxL, QUOxLxN, HERE, HEREin> { 389 \\ \S { 390 chkLOC(); 391 offer(yytext()); 392 } 393} 394 395<QUO, QUOxL> { 396 "#{" { 397 chkLOC(); 398 offer(yytext()); 399 onDisjointSpanChanged(null, yychar); 400 yypush(YYINITIAL); 401 pushData(); 402 interpop(); 403 } 404} 405 406<QUO, QUOxN, QUOxL, QUOxLxN> { 407 {Quo0} { 408 chkLOC(); 409 String capture = yytext(); 410 offer(capture); 411 if (maybeEndQuote(capture)) { 412 yypop(); 413 if (areModifiersOK()) { 414 yypush(QM); 415 } 416 onDisjointSpanChanged(null, yychar); 417 } 418 } 419} 420 421<QUO, QUOxN, QUOxL, QUOxLxN, HERE, HERExN, HEREin, HEREinxN> { 422 {WhspChar}*{EOL} { 423 onDisjointSpanChanged(null, yychar); 424 onEndOfLineMatched(yytext(), yychar); 425 onDisjointSpanChanged(HtmlConsts.STRING_CLASS, yychar); 426 } 427} 428 429<QM> { 430 // /PAT/imxouesn 431 [a-z] { 432 chkLOC(); 433 offer(yytext()); 434 } 435 436 // anything else ends the quote-modifiers state 437 [^] { 438 yypop(); 439 yypushback(yytext().length()); 440 } 441} 442 443<POD> { 444 ^ {POD_end} [^\n\r]* { 445 yypop(); 446 offer(yytext()); 447 onDisjointSpanChanged(null, yychar); 448 } 449 450 {WhspChar}*{EOL} { 451 onEndOfLineMatched(yytext(), yychar); 452 } 453} 454 455<SCOMMENT> { 456 {WhspChar}*{EOL} { 457 String capture = yytext(); 458 yypushback(capture.length()); 459 yypop(); 460 onDisjointSpanChanged(null, yychar); 461 } 462} 463 464<YYINITIAL, INTRA> { 465 {WhspChar}*{EOL} { 466 String capture = yytext(); 467 if (maybeStartHere()) { 468 yypushback(capture.length()); 469 } else { 470 onEndOfLineMatched(yytext(), yychar); 471 yybegin(YYINITIAL); 472 } 473 } 474} 475 476<YYINITIAL, INTRA, SCOMMENT, POD, QUO, QUOxN, QUOxL, QUOxLxN, 477 HERE, HERExN, HEREin, HEREinxN> { 478 479 // Only one whitespace char at a time or else {WxSigils} can be broken 480 {WhspChar} | 481 [[\s]--[\n\r]] { 482 offer(yytext()); 483 } 484 // Only one char at a time due to restriction on {WhspChar} above. 485 [^\n\r] { 486 chkLOC(); 487 maybeIntraState(); 488 offer(yytext()); 489 } 490} 491 492// "string links" and "comment links" 493<SCOMMENT, POD, QUO, QUOxN, HERE, HERExN, HEREin, HEREinxN> { 494 {FPath} { 495 chkLOC(); 496 if (takeAllContent()) { 497 onPathlikeMatched(yytext(), '/', false, yychar); 498 } 499 } 500 501 {File} { 502 chkLOC(); 503 if (takeAllContent()) { 504 String path = yytext(); 505 onFilelikeMatched(path, yychar); 506 } 507 } 508 509 {FNameChar}+ "@" {FNameChar}+ "." {FNameChar}+ { 510 chkLOC(); 511 if (takeAllContent()) { 512 onEmailAddressMatched(yytext(), yychar); 513 } 514 } 515} 516 517<SCOMMENT, POD, HERE, HERExN, HEREin, HEREinxN> { 518 {BrowseableURI} { 519 chkLOC(); 520 if (takeAllContent()) { 521 onUriMatched(yytext(), yychar, null); 522 } 523 // no skipLink() needed except in QUO* states 524 } 525} 526 527<QUO, QUOxN> { 528 {BrowseableURI} { 529 chkLOC(); 530 String capture = yytext(); 531 Pattern collateralCapture = getCollateralCapturePattern(); 532 if (takeAllContent()) { 533 onUriMatched(capture, yychar, collateralCapture); 534 } else { 535 skipLink(capture, collateralCapture); 536 } 537 } 538} 539