1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * See LICENSE.txt included in this distribution for the specific 9 * language governing permissions and limitations under the License. 10 * 11 * When distributing Covered Code, include this CDDL HEADER in each 12 * file and include the License file at LICENSE.txt. 13 * If applicable, add the following below this CDDL HEADER, with the 14 * fields enclosed by brackets "[]" replaced with your own identifying 15 * information: Portions Copyright [yyyy] [name of copyright owner] 16 * 17 * CDDL HEADER END 18 */ 19 20 /* 21 * Copyright (c) 2017, 2019, Chris Fraire <cfraire@me.com>. 22 */ 23 package org.opengrok.indexer.analysis.ruby; 24 25 import java.io.IOException; 26 import java.util.LinkedList; 27 import java.util.Queue; 28 import java.util.Stack; 29 import java.util.regex.Matcher; 30 import java.util.regex.Pattern; 31 32 import org.opengrok.indexer.analysis.JFlexJointLexer; 33 import org.opengrok.indexer.analysis.JFlexSymbolMatcher; 34 import org.opengrok.indexer.analysis.Resettable; 35 import org.opengrok.indexer.util.RegexUtils; 36 import org.opengrok.indexer.util.StringUtils; 37 import org.opengrok.indexer.web.HtmlConsts; 38 39 /** 40 * Represents an abstract base class for Ruby lexers. 41 */ 42 @SuppressWarnings("Duplicates") 43 abstract class RubyLexer extends JFlexSymbolMatcher 44 implements JFlexJointLexer, Resettable { 45 46 // Using equivalent of {Local_nextchar} from RubyProductions.lexh 47 private static final Pattern HERE_TERMINATOR_MATCH = Pattern.compile( 48 "^[a-zA-Z0-9_\u00160-\u0255]+"); 49 50 private RubyLexerData dHead; 51 52 private Stack<RubyLexerData> data; 53 RubyLexer()54 RubyLexer() { 55 dHead = new RubyLexerData(); 56 } 57 58 /** 59 * Resets the instance to an initial state. 60 */ 61 @Override reset()62 public void reset() { 63 super.reset(); 64 dHead = new RubyLexerData(); 65 if (data != null) { 66 data.clear(); 67 } 68 } 69 70 /** 71 * Determine if the quote should end based on the first character of 72 * {@code capture}, recognizing quote-like operators that allow nesting to 73 * increase the nesting level if appropriate. 74 * <p> 75 * Calling this method has side effects to possibly modify {@code nqchar}, 76 * {@code waitq}, or {@code endqchar}. 77 * @return true if the quote state should end 78 */ maybeEndQuote(String capture)79 public boolean maybeEndQuote(String capture) { 80 char c = capture.charAt(0); 81 if (c == dHead.endqchar) { 82 if (--dHead.nendqchar <= 0) { 83 dHead.endqchar = '\0'; 84 dHead.nestqchar = '\0'; 85 return true; 86 } 87 } else if (dHead.nestqchar != '\0' && c == dHead.nestqchar) { 88 ++dHead.nendqchar; 89 } 90 return false; 91 } 92 93 /** 94 * Gets a value indicating if modifiers are OK at the end of the last 95 * quote-like operator. 96 * @return true if modifiers are OK 97 */ areModifiersOK()98 public boolean areModifiersOK() { 99 // "m" named here a la Perl for the Ruby /pat/ operator 100 return "m".equals(dHead.qopname); 101 } 102 103 /** 104 * Starts a quote-like operator as specified in a syntax fragment, 105 * {@code op}, and gives the {@code op} for the {@code listener} to take. 106 */ qop(String op, int namelength, boolean nointerp)107 public void qop(String op, int namelength, boolean nointerp) 108 throws IOException { 109 qop(true, op, namelength, nointerp); 110 } 111 112 /** 113 * Starts a quote-like operator as specified in a syntax fragment, 114 * {@code op}, and gives the {@code capture} for the {@code listener} to 115 * take if {@code doWrite} is true. 116 */ qop(boolean doWrite, String capture, int namelength, boolean nointerp)117 public void qop(boolean doWrite, String capture, int namelength, 118 boolean nointerp) throws IOException { 119 120 // N.b. the following will write anyway -- despite any `doWrite' 121 // setting -- if interpolation is truly ending, but that is OK as a 122 // quote-like operator is not starting in that case. 123 if (maybeEndInterpolation(capture)) { 124 return; 125 } 126 127 // If namelength is positive, allow that a non-zero-width word boundary 128 // character may have needed to be matched since jflex does not conform 129 // with \b as a zero-width simple word boundary. Excise it into 130 // `boundary'. 131 String postop = capture; 132 dHead.qopname = ""; 133 if (namelength > 0) { 134 dHead.qopname = capture.substring(0, namelength); 135 postop = capture.substring(dHead.qopname.length()); 136 } 137 dHead.nendqchar = 1; 138 dHead.collateralCapture = null; 139 140 char opc = postop.charAt(0); 141 setEndQuoteChar(opc); 142 setState(postop, nointerp); 143 144 if (doWrite) { 145 offer(dHead.qopname); 146 skipSymbol(); 147 disjointSpan(HtmlConsts.STRING_CLASS); 148 offer(postop); 149 } 150 } 151 152 /** 153 * Sets the jflex state reflecting {@code postop} and {@code nointerp}. 154 */ setState(String postop, boolean nointerp)155 public void setState(String postop, boolean nointerp) { 156 int state; 157 boolean nolink = false; 158 159 // "no link" for values in the rules for "string links" if `postop' 160 // starts path-like or with the e-mail delimiter. 161 if (StringUtils.startsWithFpathChar(postop) || 162 postop.startsWith("@")) { 163 nolink = true; 164 } 165 166 if (nointerp) { 167 state = nolink ? QUOxLxN() : QUOxN(); 168 } else { 169 state = nolink ? QUOxL() : QUO(); 170 } 171 maybeIntraState(); 172 yypush(state); 173 } 174 175 /** 176 * Sets a special {@code endqchar} if appropriate for {@code opener} or 177 * just tracks {@code opener} as {@code endqchar}. 178 */ setEndQuoteChar(char opener)179 private void setEndQuoteChar(char opener) { 180 switch (opener) { 181 case '[': 182 dHead.nestqchar = opener; 183 dHead.endqchar = ']'; 184 break; 185 case '<': 186 dHead.nestqchar = opener; 187 dHead.endqchar = '>'; 188 break; 189 case '(': 190 dHead.nestqchar = opener; 191 dHead.endqchar = ')'; 192 break; 193 case '{': 194 dHead.nestqchar = opener; 195 dHead.endqchar = '}'; 196 break; 197 default: 198 dHead.nestqchar = '\0'; 199 dHead.endqchar = opener; 200 break; 201 } 202 } 203 204 /** 205 * Begins a quote-like state for a heuristic match of the shorthand // of 206 * m// where the {@code capture} ends with "/", begins with punctuation, 207 * and the intervening whitespace may contain LFs -- and writes the parts 208 * to output. 209 */ hqopPunc(String capture)210 public void hqopPunc(String capture) throws IOException { 211 if (maybeEndInterpolation(capture)) { 212 return; 213 } 214 215 // `preceding' is everything before the '/'; 'lede' is the initial part 216 // before any whitespace; and `intervening' is any whitespace. 217 String preceding = capture.substring(0, capture.length() - 1); 218 String lede = preceding.stripTrailing(); 219 String intervening = preceding.substring(lede.length()); 220 221 // OK to pass a fake "m/" with doWrite=false 222 qop(false, "m/", 1, false); 223 offer(lede); 224 takeWhitespace(intervening); 225 disjointSpan(HtmlConsts.STRING_CLASS); 226 offer("/"); 227 } 228 229 /** 230 * Begins a quote-like state for a heuristic match of the shorthand // of 231 * m// where the {@code capture} ends with "/", begins with an initial 232 * symbol, and the intervening whitespace may contain LFs -- and writes the 233 * parts to output. 234 */ hqopSymbol(String capture)235 public void hqopSymbol(String capture) throws IOException { 236 if (maybeEndInterpolation(capture)) { 237 return; 238 } 239 240 // `preceding' is everything before the '/'; 'lede' is the initial part 241 // before any whitespace; and `intervening' is any whitespace. 242 String preceding = capture.substring(0, capture.length() - 1); 243 String lede = preceding.stripTrailing(); 244 String intervening = preceding.substring(lede.length()); 245 246 // OK to pass a fake "m/" with doWrite=false 247 qop(false, "m/", 1, false); 248 offerSymbol(lede, 0, false); 249 takeWhitespace(intervening); 250 disjointSpan(HtmlConsts.STRING_CLASS); 251 offer("/"); 252 } 253 254 /** 255 * Write {@code whsp} to output -- if it does not contain any LFs then the 256 * full String is written; otherwise, pre-LF spaces are condensed as usual. 257 */ takeWhitespace(String whsp)258 private void takeWhitespace(String whsp) throws IOException { 259 int i; 260 if ((i = whsp.indexOf("\n")) == -1) { 261 offer(whsp); 262 } else { 263 int numlf = 1, off = i + 1; 264 while ((i = whsp.indexOf("\n", off)) != -1) { 265 ++numlf; 266 off = i + 1; 267 } 268 while (numlf-- > 0) { 269 startNewLine(); 270 } 271 if (off < whsp.length()) { 272 offer(whsp.substring(off)); 273 } 274 } 275 } 276 277 /** 278 * Parses a Here-document declaration, and takes the {@code capture} using 279 * {@link RubyLexer#offer(java.lang.String)}. If the 280 * declaration is valid, {@code hereSettings} will have been appended. 281 */ hop(String capture)282 public void hop(String capture) throws IOException { 283 if (!capture.startsWith("<<")) { 284 throw new IllegalArgumentException("bad HERE: " + capture); 285 } 286 287 offer(capture); 288 if (dHead.hereSettings == null) { 289 dHead.hereSettings = new LinkedList<>(); 290 } 291 292 String remaining = capture; 293 int i = 0; 294 HereDocSettings settings; 295 boolean indented = false; 296 boolean nointerp; 297 String terminator; 298 299 String opener = remaining.substring(0, i + 2); 300 remaining = remaining.substring(opener.length()); 301 if (remaining.startsWith("~") || remaining.startsWith("-")) { 302 indented = true; 303 remaining = remaining.substring(1); 304 } 305 306 char c = remaining.charAt(0); 307 switch (c) { 308 case '\'': 309 nointerp = true; 310 remaining = remaining.substring(1); 311 break; 312 case '`': 313 // (Ruby, unlike Perl, does not recognize '"' here.) 314 nointerp = false; 315 remaining = remaining.substring(1); 316 break; 317 default: 318 c = '\0'; 319 nointerp = false; 320 break; 321 } 322 323 if (c != '\0') { 324 if ((i = remaining.indexOf(c)) < 1) { 325 terminator = remaining; 326 } else { 327 terminator = remaining.substring(0, i); 328 } 329 } else { 330 Matcher m = HERE_TERMINATOR_MATCH.matcher(remaining); 331 if (!m.find()) { 332 return; 333 } 334 terminator = m.group(0); 335 } 336 337 int state; 338 if (nointerp) { 339 state = indented ? HEREinxN() : HERExN(); 340 } else { 341 state = indented ? HEREin() : HERE(); 342 } 343 settings = new HereDocSettings(terminator, state); 344 dHead.hereSettings.add(settings); 345 } 346 347 /** 348 * Pushes the first Here-document state if any declarations were parsed, or 349 * else does nothing. 350 * @return true if a Here state was pushed 351 */ maybeStartHere()352 public boolean maybeStartHere() throws IOException { 353 if (dHead.hereSettings != null && dHead.hereSettings.size() > 0) { 354 HereDocSettings settings = dHead.hereSettings.peek(); 355 yypush(settings.state); 356 disjointSpan(HtmlConsts.STRING_CLASS); 357 return true; 358 } 359 return false; 360 } 361 362 /** 363 * Process the {@code capture}, possibly ending the Here-document state 364 * just beforehand. 365 * @return true if the quote state ended 366 */ maybeEndHere(String capture)367 public boolean maybeEndHere(String capture) throws IOException { 368 String trimmed = capture.stripLeading(); 369 HereDocSettings settings = dHead.hereSettings.peek(); 370 assert settings != null; 371 372 boolean didZspan = false; 373 if (trimmed.equals(settings.terminator)) { 374 disjointSpan(null); 375 didZspan = true; 376 dHead.hereSettings.remove(); 377 } 378 379 offer(capture); 380 381 if (dHead.hereSettings.size() > 0) { 382 settings = dHead.hereSettings.peek(); 383 yybegin(settings.state); 384 if (didZspan) { 385 disjointSpan(HtmlConsts.STRING_CLASS); 386 } 387 return false; 388 } else { 389 yypop(); 390 return true; 391 } 392 } 393 394 /** 395 * Resets the interpolation counter to 1. 396 */ interpop()397 public void interpop() { 398 dHead.nendbrace = 1; 399 } 400 401 /** 402 * Determine if the interpolation should end based on the first character 403 * of {@code capture}, recognizing tokens that increase the nesting level 404 * instead. 405 * <p> 406 * Calling this method has side effects to possibly modify 407 * {@code nendbrace}. 408 * @return true if the interpolation state should end 409 */ maybeEndInterpolation(String capture)410 public boolean maybeEndInterpolation(String capture) throws IOException { 411 if (dHead.nendbrace <= 0) { 412 return false; 413 } 414 if (capture.startsWith("}")) { 415 if (--dHead.nendbrace <= 0) { 416 int rem = capture.length() - 1; 417 String opener = capture.substring(0, 1); 418 popData(); 419 yypop(); 420 disjointSpan(HtmlConsts.STRING_CLASS); 421 offer(opener); 422 if (rem > 0) { 423 yypushback(rem); 424 } 425 return true; 426 } 427 } else if (capture.startsWith("{")) { 428 ++dHead.nendbrace; 429 } 430 return false; 431 } 432 433 /** 434 * Take a series of module names separated by "::". 435 */ takeModules(String capture)436 public void takeModules(String capture) throws IOException { 437 final String SEP = "::"; 438 int o = 0, i; 439 while (o < capture.length() && (i = capture.indexOf(SEP, o)) != -1) { 440 String module = capture.substring(o, i); 441 offerSymbol(module, o, false); 442 offer(SEP); 443 o = i + 2; 444 } 445 if (o < capture.length()) { 446 String module = capture.substring(o); 447 offerSymbol(module, o, false); 448 } 449 } 450 451 /** 452 * Subtract the number of initial, non-word characters from the length of 453 * {@code capture}. 454 * @param capture a defined value 455 * @return the length of {@code value} minus the number of initial, 456 * non-word characters 457 */ nameLength(String capture)458 public int nameLength(String capture) { 459 int len = capture.length(); 460 for (int i = 0; i < capture.length(); ++i) { 461 if (Character.isLetterOrDigit(capture.charAt(i))) { 462 break; 463 } 464 --len; 465 } 466 return len; 467 } 468 469 /** 470 * Gets a pattern to match the collateral capture for the current quoting 471 * state or null if there is no active quoting state. 472 * @return a defined pattern or null 473 */ getCollateralCapturePattern()474 public Pattern getCollateralCapturePattern() { 475 if (dHead.endqchar == '\0') { 476 return null; 477 } 478 if (dHead.collateralCapture != null) { 479 return dHead.collateralCapture; 480 } 481 482 StringBuilder patb = new StringBuilder("["); 483 patb.append(Pattern.quote(String.valueOf(dHead.endqchar))); 484 if (dHead.nestqchar != '\0') { 485 patb.append(Pattern.quote(String.valueOf(dHead.nestqchar))); 486 } 487 patb.append("]"); 488 patb.append(RegexUtils.getNotFollowingEscapePattern()); 489 dHead.collateralCapture = Pattern.compile(patb.toString()); 490 return dHead.collateralCapture; 491 } 492 493 /** 494 * Calls {@link #phLOC()} if the yystate is not SCOMMENT or POD. 495 */ chkLOC()496 public void chkLOC() { 497 int yystate = yystate(); 498 if (yystate != SCOMMENT() && yystate != POD()) { 499 phLOC(); 500 } 501 } 502 503 /** 504 * Subclasses must override to possibly set the INTRA state. 505 */ maybeIntraState()506 abstract void maybeIntraState(); 507 pushData()508 void pushData() { 509 if (data == null) { 510 data = new Stack<>(); 511 } 512 data.push(dHead); 513 dHead = new RubyLexerData(); 514 } 515 popData()516 void popData() { 517 dHead = data.pop(); 518 } 519 520 /** 521 * Subclasses must override to get the constant value created by JFlex to 522 * represent QUOxLxN. 523 */ QUOxLxN()524 abstract int QUOxLxN(); 525 526 /** 527 * Subclasses must override to get the constant value created by JFlex to 528 * represent QUOxN. 529 */ QUOxN()530 abstract int QUOxN(); 531 532 /** 533 * Subclasses must override to get the constant value created by JFlex to 534 * represent QUOxL. 535 */ QUOxL()536 abstract int QUOxL(); 537 538 /** 539 * Subclasses must override to get the constant value created by JFlex to 540 * represent QUO. 541 */ QUO()542 abstract int QUO(); 543 544 /** 545 * Subclasses must override to get the constant value created by JFlex to 546 * represent HEREinxN. 547 */ HEREinxN()548 abstract int HEREinxN(); 549 550 /** 551 * Subclasses must override to get the constant value created by JFlex to 552 * represent HERExN. 553 */ HERExN()554 abstract int HERExN(); 555 556 /** 557 * Subclasses must override to get the constant value created by JFlex to 558 * represent HEREin. 559 */ HEREin()560 abstract int HEREin(); 561 562 /** 563 * Subclasses must override to get the constant value created by JFlex to 564 * represent HERE. 565 */ HERE()566 abstract int HERE(); 567 568 /** 569 * Subclasses must override to get the constant value created by JFlex to 570 * represent SCOMMENT. 571 */ SCOMMENT()572 abstract int SCOMMENT(); 573 574 /** 575 * Subclasses must override to get the constant value created by JFlex to 576 * represent POD. 577 */ POD()578 abstract int POD(); 579 580 private static class HereDocSettings { 581 private final String terminator; 582 private final int state; 583 HereDocSettings(String terminator, int state)584 HereDocSettings(String terminator, int state) { 585 this.terminator = terminator; 586 this.state = state; 587 } 588 } 589 590 private static class RubyLexerData { 591 private Queue<HereDocSettings> hereSettings; 592 593 /** 594 * When matching a quoting construct like qq[], q(), m//, s```, etc., 595 * the operator name (e.g., "m" or "tr") is stored. Unlike 596 * {@code endqchar} it is not unset when the quote ends, because it is 597 * useful to indicate if quote modifier characters are expected. 598 */ 599 private String qopname; 600 601 /** 602 * When matching a quoting construct like %w(), '', %[], etc., the 603 * terminating character is stored. 604 */ 605 private char endqchar; 606 607 /** 608 * When matching a quoting construct like %[], %w(), %<> etc. 609 * that nest, the begin character ('[', '<', '(', or '{') is stored 610 * so that nesting is tracked and {@code nendqchar} is incremented 611 * appropriately. Otherwise, {@code nestqchar} is set to '\0' if no 612 * nesting occurs. 613 */ 614 private char nestqchar; 615 616 /** 617 * When matching a quoting construct like %[], %w(), etc., the number 618 * of remaining end separators is stored. It starts at 1, and any 619 * nesting increases the value. 620 */ 621 private int nendqchar; 622 623 /** 624 * When interpolating inside a quoting construct, the number of 625 * remaining '}' is stored. It starts at 1, and any nesting increases 626 * the value. 627 */ 628 private int nendbrace; 629 630 /** 631 * When matching a quoting construct, a Pattern to identify collateral 632 * capture characters is stored. 633 */ 634 private Pattern collateralCapture; 635 } 636 } 637