xref: /OpenGrok/opengrok-indexer/src/main/java/org/opengrok/indexer/analysis/ruby/RubyLexer.java (revision 6c62ede99bd45f84e663cda017732f3bcc28db30)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * See LICENSE.txt included in this distribution for the specific
9  * language governing permissions and limitations under the License.
10  *
11  * When distributing Covered Code, include this CDDL HEADER in each
12  * file and include the License file at LICENSE.txt.
13  * If applicable, add the following below this CDDL HEADER, with the
14  * fields enclosed by brackets "[]" replaced with your own identifying
15  * information: Portions Copyright [yyyy] [name of copyright owner]
16  *
17  * CDDL HEADER END
18  */
19 
20 /*
21  * Copyright (c) 2017, 2019, Chris Fraire <cfraire@me.com>.
22  */
23 package org.opengrok.indexer.analysis.ruby;
24 
25 import java.io.IOException;
26 import java.util.LinkedList;
27 import java.util.Queue;
28 import java.util.Stack;
29 import java.util.regex.Matcher;
30 import java.util.regex.Pattern;
31 
32 import org.opengrok.indexer.analysis.JFlexJointLexer;
33 import org.opengrok.indexer.analysis.JFlexSymbolMatcher;
34 import org.opengrok.indexer.analysis.Resettable;
35 import org.opengrok.indexer.util.RegexUtils;
36 import org.opengrok.indexer.util.StringUtils;
37 import org.opengrok.indexer.web.HtmlConsts;
38 
39 /**
40  * Represents an abstract base class for Ruby lexers.
41  */
42 @SuppressWarnings("Duplicates")
43 abstract class RubyLexer extends JFlexSymbolMatcher
44         implements JFlexJointLexer, Resettable {
45 
46     // Using equivalent of {Local_nextchar} from RubyProductions.lexh
47     private static final Pattern HERE_TERMINATOR_MATCH = Pattern.compile(
48         "^[a-zA-Z0-9_\u00160-\u0255]+");
49 
50     private RubyLexerData dHead;
51 
52     private Stack<RubyLexerData> data;
53 
RubyLexer()54     RubyLexer() {
55         dHead = new RubyLexerData();
56     }
57 
58     /**
59      * Resets the instance to an initial state.
60      */
61     @Override
reset()62     public void reset() {
63         super.reset();
64         dHead = new RubyLexerData();
65         if (data != null) {
66             data.clear();
67         }
68     }
69 
70     /**
71      * Determine if the quote should end based on the first character of
72      * {@code capture}, recognizing quote-like operators that allow nesting to
73      * increase the nesting level if appropriate.
74      * <p>
75      * Calling this method has side effects to possibly modify {@code nqchar},
76      * {@code waitq}, or {@code endqchar}.
77      * @return true if the quote state should end
78      */
maybeEndQuote(String capture)79     public boolean maybeEndQuote(String capture) {
80         char c = capture.charAt(0);
81         if (c == dHead.endqchar) {
82             if (--dHead.nendqchar <= 0) {
83                 dHead.endqchar = '\0';
84                 dHead.nestqchar = '\0';
85                 return true;
86             }
87         } else if (dHead.nestqchar != '\0' && c == dHead.nestqchar) {
88             ++dHead.nendqchar;
89         }
90         return false;
91     }
92 
93     /**
94      * Gets a value indicating if modifiers are OK at the end of the last
95      * quote-like operator.
96      * @return true if modifiers are OK
97      */
areModifiersOK()98     public boolean areModifiersOK() {
99         // "m" named here a la Perl for the Ruby /pat/ operator
100         return "m".equals(dHead.qopname);
101     }
102 
103     /**
104      * Starts a quote-like operator as specified in a syntax fragment,
105      * {@code op}, and gives the {@code op} for the {@code listener} to take.
106      */
qop(String op, int namelength, boolean nointerp)107     public void qop(String op, int namelength, boolean nointerp)
108         throws IOException {
109         qop(true, op, namelength, nointerp);
110     }
111 
112     /**
113      * Starts a quote-like operator as specified in a syntax fragment,
114      * {@code op}, and gives the {@code capture} for the {@code listener} to
115      * take if {@code doWrite} is true.
116      */
qop(boolean doWrite, String capture, int namelength, boolean nointerp)117     public void qop(boolean doWrite, String capture, int namelength,
118         boolean nointerp) throws IOException {
119 
120         // N.b. the following will write anyway -- despite any `doWrite'
121         // setting -- if interpolation is truly ending, but that is OK as a
122         // quote-like operator is not starting in that case.
123         if (maybeEndInterpolation(capture)) {
124             return;
125         }
126 
127         // If namelength is positive, allow that a non-zero-width word boundary
128         // character may have needed to be matched since jflex does not conform
129         // with \b as a zero-width simple word boundary. Excise it into
130         // `boundary'.
131         String postop = capture;
132         dHead.qopname = "";
133         if (namelength > 0) {
134             dHead.qopname = capture.substring(0, namelength);
135             postop = capture.substring(dHead.qopname.length());
136         }
137         dHead.nendqchar = 1;
138         dHead.collateralCapture = null;
139 
140         char opc = postop.charAt(0);
141         setEndQuoteChar(opc);
142         setState(postop, nointerp);
143 
144         if (doWrite) {
145             offer(dHead.qopname);
146             skipSymbol();
147             disjointSpan(HtmlConsts.STRING_CLASS);
148             offer(postop);
149         }
150     }
151 
152     /**
153      * Sets the jflex state reflecting {@code postop} and {@code nointerp}.
154      */
setState(String postop, boolean nointerp)155     public void setState(String postop, boolean nointerp) {
156         int state;
157         boolean nolink = false;
158 
159         // "no link" for values in the rules for "string links" if `postop'
160         // starts path-like or with the e-mail delimiter.
161         if (StringUtils.startsWithFpathChar(postop) ||
162             postop.startsWith("@")) {
163             nolink = true;
164         }
165 
166         if (nointerp) {
167             state = nolink ? QUOxLxN() : QUOxN();
168         } else {
169             state = nolink ? QUOxL() : QUO();
170         }
171         maybeIntraState();
172         yypush(state);
173     }
174 
175     /**
176      * Sets a special {@code endqchar} if appropriate for {@code opener} or
177      * just tracks {@code opener} as {@code endqchar}.
178      */
setEndQuoteChar(char opener)179     private void setEndQuoteChar(char opener) {
180         switch (opener) {
181             case '[':
182                 dHead.nestqchar = opener;
183                 dHead.endqchar = ']';
184                 break;
185             case '<':
186                 dHead.nestqchar = opener;
187                 dHead.endqchar = '>';
188                 break;
189             case '(':
190                 dHead.nestqchar = opener;
191                 dHead.endqchar = ')';
192                 break;
193             case '{':
194                 dHead.nestqchar = opener;
195                 dHead.endqchar = '}';
196                 break;
197             default:
198                 dHead.nestqchar = '\0';
199                 dHead.endqchar = opener;
200                 break;
201         }
202     }
203 
204     /**
205      * Begins a quote-like state for a heuristic match of the shorthand // of
206      * m// where the {@code capture} ends with "/", begins with punctuation,
207      * and the intervening whitespace may contain LFs -- and writes the parts
208      * to output.
209      */
hqopPunc(String capture)210     public void hqopPunc(String capture) throws IOException {
211         if (maybeEndInterpolation(capture)) {
212             return;
213         }
214 
215         // `preceding' is everything before the '/'; 'lede' is the initial part
216         // before any whitespace; and `intervening' is any whitespace.
217         String preceding = capture.substring(0, capture.length() - 1);
218         String lede = preceding.stripTrailing();
219         String intervening = preceding.substring(lede.length());
220 
221         // OK to pass a fake "m/" with doWrite=false
222         qop(false, "m/", 1, false);
223         offer(lede);
224         takeWhitespace(intervening);
225         disjointSpan(HtmlConsts.STRING_CLASS);
226         offer("/");
227     }
228 
229     /**
230      * Begins a quote-like state for a heuristic match of the shorthand // of
231      * m// where the {@code capture} ends with "/", begins with an initial
232      * symbol, and the intervening whitespace may contain LFs -- and writes the
233      * parts to output.
234      */
hqopSymbol(String capture)235     public void hqopSymbol(String capture) throws IOException {
236         if (maybeEndInterpolation(capture)) {
237             return;
238         }
239 
240         // `preceding' is everything before the '/'; 'lede' is the initial part
241         // before any whitespace; and `intervening' is any whitespace.
242         String preceding = capture.substring(0, capture.length() - 1);
243         String lede = preceding.stripTrailing();
244         String intervening = preceding.substring(lede.length());
245 
246         // OK to pass a fake "m/" with doWrite=false
247         qop(false, "m/", 1, false);
248         offerSymbol(lede, 0, false);
249         takeWhitespace(intervening);
250         disjointSpan(HtmlConsts.STRING_CLASS);
251         offer("/");
252     }
253 
254     /**
255      * Write {@code whsp} to output -- if it does not contain any LFs then the
256      * full String is written; otherwise, pre-LF spaces are condensed as usual.
257      */
takeWhitespace(String whsp)258     private void takeWhitespace(String whsp) throws IOException {
259         int i;
260         if ((i = whsp.indexOf("\n")) == -1) {
261             offer(whsp);
262         } else {
263             int numlf = 1, off = i + 1;
264             while ((i = whsp.indexOf("\n", off)) != -1) {
265                 ++numlf;
266                 off = i + 1;
267             }
268             while (numlf-- > 0) {
269                 startNewLine();
270             }
271             if (off < whsp.length()) {
272                 offer(whsp.substring(off));
273             }
274         }
275     }
276 
277     /**
278      * Parses a Here-document declaration, and takes the {@code capture} using
279      * {@link RubyLexer#offer(java.lang.String)}. If the
280      * declaration is valid, {@code hereSettings} will have been appended.
281      */
hop(String capture)282     public void hop(String capture) throws IOException {
283         if (!capture.startsWith("<<")) {
284             throw new IllegalArgumentException("bad HERE: " + capture);
285         }
286 
287         offer(capture);
288         if (dHead.hereSettings == null) {
289             dHead.hereSettings = new LinkedList<>();
290         }
291 
292         String remaining = capture;
293         int i = 0;
294         HereDocSettings settings;
295         boolean indented = false;
296         boolean nointerp;
297         String terminator;
298 
299         String opener = remaining.substring(0, i + 2);
300         remaining = remaining.substring(opener.length());
301         if (remaining.startsWith("~") || remaining.startsWith("-")) {
302             indented = true;
303             remaining = remaining.substring(1);
304         }
305 
306         char c = remaining.charAt(0);
307         switch (c) {
308             case '\'':
309                 nointerp = true;
310                 remaining = remaining.substring(1);
311                 break;
312             case '`':
313                 // (Ruby, unlike Perl, does not recognize '"' here.)
314                 nointerp = false;
315                 remaining = remaining.substring(1);
316                 break;
317             default:
318                 c = '\0';
319                 nointerp = false;
320                 break;
321         }
322 
323         if (c != '\0') {
324             if ((i = remaining.indexOf(c)) < 1) {
325                 terminator = remaining;
326             } else {
327                 terminator = remaining.substring(0, i);
328             }
329         } else {
330             Matcher m = HERE_TERMINATOR_MATCH.matcher(remaining);
331             if (!m.find()) {
332                 return;
333             }
334             terminator = m.group(0);
335         }
336 
337         int state;
338         if (nointerp) {
339             state = indented ? HEREinxN() : HERExN();
340         } else {
341             state = indented ? HEREin() : HERE();
342         }
343         settings = new HereDocSettings(terminator, state);
344         dHead.hereSettings.add(settings);
345     }
346 
347     /**
348      * Pushes the first Here-document state if any declarations were parsed, or
349      * else does nothing.
350      * @return true if a Here state was pushed
351      */
maybeStartHere()352     public boolean maybeStartHere() throws IOException {
353         if (dHead.hereSettings != null && dHead.hereSettings.size() > 0) {
354             HereDocSettings settings = dHead.hereSettings.peek();
355             yypush(settings.state);
356             disjointSpan(HtmlConsts.STRING_CLASS);
357             return true;
358         }
359         return false;
360     }
361 
362     /**
363      * Process the {@code capture}, possibly ending the Here-document state
364      * just beforehand.
365      * @return true if the quote state ended
366      */
maybeEndHere(String capture)367     public boolean maybeEndHere(String capture) throws IOException {
368         String trimmed = capture.stripLeading();
369         HereDocSettings settings = dHead.hereSettings.peek();
370         assert settings != null;
371 
372         boolean didZspan = false;
373         if (trimmed.equals(settings.terminator)) {
374             disjointSpan(null);
375             didZspan = true;
376             dHead.hereSettings.remove();
377         }
378 
379         offer(capture);
380 
381         if (dHead.hereSettings.size() > 0) {
382             settings = dHead.hereSettings.peek();
383             yybegin(settings.state);
384             if (didZspan) {
385                 disjointSpan(HtmlConsts.STRING_CLASS);
386             }
387             return false;
388         } else {
389             yypop();
390             return true;
391         }
392     }
393 
394     /**
395      * Resets the interpolation counter to 1.
396      */
interpop()397     public void interpop() {
398         dHead.nendbrace = 1;
399     }
400 
401     /**
402      * Determine if the interpolation should end based on the first character
403      * of {@code capture}, recognizing tokens that increase the nesting level
404      * instead.
405      * <p>
406      * Calling this method has side effects to possibly modify
407      * {@code nendbrace}.
408      * @return true if the interpolation state should end
409      */
maybeEndInterpolation(String capture)410     public boolean maybeEndInterpolation(String capture) throws IOException {
411         if (dHead.nendbrace <= 0) {
412             return false;
413         }
414         if (capture.startsWith("}")) {
415             if (--dHead.nendbrace <= 0) {
416                 int rem = capture.length() - 1;
417                 String opener = capture.substring(0, 1);
418                 popData();
419                 yypop();
420                 disjointSpan(HtmlConsts.STRING_CLASS);
421                 offer(opener);
422                 if (rem > 0) {
423                     yypushback(rem);
424                 }
425                 return true;
426             }
427         } else if (capture.startsWith("{")) {
428             ++dHead.nendbrace;
429         }
430         return false;
431     }
432 
433     /**
434      * Take a series of module names separated by "::".
435      */
takeModules(String capture)436     public void takeModules(String capture) throws IOException {
437         final String SEP = "::";
438         int o = 0, i;
439         while (o < capture.length() && (i = capture.indexOf(SEP, o)) != -1) {
440             String module = capture.substring(o, i);
441             offerSymbol(module, o, false);
442             offer(SEP);
443             o = i + 2;
444         }
445         if (o < capture.length()) {
446             String module = capture.substring(o);
447             offerSymbol(module, o, false);
448         }
449     }
450 
451     /**
452      * Subtract the number of initial, non-word characters from the length of
453      * {@code capture}.
454      * @param capture a defined value
455      * @return the length of {@code value} minus the number of initial,
456      * non-word characters
457      */
nameLength(String capture)458     public int nameLength(String capture) {
459         int len = capture.length();
460         for (int i = 0; i < capture.length(); ++i) {
461             if (Character.isLetterOrDigit(capture.charAt(i))) {
462                 break;
463             }
464             --len;
465         }
466         return len;
467     }
468 
469     /**
470      * Gets a pattern to match the collateral capture for the current quoting
471      * state or null if there is no active quoting state.
472      * @return a defined pattern or null
473      */
getCollateralCapturePattern()474     public Pattern getCollateralCapturePattern() {
475         if (dHead.endqchar == '\0') {
476             return null;
477         }
478         if (dHead.collateralCapture != null) {
479             return dHead.collateralCapture;
480         }
481 
482         StringBuilder patb = new StringBuilder("[");
483         patb.append(Pattern.quote(String.valueOf(dHead.endqchar)));
484         if (dHead.nestqchar != '\0') {
485             patb.append(Pattern.quote(String.valueOf(dHead.nestqchar)));
486         }
487         patb.append("]");
488         patb.append(RegexUtils.getNotFollowingEscapePattern());
489         dHead.collateralCapture = Pattern.compile(patb.toString());
490         return dHead.collateralCapture;
491     }
492 
493     /**
494      * Calls {@link #phLOC()} if the yystate is not SCOMMENT or POD.
495      */
chkLOC()496     public void chkLOC() {
497         int yystate = yystate();
498         if (yystate != SCOMMENT() && yystate != POD()) {
499             phLOC();
500         }
501     }
502 
503     /**
504      * Subclasses must override to possibly set the INTRA state.
505      */
maybeIntraState()506     abstract void maybeIntraState();
507 
pushData()508     void pushData() {
509         if (data == null) {
510             data = new Stack<>();
511         }
512         data.push(dHead);
513         dHead = new RubyLexerData();
514     }
515 
popData()516     void popData() {
517         dHead = data.pop();
518     }
519 
520     /**
521      * Subclasses must override to get the constant value created by JFlex to
522      * represent QUOxLxN.
523      */
QUOxLxN()524     abstract int QUOxLxN();
525 
526     /**
527      * Subclasses must override to get the constant value created by JFlex to
528      * represent QUOxN.
529      */
QUOxN()530     abstract int QUOxN();
531 
532     /**
533      * Subclasses must override to get the constant value created by JFlex to
534      * represent QUOxL.
535      */
QUOxL()536     abstract int QUOxL();
537 
538     /**
539      * Subclasses must override to get the constant value created by JFlex to
540      * represent QUO.
541      */
QUO()542     abstract int QUO();
543 
544     /**
545      * Subclasses must override to get the constant value created by JFlex to
546      * represent HEREinxN.
547      */
HEREinxN()548     abstract int HEREinxN();
549 
550     /**
551      * Subclasses must override to get the constant value created by JFlex to
552      * represent HERExN.
553      */
HERExN()554     abstract int HERExN();
555 
556     /**
557      * Subclasses must override to get the constant value created by JFlex to
558      * represent HEREin.
559      */
HEREin()560     abstract int HEREin();
561 
562     /**
563      * Subclasses must override to get the constant value created by JFlex to
564      * represent HERE.
565      */
HERE()566     abstract int HERE();
567 
568     /**
569      * Subclasses must override to get the constant value created by JFlex to
570      * represent SCOMMENT.
571      */
SCOMMENT()572     abstract int SCOMMENT();
573 
574     /**
575      * Subclasses must override to get the constant value created by JFlex to
576      * represent POD.
577      */
POD()578     abstract int POD();
579 
580     private static class HereDocSettings {
581         private final String terminator;
582         private final int state;
583 
HereDocSettings(String terminator, int state)584         HereDocSettings(String terminator, int state) {
585             this.terminator = terminator;
586             this.state = state;
587         }
588     }
589 
590     private static class RubyLexerData {
591         private Queue<HereDocSettings> hereSettings;
592 
593         /**
594          * When matching a quoting construct like qq[], q(), m//, s```, etc.,
595          * the operator name (e.g., "m" or "tr") is stored. Unlike
596          * {@code endqchar} it is not unset when the quote ends, because it is
597          * useful to indicate if quote modifier characters are expected.
598          */
599         private String qopname;
600 
601         /**
602          * When matching a quoting construct like %w(), '', %[], etc., the
603          * terminating character is stored.
604          */
605         private char endqchar;
606 
607         /**
608          * When matching a quoting construct like %[], %w(), %&lt;&gt; etc.
609          * that nest, the begin character ('[', '&lt;', '(', or '{') is stored
610          * so that nesting is tracked and {@code nendqchar} is incremented
611          * appropriately.  Otherwise, {@code nestqchar} is set to '\0' if no
612          * nesting occurs.
613          */
614         private char nestqchar;
615 
616         /**
617          * When matching a quoting construct like %[], %w(), etc., the number
618          * of remaining end separators is stored. It starts at 1, and any
619          * nesting increases the value.
620          */
621         private int nendqchar;
622 
623         /**
624          * When interpolating inside a quoting construct, the number of
625          * remaining '}' is stored. It starts at 1, and any nesting increases
626          * the value.
627          */
628         private int nendbrace;
629 
630         /**
631          * When matching a quoting construct, a Pattern to identify collateral
632          * capture characters is stored.
633          */
634         private Pattern collateralCapture;
635     }
636 }
637