xref: /OpenGrok/opengrok-indexer/src/main/jflex/analysis/php/PhpSymbolTokenizer.lex (revision d219b4cea555a12b602d2d5518daa22134ad4879)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * See LICENSE.txt included in this distribution for the specific
9  * language governing permissions and limitations under the License.
10  *
11  * When distributing Covered Code, include this CDDL HEADER in each
12  * file and include the License file at LICENSE.txt.
13  * If applicable, add the following below this CDDL HEADER, with the
14  * fields enclosed by brackets "[]" replaced with your own identifying
15  * information: Portions Copyright [yyyy] [name of copyright owner]
16  *
17  * CDDL HEADER END
18  */
19 
20 /*
21  * Copyright (c) 2011, 2021, Oracle and/or its affiliates. All rights reserved.
22  * Portions Copyright (c) 2017-2018, Chris Fraire <cfraire@me.com>.
23  */
24 
25 /*
26  * Gets Php symbols - ignores comments, strings, keywords
27  */
28 
29 package org.opengrok.indexer.analysis.php;
30 
31 import java.util.Arrays;
32 import java.util.HashSet;
33 import java.util.Locale;
34 import java.util.Set;
35 import java.util.Stack;
36 import org.opengrok.indexer.analysis.JFlexSymbolMatcher;
37 %%
38 %public
39 %class PhpSymbolTokenizer
40 %extends JFlexSymbolMatcher
41 %unicode
42 %int
43 %include ../CommonLexer.lexh
44 %char
45 %ignorecase
46 %{
47   private final static Set<String> PSEUDO_TYPES;
48   private Stack<String> docLabels = new Stack<String>();
49 
50   static {
51     PSEUDO_TYPES = new HashSet<String>(Arrays.asList(
52         new String[] {
53             "string", "integer", "int", "boolean", "bool", "float", "double",
54             "object", "mixed", "array", "resource", "void", "null", "callback",
55             "false", "true", "self", "callable"
56         }
57     ));
58   }
59 
60   @Override
clearStack()61   protected void clearStack() {
62       super.clearStack();
63       docLabels.clear();
64   }
65 
isTabOrSpace(int i)66   private boolean isTabOrSpace(int i) {
67     return yycharat(i) == '\t' || yycharat(i) == ' ';
68   }
69 
isHtmlState(int state)70   private static boolean isHtmlState(int state) {
71     return state == YYINITIAL;
72   }
73 %}
74 
75 Identifier = [a-zA-Z_\u007F-\u10FFFF] [a-zA-Z0-9_\u007F-\u10FFFF]*
76 
77 File = [a-zA-Z]{FNameChar}* "." ("php"|"php3"|"php4"|"phps"|"phtml"|"inc"|"diff"|"patch")
78 
79 BinaryNumber = 0[b|B][01]+
80 OctalNumber = 0[0-7]+
81 DecimalNumber = [1-9][0-9]+
82 HexadecimalNumber = 0[xX][0-9a-fA-F]+
83 FloatNumber = (([0-9]* "." [0-9]+) | ([0-9]+ "." [0-9]*) | [0-9]+)([eE][+-]?[0-9]+)?
84 Number = [+-]?({BinaryNumber}|{OctalNumber}|{DecimalNumber}|{HexadecimalNumber}|{FloatNumber})
85 
86 OpeningTag = ("<?" "php"?) | "<?="
87 ClosingTag = "?>"
88 
89 CastTypes = "int"|"integer"|"real"|"double"|"float"|"string"|"binary"|"array"
90             |"object"|"bool"|"boolean"|"unset"
91 
92 DoubleQuoteEscapeSequences = \\ (([nrtfve\\$]) | ([xX] [0-9a-fA-F]{1,2}) |  ([0-7]{1,3}))
93 SingleQuoteEscapeSequences = \\ [\\\']
94 
95 DocPreviousChar = "*" | {WhspChar}+
96 
97 DocParamWithType = "return" | "throws" | "throw" | "var" | "see"  //"see" can take a URL
98 DocParamWithTypeAndName = "param" | "global" | "property" | "property-read"
99                           | "property-write"
100 DocParamWithName = "uses"
101 //method needs special treatment
102 
103 %state IN_SCRIPT STRING SCOMMENT HEREDOC NOWDOC COMMENT QSTRING BACKQUOTE STRINGEXPR STRINGVAR
104 %state DOCCOMMENT DOCCOM_TYPE_THEN_NAME DOCCOM_NAME DOCCOM_TYPE
105 
106 %include ../Common.lexh
107 %include ../CommonURI.lexh
108 %include ../CommonPath.lexh
109 %%
110 
111 <YYINITIAL> {
112     {OpeningTag}    { yypush(IN_SCRIPT); }
113 }
114 
115 
116 <IN_SCRIPT> {
117     "$" {Identifier} {
118         //we ignore keywords if the identifier starts with one of variable chars
119         onSymbolMatched(yytext().substring(1), yychar + 1);
120         return yystate();
121     }
122 
123     {Identifier} {
124         if (!Consts.kwd.contains(yytext())) {
125             onSymbolMatched(yytext(), yychar);
126             return yystate();
127         }
128     }
129 
130     \( {WhspChar}* {CastTypes} {WhspChar}* \) { }
131 
132     b? \" { yypush(STRING); }
133 
134     b? \' { yypush(QSTRING); }
135 
136     ` { yypush(BACKQUOTE); }
137 
138     b? "<<<" {WhspChar}* ({Identifier} | (\'{Identifier}\') | (\"{Identifier}\")){EOL} {
139         int i = yycharat(0) == 'b' ? 4 : 3, j = yylength()-1;
140         while (isTabOrSpace(i)) { i++; }
141         while (yycharat(j) == '\n' || yycharat(j) == '\r') { j--; }
142 
143         if (yycharat(i) == '\'' || yycharat(i) == '"') {
144             yypush(NOWDOC);
145             String text = yytext().substring(i+1, j);
146             this.docLabels.push(text);
147         } else {
148             yypush(HEREDOC);
149             String text = yytext().substring(i, j+1);
150             this.docLabels.push(text);
151         }
152     }
153 
154     {Number}   { }
155 
156     "#"|"//"   { yypush(SCOMMENT); }
157     "/**"      { yypush(DOCCOMMENT); }
158     "/*"       { yypush(COMMENT); }
159 
160     \{         { yypush(IN_SCRIPT); }
161     \}         {
162         if (!this.stack.empty() && !isHtmlState(this.stack.peek()))
163             yypop(); //may pop STRINGEXPR/HEREDOC/BACKQUOTE
164     }
165 
166     {ClosingTag}    { while (!isHtmlState(yystate())) yypop(); }
167 } //end of IN_SCRIPT
168 
169 <STRING> {
170     \\\" { }
171     \" { yypop(); }
172 }
173 
174 <BACKQUOTE> {
175     "\\`" { }
176     "`" { yypop(); }
177 }
178 
179 <STRING, BACKQUOTE, HEREDOC> {
180     "\\{" { }
181 
182     {DoubleQuoteEscapeSequences} {}
183 
184     "$"     { yypush(STRINGVAR); }
185 
186     "${"    { yypush(STRINGEXPR); }
187 
188     /* ${ is different from {$ -- for instance {$foo->bar[1]} is valid
189      * but ${foo->bar[1]} is not. ${ only enters the full blown scripting state
190      * when {Identifer}[ is found (see the PHP scanner). Tthe parser seems to
191      * put more restrictions on the {$ scripting mode than on the
192      * "${" {Identifer} "[" scripting mode, but that's not relevant here */
193     "{$" {
194         yypushback(1);
195         yypush(IN_SCRIPT);
196     }
197 }
198 
199 <QSTRING> {
200     {SingleQuoteEscapeSequences} { }
201     \'      { yypop(); }
202 }
203 
204 <HEREDOC, NOWDOC>^{Identifier} ";"? {EOL}  {
205     int i = yylength() - 1;
206     while (yycharat(i) == '\n' || yycharat(i) == '\r') { i--; }
207     if (yycharat(i) == ';') { i--; }
208     if (yytext().substring(0, i+1).equals(this.docLabels.peek())) {
209         String text = this.docLabels.pop();
210         yypop();
211     }
212 }
213 
214 <STRING, QSTRING, BACKQUOTE, HEREDOC, NOWDOC>{WhspChar}* {EOL} { }
215 
216 <STRINGVAR> {
217     {Identifier} {
218         onSymbolMatched(yytext(), yychar);
219         return yystate();
220     }
221 
222     \[ {Number} \] {
223         yypop(); //because "$arr[0][1]" is the same as $arr[0] . "[1]"
224     }
225 
226     \[ {Identifier} \] {
227         //then the identifier is actually a string!
228         yypop();
229     }
230 
231     \[ "$" {Identifier} \] {
232         onSymbolMatched(yytext().substring(2, yylength()-1), yychar + 2);
233         yypop();
234         return yystate();
235     }
236 
237     "->" {Identifier} {
238         onSymbolMatched(yytext().substring(2), yychar + 2);
239         yypop(); //because "$arr->a[0]" is the same as $arr->a . "[0]"
240         return yystate();
241     }
242 
243     [^]          { yypushback(1); yypop(); }
244 }
245 
246 <STRINGEXPR> {
247     {Identifier} {
248         onSymbolMatched(yytext(), yychar);
249         return yystate();
250     }
251     \}  { yypop(); }
252     \[  { yybegin(IN_SCRIPT); } /* don't push. when we find '}'
253                                                  * and we pop we want to go to
254                                                  * STRING/HEREDOC, not back to
255                                                  * STRINGEXPR */
256 }
257 
258 <SCOMMENT> {
259     {ClosingTag}    {
260         while (!isHtmlState(yystate())) yypop();
261     }
262     {WhspChar}* {EOL} {
263         yypop();
264     }
265 }
266 
267 <DOCCOMMENT> {
268     /* change relatively to xref -- we also consume the whitespace after */
269     {DocPreviousChar} "@" {DocParamWithType} {WhspChar}+    {
270         yybegin(DOCCOM_TYPE);
271     }
272 
273     {DocPreviousChar} "@" {DocParamWithTypeAndName} {WhspChar}+    {
274         yybegin(DOCCOM_TYPE_THEN_NAME);
275     }
276 
277     {DocPreviousChar} "@" {DocParamWithName} {WhspChar}+    {
278         yybegin(DOCCOM_NAME);
279     }
280 }
281 
282 <DOCCOM_TYPE_THEN_NAME, DOCCOM_TYPE> {
283     /* The rules here had to be substantially changed because we cannot find
284      * several symbols in one match. This is substantially more lax than
285      * the xref rules */
286 
287     [\[\]\|\(\)] { }
288 
289     {WhspChar}+    {
290         yybegin(yystate() == DOCCOM_TYPE_THEN_NAME ? DOCCOM_NAME : DOCCOMMENT);
291     }
292 
293     {Identifier} {
294         if (!PSEUDO_TYPES.contains(yytext().toLowerCase(Locale.ROOT))) {
295             onSymbolMatched(yytext(), yychar);
296             return yystate();
297         }
298     }
299 
300     [^] { yybegin(DOCCOMMENT); yypushback(1); }
301 }
302 
303 <DOCCOM_NAME> {
304     "$" {Identifier} {
305         onSymbolMatched(yytext().substring(1), yychar + 1);
306         yybegin(DOCCOMMENT);
307         return yystate();
308     }
309 
310     [^] { yybegin(DOCCOMMENT); yypushback(1); }
311 }
312 
313 <COMMENT, DOCCOMMENT> {
314     {WhspChar}* {EOL} {  }
315     "*/"    { yypop(); }
316 }
317 
318 <YYINITIAL, IN_SCRIPT, STRING, QSTRING, BACKQUOTE, HEREDOC, NOWDOC, SCOMMENT, COMMENT, DOCCOMMENT, STRINGEXPR, STRINGVAR> {
319     {WhspChar}* {EOL} { }
320     [^\n]       { }
321 }
322 
323 <YYINITIAL, SCOMMENT, COMMENT, DOCCOMMENT, STRING, QSTRING, BACKQUOTE, HEREDOC, NOWDOC> {
324     {FPath} { }
325 
326     {File} { }
327 
328     {BrowseableURI}    { }
329 
330     {FNameChar}+ "@" {FNameChar}+ "." {FNameChar}+
331             { }
332 }
333