1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * See LICENSE.txt included in this distribution for the specific 9 * language governing permissions and limitations under the License. 10 * 11 * When distributing Covered Code, include this CDDL HEADER in each 12 * file and include the License file at LICENSE.txt. 13 * If applicable, add the following below this CDDL HEADER, with the 14 * fields enclosed by brackets "[]" replaced with your own identifying 15 * information: Portions Copyright [yyyy] [name of copyright owner] 16 * 17 * CDDL HEADER END 18 */ 19 20 /* 21 * Copyright (c) 2011, 2021, Oracle and/or its affiliates. All rights reserved. 22 * Portions Copyright (c) 2017-2018, Chris Fraire <cfraire@me.com>. 23 */ 24 25 /* 26 * Gets Php symbols - ignores comments, strings, keywords 27 */ 28 29 package org.opengrok.indexer.analysis.php; 30 31 import java.util.Arrays; 32 import java.util.HashSet; 33 import java.util.Locale; 34 import java.util.Set; 35 import java.util.Stack; 36 import org.opengrok.indexer.analysis.JFlexSymbolMatcher; 37 %% 38 %public 39 %class PhpSymbolTokenizer 40 %extends JFlexSymbolMatcher 41 %unicode 42 %int 43 %include ../CommonLexer.lexh 44 %char 45 %ignorecase 46 %{ 47 private final static Set<String> PSEUDO_TYPES; 48 private Stack<String> docLabels = new Stack<String>(); 49 50 static { 51 PSEUDO_TYPES = new HashSet<String>(Arrays.asList( 52 new String[] { 53 "string", "integer", "int", "boolean", "bool", "float", "double", 54 "object", "mixed", "array", "resource", "void", "null", "callback", 55 "false", "true", "self", "callable" 56 } 57 )); 58 } 59 60 @Override clearStack()61 protected void clearStack() { 62 super.clearStack(); 63 docLabels.clear(); 64 } 65 isTabOrSpace(int i)66 private boolean isTabOrSpace(int i) { 67 return yycharat(i) == '\t' || yycharat(i) == ' '; 68 } 69 isHtmlState(int state)70 private static boolean isHtmlState(int state) { 71 return state == YYINITIAL; 72 } 73 %} 74 75 Identifier = [a-zA-Z_\u007F-\u10FFFF] [a-zA-Z0-9_\u007F-\u10FFFF]* 76 77 File = [a-zA-Z]{FNameChar}* "." ("php"|"php3"|"php4"|"phps"|"phtml"|"inc"|"diff"|"patch") 78 79 BinaryNumber = 0[b|B][01]+ 80 OctalNumber = 0[0-7]+ 81 DecimalNumber = [1-9][0-9]+ 82 HexadecimalNumber = 0[xX][0-9a-fA-F]+ 83 FloatNumber = (([0-9]* "." [0-9]+) | ([0-9]+ "." [0-9]*) | [0-9]+)([eE][+-]?[0-9]+)? 84 Number = [+-]?({BinaryNumber}|{OctalNumber}|{DecimalNumber}|{HexadecimalNumber}|{FloatNumber}) 85 86 OpeningTag = ("<?" "php"?) | "<?=" 87 ClosingTag = "?>" 88 89 CastTypes = "int"|"integer"|"real"|"double"|"float"|"string"|"binary"|"array" 90 |"object"|"bool"|"boolean"|"unset" 91 92 DoubleQuoteEscapeSequences = \\ (([nrtfve\\$]) | ([xX] [0-9a-fA-F]{1,2}) | ([0-7]{1,3})) 93 SingleQuoteEscapeSequences = \\ [\\\'] 94 95 DocPreviousChar = "*" | {WhspChar}+ 96 97 DocParamWithType = "return" | "throws" | "throw" | "var" | "see" //"see" can take a URL 98 DocParamWithTypeAndName = "param" | "global" | "property" | "property-read" 99 | "property-write" 100 DocParamWithName = "uses" 101 //method needs special treatment 102 103 %state IN_SCRIPT STRING SCOMMENT HEREDOC NOWDOC COMMENT QSTRING BACKQUOTE STRINGEXPR STRINGVAR 104 %state DOCCOMMENT DOCCOM_TYPE_THEN_NAME DOCCOM_NAME DOCCOM_TYPE 105 106 %include ../Common.lexh 107 %include ../CommonURI.lexh 108 %include ../CommonPath.lexh 109 %% 110 111 <YYINITIAL> { 112 {OpeningTag} { yypush(IN_SCRIPT); } 113 } 114 115 116 <IN_SCRIPT> { 117 "$" {Identifier} { 118 //we ignore keywords if the identifier starts with one of variable chars 119 onSymbolMatched(yytext().substring(1), yychar + 1); 120 return yystate(); 121 } 122 123 {Identifier} { 124 if (!Consts.kwd.contains(yytext())) { 125 onSymbolMatched(yytext(), yychar); 126 return yystate(); 127 } 128 } 129 130 \( {WhspChar}* {CastTypes} {WhspChar}* \) { } 131 132 b? \" { yypush(STRING); } 133 134 b? \' { yypush(QSTRING); } 135 136 ` { yypush(BACKQUOTE); } 137 138 b? "<<<" {WhspChar}* ({Identifier} | (\'{Identifier}\') | (\"{Identifier}\")){EOL} { 139 int i = yycharat(0) == 'b' ? 4 : 3, j = yylength()-1; 140 while (isTabOrSpace(i)) { i++; } 141 while (yycharat(j) == '\n' || yycharat(j) == '\r') { j--; } 142 143 if (yycharat(i) == '\'' || yycharat(i) == '"') { 144 yypush(NOWDOC); 145 String text = yytext().substring(i+1, j); 146 this.docLabels.push(text); 147 } else { 148 yypush(HEREDOC); 149 String text = yytext().substring(i, j+1); 150 this.docLabels.push(text); 151 } 152 } 153 154 {Number} { } 155 156 "#"|"//" { yypush(SCOMMENT); } 157 "/**" { yypush(DOCCOMMENT); } 158 "/*" { yypush(COMMENT); } 159 160 \{ { yypush(IN_SCRIPT); } 161 \} { 162 if (!this.stack.empty() && !isHtmlState(this.stack.peek())) 163 yypop(); //may pop STRINGEXPR/HEREDOC/BACKQUOTE 164 } 165 166 {ClosingTag} { while (!isHtmlState(yystate())) yypop(); } 167 } //end of IN_SCRIPT 168 169 <STRING> { 170 \\\" { } 171 \" { yypop(); } 172 } 173 174 <BACKQUOTE> { 175 "\\`" { } 176 "`" { yypop(); } 177 } 178 179 <STRING, BACKQUOTE, HEREDOC> { 180 "\\{" { } 181 182 {DoubleQuoteEscapeSequences} {} 183 184 "$" { yypush(STRINGVAR); } 185 186 "${" { yypush(STRINGEXPR); } 187 188 /* ${ is different from {$ -- for instance {$foo->bar[1]} is valid 189 * but ${foo->bar[1]} is not. ${ only enters the full blown scripting state 190 * when {Identifer}[ is found (see the PHP scanner). Tthe parser seems to 191 * put more restrictions on the {$ scripting mode than on the 192 * "${" {Identifer} "[" scripting mode, but that's not relevant here */ 193 "{$" { 194 yypushback(1); 195 yypush(IN_SCRIPT); 196 } 197 } 198 199 <QSTRING> { 200 {SingleQuoteEscapeSequences} { } 201 \' { yypop(); } 202 } 203 204 <HEREDOC, NOWDOC>^{Identifier} ";"? {EOL} { 205 int i = yylength() - 1; 206 while (yycharat(i) == '\n' || yycharat(i) == '\r') { i--; } 207 if (yycharat(i) == ';') { i--; } 208 if (yytext().substring(0, i+1).equals(this.docLabels.peek())) { 209 String text = this.docLabels.pop(); 210 yypop(); 211 } 212 } 213 214 <STRING, QSTRING, BACKQUOTE, HEREDOC, NOWDOC>{WhspChar}* {EOL} { } 215 216 <STRINGVAR> { 217 {Identifier} { 218 onSymbolMatched(yytext(), yychar); 219 return yystate(); 220 } 221 222 \[ {Number} \] { 223 yypop(); //because "$arr[0][1]" is the same as $arr[0] . "[1]" 224 } 225 226 \[ {Identifier} \] { 227 //then the identifier is actually a string! 228 yypop(); 229 } 230 231 \[ "$" {Identifier} \] { 232 onSymbolMatched(yytext().substring(2, yylength()-1), yychar + 2); 233 yypop(); 234 return yystate(); 235 } 236 237 "->" {Identifier} { 238 onSymbolMatched(yytext().substring(2), yychar + 2); 239 yypop(); //because "$arr->a[0]" is the same as $arr->a . "[0]" 240 return yystate(); 241 } 242 243 [^] { yypushback(1); yypop(); } 244 } 245 246 <STRINGEXPR> { 247 {Identifier} { 248 onSymbolMatched(yytext(), yychar); 249 return yystate(); 250 } 251 \} { yypop(); } 252 \[ { yybegin(IN_SCRIPT); } /* don't push. when we find '}' 253 * and we pop we want to go to 254 * STRING/HEREDOC, not back to 255 * STRINGEXPR */ 256 } 257 258 <SCOMMENT> { 259 {ClosingTag} { 260 while (!isHtmlState(yystate())) yypop(); 261 } 262 {WhspChar}* {EOL} { 263 yypop(); 264 } 265 } 266 267 <DOCCOMMENT> { 268 /* change relatively to xref -- we also consume the whitespace after */ 269 {DocPreviousChar} "@" {DocParamWithType} {WhspChar}+ { 270 yybegin(DOCCOM_TYPE); 271 } 272 273 {DocPreviousChar} "@" {DocParamWithTypeAndName} {WhspChar}+ { 274 yybegin(DOCCOM_TYPE_THEN_NAME); 275 } 276 277 {DocPreviousChar} "@" {DocParamWithName} {WhspChar}+ { 278 yybegin(DOCCOM_NAME); 279 } 280 } 281 282 <DOCCOM_TYPE_THEN_NAME, DOCCOM_TYPE> { 283 /* The rules here had to be substantially changed because we cannot find 284 * several symbols in one match. This is substantially more lax than 285 * the xref rules */ 286 287 [\[\]\|\(\)] { } 288 289 {WhspChar}+ { 290 yybegin(yystate() == DOCCOM_TYPE_THEN_NAME ? DOCCOM_NAME : DOCCOMMENT); 291 } 292 293 {Identifier} { 294 if (!PSEUDO_TYPES.contains(yytext().toLowerCase(Locale.ROOT))) { 295 onSymbolMatched(yytext(), yychar); 296 return yystate(); 297 } 298 } 299 300 [^] { yybegin(DOCCOMMENT); yypushback(1); } 301 } 302 303 <DOCCOM_NAME> { 304 "$" {Identifier} { 305 onSymbolMatched(yytext().substring(1), yychar + 1); 306 yybegin(DOCCOMMENT); 307 return yystate(); 308 } 309 310 [^] { yybegin(DOCCOMMENT); yypushback(1); } 311 } 312 313 <COMMENT, DOCCOMMENT> { 314 {WhspChar}* {EOL} { } 315 "*/" { yypop(); } 316 } 317 318 <YYINITIAL, IN_SCRIPT, STRING, QSTRING, BACKQUOTE, HEREDOC, NOWDOC, SCOMMENT, COMMENT, DOCCOMMENT, STRINGEXPR, STRINGVAR> { 319 {WhspChar}* {EOL} { } 320 [^\n] { } 321 } 322 323 <YYINITIAL, SCOMMENT, COMMENT, DOCCOMMENT, STRING, QSTRING, BACKQUOTE, HEREDOC, NOWDOC> { 324 {FPath} { } 325 326 {File} { } 327 328 {BrowseableURI} { } 329 330 {FNameChar}+ "@" {FNameChar}+ "." {FNameChar}+ 331 { } 332 } 333