1*d219b4ceSAdam Hornacek /* 2*d219b4ceSAdam Hornacek * CDDL HEADER START 3*d219b4ceSAdam Hornacek * 4*d219b4ceSAdam Hornacek * The contents of this file are subject to the terms of the 5*d219b4ceSAdam Hornacek * Common Development and Distribution License (the "License"). 6*d219b4ceSAdam Hornacek * You may not use this file except in compliance with the License. 7*d219b4ceSAdam Hornacek * 8*d219b4ceSAdam Hornacek * See LICENSE.txt included in this distribution for the specific 9*d219b4ceSAdam Hornacek * language governing permissions and limitations under the License. 10*d219b4ceSAdam Hornacek * 11*d219b4ceSAdam Hornacek * When distributing Covered Code, include this CDDL HEADER in each 12*d219b4ceSAdam Hornacek * file and include the License file at LICENSE.txt. 13*d219b4ceSAdam Hornacek * If applicable, add the following below this CDDL HEADER, with the 14*d219b4ceSAdam Hornacek * fields enclosed by brackets "[]" replaced with your own identifying 15*d219b4ceSAdam Hornacek * information: Portions Copyright [yyyy] [name of copyright owner] 16*d219b4ceSAdam Hornacek * 17*d219b4ceSAdam Hornacek * CDDL HEADER END 18*d219b4ceSAdam Hornacek */ 19*d219b4ceSAdam Hornacek 20*d219b4ceSAdam Hornacek /* 21*d219b4ceSAdam Hornacek * Copyright (c) 2011, 2021, Oracle and/or its affiliates. All rights reserved. 22*d219b4ceSAdam Hornacek * Portions Copyright (c) 2017-2018, Chris Fraire <cfraire@me.com>. 23*d219b4ceSAdam Hornacek */ 24*d219b4ceSAdam Hornacek 25*d219b4ceSAdam Hornacek /* 26*d219b4ceSAdam Hornacek * Gets Php symbols - ignores comments, strings, keywords 27*d219b4ceSAdam Hornacek */ 28*d219b4ceSAdam Hornacek 29*d219b4ceSAdam Hornacek package org.opengrok.indexer.analysis.php; 30*d219b4ceSAdam Hornacek 31*d219b4ceSAdam Hornacek import java.util.Arrays; 32*d219b4ceSAdam Hornacek import java.util.HashSet; 33*d219b4ceSAdam Hornacek import java.util.Locale; 34*d219b4ceSAdam Hornacek import java.util.Set; 35*d219b4ceSAdam Hornacek import java.util.Stack; 36*d219b4ceSAdam Hornacek import org.opengrok.indexer.analysis.JFlexSymbolMatcher; 37*d219b4ceSAdam Hornacek %% 38*d219b4ceSAdam Hornacek %public 39*d219b4ceSAdam Hornacek %class PhpSymbolTokenizer 40*d219b4ceSAdam Hornacek %extends JFlexSymbolMatcher 41*d219b4ceSAdam Hornacek %unicode 42*d219b4ceSAdam Hornacek %int 43*d219b4ceSAdam Hornacek %include ../CommonLexer.lexh 44*d219b4ceSAdam Hornacek %char 45*d219b4ceSAdam Hornacek %ignorecase 46*d219b4ceSAdam Hornacek %{ 47*d219b4ceSAdam Hornacek private final static Set<String> PSEUDO_TYPES; 48*d219b4ceSAdam Hornacek private Stack<String> docLabels = new Stack<String>(); 49*d219b4ceSAdam Hornacek 50*d219b4ceSAdam Hornacek static { 51*d219b4ceSAdam Hornacek PSEUDO_TYPES = new HashSet<String>(Arrays.asList( 52*d219b4ceSAdam Hornacek new String[] { 53*d219b4ceSAdam Hornacek "string", "integer", "int", "boolean", "bool", "float", "double", 54*d219b4ceSAdam Hornacek "object", "mixed", "array", "resource", "void", "null", "callback", 55*d219b4ceSAdam Hornacek "false", "true", "self", "callable" 56*d219b4ceSAdam Hornacek } 57*d219b4ceSAdam Hornacek )); 58*d219b4ceSAdam Hornacek } 59*d219b4ceSAdam Hornacek 60*d219b4ceSAdam Hornacek @Override clearStack()61*d219b4ceSAdam Hornacek protected void clearStack() { 62*d219b4ceSAdam Hornacek super.clearStack(); 63*d219b4ceSAdam Hornacek docLabels.clear(); 64*d219b4ceSAdam Hornacek } 65*d219b4ceSAdam Hornacek isTabOrSpace(int i)66*d219b4ceSAdam Hornacek private boolean isTabOrSpace(int i) { 67*d219b4ceSAdam Hornacek return yycharat(i) == '\t' || yycharat(i) == ' '; 68*d219b4ceSAdam Hornacek } 69*d219b4ceSAdam Hornacek isHtmlState(int state)70*d219b4ceSAdam Hornacek private static boolean isHtmlState(int state) { 71*d219b4ceSAdam Hornacek return state == YYINITIAL; 72*d219b4ceSAdam Hornacek } 73*d219b4ceSAdam Hornacek %} 74*d219b4ceSAdam Hornacek 75*d219b4ceSAdam Hornacek Identifier = [a-zA-Z_\u007F-\u10FFFF] [a-zA-Z0-9_\u007F-\u10FFFF]* 76*d219b4ceSAdam Hornacek 77*d219b4ceSAdam Hornacek File = [a-zA-Z]{FNameChar}* "." ("php"|"php3"|"php4"|"phps"|"phtml"|"inc"|"diff"|"patch") 78*d219b4ceSAdam Hornacek 79*d219b4ceSAdam Hornacek BinaryNumber = 0[b|B][01]+ 80*d219b4ceSAdam Hornacek OctalNumber = 0[0-7]+ 81*d219b4ceSAdam Hornacek DecimalNumber = [1-9][0-9]+ 82*d219b4ceSAdam Hornacek HexadecimalNumber = 0[xX][0-9a-fA-F]+ 83*d219b4ceSAdam Hornacek FloatNumber = (([0-9]* "." [0-9]+) | ([0-9]+ "." [0-9]*) | [0-9]+)([eE][+-]?[0-9]+)? 84*d219b4ceSAdam Hornacek Number = [+-]?({BinaryNumber}|{OctalNumber}|{DecimalNumber}|{HexadecimalNumber}|{FloatNumber}) 85*d219b4ceSAdam Hornacek 86*d219b4ceSAdam Hornacek OpeningTag = ("<?" "php"?) | "<?=" 87*d219b4ceSAdam Hornacek ClosingTag = "?>" 88*d219b4ceSAdam Hornacek 89*d219b4ceSAdam Hornacek CastTypes = "int"|"integer"|"real"|"double"|"float"|"string"|"binary"|"array" 90*d219b4ceSAdam Hornacek |"object"|"bool"|"boolean"|"unset" 91*d219b4ceSAdam Hornacek 92*d219b4ceSAdam Hornacek DoubleQuoteEscapeSequences = \\ (([nrtfve\\$]) | ([xX] [0-9a-fA-F]{1,2}) | ([0-7]{1,3})) 93*d219b4ceSAdam Hornacek SingleQuoteEscapeSequences = \\ [\\\'] 94*d219b4ceSAdam Hornacek 95*d219b4ceSAdam Hornacek DocPreviousChar = "*" | {WhspChar}+ 96*d219b4ceSAdam Hornacek 97*d219b4ceSAdam Hornacek DocParamWithType = "return" | "throws" | "throw" | "var" | "see" //"see" can take a URL 98*d219b4ceSAdam Hornacek DocParamWithTypeAndName = "param" | "global" | "property" | "property-read" 99*d219b4ceSAdam Hornacek | "property-write" 100*d219b4ceSAdam Hornacek DocParamWithName = "uses" 101*d219b4ceSAdam Hornacek //method needs special treatment 102*d219b4ceSAdam Hornacek 103*d219b4ceSAdam Hornacek %state IN_SCRIPT STRING SCOMMENT HEREDOC NOWDOC COMMENT QSTRING BACKQUOTE STRINGEXPR STRINGVAR 104*d219b4ceSAdam Hornacek %state DOCCOMMENT DOCCOM_TYPE_THEN_NAME DOCCOM_NAME DOCCOM_TYPE 105*d219b4ceSAdam Hornacek 106*d219b4ceSAdam Hornacek %include ../Common.lexh 107*d219b4ceSAdam Hornacek %include ../CommonURI.lexh 108*d219b4ceSAdam Hornacek %include ../CommonPath.lexh 109*d219b4ceSAdam Hornacek %% 110*d219b4ceSAdam Hornacek 111*d219b4ceSAdam Hornacek <YYINITIAL> { 112*d219b4ceSAdam Hornacek {OpeningTag} { yypush(IN_SCRIPT); } 113*d219b4ceSAdam Hornacek } 114*d219b4ceSAdam Hornacek 115*d219b4ceSAdam Hornacek 116*d219b4ceSAdam Hornacek <IN_SCRIPT> { 117*d219b4ceSAdam Hornacek "$" {Identifier} { 118*d219b4ceSAdam Hornacek //we ignore keywords if the identifier starts with one of variable chars 119*d219b4ceSAdam Hornacek onSymbolMatched(yytext().substring(1), yychar + 1); 120*d219b4ceSAdam Hornacek return yystate(); 121*d219b4ceSAdam Hornacek } 122*d219b4ceSAdam Hornacek 123*d219b4ceSAdam Hornacek {Identifier} { 124*d219b4ceSAdam Hornacek if (!Consts.kwd.contains(yytext())) { 125*d219b4ceSAdam Hornacek onSymbolMatched(yytext(), yychar); 126*d219b4ceSAdam Hornacek return yystate(); 127*d219b4ceSAdam Hornacek } 128*d219b4ceSAdam Hornacek } 129*d219b4ceSAdam Hornacek 130*d219b4ceSAdam Hornacek \( {WhspChar}* {CastTypes} {WhspChar}* \) { } 131*d219b4ceSAdam Hornacek 132*d219b4ceSAdam Hornacek b? \" { yypush(STRING); } 133*d219b4ceSAdam Hornacek 134*d219b4ceSAdam Hornacek b? \' { yypush(QSTRING); } 135*d219b4ceSAdam Hornacek 136*d219b4ceSAdam Hornacek ` { yypush(BACKQUOTE); } 137*d219b4ceSAdam Hornacek 138*d219b4ceSAdam Hornacek b? "<<<" {WhspChar}* ({Identifier} | (\'{Identifier}\') | (\"{Identifier}\")){EOL} { 139*d219b4ceSAdam Hornacek int i = yycharat(0) == 'b' ? 4 : 3, j = yylength()-1; 140*d219b4ceSAdam Hornacek while (isTabOrSpace(i)) { i++; } 141*d219b4ceSAdam Hornacek while (yycharat(j) == '\n' || yycharat(j) == '\r') { j--; } 142*d219b4ceSAdam Hornacek 143*d219b4ceSAdam Hornacek if (yycharat(i) == '\'' || yycharat(i) == '"') { 144*d219b4ceSAdam Hornacek yypush(NOWDOC); 145*d219b4ceSAdam Hornacek String text = yytext().substring(i+1, j); 146*d219b4ceSAdam Hornacek this.docLabels.push(text); 147*d219b4ceSAdam Hornacek } else { 148*d219b4ceSAdam Hornacek yypush(HEREDOC); 149*d219b4ceSAdam Hornacek String text = yytext().substring(i, j+1); 150*d219b4ceSAdam Hornacek this.docLabels.push(text); 151*d219b4ceSAdam Hornacek } 152*d219b4ceSAdam Hornacek } 153*d219b4ceSAdam Hornacek 154*d219b4ceSAdam Hornacek {Number} { } 155*d219b4ceSAdam Hornacek 156*d219b4ceSAdam Hornacek "#"|"//" { yypush(SCOMMENT); } 157*d219b4ceSAdam Hornacek "/**" { yypush(DOCCOMMENT); } 158*d219b4ceSAdam Hornacek "/*" { yypush(COMMENT); } 159*d219b4ceSAdam Hornacek 160*d219b4ceSAdam Hornacek \{ { yypush(IN_SCRIPT); } 161*d219b4ceSAdam Hornacek \} { 162*d219b4ceSAdam Hornacek if (!this.stack.empty() && !isHtmlState(this.stack.peek())) 163*d219b4ceSAdam Hornacek yypop(); //may pop STRINGEXPR/HEREDOC/BACKQUOTE 164*d219b4ceSAdam Hornacek } 165*d219b4ceSAdam Hornacek 166*d219b4ceSAdam Hornacek {ClosingTag} { while (!isHtmlState(yystate())) yypop(); } 167*d219b4ceSAdam Hornacek } //end of IN_SCRIPT 168*d219b4ceSAdam Hornacek 169*d219b4ceSAdam Hornacek <STRING> { 170*d219b4ceSAdam Hornacek \\\" { } 171*d219b4ceSAdam Hornacek \" { yypop(); } 172*d219b4ceSAdam Hornacek } 173*d219b4ceSAdam Hornacek 174*d219b4ceSAdam Hornacek <BACKQUOTE> { 175*d219b4ceSAdam Hornacek "\\`" { } 176*d219b4ceSAdam Hornacek "`" { yypop(); } 177*d219b4ceSAdam Hornacek } 178*d219b4ceSAdam Hornacek 179*d219b4ceSAdam Hornacek <STRING, BACKQUOTE, HEREDOC> { 180*d219b4ceSAdam Hornacek "\\{" { } 181*d219b4ceSAdam Hornacek 182*d219b4ceSAdam Hornacek {DoubleQuoteEscapeSequences} {} 183*d219b4ceSAdam Hornacek 184*d219b4ceSAdam Hornacek "$" { yypush(STRINGVAR); } 185*d219b4ceSAdam Hornacek 186*d219b4ceSAdam Hornacek "${" { yypush(STRINGEXPR); } 187*d219b4ceSAdam Hornacek 188*d219b4ceSAdam Hornacek /* ${ is different from {$ -- for instance {$foo->bar[1]} is valid 189*d219b4ceSAdam Hornacek * but ${foo->bar[1]} is not. ${ only enters the full blown scripting state 190*d219b4ceSAdam Hornacek * when {Identifer}[ is found (see the PHP scanner). Tthe parser seems to 191*d219b4ceSAdam Hornacek * put more restrictions on the {$ scripting mode than on the 192*d219b4ceSAdam Hornacek * "${" {Identifer} "[" scripting mode, but that's not relevant here */ 193*d219b4ceSAdam Hornacek "{$" { 194*d219b4ceSAdam Hornacek yypushback(1); 195*d219b4ceSAdam Hornacek yypush(IN_SCRIPT); 196*d219b4ceSAdam Hornacek } 197*d219b4ceSAdam Hornacek } 198*d219b4ceSAdam Hornacek 199*d219b4ceSAdam Hornacek <QSTRING> { 200*d219b4ceSAdam Hornacek {SingleQuoteEscapeSequences} { } 201*d219b4ceSAdam Hornacek \' { yypop(); } 202*d219b4ceSAdam Hornacek } 203*d219b4ceSAdam Hornacek 204*d219b4ceSAdam Hornacek <HEREDOC, NOWDOC>^{Identifier} ";"? {EOL} { 205*d219b4ceSAdam Hornacek int i = yylength() - 1; 206*d219b4ceSAdam Hornacek while (yycharat(i) == '\n' || yycharat(i) == '\r') { i--; } 207*d219b4ceSAdam Hornacek if (yycharat(i) == ';') { i--; } 208*d219b4ceSAdam Hornacek if (yytext().substring(0, i+1).equals(this.docLabels.peek())) { 209*d219b4ceSAdam Hornacek String text = this.docLabels.pop(); 210*d219b4ceSAdam Hornacek yypop(); 211*d219b4ceSAdam Hornacek } 212*d219b4ceSAdam Hornacek } 213*d219b4ceSAdam Hornacek 214*d219b4ceSAdam Hornacek <STRING, QSTRING, BACKQUOTE, HEREDOC, NOWDOC>{WhspChar}* {EOL} { } 215*d219b4ceSAdam Hornacek 216*d219b4ceSAdam Hornacek <STRINGVAR> { 217*d219b4ceSAdam Hornacek {Identifier} { 218*d219b4ceSAdam Hornacek onSymbolMatched(yytext(), yychar); 219*d219b4ceSAdam Hornacek return yystate(); 220*d219b4ceSAdam Hornacek } 221*d219b4ceSAdam Hornacek 222*d219b4ceSAdam Hornacek \[ {Number} \] { 223*d219b4ceSAdam Hornacek yypop(); //because "$arr[0][1]" is the same as $arr[0] . "[1]" 224*d219b4ceSAdam Hornacek } 225*d219b4ceSAdam Hornacek 226*d219b4ceSAdam Hornacek \[ {Identifier} \] { 227*d219b4ceSAdam Hornacek //then the identifier is actually a string! 228*d219b4ceSAdam Hornacek yypop(); 229*d219b4ceSAdam Hornacek } 230*d219b4ceSAdam Hornacek 231*d219b4ceSAdam Hornacek \[ "$" {Identifier} \] { 232*d219b4ceSAdam Hornacek onSymbolMatched(yytext().substring(2, yylength()-1), yychar + 2); 233*d219b4ceSAdam Hornacek yypop(); 234*d219b4ceSAdam Hornacek return yystate(); 235*d219b4ceSAdam Hornacek } 236*d219b4ceSAdam Hornacek 237*d219b4ceSAdam Hornacek "->" {Identifier} { 238*d219b4ceSAdam Hornacek onSymbolMatched(yytext().substring(2), yychar + 2); 239*d219b4ceSAdam Hornacek yypop(); //because "$arr->a[0]" is the same as $arr->a . "[0]" 240*d219b4ceSAdam Hornacek return yystate(); 241*d219b4ceSAdam Hornacek } 242*d219b4ceSAdam Hornacek 243*d219b4ceSAdam Hornacek [^] { yypushback(1); yypop(); } 244*d219b4ceSAdam Hornacek } 245*d219b4ceSAdam Hornacek 246*d219b4ceSAdam Hornacek <STRINGEXPR> { 247*d219b4ceSAdam Hornacek {Identifier} { 248*d219b4ceSAdam Hornacek onSymbolMatched(yytext(), yychar); 249*d219b4ceSAdam Hornacek return yystate(); 250*d219b4ceSAdam Hornacek } 251*d219b4ceSAdam Hornacek \} { yypop(); } 252*d219b4ceSAdam Hornacek \[ { yybegin(IN_SCRIPT); } /* don't push. when we find '}' 253*d219b4ceSAdam Hornacek * and we pop we want to go to 254*d219b4ceSAdam Hornacek * STRING/HEREDOC, not back to 255*d219b4ceSAdam Hornacek * STRINGEXPR */ 256*d219b4ceSAdam Hornacek } 257*d219b4ceSAdam Hornacek 258*d219b4ceSAdam Hornacek <SCOMMENT> { 259*d219b4ceSAdam Hornacek {ClosingTag} { 260*d219b4ceSAdam Hornacek while (!isHtmlState(yystate())) yypop(); 261*d219b4ceSAdam Hornacek } 262*d219b4ceSAdam Hornacek {WhspChar}* {EOL} { 263*d219b4ceSAdam Hornacek yypop(); 264*d219b4ceSAdam Hornacek } 265*d219b4ceSAdam Hornacek } 266*d219b4ceSAdam Hornacek 267*d219b4ceSAdam Hornacek <DOCCOMMENT> { 268*d219b4ceSAdam Hornacek /* change relatively to xref -- we also consume the whitespace after */ 269*d219b4ceSAdam Hornacek {DocPreviousChar} "@" {DocParamWithType} {WhspChar}+ { 270*d219b4ceSAdam Hornacek yybegin(DOCCOM_TYPE); 271*d219b4ceSAdam Hornacek } 272*d219b4ceSAdam Hornacek 273*d219b4ceSAdam Hornacek {DocPreviousChar} "@" {DocParamWithTypeAndName} {WhspChar}+ { 274*d219b4ceSAdam Hornacek yybegin(DOCCOM_TYPE_THEN_NAME); 275*d219b4ceSAdam Hornacek } 276*d219b4ceSAdam Hornacek 277*d219b4ceSAdam Hornacek {DocPreviousChar} "@" {DocParamWithName} {WhspChar}+ { 278*d219b4ceSAdam Hornacek yybegin(DOCCOM_NAME); 279*d219b4ceSAdam Hornacek } 280*d219b4ceSAdam Hornacek } 281*d219b4ceSAdam Hornacek 282*d219b4ceSAdam Hornacek <DOCCOM_TYPE_THEN_NAME, DOCCOM_TYPE> { 283*d219b4ceSAdam Hornacek /* The rules here had to be substantially changed because we cannot find 284*d219b4ceSAdam Hornacek * several symbols in one match. This is substantially more lax than 285*d219b4ceSAdam Hornacek * the xref rules */ 286*d219b4ceSAdam Hornacek 287*d219b4ceSAdam Hornacek [\[\]\|\(\)] { } 288*d219b4ceSAdam Hornacek 289*d219b4ceSAdam Hornacek {WhspChar}+ { 290*d219b4ceSAdam Hornacek yybegin(yystate() == DOCCOM_TYPE_THEN_NAME ? DOCCOM_NAME : DOCCOMMENT); 291*d219b4ceSAdam Hornacek } 292*d219b4ceSAdam Hornacek 293*d219b4ceSAdam Hornacek {Identifier} { 294*d219b4ceSAdam Hornacek if (!PSEUDO_TYPES.contains(yytext().toLowerCase(Locale.ROOT))) { 295*d219b4ceSAdam Hornacek onSymbolMatched(yytext(), yychar); 296*d219b4ceSAdam Hornacek return yystate(); 297*d219b4ceSAdam Hornacek } 298*d219b4ceSAdam Hornacek } 299*d219b4ceSAdam Hornacek 300*d219b4ceSAdam Hornacek [^] { yybegin(DOCCOMMENT); yypushback(1); } 301*d219b4ceSAdam Hornacek } 302*d219b4ceSAdam Hornacek 303*d219b4ceSAdam Hornacek <DOCCOM_NAME> { 304*d219b4ceSAdam Hornacek "$" {Identifier} { 305*d219b4ceSAdam Hornacek onSymbolMatched(yytext().substring(1), yychar + 1); 306*d219b4ceSAdam Hornacek yybegin(DOCCOMMENT); 307*d219b4ceSAdam Hornacek return yystate(); 308*d219b4ceSAdam Hornacek } 309*d219b4ceSAdam Hornacek 310*d219b4ceSAdam Hornacek [^] { yybegin(DOCCOMMENT); yypushback(1); } 311*d219b4ceSAdam Hornacek } 312*d219b4ceSAdam Hornacek 313*d219b4ceSAdam Hornacek <COMMENT, DOCCOMMENT> { 314*d219b4ceSAdam Hornacek {WhspChar}* {EOL} { } 315*d219b4ceSAdam Hornacek "*/" { yypop(); } 316*d219b4ceSAdam Hornacek } 317*d219b4ceSAdam Hornacek 318*d219b4ceSAdam Hornacek <YYINITIAL, IN_SCRIPT, STRING, QSTRING, BACKQUOTE, HEREDOC, NOWDOC, SCOMMENT, COMMENT, DOCCOMMENT, STRINGEXPR, STRINGVAR> { 319*d219b4ceSAdam Hornacek {WhspChar}* {EOL} { } 320*d219b4ceSAdam Hornacek [^\n] { } 321*d219b4ceSAdam Hornacek } 322*d219b4ceSAdam Hornacek 323*d219b4ceSAdam Hornacek <YYINITIAL, SCOMMENT, COMMENT, DOCCOMMENT, STRING, QSTRING, BACKQUOTE, HEREDOC, NOWDOC> { 324*d219b4ceSAdam Hornacek {FPath} { } 325*d219b4ceSAdam Hornacek 326*d219b4ceSAdam Hornacek {File} { } 327*d219b4ceSAdam Hornacek 328*d219b4ceSAdam Hornacek {BrowseableURI} { } 329*d219b4ceSAdam Hornacek 330*d219b4ceSAdam Hornacek {FNameChar}+ "@" {FNameChar}+ "." {FNameChar}+ 331*d219b4ceSAdam Hornacek { } 332*d219b4ceSAdam Hornacek } 333