/* * CDDL HEADER START * * The contents of this file are subject to the terms of the * Common Development and Distribution License (the "License"). * You may not use this file except in compliance with the License. * * See LICENSE.txt included in this distribution for the specific * language governing permissions and limitations under the License. * * When distributing Covered Code, include this CDDL HEADER in each * file and include the License file at LICENSE.txt. * If applicable, add the following below this CDDL HEADER, with the * fields enclosed by brackets "[]" replaced with your own identifying * information: Portions Copyright [yyyy] [name of copyright owner] * * CDDL HEADER END */ /* * Copyright (c) 2010, 2016, Oracle and/or its affiliates. All rights reserved. * Portions Copyright (c) 2017, 2019, Chris Fraire . */ /* * Regex productions shared between AdaXref and AdaSymbolTokenizer */ /* * Identifiers syntax * 2.3-1: Identifiers are used as names. * 2.3-2/2: identifier ::= * identifier_start {identifier_start identifier_extend} */ Identifier = {Identifier_start} ({Identifier_start} | {Identifier_extend})* /* * 2.3-3/2: identifier_start ::= letter_uppercase | letter_lowercase | * letter_titlecase | letter_modifier | letter_other | number_letter */ Identifier_start = [\p{Lu}\p{Ll}\p{Lt}\p{Lm}\p{Lo}\p{Nl}] /* * 2.3-3.1/2: identifier_extend ::= mark_non_spacing | mark_spacing_combining | * number_decimal | punctuation_connector | other_format */ Identifier_extend = [\p{Mn}\p{Mc}\p{Nd}\p{Pc}\p{Cf}] /* * 2.3-4/2 reads "After eliminating the characters in category other_format, an * identifier shall not contain two consecutive characters in category * punctuation_connector, or end with a character in that category," but that * it not enforceable in jflex regexes, as its syntax does not allow negative * look-behind assertions. */ /* * 2.4-1: There are two kinds of numeric_literals, real literals and integer * literals. A real literal is a numeric_literal that includes a point; an * integer literal is a numeric_literal without a point. * * 2.4-2: numeric_literal ::= decimal_literal | based_literal */ Numeric_literal = ({NONCONFORM_NUMBER} | {Decimal_literal} | {Based_literal}) /* * 2.4.1-1: A decimal_literal is a numeric_literal in the conventional decimal * notation (that is, the base is ten). * * 2.4.1-2: decimal_literal ::= numeral [.numeral] [exponent] */ Decimal_literal = {Numeral} ([\.]{Numeral})? {Exponent}? /* * 2.4.1-3: numeral ::= digit {[underline] digit} */ Numeral = {Digit} ([_]? {Digit})* /* * 2.4.1-4: exponent ::= E [+] numeral | E – numeral */ Exponent = [E] [\+\-]? {Numeral} /* * 2.4.1-4.1/2: digit ::= 0 | 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 */ Digit = [0-9] /* * 2.4.1-5 reads "An exponent for an integer literal shall not have a minus * sign," but that rule is not distinguished here in regex. */ /* * 2.4.2-1: A based_literal is a numeric_literal expressed in a form that * specifies the base explicitly. * * 2.4.2-2: based_literal ::= base # based_numeral [.based_numeral] # [exponent] */ Based_literal = {Base}[#]{Based_numeral} ([\.]{Based_numeral})? [#]{Exponent}? /* * 2.4.2-3: base ::= numeral * 2.4.2-6: The base (the numeric value of the decimal numeral preceding the * first #) shall be at least two and at most sixteen. */ Base = ([2-9] | [1][0-6]) /* * 2.4.2-4: based_numeral ::= extended_digit {[underline] extended_digit} */ Based_numeral = {Extended_digit} ([_]? {Extended_digit})* /* * 2.4.2-5: extended_digit ::= digit | A | B | C | D | E | F */ Extended_digit = [0-9A-F] /* * This is unconventional numeric syntax seen in large open-source Ada projects */ NONCONFORM_NUMBER = ("0x"? {Extended_digit}+ | {Numeral} ([\.]{Numeral})?[f]) /* * 2.5-1: A character_literal is formed by enclosing a graphic character * between two apostrophe characters. * * 2.5-2: character_literal ::= 'graphic_character' */ Character_literal = ['] [^] ['] /* * 2.6-1: A string_literal is formed by a sequence of graphic characters * (possibly none) enclosed between two quotation marks used as string * brackets. * 2.6-2: string_literal ::= "{string_element}" * 2.6-3: string_element ::= "" | non_quotation_mark_graphic_character * 2.6-4: A string_element is either a pair of quotation marks (""), or a * single graphic_character other than a quotation mark. */ String_literal = [\"] ([\"][\"] | [^\"])* [\"] /* * 2.7-2: comment ::= --{non_end_of_line_character} */ Comment_token = "--" FileExt = ([Aa][Dd][AaBbSs] | [Dd][Ii][Ff][Ff] | [Pp][Aa][Tt][Cc][Hh]) File = [a-zA-Z]{FNameChar}* "." {FileExt} %state SCOMMENT %% { {Identifier} { chkLOC(); String id = yytext(); if (offerSymbol(id, 0, false) && returnOnSymbol()) { return yystate(); } } {Character_literal} { chkLOC(); takeLiteral(yytext(), HtmlConsts.STRING_CLASS); } {Numeric_literal} { chkLOC(); takeLiteral(yytext(), HtmlConsts.NUMBER_CLASS); } {String_literal} { chkLOC(); takeLiteral(yytext(), HtmlConsts.STRING_CLASS); } {Comment_token} { yypush(SCOMMENT); onDisjointSpanChanged(HtmlConsts.COMMENT_CLASS, yychar); offer(yytext()); } } { {WhspChar}*{EOL} { String capture = yytext(); yypushback(capture.length()); yypop(); onDisjointSpanChanged(null, yychar); } } { {WhspChar}*{EOL} { onEndOfLineMatched(yytext(), yychar); } } { // Only one whitespace char at a time [[\s]--[\n\r]] { offer(yytext()); } // Only one character at a time because of \s restriction above. [^\n\r] { chkLOC(); offer(yytext()); } } // "comment links" { {FPath} { if (takeAllContent()) { onPathlikeMatched(yytext(), '/', false, yychar); } } {File} { if (takeAllContent()) { String path = yytext(); onFilelikeMatched(path, yychar); } } {BrowseableURI} { if (takeAllContent()) { onUriMatched(yytext(), yychar); } } {FNameChar}+ "@" {FNameChar}+ "." {FNameChar}+ { if (takeAllContent()) { onEmailAddressMatched(yytext(), yychar); } } }