analysis/ada/AdaProductions.lexh

/*
 * CDDL HEADER START
 *
 * The contents of this file are subject to the terms of the
 * Common Development and Distribution License (the "License").
 * You may not use this file except in compliance with the License.
 *
 * See LICENSE.txt included in this distribution for the specific
 * language governing permissions and limitations under the License.
 *
 * When distributing Covered Code, include this CDDL HEADER in each
 * file and include the License file at LICENSE.txt.
 * If applicable, add the following below this CDDL HEADER, with the
 * fields enclosed by brackets "[]" replaced with your own identifying
 * information: Portions Copyright [yyyy] [name of copyright owner]
 *
 * CDDL HEADER END
 */

/*
 * Copyright (c) 2010, 2016, Oracle and/or its affiliates. All rights reserved.
 * Portions Copyright (c) 2017, 2019, Chris Fraire <cfraire@me.com>.
 */

/*
 * Regex productions shared between AdaXref and AdaSymbolTokenizer
 */

/*
 * Identifiers syntax
 * 2.3-1: Identifiers are used as names.
 * 2.3-2/2: identifier ::=
 *     identifier_start {identifier_start identifier_extend}
 */
Identifier = {Identifier_start} ({Identifier_start} | {Identifier_extend})*
/*
 * 2.3-3/2: identifier_start ::= letter_uppercase | letter_lowercase |
 *     letter_titlecase | letter_modifier | letter_other | number_letter
 */
Identifier_start = [\p{Lu}\p{Ll}\p{Lt}\p{Lm}\p{Lo}\p{Nl}]
/*
 * 2.3-3.1/2: identifier_extend ::= mark_non_spacing | mark_spacing_combining |
 *     number_decimal | punctuation_connector | other_format
 */
Identifier_extend = [\p{Mn}\p{Mc}\p{Nd}\p{Pc}\p{Cf}]
/*
 * 2.3-4/2 reads "After eliminating the characters in category other_format, an
 * identifier shall not contain two consecutive characters in category
 * punctuation_connector, or end with a character in that category," but that
 * it not enforceable in jflex regexes, as its syntax does not allow negative
 * look-behind assertions.
 */

/*
 * 2.4-1: There are two kinds of numeric_literals, real literals and integer
 * literals. A real literal is a numeric_literal that includes a point; an
 * integer literal is a numeric_literal without a point.
 *
 * 2.4-2: numeric_literal ::= decimal_literal | based_literal
 */
Numeric_literal = ({NONCONFORM_NUMBER} | {Decimal_literal} | {Based_literal})
/*
 * 2.4.1-1: A decimal_literal is a numeric_literal in the conventional decimal
 * notation (that is, the base is ten).
 *
 * 2.4.1-2: decimal_literal ::= numeral [.numeral] [exponent]
 */
Decimal_literal = {Numeral} ([\.]{Numeral})? {Exponent}?
/*
 * 2.4.1-3: numeral ::= digit {[underline] digit}
 */
Numeral = {Digit} ([_]? {Digit})*
/*
 * 2.4.1-4: exponent ::= E [+] numeral | E – numeral
 */
Exponent = [E] [\+\-]? {Numeral}
/*
 * 2.4.1-4.1/2: digit ::= 0 | 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9
 */
Digit = [0-9]
/*
 * 2.4.1-5 reads "An exponent for an integer literal shall not have a minus
 * sign," but that rule is not distinguished here in regex.
 */
/*
 * 2.4.2-1: A based_literal is a numeric_literal expressed in a form that
 * specifies the base explicitly.
 *
 * 2.4.2-2: based_literal ::= base # based_numeral [.based_numeral] # [exponent]
 */
Based_literal = {Base}[#]{Based_numeral} ([\.]{Based_numeral})? [#]{Exponent}?
/*
 * 2.4.2-3: base ::= numeral
 * 2.4.2-6: The base (the numeric value of the decimal numeral preceding the
 *     first #) shall be at least two and at most sixteen.
 */
Base = ([2-9] | [1][0-6])
/*
 * 2.4.2-4: based_numeral ::= extended_digit {[underline] extended_digit}
 */
Based_numeral = {Extended_digit} ([_]? {Extended_digit})*
/*
 * 2.4.2-5: extended_digit ::= digit | A | B | C | D | E | F
 */
Extended_digit = [0-9A-F]
/*
 * This is unconventional numeric syntax seen in large open-source Ada projects
 */
NONCONFORM_NUMBER = ("0x"? {Extended_digit}+ | {Numeral} ([\.]{Numeral})?[f])

/*
 * 2.5-1: A character_literal is formed by enclosing a graphic character
 * between two apostrophe characters.
 *
 * 2.5-2: character_literal ::= 'graphic_character'
 */
Character_literal = ['] [^] [']

/*
 * 2.6-1: A string_literal is formed by a sequence of graphic characters
 * (possibly none) enclosed between two quotation marks used as string
 * brackets.
 * 2.6-2: string_literal ::= "{string_element}"
 * 2.6-3: string_element ::= "" | non_quotation_mark_graphic_character
 * 2.6-4: A string_element is either a pair of quotation marks (""), or a
 * single graphic_character other than a quotation mark.
 */
String_literal = [\"] ([\"][\"] | [^\"])* [\"]

/*
 * 2.7-2: comment ::= --{non_end_of_line_character}
 */
Comment_token = "--"

FileExt = ([Aa][Dd][AaBbSs] | [Dd][Ii][Ff][Ff] | [Pp][Aa][Tt][Cc][Hh])
File = [a-zA-Z]{FNameChar}* "." {FileExt}

%state SCOMMENT

%%
<YYINITIAL> {
    {Identifier}    {
        chkLOC();
        String id = yytext();
        if (offerSymbol(id, 0, false) && returnOnSymbol()) {
            return yystate();
        }
    }

    {Character_literal}    {
        chkLOC();
        takeLiteral(yytext(), HtmlConsts.STRING_CLASS);
    }

    {Numeric_literal}    {
        chkLOC();
        takeLiteral(yytext(), HtmlConsts.NUMBER_CLASS);
    }

    {String_literal}    {
        chkLOC();
        takeLiteral(yytext(), HtmlConsts.STRING_CLASS);
    }

    {Comment_token}    {
        yypush(SCOMMENT);
        onDisjointSpanChanged(HtmlConsts.COMMENT_CLASS, yychar);
        offer(yytext());
    }
}

<SCOMMENT> {
    {WhspChar}*{EOL}    {
        String capture = yytext();
        yypushback(capture.length());
        yypop();
        onDisjointSpanChanged(null, yychar);
    }
}

<YYINITIAL> {
    {WhspChar}*{EOL}    {
        onEndOfLineMatched(yytext(), yychar);
    }
}

<YYINITIAL, SCOMMENT> {
    // Only one whitespace char at a time
    [[\s]--[\n\r]]    {
        offer(yytext());
    }
    // Only one character at a time because of \s restriction above.
    [^\n\r]    {
        chkLOC();
        offer(yytext());
    }
}

// "comment links"
<SCOMMENT> {
    {FPath}    {
        if (takeAllContent()) {
            onPathlikeMatched(yytext(), '/', false, yychar);
        }
    }

    {File}    {
        if (takeAllContent()) {
            String path = yytext();
            onFilelikeMatched(path, yychar);
        }
    }

    {BrowseableURI}    {
        if (takeAllContent()) {
            onUriMatched(yytext(), yychar);
        }
    }

    {FNameChar}+ "@" {FNameChar}+ "." {FNameChar}+    {
        if (takeAllContent()) {
            onEmailAddressMatched(yytext(), yychar);
        }
    }
}