# Copyright (c) 2022 Nik Silver # # This source code is released for free distribution under the terms of the # GNU General Public License version 2 or later. # # Thanks to: # - Mark Skipper, for the original Elm optlib parser, which inspired this; # - Samuel Stauffer, for the Thrift PEG parser, which showed me how to # write a PEG parser; # - Jan DolinĂ¡r, for the Kotlin PEG parser, which also provided insight; # - Masatake YAMATO, for patience and guidance in code reviews. # # This parser generates tags for Elm. See https://elm-lang.org/docs/syntax # for language reference. # # The parser will tag items reliably at the top level. Functions # defined in let/in blocks are also tagged, but with limitations. See below. # # Kinds # - m module # - n namespace (ie a module that's renamed) # - t type # - c constructor (within a type) # - a alias # - p port # - f function # # Key/value pairs # - roles:def This is defined here. # - roles:imported This is imported here. # - type: This constructor is in the scope of type , which # may be dotted. Eg Main.myType. # - function: This function is in the scope of function , which # may be dotted. Eg Main.myFunc. # - module: This is in the scope of module . # - typeref:description: This function, constructor or port # has type . # - moduleName: This namespace has original module name . # # Functions defined in let/in blocks may be tagged, with these limitations: # - the LHS (up to and including the '=') need to be on a single line; # - the LHS can only have simple parameters; # - their scope is only marked as being in the top-most function; # - any type annotation is ignored. # This should be good for 90% of inner functions. To make it totally robust # is much more complicated due to (a) Elm's clever indentation-sensitivity # and (b) limitations of the PEG parser used here. # # To do: # Maybe do: # - let/in blocks # - Allow tuples on the LHS. Eg '(val1, val2) = valFunc'. # - Inner functions' type annotations are used in the function's # type description. # - Inner functions can have more complex parameters. # - Functions # - Allow non-Latin upper and lower case. Use # https://util.unicode.org/UnicodeJsps/properties.html # combined with \p{Lu}, \p{Ll} and \p{L}. # # Won't do: # - Handle Elm's indentation properly. %prefix "pelm" %auxil "struct parserCtx *" %earlysource { #include "general.h" } %header { struct parserCtx; } %source { #include "elm_pre.h" #include "routines.h" /* * Include these lines to debug the parsing. * From https://github.com/arithy/packcc#macros * This will output parsing info to STDERR.tmp in the vent of a failed test. */ /* static const char *dbg_str[] = { "Evaluating rule", "Matched rule", "Abandoning rule" }; #define PCC_DEBUG(auxil, event, rule, level, pos, buffer, length) \ fprintf(stderr, "%*s%s %s @%zu [%.*s]\n", \ (int)((level) * 2), "", dbg_str[event], rule, pos, (int)(length), buffer) */ } # Top level elements ----------------------------------------------------- # We separate the file into the module section and the main section # so that we only consider and tag one module declaration file <- { ELM_INIT_MODULE_SCOPE; } TLSS? moduleDeclaration? TLSS? mainTopLevelStatements? TLSS? EOF mainTopLevelStatements <- topLevelStatement (TLSS topLevelStatement)* topLevelStatement <- importStatement / typeAlias / customType / portDeclaration / functionWithTypeAnnotation / functionDefinition / ignoreRestOfStatement # Main Elm grammar ------------------------------------------------------- # Module declaration # # We can be a bit relaxed about distinguishing functions, types and # constructors listed in a module declaration, because we're not going # to tag them. moduleDeclaration <- ('port' _1_)? 'module' _1_ _1_ 'exposing' _0_ '(' exposedList ')' EOS { elm_module_scope_index = makeElmTagSettingScope(auxil, $1, $1s, K_MODULE, ROLE_DEFINITION_INDEX); } exposedList <- _0_ exposedItem _0_ (',' _0_ exposedList )* exposedItem <- exposedFieldOrType / exposedFunction / exposedItemIgnored exposedFieldOrType <- (_0_ '(' _0_ exposedTypeConstructorList _0_ ')')? exposedFunction <- lowerStartIdentifier exposedItemIgnored <- '.'+ exposedTypeConstructorList <- (upperStartIdentifier / exposedItemIgnored) _0_ (',' _0_ exposedTypeConstructorList)* # Type alias # # We don't care what the actual alias is typeAlias <- 'type' _1_ 'alias' _1_ _0_ '=' _0_ ignoreRestOfStatement { makeElmTag(auxil, $1, $1s, K_ALIAS, ROLE_DEFINITION_INDEX); } # Custom type # # Includes type parameters, such as 'x' in 'type MyType x = Wrap x'. # # In a definition such as 'type MyType = Cons1 String Int' we # capture 'MyType', and then for each type in each constructor # subtype (here, 'String' and 'Int') we append a '->' and finally # concatentate them all to get the constructor's type description, # such as 'String -> Int -> MyType' customType <- 'type' _1_ (_0_ typeParameterList)? _0_ '=' _0_ { initElmConstructorFields(auxil, $1); makeElmTagSettingScope(auxil, $1, $1s, K_TYPE, ROLE_DEFINITION_INDEX); } constructorList EOS { POP_SCOPE(auxil); tidyElmConstructorFields(auxil); } typeParameterList <- lowerStartIdentifier (_1_ lowerStartIdentifier)* # A type could be defined as a constructor list: # type A = Cons1 String | Cons2 Float Float | ... # The 'String' and the 'Float Float' etc are the constructor subtypes. # Each 'String', 'Float', etc is a single type spec. # But a single type spec could also be a record, a tuple or a function spec. # # Subtypes in constructors need to be parsed differently from types in # type annotations and record fields. Consider these: # type A1Type a b = A1Cons a b -- Line 1 # type A2Type a b = A2Cons String a b -- Line 2 # type BType a b = BCons { x : A2Type a b} -- Line 3 # cFunc : A1Type String Int -> String -- Line 4 # In line 1, 'a b' must be parsed as two individual types (parameterised). # In line 2, 'String a b' must be parsed as three individual types. # In line 3, 'A2Type a b' must be parsed as one type, even though it's # lexically equivalent to 'String a b' on line 2. # In line 4, 'A1Type String Int' must also be parsed one type. # This means we have to have slightly different rules for parsing a # constructor's subtypes as from other cases. The first case is handled # by constructorSubtypeList and singleConstructorSubtypeSpec. The second # case is handled by singleTypeSpec. constructorList <- { initElmConstructorSubtypeFields(auxil); } _0_ ? { int r = makeElmTag(auxil, $1, $1s, K_CONSTRUCTOR, ROLE_DEFINITION_INDEX); addElmConstructorTypeRef(auxil, r); } _0_ ('|' _0_ constructorList)? constructorSubtypeList <- singleConstructorSubtypeSpec (_0_ singleConstructorSubtypeSpec)* singleConstructorSubtypeSpec <- < recordTypeSpec / tupleTypeSpec / functionTypeSpec / dottedIdentifier > { addElmConstructorSubtype(auxil, $1); } singleTypeSpec <- recordTypeSpec / tupleTypeSpec / functionTypeSpec / parameterisedTypeSpec recordTypeSpec <- '{' (_0_ recordRestrictionPrefix)? _0_ fieldSpec (_0_ ',' _0_ fieldSpec)* _0_ '}' / '{' (_0_ recordRestrictionPrefix)? _0_ '}' recordRestrictionPrefix <- lowerStartIdentifier _0_ '|' fieldSpec <- lowerStartIdentifier _0_ ':' _0_ singleTypeSpec tupleTypeSpec <- '(' _0_ singleTypeSpec (_0_ ',' _0_ singleTypeSpec)* _0_ ')' / '(' _0_ ')' parameterisedTypeSpec <- dottedIdentifier (_1_ (singleTypeSpec / lowerStartIdentifier))* functionTypeSpec <- singleTypeSpec (_0_ '->' _0_ singleTypeSpec)+ # Port declaration portDeclaration <- 'port' _1_ _0_ ':' _0_ EOS { int r = makeElmTag(auxil, $1, $1s, K_PORT, ROLE_DEFINITION_INDEX); addElmTypeRef(r, $2); } # Import statement # # For the import statement we don't want the imported items to appear in the # scope of the current module (ie this file), otherwise they'll be named # wrongly. So we # want to save the module scope, make the imported tags, # then restore the module scope. We do this in two separate C code blocks, # because the module scope needs to be saved before any of the imported tags # are made. # # Also, if we create a namespace then that *does* live in the scope of the # current module, so we'll make that tag (if needed) before saving the # module scope. importStatement <- 'import' _1_ (_1_ 'as' _1_ )? { // Make the namespace tag first, as it's in the file module's scope if ($2s > 0) { int r = makeElmTag(auxil, $2, $2s, K_NAMESPACE, ROLE_DEFINITION_INDEX); attachParserFieldToCorkEntry (r, ElmFields[F_MODULENAME].ftype, $1); } // Now make the tag for the imported module, as it lives outside // the scope of the file module ELM_SAVE_MODULE_SCOPE; makeElmTagSettingScope(auxil, $1, $1s, K_MODULE, ELM_MODULE_IMPORTED); } (_1_ 'exposing' _0_ '(' _0_ importedList _0_ ')')? EOS { ELM_RESTORE_MODULE_SCOPE; } importedList <- importedItem _0_ (',' _0_ importedList)* importedItem <- importedFunction / importedType / importedItemIgnored importedFunction <- { makeElmTag(auxil, $1, $1s, K_FUNCTION, ELM_FUNCTION_EXPOSED); } # When importing a type and constructors we want the constructors # to be in the scope of the type. So we have to set the scope as the # type first, before parsing (and making the tags for) the constructors. # That's why the code here uses two separate C code blocks. importedType <- { makeElmTagSettingScope(auxil, $1, $1s, K_TYPE, ELM_TYPE_EXPOSED); } (_0_ '(' _0_ importedTypeConstructorList _0_ ')')? { // We're done with the type and its constructors, so we can pop it POP_SCOPE(auxil); } importedItemIgnored <- '.'+ importedTypeConstructorList <- (importedTypeConstructor / importedItemIgnored) _0_ (',' _0_ importedTypeConstructorList)* importedTypeConstructor <- { makeElmTag(auxil, $1, $1s, K_CONSTRUCTOR, ELM_CONSTRUCTOR_EXPOSED); } # Function with a type annotation. # # The type is on one line, and the function must follow immediately as # the next top level statement functionWithTypeAnnotation <- _0_ ':' _0_ TLSS <$1> _1_ ? { int r = makeElmTagSettingScope(auxil, $3, $3s, K_FUNCTION, ROLE_DEFINITION_INDEX); addElmTypeRef(r, $2); addElmSignature(r, $4); } _0_ '=' _0_ expression EOS { POP_SCOPE(auxil); } typeAnnotation <- singleTypeSpec (_0_ '->' _0_ singleTypeSpec)* # Function without a type annotation functionDefinition <- _0_ ? { int r = makeElmTagSettingScope(auxil, $1, $1s, K_FUNCTION, ROLE_DEFINITION_INDEX); addElmSignature(r, $2); } _0_ '=' _0_ expression EOS { POP_SCOPE(auxil); } # A function parameter list is what we define a function with. It's the # x y z in 'fn x y z'. But of course they can be more complex, such as # 'fn (Cons a b) ({ thing } as otherThing))' etc. functionParameterList <- functionParameter (_0_ functionParameter)* functionParameter <- plainFunctionParameter / tupleFunctionParameter / recordFunctionParameter / constructorFunctionParameter plainFunctionParameter <- lowerStartIdentifier (_0_ asClause)? tupleFunctionParameter <- '(' _0_ functionParameter (_0_ ',' _0_ functionParameter)* _0_ ')' (_0_ asClause)? recordFunctionParameter <- '{' _0_ lowerStartIdentifier (_0_ ',' _0_ lowerStartIdentifier)* _0_ '}' (_0_ asClause)? constructorFunctionParameter <- upperStartIdentifier (_0_ functionParameter)* (_0_ asClause)? asClause <- 'as' _1_ lowerStartIdentifier # Expressions expression <- (letInBlock _NL_IND_)? simpleExpression (_0_ binaryOperator _0_ expression)* simpleExpression <- hexNumber / decimal / multilineString / characterLiteral / oneLineString / tupleExpression / listExpression / recordExpression / caseStatement / ifThenElseStatement / anonymousFunction / functionCall tupleExpression <- '(' _0_ expression (_0_ ',' _0_ expression)* _0_ ')' / '(' _0_ ')' listExpression <- '[' _0_ expression (_0_ ',' _0_ expression)* _0_ ']' / '[' _0_ ']' recordExpression <- '{' _0_ (lowerStartIdentifier _0_ '|' _0_)? recordExpressionAssignment (_0_ ',' _0_ recordExpressionAssignment)* _0_ '}' / '{' _0_ '}' recordExpressionAssignment <- lowerStartIdentifier _0_ '=' _0_ expression anonymousFunction <- '\\' _0_ functionParameterList _0_ '->' _0_ expression functionCall <- ( dottedIdentifier / '.' lowerStartIdentifier / '(' binaryOperator ')' ) (_1_ expression)* # Let/in block # # We'll treat let/in blocks very simply - we'll consider each line # and expect the whole line either to be the start of a function # definition (perhaps with some of its body) or its body. So something # like 'f x y =' will have to be on one line. letInBlock <- 'let' _NL_IND_ letInLine (_NL_IND_ letInLine)* _NL_IND_ 'in' letInLine <- letInFunctionDefinition / letInBlock / letInFunctionBody letInFunctionDefinition <- WS* ? WS* '=' Non_NL* { int r = makeElmTag(auxil, $1, $1s, K_FUNCTION, ROLE_DEFINITION_INDEX); addElmSignature(r, $2); } letInFunctionParameters <- nonKeywordIdentifier (WS+ nonKeywordIdentifier)* letInFunctionBody <- !('let' / 'in') Non_NL+ # Case statements # # We're going to be pretty loose with case statements, otherwise we'd # have to follow Elm's indentation rules. So we'll just say # the body of a case statement is a series of patterns like this: # -> . The might well swallow # up a bit of the next case pattern (because to do otherwise requires # following Elm's indentation rules), so that's why we just specify # . caseStatement <- 'case' _1_ expression _0_ 'of' _1_ caseClauseList caseClauseList <- caseClause (_1_ caseClause)* caseClause <- roughCasePatternChar* '->' _0_ expression roughCasePatternChar <- !('->' / TLSS / lineComment / delimitedComment / NL) . # If/then/else statements ifThenElseStatement <- 'if' _1_ expression _1_ 'then' _1_ expression _1_ 'else' _1_ expression # Binary operators binaryOperator <- '>>' / '<<' / '|>' / '<|' / '//' / '++' / '::' / '==' / '/=' / '&&' / '||' / '<=' / '>=' / '<' / '>' / '+' / '-' / '*' / '/' / '^' # Sometimes we just need to ignore the rest of the (top level) statement ignoreRestOfStatement <- (multilineString / Non_WS_or_NL+) (_1_ ignoreRestOfStatement)* multilineString <- '"""' (!'"""' .)* '"""' # Low level tokens ------------------------------------------------------- # Identifiers naiveIdentifier <- [A-Za-z_] alphanumeric* upperStartIdentifier <- [A-Z] alphanumeric* lowerStartIdentifier <- !keyword [a-z_] alphanumeric* alphanumeric <- [A-Za-z0-9_] nonKeywordIdentifier <- !keyword naiveIdentifier keyword <- 'type' !alphanumeric / 'module' !alphanumeric / 'port' !alphanumeric / 'alias' !alphanumeric / 'as' !alphanumeric / 'exposing' !alphanumeric / 'import' !alphanumeric / 'let' !alphanumeric / 'in' !alphanumeric / 'case' !alphanumeric / 'of' !alphanumeric / 'if' !alphanumeric / 'then' !alphanumeric / 'else' !alphanumeric dottedIdentifier <- nonKeywordIdentifier ('.' nonKeywordIdentifier)* # Numbers decimal <- exponentialDecimal / simpleDecimal exponentialDecimal <- simpleDecimal 'e' simpleInteger simpleDecimal <- simpleInteger ('.' digits)? / '.' digits+ simpleInteger <- [-+]? digits digits <- [0-9]+ hexNumber <- '0x' [0-9A-Fa-f]+ # One line strings and characters oneLineString <- '"' inStringChar* '"' characterLiteral <- "'" inStringChar "'" inStringChar <- !('"' / NL) ( inStringUnicodeChar / inStringEscapedChar / inStringPlainChar ) inStringPlainChar <- !('"' / '\\' / NL) . inStringEscapedChar <- '\\' !('u' / NL) . inStringUnicodeChar <- '\\u{' [0-9A-Fa-f]+ '}' # Ignorable things ------------------------------------------------------- # Simple things... WS <- [ \t]+ NL <- '\n' / '\f' / '\r' '\n'? Non_NL <- [^\n\r\f] Non_WS_or_NL <- [^ \t\n\r\f] EOF <- !. # A delimited comment is effectively "nothing", even if it spans several # lines. But it does separate two tokens. # # A line comment can only come at the end of a line. Notice here it doesn't # include the actual newline. delimitedComment <- '{-' (delimitedComment / !'-}' .)* '-}' lineComment <- '--' Non_NL* # Elm whitespacing is a bit special... # - Two statements are at the same level (eg at the top level, or statements # in the same let...in block) only if they begin with the same indentation. # - One line has more indentation than the previous line then it is a # continuation of that previous line. # - But sometimes several statements can appear on the same line if tokens # make it obvious. Eg this is okay: # Eg: 'myFunc = let f x y = x + y in f 3 4' # # We'll only worry about top level statements for this part. But we still # need to know # - when a top level statement begins; and # - when two sequential tokens are part of the same top level statement. # They may be separated by a combination of whitespace, comments, and # newlines, but if there is a newline then that will always be followed # by an indent. # # When considering how one token relates to the next in top level statements # we should only need three kinds of "join"s: # - Where we need whitespace, such as 'import MyModule', but that space # may occur over multiple lines. If it's over multiple lines, the # second token needs to be somewhat in from the first column of text. # We'll call this _1_ - ie at least one space. # - Where we don't need whitespace, such as 'f = 3', but that space # may occur over multiple lines. If it's over multiple lines then again # the second token needs to be somewhat in from the first column of text. # We'll call this _0_ - ie possibly zero space. # - When we've got an end of statement, and the next token is some # meaningful code (not a comment) and starts in the first column of text. # Then that next token is the start of the next top level statement. # We'll call this TLSS, for top level statement separator. # # We can define _1_ as # - The longest possible sequence of whitespace, delimited comments, # newlines, and line comments, as long as it ends with a whitespace # or a delimited comment, because then it won't be in the first column. # # We can define _0_ as # - _1_ or the empty string. # # We can define TLSS as # - The longest possible sequence of whitespace, delimited comments, # newlines, and line comments, as long as it ends with a newline or EOF # (and there's no more ignorable characters after that). # # PEG parsing tip: If we want to define a sequence like 'the longest # sequence of As, Bs and Cs, as long as it ends with C' we define a short # sequence like 'the longest sequence of As and Bs, then a C' and then # define 'the longest sequence of those'. _1_short <- (lineComment / NL)* (WS / delimitedComment) _1_ <- _1_short+ _0_ <- _1_ / '' TLSS_short <- (WS / lineComment / delimitedComment)* (NL / EOF) TLSS <- TLSS_short+ !(WS / lineComment / delimitedComment) # An end of statement marks the end of a top level statement, but # doesn't consume anything EOS <- &( TLSS / EOF ) # When considering lines in a let/in block we'll want to look for # a newline and an indent. There may be some delimited comments etc # in between. _NL_IND_ <- TLSS_short+ WS+ %% #include "elm_post.h"