1# Copyright (c) 2022 Nik Silver 2# 3# This source code is released for free distribution under the terms of the 4# GNU General Public License version 2 or later. 5# 6# Thanks to: 7# - Mark Skipper, for the original Elm optlib parser, which inspired this; 8# - Samuel Stauffer, for the Thrift PEG parser, which showed me how to 9# write a PEG parser; 10# - Jan Dolinár, for the Kotlin PEG parser, which also provided insight; 11# - Masatake YAMATO, for patience and guidance in code reviews. 12# 13# This parser generates tags for Elm. See https://elm-lang.org/docs/syntax 14# for language reference. 15# 16# The parser will tag items reliably at the top level. Functions 17# defined in let/in blocks are also tagged, but with limitations. See below. 18# 19# Kinds 20# - m module 21# - n namespace (ie a module that's renamed) 22# - t type 23# - c constructor (within a type) 24# - a alias 25# - p port 26# - f function 27# 28# Key/value pairs 29# - roles:def This is defined here. 30# - roles:imported This is imported here. 31# - type:<t> This constructor is in the scope of type <t>, which 32# may be dotted. Eg Main.myType. 33# - function:<f> This function is in the scope of function <f>, which 34# may be dotted. Eg Main.myFunc. 35# - module:<m> This is in the scope of module <m>. 36# - typeref:description:<t> This function, constructor or port 37# has type <t>. 38# - moduleName:<m> This namespace has original module name <m>. 39# 40# Functions defined in let/in blocks may be tagged, with these limitations: 41# - the LHS (up to and including the '=') need to be on a single line; 42# - the LHS can only have simple parameters; 43# - their scope is only marked as being in the top-most function; 44# - any type annotation is ignored. 45# This should be good for 90% of inner functions. To make it totally robust 46# is much more complicated due to (a) Elm's clever indentation-sensitivity 47# and (b) limitations of the PEG parser used here. 48# 49# To do: 50# Maybe do: 51# - let/in blocks 52# - Allow tuples on the LHS. Eg '(val1, val2) = valFunc'. 53# - Inner functions' type annotations are used in the function's 54# type description. 55# - Inner functions can have more complex parameters. 56# - Functions 57# - Allow non-Latin upper and lower case. Use 58# https://util.unicode.org/UnicodeJsps/properties.html 59# combined with \p{Lu}, \p{Ll} and \p{L}. 60# 61# Won't do: 62# - Handle Elm's indentation properly. 63 64 65%prefix "pelm" 66 67%auxil "struct parserCtx *" 68 69%earlysource { 70 #include "general.h" 71} 72 73%header { 74 struct parserCtx; 75} 76 77%source { 78#include "elm_pre.h" 79#include "routines.h" 80 81/* 82 * Include these lines to debug the parsing. 83 * From https://github.com/arithy/packcc#macros 84 * This will output parsing info to STDERR.tmp in the vent of a failed test. 85 */ 86 87/* 88static const char *dbg_str[] = { "Evaluating rule", "Matched rule", "Abandoning rule" }; 89 90#define PCC_DEBUG(auxil, event, rule, level, pos, buffer, length) \ 91 fprintf(stderr, "%*s%s %s @%zu [%.*s]\n", \ 92 (int)((level) * 2), "", dbg_str[event], rule, pos, (int)(length), buffer) 93 */ 94} 95 96# Top level elements ----------------------------------------------------- 97 98# We separate the file into the module section and the main section 99# so that we only consider and tag one module declaration 100 101file <- 102 { 103 ELM_INIT_MODULE_SCOPE; 104 } 105 TLSS? 106 moduleDeclaration? 107 TLSS? 108 mainTopLevelStatements? 109 TLSS? 110 EOF 111 112mainTopLevelStatements <- 113 topLevelStatement (TLSS topLevelStatement)* 114 115topLevelStatement <- 116 importStatement 117 / typeAlias 118 / customType 119 / portDeclaration 120 / functionWithTypeAnnotation 121 / functionDefinition 122 / ignoreRestOfStatement 123 124# Main Elm grammar ------------------------------------------------------- 125 126# Module declaration 127# 128# We can be a bit relaxed about distinguishing functions, types and 129# constructors listed in a module declaration, because we're not going 130# to tag them. 131 132moduleDeclaration <- 133 ('port' _1_)? 'module' _1_ <dottedIdentifier> _1_ 'exposing' _0_ '(' exposedList ')' EOS { 134 elm_module_scope_index = makeElmTagSettingScope(auxil, $1, $1s, K_MODULE, ROLE_DEFINITION_INDEX); 135 } 136 137exposedList <- _0_ exposedItem _0_ (',' _0_ exposedList )* 138 139exposedItem <- 140 exposedFieldOrType 141 / exposedFunction 142 / exposedItemIgnored 143 144exposedFieldOrType <- 145 <upperStartIdentifier> (_0_ '(' _0_ exposedTypeConstructorList _0_ ')')? 146 147exposedFunction <- 148 lowerStartIdentifier 149 150exposedItemIgnored <- '.'+ 151 152exposedTypeConstructorList <- 153 (upperStartIdentifier / exposedItemIgnored) _0_ (',' _0_ exposedTypeConstructorList)* 154 155# Type alias 156# 157# We don't care what the actual alias is 158 159typeAlias <- 160 'type' _1_ 'alias' _1_ <upperStartIdentifier> _0_ '=' _0_ ignoreRestOfStatement { 161 makeElmTag(auxil, $1, $1s, K_ALIAS, ROLE_DEFINITION_INDEX); 162 } 163 164# Custom type 165# 166# Includes type parameters, such as 'x' in 'type MyType x = Wrap x'. 167# 168# In a definition such as 'type MyType = Cons1 String Int' we 169# capture 'MyType', and then for each type in each constructor 170# subtype (here, 'String' and 'Int') we append a '->' and finally 171# concatentate them all to get the constructor's type description, 172# such as 'String -> Int -> MyType' 173 174customType <- 175 'type' _1_ <upperStartIdentifier> (_0_ typeParameterList)? _0_ '=' _0_ { 176 initElmConstructorFields(auxil, $1); 177 makeElmTagSettingScope(auxil, $1, $1s, K_TYPE, ROLE_DEFINITION_INDEX); 178 } constructorList EOS { 179 POP_SCOPE(auxil); 180 tidyElmConstructorFields(auxil); 181 } 182 183typeParameterList <- lowerStartIdentifier (_1_ lowerStartIdentifier)* 184 185# A type could be defined as a constructor list: 186# type A = Cons1 String | Cons2 Float Float | ... 187# The 'String' and the 'Float Float' etc are the constructor subtypes. 188# Each 'String', 'Float', etc is a single type spec. 189# But a single type spec could also be a record, a tuple or a function spec. 190# 191# Subtypes in constructors need to be parsed differently from types in 192# type annotations and record fields. Consider these: 193# type A1Type a b = A1Cons a b -- Line 1 194# type A2Type a b = A2Cons String a b -- Line 2 195# type BType a b = BCons { x : A2Type a b} -- Line 3 196# cFunc : A1Type String Int -> String -- Line 4 197# In line 1, 'a b' must be parsed as two individual types (parameterised). 198# In line 2, 'String a b' must be parsed as three individual types. 199# In line 3, 'A2Type a b' must be parsed as one type, even though it's 200# lexically equivalent to 'String a b' on line 2. 201# In line 4, 'A1Type String Int' must also be parsed one type. 202# This means we have to have slightly different rules for parsing a 203# constructor's subtypes as from other cases. The first case is handled 204# by constructorSubtypeList and singleConstructorSubtypeSpec. The second 205# case is handled by singleTypeSpec. 206 207constructorList <- <upperStartIdentifier> { 208 initElmConstructorSubtypeFields(auxil); 209 } _0_ <constructorSubtypeList>? { 210 int r = makeElmTag(auxil, $1, $1s, K_CONSTRUCTOR, ROLE_DEFINITION_INDEX); 211 addElmConstructorTypeRef(auxil, r); 212 } _0_ ('|' _0_ constructorList)? 213 214constructorSubtypeList <- singleConstructorSubtypeSpec (_0_ singleConstructorSubtypeSpec)* 215 216singleConstructorSubtypeSpec <- 217 < recordTypeSpec 218 / tupleTypeSpec 219 / functionTypeSpec 220 / dottedIdentifier 221 > 222 { 223 addElmConstructorSubtype(auxil, $1); 224 } 225 226singleTypeSpec <- 227 recordTypeSpec 228 / tupleTypeSpec 229 / functionTypeSpec 230 / parameterisedTypeSpec 231 232recordTypeSpec <- 233 '{' (_0_ recordRestrictionPrefix)? _0_ fieldSpec (_0_ ',' _0_ fieldSpec)* _0_ '}' 234 / '{' (_0_ recordRestrictionPrefix)? _0_ '}' 235 236recordRestrictionPrefix <- 237 lowerStartIdentifier _0_ '|' 238 239fieldSpec <- 240 lowerStartIdentifier _0_ ':' _0_ singleTypeSpec 241 242tupleTypeSpec <- 243 '(' _0_ singleTypeSpec (_0_ ',' _0_ singleTypeSpec)* _0_ ')' 244 / '(' _0_ ')' 245 246parameterisedTypeSpec <- 247 dottedIdentifier (_1_ (singleTypeSpec / lowerStartIdentifier))* 248 249functionTypeSpec <- 250 singleTypeSpec (_0_ '->' _0_ singleTypeSpec)+ 251 252# Port declaration 253 254portDeclaration <- 255 'port' _1_ <lowerStartIdentifier> _0_ ':' _0_ <typeAnnotation> EOS { 256 int r = makeElmTag(auxil, $1, $1s, K_PORT, ROLE_DEFINITION_INDEX); 257 addElmTypeRef(r, $2); 258 } 259 260# Import statement 261# 262# For the import statement we don't want the imported items to appear in the 263# scope of the current module (ie this file), otherwise they'll be named 264# wrongly. So we # want to save the module scope, make the imported tags, 265# then restore the module scope. We do this in two separate C code blocks, 266# because the module scope needs to be saved before any of the imported tags 267# are made. 268# 269# Also, if we create a namespace then that *does* live in the scope of the 270# current module, so we'll make that tag (if needed) before saving the 271# module scope. 272 273importStatement <- 274 'import' _1_ <dottedIdentifier> (_1_ 'as' _1_ <upperStartIdentifier>)? { 275 // Make the namespace tag first, as it's in the file module's scope 276 if ($2s > 0) { 277 int r = makeElmTag(auxil, $2, $2s, K_NAMESPACE, ROLE_DEFINITION_INDEX); 278 attachParserFieldToCorkEntry (r, ElmFields[F_MODULENAME].ftype, $1); 279 } 280 281 // Now make the tag for the imported module, as it lives outside 282 // the scope of the file module 283 ELM_SAVE_MODULE_SCOPE; 284 makeElmTagSettingScope(auxil, $1, $1s, K_MODULE, ELM_MODULE_IMPORTED); 285 } (_1_ 'exposing' _0_ '(' _0_ importedList _0_ ')')? EOS { 286 ELM_RESTORE_MODULE_SCOPE; 287 } 288 289importedList <- importedItem _0_ (',' _0_ importedList)* 290 291importedItem <- 292 importedFunction 293 / importedType 294 / importedItemIgnored 295 296importedFunction <- <lowerStartIdentifier> { 297 makeElmTag(auxil, $1, $1s, K_FUNCTION, ELM_FUNCTION_EXPOSED); 298 } 299 300# When importing a type and constructors we want the constructors 301# to be in the scope of the type. So we have to set the scope as the 302# type first, before parsing (and making the tags for) the constructors. 303# That's why the code here uses two separate C code blocks. 304 305importedType <- 306 <upperStartIdentifier> { 307 makeElmTagSettingScope(auxil, $1, $1s, K_TYPE, ELM_TYPE_EXPOSED); 308 } (_0_ '(' _0_ importedTypeConstructorList _0_ ')')? { 309 // We're done with the type and its constructors, so we can pop it 310 POP_SCOPE(auxil); 311 } 312 313importedItemIgnored <- '.'+ 314 315importedTypeConstructorList <- 316 (importedTypeConstructor / importedItemIgnored) _0_ (',' _0_ importedTypeConstructorList)* 317 318importedTypeConstructor <- 319 <upperStartIdentifier> { 320 makeElmTag(auxil, $1, $1s, K_CONSTRUCTOR, ELM_CONSTRUCTOR_EXPOSED); 321 } 322 323# Function with a type annotation. 324# 325# The type is on one line, and the function must follow immediately as 326# the next top level statement 327 328functionWithTypeAnnotation <- 329 <lowerStartIdentifier> _0_ ':' _0_ <typeAnnotation> TLSS 330 <$1> _1_ <functionParameterList>? { 331 int r = makeElmTagSettingScope(auxil, $3, $3s, K_FUNCTION, ROLE_DEFINITION_INDEX); 332 addElmTypeRef(r, $2); 333 addElmSignature(r, $4); 334 } _0_ '=' _0_ expression EOS { 335 POP_SCOPE(auxil); 336 } 337 338typeAnnotation <- 339 singleTypeSpec (_0_ '->' _0_ singleTypeSpec)* 340 341# Function without a type annotation 342 343functionDefinition <- 344 <nonKeywordIdentifier> _0_ <functionParameterList>? { 345 int r = makeElmTagSettingScope(auxil, $1, $1s, K_FUNCTION, ROLE_DEFINITION_INDEX); 346 addElmSignature(r, $2); 347 } _0_ '=' _0_ expression EOS { 348 POP_SCOPE(auxil); 349 } 350 351# A function parameter list is what we define a function with. It's the 352# x y z in 'fn x y z'. But of course they can be more complex, such as 353# 'fn (Cons a b) ({ thing } as otherThing))' etc. 354 355functionParameterList <- functionParameter (_0_ functionParameter)* 356 357functionParameter <- 358 plainFunctionParameter 359 / tupleFunctionParameter 360 / recordFunctionParameter 361 / constructorFunctionParameter 362 363plainFunctionParameter <- 364 lowerStartIdentifier (_0_ asClause)? 365 366tupleFunctionParameter <- 367 '(' _0_ functionParameter (_0_ ',' _0_ functionParameter)* _0_ ')' (_0_ asClause)? 368 369recordFunctionParameter <- 370 '{' _0_ lowerStartIdentifier (_0_ ',' _0_ lowerStartIdentifier)* _0_ '}' (_0_ asClause)? 371 372constructorFunctionParameter <- 373 upperStartIdentifier (_0_ functionParameter)* (_0_ asClause)? 374 375asClause <- 376 'as' _1_ lowerStartIdentifier 377 378# Expressions 379 380expression <- 381 (letInBlock _NL_IND_)? simpleExpression (_0_ binaryOperator _0_ expression)* 382 383simpleExpression <- 384 hexNumber 385 / decimal 386 / multilineString 387 / characterLiteral 388 / oneLineString 389 / tupleExpression 390 / listExpression 391 / recordExpression 392 / caseStatement 393 / ifThenElseStatement 394 / anonymousFunction 395 / functionCall 396 397tupleExpression <- 398 '(' _0_ expression (_0_ ',' _0_ expression)* _0_ ')' 399 / '(' _0_ ')' 400 401listExpression <- 402 '[' _0_ expression (_0_ ',' _0_ expression)* _0_ ']' 403 / '[' _0_ ']' 404 405recordExpression <- 406 '{' _0_ 407 (lowerStartIdentifier _0_ '|' _0_)? 408 recordExpressionAssignment (_0_ ',' _0_ recordExpressionAssignment)* _0_ 409 '}' 410 / '{' _0_ '}' 411 412recordExpressionAssignment <- 413 lowerStartIdentifier _0_ '=' _0_ expression 414 415anonymousFunction <- 416 '\\' _0_ functionParameterList _0_ '->' _0_ expression 417 418functionCall <- 419 ( dottedIdentifier 420 / '.' lowerStartIdentifier 421 / '(' binaryOperator ')' 422 ) (_1_ expression)* 423 424# Let/in block 425# 426# We'll treat let/in blocks very simply - we'll consider each line 427# and expect the whole line either to be the start of a function 428# definition (perhaps with some of its body) or its body. So something 429# like 'f x y =' will have to be on one line. 430 431letInBlock <- 432 'let' _NL_IND_ 433 letInLine (_NL_IND_ letInLine)* _NL_IND_ 434 'in' 435 436letInLine <- 437 letInFunctionDefinition 438 / letInBlock 439 / letInFunctionBody 440 441letInFunctionDefinition <- 442 <nonKeywordIdentifier> WS* <letInFunctionParameters>? WS* '=' Non_NL* { 443 int r = makeElmTag(auxil, $1, $1s, K_FUNCTION, ROLE_DEFINITION_INDEX); 444 addElmSignature(r, $2); 445 } 446 447letInFunctionParameters <- 448 nonKeywordIdentifier (WS+ nonKeywordIdentifier)* 449 450letInFunctionBody <- 451 !('let' / 'in') Non_NL+ 452 453# Case statements 454# 455# We're going to be pretty loose with case statements, otherwise we'd 456# have to follow Elm's indentation rules. So we'll just say 457# the body of a case statement is a series of patterns like this: 458# <something> -> <expression>. The <expression> might well swallow 459# up a bit of the next case pattern (because to do otherwise requires 460# following Elm's indentation rules), so that's why we just specify 461# <something>. 462 463caseStatement <- 464 'case' _1_ expression _0_ 'of' _1_ 465 caseClauseList 466 467caseClauseList <- 468 caseClause (_1_ caseClause)* 469 470caseClause <- 471 roughCasePatternChar* '->' _0_ expression 472 473roughCasePatternChar <- 474 !('->' / TLSS / lineComment / delimitedComment / NL) . 475 476# If/then/else statements 477 478ifThenElseStatement <- 479 'if' _1_ expression _1_ 480 'then' _1_ expression _1_ 481 'else' _1_ expression 482 483# Binary operators 484 485binaryOperator <- 486 '>>' / '<<' / '|>' / '<|' 487 / '//' / '++' / '::' 488 / '==' / '/=' 489 / '&&' / '||' 490 / '<=' / '>=' 491 / '<' / '>' 492 / '+' / '-' / '*' / '/' / '^' 493 494# Sometimes we just need to ignore the rest of the (top level) statement 495 496ignoreRestOfStatement <- 497 (multilineString / Non_WS_or_NL+) (_1_ ignoreRestOfStatement)* 498 499multilineString <- 500 '"""' (!'"""' .)* '"""' 501 502# Low level tokens ------------------------------------------------------- 503 504# Identifiers 505 506naiveIdentifier <- [A-Za-z_] alphanumeric* 507 508upperStartIdentifier <- [A-Z] alphanumeric* 509 510lowerStartIdentifier <- !keyword [a-z_] alphanumeric* 511 512alphanumeric <- [A-Za-z0-9_] 513 514nonKeywordIdentifier <- 515 !keyword naiveIdentifier 516 517keyword <- 518 'type' !alphanumeric 519 / 'module' !alphanumeric 520 / 'port' !alphanumeric 521 / 'alias' !alphanumeric 522 / 'as' !alphanumeric 523 / 'exposing' !alphanumeric 524 / 'import' !alphanumeric 525 / 'let' !alphanumeric 526 / 'in' !alphanumeric 527 / 'case' !alphanumeric 528 / 'of' !alphanumeric 529 / 'if' !alphanumeric 530 / 'then' !alphanumeric 531 / 'else' !alphanumeric 532 533dottedIdentifier <- nonKeywordIdentifier ('.' nonKeywordIdentifier)* 534 535# Numbers 536 537decimal <- 538 exponentialDecimal 539 / simpleDecimal 540 541exponentialDecimal <- 542 simpleDecimal 'e' simpleInteger 543 544simpleDecimal <- 545 simpleInteger ('.' digits)? 546 / '.' digits+ 547 548simpleInteger <- [-+]? digits 549 550digits <- [0-9]+ 551 552hexNumber <- '0x' [0-9A-Fa-f]+ 553 554# One line strings and characters 555 556oneLineString <- '"' inStringChar* '"' 557 558characterLiteral <- "'" inStringChar "'" 559 560inStringChar <- 561 !('"' / NL) 562 ( inStringUnicodeChar / inStringEscapedChar / inStringPlainChar ) 563 564inStringPlainChar <- 565 !('"' / '\\' / NL) . 566 567inStringEscapedChar <- 568 '\\' !('u' / NL) . 569 570inStringUnicodeChar <- 571 '\\u{' [0-9A-Fa-f]+ '}' 572 573# Ignorable things ------------------------------------------------------- 574 575# Simple things... 576 577WS <- [ \t]+ 578NL <- '\n' / '\f' / '\r' '\n'? 579Non_NL <- [^\n\r\f] 580Non_WS_or_NL <- [^ \t\n\r\f] 581EOF <- !. 582 583# A delimited comment is effectively "nothing", even if it spans several 584# lines. But it does separate two tokens. 585# 586# A line comment can only come at the end of a line. Notice here it doesn't 587# include the actual newline. 588 589delimitedComment <- '{-' (delimitedComment / !'-}' .)* '-}' 590 591lineComment <- '--' Non_NL* 592 593# Elm whitespacing is a bit special... 594# - Two statements are at the same level (eg at the top level, or statements 595# in the same let...in block) only if they begin with the same indentation. 596# - One line has more indentation than the previous line then it is a 597# continuation of that previous line. 598# - But sometimes several statements can appear on the same line if tokens 599# make it obvious. Eg this is okay: 600# Eg: 'myFunc = let f x y = x + y in f 3 4' 601# 602# We'll only worry about top level statements for this part. But we still 603# need to know 604# - when a top level statement begins; and 605# - when two sequential tokens are part of the same top level statement. 606# They may be separated by a combination of whitespace, comments, and 607# newlines, but if there is a newline then that will always be followed 608# by an indent. 609# 610# When considering how one token relates to the next in top level statements 611# we should only need three kinds of "join"s: 612# - Where we need whitespace, such as 'import MyModule', but that space 613# may occur over multiple lines. If it's over multiple lines, the 614# second token needs to be somewhat in from the first column of text. 615# We'll call this _1_ - ie at least one space. 616# - Where we don't need whitespace, such as 'f = 3', but that space 617# may occur over multiple lines. If it's over multiple lines then again 618# the second token needs to be somewhat in from the first column of text. 619# We'll call this _0_ - ie possibly zero space. 620# - When we've got an end of statement, and the next token is some 621# meaningful code (not a comment) and starts in the first column of text. 622# Then that next token is the start of the next top level statement. 623# We'll call this TLSS, for top level statement separator. 624# 625# We can define _1_ as 626# - The longest possible sequence of whitespace, delimited comments, 627# newlines, and line comments, as long as it ends with a whitespace 628# or a delimited comment, because then it won't be in the first column. 629# 630# We can define _0_ as 631# - _1_ or the empty string. 632# 633# We can define TLSS as 634# - The longest possible sequence of whitespace, delimited comments, 635# newlines, and line comments, as long as it ends with a newline or EOF 636# (and there's no more ignorable characters after that). 637# 638# PEG parsing tip: If we want to define a sequence like 'the longest 639# sequence of As, Bs and Cs, as long as it ends with C' we define a short 640# sequence like 'the longest sequence of As and Bs, then a C' and then 641# define 'the longest sequence of those'. 642 643_1_short <- 644 (lineComment / NL)* (WS / delimitedComment) 645 646_1_ <- _1_short+ 647 648 649_0_ <- _1_ / '' 650 651TLSS_short <- 652 (WS / lineComment / delimitedComment)* (NL / EOF) 653 654TLSS <- 655 TLSS_short+ 656 !(WS / lineComment / delimitedComment) 657 658# An end of statement marks the end of a top level statement, but 659# doesn't consume anything 660 661EOS <- &( TLSS / EOF ) 662 663# When considering lines in a let/in block we'll want to look for 664# a newline and an indent. There may be some delimited comments etc 665# in between. 666 667_NL_IND_ <- 668 TLSS_short+ WS+ 669 670%% 671#include "elm_post.h" 672