xref: /Universal-ctags/peg/elm.peg (revision fa5642e4dd4866485c417ba4997bf6e2646e8441)
1# Copyright (c) 2022 Nik Silver
2#
3# This source code is released for free distribution under the terms of the
4# GNU General Public License version 2 or later.
5#
6# Thanks to:
7# - Mark Skipper, for the original Elm optlib parser, which inspired this;
8# - Samuel Stauffer, for the Thrift PEG parser, which showed me how to
9#   write a PEG parser;
10# - Jan Dolinár, for the Kotlin PEG parser, which also provided insight;
11# - Masatake YAMATO, for patience and guidance in code reviews.
12#
13# This parser generates tags for Elm. See https://elm-lang.org/docs/syntax
14# for language reference.
15#
16# The parser will tag items reliably at the top level. Functions
17# defined in let/in blocks are also tagged, but with limitations. See below.
18#
19# Kinds
20# - m module
21# - n namespace (ie a module that's renamed)
22# - t type
23# - c constructor (within a type)
24# - a alias
25# - p port
26# - f function
27#
28# Key/value pairs
29# - roles:def       This is defined here.
30# - roles:imported  This is imported here.
31# - type:<t>        This constructor is in the scope of type <t>, which
32#                   may be dotted. Eg Main.myType.
33# - function:<f>    This function is in the scope of function <f>, which
34#                   may be dotted. Eg Main.myFunc.
35# - module:<m>      This is in the scope of module <m>.
36# - typeref:description:<t>   This function, constructor or port
37#                   has type <t>.
38# - moduleName:<m>  This namespace has original module name <m>.
39#
40# Functions defined in let/in blocks may be tagged, with these limitations:
41# - the LHS (up to and including the '=') need to be on a single line;
42# - the LHS can only have simple parameters;
43# - their scope is only marked as being in the top-most function;
44# - any type annotation is ignored.
45# This should be good for 90% of inner functions. To make it totally robust
46# is much more complicated due to (a) Elm's clever indentation-sensitivity
47# and (b) limitations of the PEG parser used here.
48#
49# To do:
50# Maybe do:
51# - let/in blocks
52#   - Allow tuples on the LHS. Eg '(val1, val2) = valFunc'.
53#   - Inner functions' type annotations are used in the function's
54#     type description.
55#   - Inner functions can have more complex parameters.
56# - Functions
57#   - Allow non-Latin upper and lower case. Use
58#     https://util.unicode.org/UnicodeJsps/properties.html
59#     combined with \p{Lu}, \p{Ll} and \p{L}.
60#
61# Won't do:
62# - Handle Elm's indentation properly.
63
64
65%prefix "pelm"
66
67%auxil	"struct parserCtx *"
68
69%earlysource {
70    #include "general.h"
71}
72
73%header {
74	struct parserCtx;
75}
76
77%source {
78#include "elm_pre.h"
79#include "routines.h"
80
81/*
82 * Include these lines to debug the parsing.
83 * From https://github.com/arithy/packcc#macros
84 * This will output parsing info to STDERR.tmp in the vent of a failed test.
85 */
86
87/*
88static const char *dbg_str[] = { "Evaluating rule", "Matched rule", "Abandoning rule" };
89
90#define PCC_DEBUG(auxil, event, rule, level, pos, buffer, length) \
91    fprintf(stderr, "%*s%s %s @%zu [%.*s]\n", \
92        (int)((level) * 2), "", dbg_str[event], rule, pos, (int)(length), buffer)
93 */
94}
95
96# Top level elements -----------------------------------------------------
97
98# We separate the file into the module section and the main section
99# so that we only consider and tag one module declaration
100
101file <-
102    {
103        ELM_INIT_MODULE_SCOPE;
104    }
105    TLSS?
106    moduleDeclaration?
107    TLSS?
108    mainTopLevelStatements?
109    TLSS?
110    EOF
111
112mainTopLevelStatements <-
113    topLevelStatement (TLSS topLevelStatement)*
114
115topLevelStatement <-
116    importStatement
117    / typeAlias
118    / customType
119    / portDeclaration
120    / functionWithTypeAnnotation
121    / functionDefinition
122    / ignoreRestOfStatement
123
124# Main Elm grammar -------------------------------------------------------
125
126# Module declaration
127#
128# We can be a bit relaxed about distinguishing functions, types and
129# constructors listed in a module declaration, because we're not going
130# to tag them.
131
132moduleDeclaration <-
133    ('port' _1_)? 'module' _1_ <dottedIdentifier> _1_ 'exposing' _0_ '(' exposedList ')' EOS {
134        elm_module_scope_index = makeElmTagSettingScope(auxil, $1, $1s, K_MODULE, ROLE_DEFINITION_INDEX);
135    }
136
137exposedList <- _0_ exposedItem _0_ (',' _0_ exposedList )*
138
139exposedItem <-
140    exposedFieldOrType
141    / exposedFunction
142    / exposedItemIgnored
143
144exposedFieldOrType <-
145    <upperStartIdentifier> (_0_ '(' _0_ exposedTypeConstructorList _0_ ')')?
146
147exposedFunction <-
148    lowerStartIdentifier
149
150exposedItemIgnored <- '.'+
151
152exposedTypeConstructorList <-
153    (upperStartIdentifier / exposedItemIgnored) _0_ (',' _0_ exposedTypeConstructorList)*
154
155# Type alias
156#
157# We don't care what the actual alias is
158
159typeAlias <-
160    'type' _1_ 'alias' _1_ <upperStartIdentifier> _0_ '=' _0_ ignoreRestOfStatement {
161        makeElmTag(auxil, $1, $1s, K_ALIAS, ROLE_DEFINITION_INDEX);
162    }
163
164# Custom type
165#
166# Includes type parameters, such as 'x' in 'type MyType x = Wrap x'.
167#
168# In a definition such as 'type MyType = Cons1 String Int' we
169# capture 'MyType', and then for each type in each constructor
170# subtype (here, 'String' and 'Int') we append a '->' and finally
171# concatentate them all to get the constructor's type description,
172# such as 'String -> Int -> MyType'
173
174customType <-
175    'type' _1_ <upperStartIdentifier> (_0_ typeParameterList)? _0_ '=' _0_ {
176        initElmConstructorFields(auxil, $1);
177        makeElmTagSettingScope(auxil, $1, $1s, K_TYPE, ROLE_DEFINITION_INDEX);
178    } constructorList EOS {
179        POP_SCOPE(auxil);
180        tidyElmConstructorFields(auxil);
181    }
182
183typeParameterList <- lowerStartIdentifier (_1_ lowerStartIdentifier)*
184
185# A type could be defined as a constructor list:
186#     type A = Cons1 String | Cons2 Float Float | ...
187# The 'String' and the 'Float Float' etc are the constructor subtypes.
188# Each 'String', 'Float', etc is a single type spec.
189# But a single type spec could also be a record, a tuple or a function spec.
190#
191# Subtypes in constructors need to be parsed differently from types in
192# type annotations and record fields. Consider these:
193#     type A1Type a b = A1Cons a b              -- Line 1
194#     type A2Type a b = A2Cons String a b       -- Line 2
195#     type BType a b = BCons { x : A2Type a b}  -- Line 3
196#     cFunc : A1Type String Int -> String       -- Line 4
197# In line 1, 'a b' must be parsed as two individual types (parameterised).
198# In line 2, 'String a b' must be parsed as three individual types.
199# In line 3, 'A2Type a b' must be parsed as one type, even though it's
200# lexically equivalent to 'String a b' on line 2.
201# In line 4, 'A1Type String Int' must also be parsed one type.
202# This means we have to have slightly different rules for parsing a
203# constructor's subtypes as from other cases. The first case is handled
204# by constructorSubtypeList and singleConstructorSubtypeSpec. The second
205# case is handled by singleTypeSpec.
206
207constructorList <- <upperStartIdentifier> {
208        initElmConstructorSubtypeFields(auxil);
209    } _0_ <constructorSubtypeList>? {
210        int r = makeElmTag(auxil, $1, $1s, K_CONSTRUCTOR, ROLE_DEFINITION_INDEX);
211        addElmConstructorTypeRef(auxil, r);
212    } _0_ ('|' _0_ constructorList)?
213
214constructorSubtypeList <- singleConstructorSubtypeSpec (_0_ singleConstructorSubtypeSpec)*
215
216singleConstructorSubtypeSpec <-
217    < recordTypeSpec
218      / tupleTypeSpec
219      / functionTypeSpec
220      / dottedIdentifier
221    >
222    {
223        addElmConstructorSubtype(auxil, $1);
224    }
225
226singleTypeSpec <-
227    recordTypeSpec
228    / tupleTypeSpec
229    / functionTypeSpec
230    / parameterisedTypeSpec
231
232recordTypeSpec <-
233    '{' (_0_ recordRestrictionPrefix)? _0_ fieldSpec (_0_ ',' _0_ fieldSpec)* _0_ '}'
234    / '{' (_0_ recordRestrictionPrefix)? _0_ '}'
235
236recordRestrictionPrefix <-
237    lowerStartIdentifier _0_ '|'
238
239fieldSpec <-
240    lowerStartIdentifier _0_ ':' _0_ singleTypeSpec
241
242tupleTypeSpec <-
243    '(' _0_ singleTypeSpec (_0_ ',' _0_ singleTypeSpec)* _0_ ')'
244    / '(' _0_ ')'
245
246parameterisedTypeSpec <-
247    dottedIdentifier (_1_ (singleTypeSpec / lowerStartIdentifier))*
248
249functionTypeSpec <-
250    singleTypeSpec (_0_ '->' _0_ singleTypeSpec)+
251
252# Port declaration
253
254portDeclaration <-
255    'port' _1_ <lowerStartIdentifier> _0_ ':' _0_ <typeAnnotation> EOS {
256        int r = makeElmTag(auxil, $1, $1s, K_PORT, ROLE_DEFINITION_INDEX);
257        addElmTypeRef(r, $2);
258    }
259
260# Import statement
261#
262# For the import statement we don't want the imported items to appear in the
263# scope of the current module (ie this file), otherwise they'll be named
264# wrongly. So we # want to save the module scope, make the imported tags,
265# then restore the module scope. We do this in two separate C code blocks,
266# because the module scope needs to be saved before any of the imported tags
267# are made.
268#
269# Also, if we create a namespace then that *does* live in the scope of the
270# current module, so we'll make that tag (if needed) before saving the
271# module scope.
272
273importStatement <-
274    'import' _1_ <dottedIdentifier> (_1_ 'as' _1_ <upperStartIdentifier>)? {
275        // Make the namespace tag first, as it's in the file module's scope
276        if ($2s > 0) {
277            int r = makeElmTag(auxil, $2, $2s, K_NAMESPACE, ROLE_DEFINITION_INDEX);
278            attachParserFieldToCorkEntry (r, ElmFields[F_MODULENAME].ftype, $1);
279        }
280
281        // Now make the tag for the imported module, as it lives outside
282        // the scope of the file module
283        ELM_SAVE_MODULE_SCOPE;
284        makeElmTagSettingScope(auxil, $1, $1s, K_MODULE, ELM_MODULE_IMPORTED);
285    } (_1_ 'exposing' _0_ '(' _0_ importedList _0_ ')')? EOS {
286        ELM_RESTORE_MODULE_SCOPE;
287    }
288
289importedList <- importedItem _0_ (',' _0_ importedList)*
290
291importedItem <-
292    importedFunction
293    / importedType
294    / importedItemIgnored
295
296importedFunction <- <lowerStartIdentifier> {
297        makeElmTag(auxil, $1, $1s, K_FUNCTION, ELM_FUNCTION_EXPOSED);
298    }
299
300# When importing a type and constructors we want the constructors
301# to be in the scope of the type. So we have to set the scope as the
302# type first, before parsing (and making the tags for) the constructors.
303# That's why the code here uses two separate C code blocks.
304
305importedType <-
306    <upperStartIdentifier> {
307        makeElmTagSettingScope(auxil, $1, $1s, K_TYPE, ELM_TYPE_EXPOSED);
308    } (_0_ '(' _0_ importedTypeConstructorList _0_ ')')? {
309        // We're done with the type and its constructors, so we can pop it
310        POP_SCOPE(auxil);
311    }
312
313importedItemIgnored <- '.'+
314
315importedTypeConstructorList <-
316    (importedTypeConstructor / importedItemIgnored) _0_ (',' _0_ importedTypeConstructorList)*
317
318importedTypeConstructor <-
319    <upperStartIdentifier> {
320        makeElmTag(auxil, $1, $1s, K_CONSTRUCTOR, ELM_CONSTRUCTOR_EXPOSED);
321    }
322
323# Function with a type annotation.
324#
325# The type is on one line, and the function must follow immediately as
326# the next top level statement
327
328functionWithTypeAnnotation <-
329    <lowerStartIdentifier> _0_ ':' _0_ <typeAnnotation> TLSS
330    <$1> _1_ <functionParameterList>? {
331        int r = makeElmTagSettingScope(auxil, $3, $3s, K_FUNCTION, ROLE_DEFINITION_INDEX);
332        addElmTypeRef(r, $2);
333        addElmSignature(r, $4);
334    } _0_ '=' _0_ expression EOS {
335        POP_SCOPE(auxil);
336    }
337
338typeAnnotation <-
339    singleTypeSpec (_0_ '->' _0_ singleTypeSpec)*
340
341# Function without a type annotation
342
343functionDefinition <-
344    <nonKeywordIdentifier> _0_ <functionParameterList>? {
345        int r = makeElmTagSettingScope(auxil, $1, $1s, K_FUNCTION, ROLE_DEFINITION_INDEX);
346        addElmSignature(r, $2);
347    } _0_ '=' _0_ expression EOS {
348        POP_SCOPE(auxil);
349    }
350
351# A function parameter list is what we define a function with. It's the
352# x y z in 'fn x y z'. But of course they can be more complex, such as
353# 'fn (Cons a b) ({ thing } as otherThing))' etc.
354
355functionParameterList <- functionParameter (_0_ functionParameter)*
356
357functionParameter <-
358    plainFunctionParameter
359    / tupleFunctionParameter
360    / recordFunctionParameter
361    / constructorFunctionParameter
362
363plainFunctionParameter <-
364    lowerStartIdentifier (_0_ asClause)?
365
366tupleFunctionParameter <-
367    '(' _0_ functionParameter (_0_ ',' _0_ functionParameter)* _0_ ')' (_0_ asClause)?
368
369recordFunctionParameter <-
370    '{' _0_ lowerStartIdentifier (_0_ ',' _0_ lowerStartIdentifier)* _0_ '}' (_0_ asClause)?
371
372constructorFunctionParameter <-
373    upperStartIdentifier (_0_ functionParameter)* (_0_ asClause)?
374
375asClause <-
376    'as' _1_ lowerStartIdentifier
377
378# Expressions
379
380expression <-
381    (letInBlock _NL_IND_)? simpleExpression (_0_ binaryOperator _0_ expression)*
382
383simpleExpression <-
384    hexNumber
385    / decimal
386    / multilineString
387    / characterLiteral
388    / oneLineString
389    / tupleExpression
390    / listExpression
391    / recordExpression
392    / caseStatement
393    / ifThenElseStatement
394    / anonymousFunction
395    / functionCall
396
397tupleExpression <-
398    '(' _0_ expression (_0_ ',' _0_ expression)* _0_ ')'
399    / '(' _0_ ')'
400
401listExpression <-
402    '[' _0_ expression (_0_ ',' _0_ expression)* _0_ ']'
403    / '[' _0_ ']'
404
405recordExpression <-
406    '{' _0_
407    (lowerStartIdentifier _0_ '|' _0_)?
408    recordExpressionAssignment (_0_ ',' _0_ recordExpressionAssignment)* _0_
409    '}'
410    / '{' _0_ '}'
411
412recordExpressionAssignment <-
413    lowerStartIdentifier _0_ '=' _0_ expression
414
415anonymousFunction <-
416    '\\' _0_ functionParameterList _0_ '->' _0_ expression
417
418functionCall <-
419    ( dottedIdentifier
420      / '.' lowerStartIdentifier
421      / '(' binaryOperator ')'
422    ) (_1_ expression)*
423
424# Let/in block
425#
426# We'll treat let/in blocks very simply - we'll consider each line
427# and expect the whole line either to be the start of a function
428# definition (perhaps with some of its body) or its body. So something
429# like 'f x y =' will have to be on one line.
430
431letInBlock <-
432    'let' _NL_IND_
433    letInLine (_NL_IND_ letInLine)* _NL_IND_
434    'in'
435
436letInLine <-
437    letInFunctionDefinition
438    / letInBlock
439    / letInFunctionBody
440
441letInFunctionDefinition <-
442    <nonKeywordIdentifier> WS* <letInFunctionParameters>? WS* '=' Non_NL* {
443        int r = makeElmTag(auxil, $1, $1s, K_FUNCTION, ROLE_DEFINITION_INDEX);
444        addElmSignature(r, $2);
445    }
446
447letInFunctionParameters <-
448    nonKeywordIdentifier (WS+ nonKeywordIdentifier)*
449
450letInFunctionBody <-
451    !('let' / 'in') Non_NL+
452
453# Case statements
454#
455# We're going to be pretty loose with case statements, otherwise we'd
456# have to follow Elm's indentation rules. So we'll just say
457# the body of a case statement is a series of patterns like this:
458# <something> -> <expression>. The <expression> might well swallow
459# up a bit of the next case pattern (because to do otherwise requires
460# following Elm's indentation rules), so that's why we just specify
461# <something>.
462
463caseStatement <-
464    'case' _1_ expression _0_ 'of' _1_
465    caseClauseList
466
467caseClauseList <-
468    caseClause (_1_ caseClause)*
469
470caseClause <-
471    roughCasePatternChar* '->' _0_ expression
472
473roughCasePatternChar <-
474    !('->' / TLSS / lineComment / delimitedComment / NL) .
475
476# If/then/else statements
477
478ifThenElseStatement <-
479    'if' _1_ expression _1_
480    'then' _1_ expression _1_
481    'else' _1_ expression
482
483# Binary operators
484
485binaryOperator <-
486    '>>' / '<<' / '|>' / '<|'
487    / '//' / '++' / '::'
488    / '==' / '/='
489    / '&&' / '||'
490    / '<=' / '>='
491    / '<' / '>'
492    / '+' / '-' / '*' / '/' / '^'
493
494# Sometimes we just need to ignore the rest of the (top level) statement
495
496ignoreRestOfStatement <-
497    (multilineString / Non_WS_or_NL+) (_1_ ignoreRestOfStatement)*
498
499multilineString <-
500    '"""' (!'"""' .)* '"""'
501
502# Low level tokens -------------------------------------------------------
503
504# Identifiers
505
506naiveIdentifier <- [A-Za-z_] alphanumeric*
507
508upperStartIdentifier <- [A-Z] alphanumeric*
509
510lowerStartIdentifier <- !keyword [a-z_] alphanumeric*
511
512alphanumeric <- [A-Za-z0-9_]
513
514nonKeywordIdentifier <-
515    !keyword naiveIdentifier
516
517keyword <-
518    'type' !alphanumeric
519    / 'module' !alphanumeric
520    / 'port' !alphanumeric
521    / 'alias' !alphanumeric
522    / 'as' !alphanumeric
523    / 'exposing' !alphanumeric
524    / 'import' !alphanumeric
525    / 'let' !alphanumeric
526    / 'in' !alphanumeric
527    / 'case' !alphanumeric
528    / 'of' !alphanumeric
529    / 'if' !alphanumeric
530    / 'then' !alphanumeric
531    / 'else' !alphanumeric
532
533dottedIdentifier <- nonKeywordIdentifier ('.' nonKeywordIdentifier)*
534
535# Numbers
536
537decimal <-
538    exponentialDecimal
539    / simpleDecimal
540
541exponentialDecimal <-
542    simpleDecimal 'e' simpleInteger
543
544simpleDecimal <-
545    simpleInteger ('.' digits)?
546    / '.' digits+
547
548simpleInteger <- [-+]? digits
549
550digits <- [0-9]+
551
552hexNumber <- '0x' [0-9A-Fa-f]+
553
554# One line strings and characters
555
556oneLineString <- '"' inStringChar* '"'
557
558characterLiteral <- "'" inStringChar "'"
559
560inStringChar <-
561    !('"' / NL)
562    ( inStringUnicodeChar / inStringEscapedChar / inStringPlainChar )
563
564inStringPlainChar <-
565    !('"' / '\\' / NL) .
566
567inStringEscapedChar <-
568    '\\' !('u' / NL) .
569
570inStringUnicodeChar <-
571    '\\u{' [0-9A-Fa-f]+ '}'
572
573# Ignorable things -------------------------------------------------------
574
575# Simple things...
576
577WS <- [ \t]+
578NL <- '\n' / '\f' / '\r' '\n'?
579Non_NL <- [^\n\r\f]
580Non_WS_or_NL <- [^ \t\n\r\f]
581EOF <- !.
582
583# A delimited comment is effectively "nothing", even if it spans several
584# lines. But it does separate two tokens.
585#
586# A line comment can only come at the end of a line. Notice here it doesn't
587# include the actual newline.
588
589delimitedComment <- '{-' (delimitedComment / !'-}' .)* '-}'
590
591lineComment <- '--' Non_NL*
592
593# Elm whitespacing is a bit special...
594# - Two statements are at the same level (eg at the top level, or statements
595#   in the same let...in block) only if they begin with the same indentation.
596# - One line has more indentation than the previous line then it is a
597#   continuation of that previous line.
598# - But sometimes several statements can appear on the same line if tokens
599#   make it obvious. Eg this is okay:
600#   Eg: 'myFunc = let f x y = x + y in f 3 4'
601#
602# We'll only worry about top level statements for this part. But we still
603# need to know
604# - when a top level statement begins; and
605# - when two sequential tokens are part of the same top level statement.
606#   They may be separated by a combination of whitespace, comments, and
607#   newlines, but if there is a newline then that will always be followed
608#   by an indent.
609#
610# When considering how one token relates to the next in top level statements
611# we should only need three kinds of "join"s:
612# - Where we need whitespace, such as 'import MyModule', but that space
613#   may occur over multiple lines. If it's over multiple lines, the
614#   second token needs to be somewhat in from the first column of text.
615#   We'll call this _1_ - ie at least one space.
616# - Where we don't need whitespace, such as 'f = 3', but that space
617#   may occur over multiple lines. If it's over multiple lines then again
618#   the second token needs to be somewhat in from the first column of text.
619#   We'll call this _0_ - ie possibly zero space.
620# - When we've got an end of statement, and the next token is some
621#   meaningful code (not a comment) and starts in the first column of text.
622#   Then that next token is the start of the next top level statement.
623#   We'll call this TLSS, for top level statement separator.
624#
625# We can define _1_ as
626# - The longest possible sequence of whitespace, delimited comments,
627#   newlines, and line comments, as long as it ends with a whitespace
628#   or a delimited comment, because then it won't be in the first column.
629#
630# We can define _0_ as
631# - _1_ or the empty string.
632#
633# We can define TLSS as
634# - The longest possible sequence of whitespace, delimited comments,
635#   newlines, and line comments, as long as it ends with a newline or EOF
636#   (and there's no more ignorable characters after that).
637#
638# PEG parsing tip: If we want to define a sequence like 'the longest
639# sequence of As, Bs and Cs, as long as it ends with C' we define a short
640# sequence like 'the longest sequence of As and Bs, then a C' and then
641# define 'the longest sequence of those'.
642
643_1_short <-
644    (lineComment / NL)* (WS / delimitedComment)
645
646_1_ <- _1_short+
647
648
649_0_ <- _1_ / ''
650
651TLSS_short <-
652    (WS / lineComment / delimitedComment)* (NL / EOF)
653
654TLSS <-
655    TLSS_short+
656    !(WS / lineComment / delimitedComment)
657
658# An end of statement marks the end of a top level statement, but
659# doesn't consume anything
660
661EOS <- &( TLSS / EOF )
662
663# When considering lines in a let/in block we'll want to look for
664# a newline and an indent. There may be some delimited comments etc
665# in between.
666
667_NL_IND_ <-
668    TLSS_short+ WS+
669
670%%
671#include "elm_post.h"
672