xref: /Universal-ctags/parsers/perl6.c (revision af24e4f7ca6d97cd3d2b09b20847e5b0bf239e81)
1 /*
2  * perl6.c -- Perl6 parser.
3  * Author: Dmitri Tikhonov <dmitri@cpan.org>
4  *
5  * This is a very basic Perl 6 parser.  It does not know how to:
6  *   - skip POD;
7  *   - skip multiline comments;
8  *   - skip quoted strings;
9  *   - generate fully-qualified tags.
10  *
11  * This source code is released for free distribution under the terms of
12  * the GNU General Public License version 2 or (at your option) any later version.
13  */
14 
15 #include "general.h"    /* must always come first */
16 
17 #include <stdio.h>
18 #include <string.h>
19 
20 #include "debug.h"
21 #include "entry.h"
22 #include "parse.h"
23 #include "read.h"
24 #include "routines.h"
25 #include "selectors.h"
26 #include "vstring.h"
27 
28 enum perl6Kind {
29     K_NONE = -1,
30     K_CLASS,
31     K_GRAMMAR,
32     K_METHOD,
33     K_MODULE,
34     K_PACKAGE,
35     K_ROLE,
36     K_RULE,
37     K_SUBMETHOD,
38     K_SUBROUTINE,
39     K_TOKEN,
40 };
41 
42 static kindDefinition perl6Kinds[] = {
43     [K_CLASS]       = { true,  'c', "class",      "classes" },
44     [K_GRAMMAR]     = { true,  'g', "grammar",    "grammars" },
45     [K_METHOD]      = { true,  'm', "method",     "methods" },
46     [K_MODULE]      = { true,  'o', "module",     "modules" },
47     [K_PACKAGE]     = { true,  'p', "package",    "packages" },
48     [K_ROLE]        = { true,  'r', "role",       "roles" },
49     [K_RULE]        = { true,  'u', "rule",       "rules" },
50     [K_SUBMETHOD]   = { true,  'b', "submethod",  "submethods" },
51     [K_SUBROUTINE]  = { true,  's', "subroutine", "subroutines" },
52     [K_TOKEN]       = { true,  't', "token",      "tokens" },
53 };
54 
55 enum token {
56     T_CLASS,
57     T_GRAMMAR,
58     T_METHOD,
59     T_MODULE,
60     T_MULTI,
61     T_MY,
62     T_OUR,
63     T_PACKAGE,
64     T_PROTO,
65     T_ROLE,
66     T_RULE,
67     T_SUB,
68     T_SUBMETHOD,
69     T_UNIT,
70     T_TOKEN,
71 };
72 
73 static const enum perl6Kind token2kind[] = {
74     [T_CLASS]       = K_CLASS,
75     [T_GRAMMAR]     = K_GRAMMAR,
76     [T_METHOD]      = K_METHOD,
77     [T_MODULE]      = K_MODULE,
78     [T_MULTI]       = K_SUBROUTINE,
79     [T_MY]          = K_NONE,
80     [T_OUR]         = K_NONE,
81     [T_PACKAGE]     = K_PACKAGE,
82     [T_PROTO]       = K_NONE,
83     [T_ROLE]        = K_ROLE,
84     [T_RULE]        = K_RULE,
85     [T_SUB]         = K_SUBROUTINE,
86     [T_SUBMETHOD]   = K_SUBMETHOD,
87     [T_UNIT]        = K_NONE,
88     [T_TOKEN]       = K_TOKEN,
89 };
90 
91 #define STRLEN(s) (sizeof(s) - 1)
92 #define STREQN(s, token) (0 == strncmp(s, token, STRLEN(token)))
93 
94 static enum token
matchToken(const char * s,int len)95 matchToken (const char *s, int len)
96 {
97     switch (len) {
98         case 2:
99             if (STREQN(s, "my"))                return T_MY;
100             break;
101         case 3:
102             switch (s[0]) {
103                 case 'o':
104                     if (STREQN(s, "our"))       return T_OUR;
105                     break;
106                 case 's':
107                     if (STREQN(s, "sub"))       return T_SUB;
108                     break;
109             }
110             break;
111         case 4:
112             switch (s[1]) {
113                 case 'o':
114                     if (STREQN(s, "role"))      return T_ROLE;
115                     break;
116                 case 'u':
117                     if (STREQN(s, "rule"))      return T_RULE;
118                     break;
119                 case 'n':
120                     if (STREQN(s, "unit"))      return T_UNIT;
121                     break;
122             }
123             break;
124         case 5:
125             switch (s[0]) {
126                 case 'c':
127                     if (STREQN(s, "class"))     return T_CLASS;
128                     break;
129                 case 'm':
130                     if (STREQN(s, "multi"))     return T_MULTI;
131                     break;
132                 case 'p':
133                     if (STREQN(s, "proto"))     return T_PROTO;
134                     break;
135                 case 't':
136                     if (STREQN(s, "token"))     return T_TOKEN;
137                     break;
138             }
139             break;
140         case 6:
141             switch (s[1]) {
142                 case 'e':
143                     if (STREQN(s, "method"))    return T_METHOD;
144                     break;
145                 case 'o':
146                     if (STREQN(s, "module"))    return T_MODULE;
147                     break;
148             }
149             break;
150         case 7:
151             switch (s[0]) {
152                 case 'g':
153                     if (STREQN(s, "grammar"))   return T_GRAMMAR;
154                     break;
155                 case 'p':
156                     if (STREQN(s, "package"))   return T_PACKAGE;
157                     break;
158             }
159             break;
160         case 9:
161             if (STREQN(s, "submethod"))         return T_SUBMETHOD;
162             break;
163     }
164     return -1;
165 }
166 
167 static const int validPerl6Identifier[0x100] = {
168 /* r!perl -e "print qq([(int)'\$_'] = 1,\n)for a..z,A..Z,0..9,':','-','_'"|fmt
169  */
170     [(int)'a'] = 1, [(int)'b'] = 1, [(int)'c'] = 1, [(int)'d'] = 1,
171     [(int)'e'] = 1, [(int)'f'] = 1, [(int)'g'] = 1, [(int)'h'] = 1,
172     [(int)'i'] = 1, [(int)'j'] = 1, [(int)'k'] = 1, [(int)'l'] = 1,
173     [(int)'m'] = 1, [(int)'n'] = 1, [(int)'o'] = 1, [(int)'p'] = 1,
174     [(int)'q'] = 1, [(int)'r'] = 1, [(int)'s'] = 1, [(int)'t'] = 1,
175     [(int)'u'] = 1, [(int)'v'] = 1, [(int)'w'] = 1, [(int)'x'] = 1,
176     [(int)'y'] = 1, [(int)'z'] = 1, [(int)'A'] = 1, [(int)'B'] = 1,
177     [(int)'C'] = 1, [(int)'D'] = 1, [(int)'E'] = 1, [(int)'F'] = 1,
178     [(int)'G'] = 1, [(int)'H'] = 1, [(int)'I'] = 1, [(int)'J'] = 1,
179     [(int)'K'] = 1, [(int)'L'] = 1, [(int)'M'] = 1, [(int)'N'] = 1,
180     [(int)'O'] = 1, [(int)'P'] = 1, [(int)'Q'] = 1, [(int)'R'] = 1,
181     [(int)'S'] = 1, [(int)'T'] = 1, [(int)'U'] = 1, [(int)'V'] = 1,
182     [(int)'W'] = 1, [(int)'X'] = 1, [(int)'Y'] = 1, [(int)'Z'] = 1,
183     [(int)'0'] = 1, [(int)'1'] = 1, [(int)'2'] = 1, [(int)'3'] = 1,
184     [(int)'4'] = 1, [(int)'5'] = 1, [(int)'6'] = 1, [(int)'7'] = 1,
185     [(int)'8'] = 1, [(int)'9'] = 1, [(int)':'] = 1, [(int)'-'] = 1,
186     [(int)'_'] = 1,
187 };
188 
189 static const int validMethodPrefix[0x100] = {
190     [(int)'!'] = 1, [(int)'^'] = 1,
191 };
192 
193 static const int kindMayHaveMethodPrefix = (1 << K_SUBMETHOD) |
194                                            (1 << K_METHOD)    ;
195 
196 /* Trim identifier pointed to by ps, possibly advancing it, and return
197  * the length of the valid portion.  If the returned value is zero, the
198  * identifier is invalid.
199  */
200 static int
trimIdentifier(enum perl6Kind kind,const char ** ps,int len)201 trimIdentifier (enum perl6Kind kind, const char **ps, int len)
202 {
203     Assert(len > 0);
204     const char *const end = *ps + len;
205     const char *s = *ps;
206     /* Trim the front if possible: */
207     s += (kindMayHaveMethodPrefix & (1 << kind)) &&
208          validMethodPrefix[(int)*s];
209     /* Record the start of identifier: */
210     *ps = s;
211     /* Continuous string of valid characters: */
212     while (s < end && validPerl6Identifier[(int)*s])
213         ++s;
214     /* sub multi infix:<...>        -- we want the "infix" only */
215     while (s - *ps > 0 && ':' == s[-1])
216         --s;
217     /* It's ok if this is zero: */
218     return s - *ps;
219 }
220 
221 struct p6Ctx {
222     enum token  tokens[128 /* unlikely to need more than this */];
223     unsigned int n_tokens;
224     vString    *name;
225     const char *line;      /* Saved from readLineFromInputFile() */
226 };
227 
228 static void
makeTag(struct p6Ctx * ctx,int kind,const char * name,int len)229 makeTag (struct p6Ctx *ctx, int kind, const char *name, int len)
230 {
231     tagEntryInfo entry;
232     vStringNCopyS(ctx->name, name, len);
233     initTagEntry(&entry, vStringValue(ctx->name), kind);
234     makeTagEntry(&entry);
235 }
236 
237 static void
possiblyMakeTag(struct p6Ctx * ctx,const char * s,int len)238 possiblyMakeTag (struct p6Ctx *ctx, const char *s, int len)
239 {
240     Assert(ctx->n_tokens > 0);
241     enum perl6Kind kind = token2kind[ ctx->tokens[ctx->n_tokens - 1] ];
242     if (K_NONE != kind && perl6Kinds[kind].enabled
243                        && (len = trimIdentifier(kind, &s, len)) > 0)
244         makeTag(ctx, kind, s, len);
245 }
246 
247 static void
initP6Ctx(struct p6Ctx * ctx)248 initP6Ctx (struct p6Ctx *ctx)
249 {
250     ctx->n_tokens = 0;
251     ctx->name = vStringNew();
252     ctx->line = NULL;
253 }
254 
255 static void
deinitP6Ctx(struct p6Ctx * ctx)256 deinitP6Ctx (struct p6Ctx *ctx)
257 {
258     vStringDelete(ctx->name);
259 }
260 
261 /* Read next contiguous sequence of non-whitespace characters, store
262  * the address in `ptok', and return its length.  Return value of zero
263  * means EOF.
264  *
265  * TODO: Currently, POD and multi-line comments are not handled.
266  */
267 static int
getNonSpaceStr(struct p6Ctx * ctx,const char ** ptok)268 getNonSpaceStr (struct p6Ctx *ctx, const char **ptok)
269 {
270     const char *s = ctx->line;
271     if (!s) {
272 next_line:
273         s = (const char *) readLineFromInputFile();
274         if (!s)
275             return 0;                           /* EOF */
276     }
277     while (*s && isspace(*s))                   /* Skip whitespace */
278         ++s;
279     if ('#' == *s)
280         goto next_line;
281     int non_white_len = strcspn(s, ",; \t");
282     if (non_white_len) {
283         ctx->line = s + non_white_len;          /* Save state */
284         *ptok = s;
285         return non_white_len;
286     } else
287         goto next_line;
288 }
289 
290 static void
findPerl6Tags(void)291 findPerl6Tags (void)
292 {
293     struct p6Ctx ctx;
294 
295 #define RESET_TOKENS() do { ctx.n_tokens = 0; } while (0)
296 
297 #define PUSH_TOKEN(_t_) do {                                            \
298     if (ctx.n_tokens < ARRAY_SIZE(ctx.tokens)) {			\
299         ctx.tokens[ ctx.n_tokens ] = _t_;                               \
300         ++ctx.n_tokens;                                                 \
301     } else {                                                            \
302         Assert(!"Token stack overflown: this is quite odd");            \
303         RESET_TOKENS();                                                 \
304     }                                                                   \
305 } while (0)
306 
307     initP6Ctx(&ctx);
308 
309     const char *s;
310     int len;
311 
312     while ((len = getNonSpaceStr(&ctx, &s)) > 0) {
313         enum token token = matchToken(s, len);
314         if ((int) token >= 0) {
315             PUSH_TOKEN(token);
316         } else if (ctx.n_tokens > 0) {
317             possiblyMakeTag(&ctx, s, len);
318             RESET_TOKENS();
319         }
320     }
321 
322     deinitP6Ctx(&ctx);
323 }
324 
325 parserDefinition *
Perl6Parser(void)326 Perl6Parser (void)
327 {
328     static const char *const extensions[] = { "p6", "pm6", "pm", "pl6",
329 		"t6", "raku", "rakumod", "rakutest", NULL };
330     static selectLanguage selectors [] = { selectByPickingPerlVersion,
331 					   NULL };
332     parserDefinition* def = parserNew("Perl6");
333     def->kindTable      = perl6Kinds;
334     def->kindCount  = ARRAY_SIZE(perl6Kinds);
335     def->extensions = extensions;
336     def->parser     = findPerl6Tags;
337     def->selectLanguage = selectors;
338     return def;
339 }
340