1 /*
2 * perl6.c -- Perl6 parser.
3 * Author: Dmitri Tikhonov <dmitri@cpan.org>
4 *
5 * This is a very basic Perl 6 parser. It does not know how to:
6 * - skip POD;
7 * - skip multiline comments;
8 * - skip quoted strings;
9 * - generate fully-qualified tags.
10 *
11 * This source code is released for free distribution under the terms of
12 * the GNU General Public License version 2 or (at your option) any later version.
13 */
14
15 #include "general.h" /* must always come first */
16
17 #include <stdio.h>
18 #include <string.h>
19
20 #include "debug.h"
21 #include "entry.h"
22 #include "parse.h"
23 #include "read.h"
24 #include "routines.h"
25 #include "selectors.h"
26 #include "vstring.h"
27
28 enum perl6Kind {
29 K_NONE = -1,
30 K_CLASS,
31 K_GRAMMAR,
32 K_METHOD,
33 K_MODULE,
34 K_PACKAGE,
35 K_ROLE,
36 K_RULE,
37 K_SUBMETHOD,
38 K_SUBROUTINE,
39 K_TOKEN,
40 };
41
42 static kindDefinition perl6Kinds[] = {
43 [K_CLASS] = { true, 'c', "class", "classes" },
44 [K_GRAMMAR] = { true, 'g', "grammar", "grammars" },
45 [K_METHOD] = { true, 'm', "method", "methods" },
46 [K_MODULE] = { true, 'o', "module", "modules" },
47 [K_PACKAGE] = { true, 'p', "package", "packages" },
48 [K_ROLE] = { true, 'r', "role", "roles" },
49 [K_RULE] = { true, 'u', "rule", "rules" },
50 [K_SUBMETHOD] = { true, 'b', "submethod", "submethods" },
51 [K_SUBROUTINE] = { true, 's', "subroutine", "subroutines" },
52 [K_TOKEN] = { true, 't', "token", "tokens" },
53 };
54
55 enum token {
56 T_CLASS,
57 T_GRAMMAR,
58 T_METHOD,
59 T_MODULE,
60 T_MULTI,
61 T_MY,
62 T_OUR,
63 T_PACKAGE,
64 T_PROTO,
65 T_ROLE,
66 T_RULE,
67 T_SUB,
68 T_SUBMETHOD,
69 T_UNIT,
70 T_TOKEN,
71 };
72
73 static const enum perl6Kind token2kind[] = {
74 [T_CLASS] = K_CLASS,
75 [T_GRAMMAR] = K_GRAMMAR,
76 [T_METHOD] = K_METHOD,
77 [T_MODULE] = K_MODULE,
78 [T_MULTI] = K_SUBROUTINE,
79 [T_MY] = K_NONE,
80 [T_OUR] = K_NONE,
81 [T_PACKAGE] = K_PACKAGE,
82 [T_PROTO] = K_NONE,
83 [T_ROLE] = K_ROLE,
84 [T_RULE] = K_RULE,
85 [T_SUB] = K_SUBROUTINE,
86 [T_SUBMETHOD] = K_SUBMETHOD,
87 [T_UNIT] = K_NONE,
88 [T_TOKEN] = K_TOKEN,
89 };
90
91 #define STRLEN(s) (sizeof(s) - 1)
92 #define STREQN(s, token) (0 == strncmp(s, token, STRLEN(token)))
93
94 static enum token
matchToken(const char * s,int len)95 matchToken (const char *s, int len)
96 {
97 switch (len) {
98 case 2:
99 if (STREQN(s, "my")) return T_MY;
100 break;
101 case 3:
102 switch (s[0]) {
103 case 'o':
104 if (STREQN(s, "our")) return T_OUR;
105 break;
106 case 's':
107 if (STREQN(s, "sub")) return T_SUB;
108 break;
109 }
110 break;
111 case 4:
112 switch (s[1]) {
113 case 'o':
114 if (STREQN(s, "role")) return T_ROLE;
115 break;
116 case 'u':
117 if (STREQN(s, "rule")) return T_RULE;
118 break;
119 case 'n':
120 if (STREQN(s, "unit")) return T_UNIT;
121 break;
122 }
123 break;
124 case 5:
125 switch (s[0]) {
126 case 'c':
127 if (STREQN(s, "class")) return T_CLASS;
128 break;
129 case 'm':
130 if (STREQN(s, "multi")) return T_MULTI;
131 break;
132 case 'p':
133 if (STREQN(s, "proto")) return T_PROTO;
134 break;
135 case 't':
136 if (STREQN(s, "token")) return T_TOKEN;
137 break;
138 }
139 break;
140 case 6:
141 switch (s[1]) {
142 case 'e':
143 if (STREQN(s, "method")) return T_METHOD;
144 break;
145 case 'o':
146 if (STREQN(s, "module")) return T_MODULE;
147 break;
148 }
149 break;
150 case 7:
151 switch (s[0]) {
152 case 'g':
153 if (STREQN(s, "grammar")) return T_GRAMMAR;
154 break;
155 case 'p':
156 if (STREQN(s, "package")) return T_PACKAGE;
157 break;
158 }
159 break;
160 case 9:
161 if (STREQN(s, "submethod")) return T_SUBMETHOD;
162 break;
163 }
164 return -1;
165 }
166
167 static const int validPerl6Identifier[0x100] = {
168 /* r!perl -e "print qq([(int)'\$_'] = 1,\n)for a..z,A..Z,0..9,':','-','_'"|fmt
169 */
170 [(int)'a'] = 1, [(int)'b'] = 1, [(int)'c'] = 1, [(int)'d'] = 1,
171 [(int)'e'] = 1, [(int)'f'] = 1, [(int)'g'] = 1, [(int)'h'] = 1,
172 [(int)'i'] = 1, [(int)'j'] = 1, [(int)'k'] = 1, [(int)'l'] = 1,
173 [(int)'m'] = 1, [(int)'n'] = 1, [(int)'o'] = 1, [(int)'p'] = 1,
174 [(int)'q'] = 1, [(int)'r'] = 1, [(int)'s'] = 1, [(int)'t'] = 1,
175 [(int)'u'] = 1, [(int)'v'] = 1, [(int)'w'] = 1, [(int)'x'] = 1,
176 [(int)'y'] = 1, [(int)'z'] = 1, [(int)'A'] = 1, [(int)'B'] = 1,
177 [(int)'C'] = 1, [(int)'D'] = 1, [(int)'E'] = 1, [(int)'F'] = 1,
178 [(int)'G'] = 1, [(int)'H'] = 1, [(int)'I'] = 1, [(int)'J'] = 1,
179 [(int)'K'] = 1, [(int)'L'] = 1, [(int)'M'] = 1, [(int)'N'] = 1,
180 [(int)'O'] = 1, [(int)'P'] = 1, [(int)'Q'] = 1, [(int)'R'] = 1,
181 [(int)'S'] = 1, [(int)'T'] = 1, [(int)'U'] = 1, [(int)'V'] = 1,
182 [(int)'W'] = 1, [(int)'X'] = 1, [(int)'Y'] = 1, [(int)'Z'] = 1,
183 [(int)'0'] = 1, [(int)'1'] = 1, [(int)'2'] = 1, [(int)'3'] = 1,
184 [(int)'4'] = 1, [(int)'5'] = 1, [(int)'6'] = 1, [(int)'7'] = 1,
185 [(int)'8'] = 1, [(int)'9'] = 1, [(int)':'] = 1, [(int)'-'] = 1,
186 [(int)'_'] = 1,
187 };
188
189 static const int validMethodPrefix[0x100] = {
190 [(int)'!'] = 1, [(int)'^'] = 1,
191 };
192
193 static const int kindMayHaveMethodPrefix = (1 << K_SUBMETHOD) |
194 (1 << K_METHOD) ;
195
196 /* Trim identifier pointed to by ps, possibly advancing it, and return
197 * the length of the valid portion. If the returned value is zero, the
198 * identifier is invalid.
199 */
200 static int
trimIdentifier(enum perl6Kind kind,const char ** ps,int len)201 trimIdentifier (enum perl6Kind kind, const char **ps, int len)
202 {
203 Assert(len > 0);
204 const char *const end = *ps + len;
205 const char *s = *ps;
206 /* Trim the front if possible: */
207 s += (kindMayHaveMethodPrefix & (1 << kind)) &&
208 validMethodPrefix[(int)*s];
209 /* Record the start of identifier: */
210 *ps = s;
211 /* Continuous string of valid characters: */
212 while (s < end && validPerl6Identifier[(int)*s])
213 ++s;
214 /* sub multi infix:<...> -- we want the "infix" only */
215 while (s - *ps > 0 && ':' == s[-1])
216 --s;
217 /* It's ok if this is zero: */
218 return s - *ps;
219 }
220
221 struct p6Ctx {
222 enum token tokens[128 /* unlikely to need more than this */];
223 unsigned int n_tokens;
224 vString *name;
225 const char *line; /* Saved from readLineFromInputFile() */
226 };
227
228 static void
makeTag(struct p6Ctx * ctx,int kind,const char * name,int len)229 makeTag (struct p6Ctx *ctx, int kind, const char *name, int len)
230 {
231 tagEntryInfo entry;
232 vStringNCopyS(ctx->name, name, len);
233 initTagEntry(&entry, vStringValue(ctx->name), kind);
234 makeTagEntry(&entry);
235 }
236
237 static void
possiblyMakeTag(struct p6Ctx * ctx,const char * s,int len)238 possiblyMakeTag (struct p6Ctx *ctx, const char *s, int len)
239 {
240 Assert(ctx->n_tokens > 0);
241 enum perl6Kind kind = token2kind[ ctx->tokens[ctx->n_tokens - 1] ];
242 if (K_NONE != kind && perl6Kinds[kind].enabled
243 && (len = trimIdentifier(kind, &s, len)) > 0)
244 makeTag(ctx, kind, s, len);
245 }
246
247 static void
initP6Ctx(struct p6Ctx * ctx)248 initP6Ctx (struct p6Ctx *ctx)
249 {
250 ctx->n_tokens = 0;
251 ctx->name = vStringNew();
252 ctx->line = NULL;
253 }
254
255 static void
deinitP6Ctx(struct p6Ctx * ctx)256 deinitP6Ctx (struct p6Ctx *ctx)
257 {
258 vStringDelete(ctx->name);
259 }
260
261 /* Read next contiguous sequence of non-whitespace characters, store
262 * the address in `ptok', and return its length. Return value of zero
263 * means EOF.
264 *
265 * TODO: Currently, POD and multi-line comments are not handled.
266 */
267 static int
getNonSpaceStr(struct p6Ctx * ctx,const char ** ptok)268 getNonSpaceStr (struct p6Ctx *ctx, const char **ptok)
269 {
270 const char *s = ctx->line;
271 if (!s) {
272 next_line:
273 s = (const char *) readLineFromInputFile();
274 if (!s)
275 return 0; /* EOF */
276 }
277 while (*s && isspace(*s)) /* Skip whitespace */
278 ++s;
279 if ('#' == *s)
280 goto next_line;
281 int non_white_len = strcspn(s, ",; \t");
282 if (non_white_len) {
283 ctx->line = s + non_white_len; /* Save state */
284 *ptok = s;
285 return non_white_len;
286 } else
287 goto next_line;
288 }
289
290 static void
findPerl6Tags(void)291 findPerl6Tags (void)
292 {
293 struct p6Ctx ctx;
294
295 #define RESET_TOKENS() do { ctx.n_tokens = 0; } while (0)
296
297 #define PUSH_TOKEN(_t_) do { \
298 if (ctx.n_tokens < ARRAY_SIZE(ctx.tokens)) { \
299 ctx.tokens[ ctx.n_tokens ] = _t_; \
300 ++ctx.n_tokens; \
301 } else { \
302 Assert(!"Token stack overflown: this is quite odd"); \
303 RESET_TOKENS(); \
304 } \
305 } while (0)
306
307 initP6Ctx(&ctx);
308
309 const char *s;
310 int len;
311
312 while ((len = getNonSpaceStr(&ctx, &s)) > 0) {
313 enum token token = matchToken(s, len);
314 if ((int) token >= 0) {
315 PUSH_TOKEN(token);
316 } else if (ctx.n_tokens > 0) {
317 possiblyMakeTag(&ctx, s, len);
318 RESET_TOKENS();
319 }
320 }
321
322 deinitP6Ctx(&ctx);
323 }
324
325 parserDefinition *
Perl6Parser(void)326 Perl6Parser (void)
327 {
328 static const char *const extensions[] = { "p6", "pm6", "pm", "pl6",
329 "t6", "raku", "rakumod", "rakutest", NULL };
330 static selectLanguage selectors [] = { selectByPickingPerlVersion,
331 NULL };
332 parserDefinition* def = parserNew("Perl6");
333 def->kindTable = perl6Kinds;
334 def->kindCount = ARRAY_SIZE(perl6Kinds);
335 def->extensions = extensions;
336 def->parser = findPerl6Tags;
337 def->selectLanguage = selectors;
338 return def;
339 }
340