/* * perl6.c -- Perl6 parser. * Author: Dmitri Tikhonov * * This is a very basic Perl 6 parser. It does not know how to: * - skip POD; * - skip multiline comments; * - skip quoted strings; * - generate fully-qualified tags. * * This source code is released for free distribution under the terms of * the GNU General Public License version 2 or (at your option) any later version. */ #include "general.h" /* must always come first */ #include #include #include "debug.h" #include "entry.h" #include "parse.h" #include "read.h" #include "routines.h" #include "selectors.h" #include "vstring.h" enum perl6Kind { K_NONE = -1, K_CLASS, K_GRAMMAR, K_METHOD, K_MODULE, K_PACKAGE, K_ROLE, K_RULE, K_SUBMETHOD, K_SUBROUTINE, K_TOKEN, }; static kindDefinition perl6Kinds[] = { [K_CLASS] = { true, 'c', "class", "classes" }, [K_GRAMMAR] = { true, 'g', "grammar", "grammars" }, [K_METHOD] = { true, 'm', "method", "methods" }, [K_MODULE] = { true, 'o', "module", "modules" }, [K_PACKAGE] = { true, 'p', "package", "packages" }, [K_ROLE] = { true, 'r', "role", "roles" }, [K_RULE] = { true, 'u', "rule", "rules" }, [K_SUBMETHOD] = { true, 'b', "submethod", "submethods" }, [K_SUBROUTINE] = { true, 's', "subroutine", "subroutines" }, [K_TOKEN] = { true, 't', "token", "tokens" }, }; enum token { T_CLASS, T_GRAMMAR, T_METHOD, T_MODULE, T_MULTI, T_MY, T_OUR, T_PACKAGE, T_PROTO, T_ROLE, T_RULE, T_SUB, T_SUBMETHOD, T_UNIT, T_TOKEN, }; static const enum perl6Kind token2kind[] = { [T_CLASS] = K_CLASS, [T_GRAMMAR] = K_GRAMMAR, [T_METHOD] = K_METHOD, [T_MODULE] = K_MODULE, [T_MULTI] = K_SUBROUTINE, [T_MY] = K_NONE, [T_OUR] = K_NONE, [T_PACKAGE] = K_PACKAGE, [T_PROTO] = K_NONE, [T_ROLE] = K_ROLE, [T_RULE] = K_RULE, [T_SUB] = K_SUBROUTINE, [T_SUBMETHOD] = K_SUBMETHOD, [T_UNIT] = K_NONE, [T_TOKEN] = K_TOKEN, }; #define STRLEN(s) (sizeof(s) - 1) #define STREQN(s, token) (0 == strncmp(s, token, STRLEN(token))) static enum token matchToken (const char *s, int len) { switch (len) { case 2: if (STREQN(s, "my")) return T_MY; break; case 3: switch (s[0]) { case 'o': if (STREQN(s, "our")) return T_OUR; break; case 's': if (STREQN(s, "sub")) return T_SUB; break; } break; case 4: switch (s[1]) { case 'o': if (STREQN(s, "role")) return T_ROLE; break; case 'u': if (STREQN(s, "rule")) return T_RULE; break; case 'n': if (STREQN(s, "unit")) return T_UNIT; break; } break; case 5: switch (s[0]) { case 'c': if (STREQN(s, "class")) return T_CLASS; break; case 'm': if (STREQN(s, "multi")) return T_MULTI; break; case 'p': if (STREQN(s, "proto")) return T_PROTO; break; case 't': if (STREQN(s, "token")) return T_TOKEN; break; } break; case 6: switch (s[1]) { case 'e': if (STREQN(s, "method")) return T_METHOD; break; case 'o': if (STREQN(s, "module")) return T_MODULE; break; } break; case 7: switch (s[0]) { case 'g': if (STREQN(s, "grammar")) return T_GRAMMAR; break; case 'p': if (STREQN(s, "package")) return T_PACKAGE; break; } break; case 9: if (STREQN(s, "submethod")) return T_SUBMETHOD; break; } return -1; } static const int validPerl6Identifier[0x100] = { /* r!perl -e "print qq([(int)'\$_'] = 1,\n)for a..z,A..Z,0..9,':','-','_'"|fmt */ [(int)'a'] = 1, [(int)'b'] = 1, [(int)'c'] = 1, [(int)'d'] = 1, [(int)'e'] = 1, [(int)'f'] = 1, [(int)'g'] = 1, [(int)'h'] = 1, [(int)'i'] = 1, [(int)'j'] = 1, [(int)'k'] = 1, [(int)'l'] = 1, [(int)'m'] = 1, [(int)'n'] = 1, [(int)'o'] = 1, [(int)'p'] = 1, [(int)'q'] = 1, [(int)'r'] = 1, [(int)'s'] = 1, [(int)'t'] = 1, [(int)'u'] = 1, [(int)'v'] = 1, [(int)'w'] = 1, [(int)'x'] = 1, [(int)'y'] = 1, [(int)'z'] = 1, [(int)'A'] = 1, [(int)'B'] = 1, [(int)'C'] = 1, [(int)'D'] = 1, [(int)'E'] = 1, [(int)'F'] = 1, [(int)'G'] = 1, [(int)'H'] = 1, [(int)'I'] = 1, [(int)'J'] = 1, [(int)'K'] = 1, [(int)'L'] = 1, [(int)'M'] = 1, [(int)'N'] = 1, [(int)'O'] = 1, [(int)'P'] = 1, [(int)'Q'] = 1, [(int)'R'] = 1, [(int)'S'] = 1, [(int)'T'] = 1, [(int)'U'] = 1, [(int)'V'] = 1, [(int)'W'] = 1, [(int)'X'] = 1, [(int)'Y'] = 1, [(int)'Z'] = 1, [(int)'0'] = 1, [(int)'1'] = 1, [(int)'2'] = 1, [(int)'3'] = 1, [(int)'4'] = 1, [(int)'5'] = 1, [(int)'6'] = 1, [(int)'7'] = 1, [(int)'8'] = 1, [(int)'9'] = 1, [(int)':'] = 1, [(int)'-'] = 1, [(int)'_'] = 1, }; static const int validMethodPrefix[0x100] = { [(int)'!'] = 1, [(int)'^'] = 1, }; static const int kindMayHaveMethodPrefix = (1 << K_SUBMETHOD) | (1 << K_METHOD) ; /* Trim identifier pointed to by ps, possibly advancing it, and return * the length of the valid portion. If the returned value is zero, the * identifier is invalid. */ static int trimIdentifier (enum perl6Kind kind, const char **ps, int len) { Assert(len > 0); const char *const end = *ps + len; const char *s = *ps; /* Trim the front if possible: */ s += (kindMayHaveMethodPrefix & (1 << kind)) && validMethodPrefix[(int)*s]; /* Record the start of identifier: */ *ps = s; /* Continuous string of valid characters: */ while (s < end && validPerl6Identifier[(int)*s]) ++s; /* sub multi infix:<...> -- we want the "infix" only */ while (s - *ps > 0 && ':' == s[-1]) --s; /* It's ok if this is zero: */ return s - *ps; } struct p6Ctx { enum token tokens[128 /* unlikely to need more than this */]; unsigned int n_tokens; vString *name; const char *line; /* Saved from readLineFromInputFile() */ }; static void makeTag (struct p6Ctx *ctx, int kind, const char *name, int len) { tagEntryInfo entry; vStringNCopyS(ctx->name, name, len); initTagEntry(&entry, vStringValue(ctx->name), kind); makeTagEntry(&entry); } static void possiblyMakeTag (struct p6Ctx *ctx, const char *s, int len) { Assert(ctx->n_tokens > 0); enum perl6Kind kind = token2kind[ ctx->tokens[ctx->n_tokens - 1] ]; if (K_NONE != kind && perl6Kinds[kind].enabled && (len = trimIdentifier(kind, &s, len)) > 0) makeTag(ctx, kind, s, len); } static void initP6Ctx (struct p6Ctx *ctx) { ctx->n_tokens = 0; ctx->name = vStringNew(); ctx->line = NULL; } static void deinitP6Ctx (struct p6Ctx *ctx) { vStringDelete(ctx->name); } /* Read next contiguous sequence of non-whitespace characters, store * the address in `ptok', and return its length. Return value of zero * means EOF. * * TODO: Currently, POD and multi-line comments are not handled. */ static int getNonSpaceStr (struct p6Ctx *ctx, const char **ptok) { const char *s = ctx->line; if (!s) { next_line: s = (const char *) readLineFromInputFile(); if (!s) return 0; /* EOF */ } while (*s && isspace(*s)) /* Skip whitespace */ ++s; if ('#' == *s) goto next_line; int non_white_len = strcspn(s, ",; \t"); if (non_white_len) { ctx->line = s + non_white_len; /* Save state */ *ptok = s; return non_white_len; } else goto next_line; } static void findPerl6Tags (void) { struct p6Ctx ctx; #define RESET_TOKENS() do { ctx.n_tokens = 0; } while (0) #define PUSH_TOKEN(_t_) do { \ if (ctx.n_tokens < ARRAY_SIZE(ctx.tokens)) { \ ctx.tokens[ ctx.n_tokens ] = _t_; \ ++ctx.n_tokens; \ } else { \ Assert(!"Token stack overflown: this is quite odd"); \ RESET_TOKENS(); \ } \ } while (0) initP6Ctx(&ctx); const char *s; int len; while ((len = getNonSpaceStr(&ctx, &s)) > 0) { enum token token = matchToken(s, len); if ((int) token >= 0) { PUSH_TOKEN(token); } else if (ctx.n_tokens > 0) { possiblyMakeTag(&ctx, s, len); RESET_TOKENS(); } } deinitP6Ctx(&ctx); } parserDefinition * Perl6Parser (void) { static const char *const extensions[] = { "p6", "pm6", "pm", "pl6", "t6", "raku", "rakumod", "rakutest", NULL }; static selectLanguage selectors [] = { selectByPickingPerlVersion, NULL }; parserDefinition* def = parserNew("Perl6"); def->kindTable = perl6Kinds; def->kindCount = ARRAY_SIZE(perl6Kinds); def->extensions = extensions; def->parser = findPerl6Tags; def->selectLanguage = selectors; return def; }