xref: /Universal-ctags/parsers/rust.c (revision 648cbe273d9454a0556e871c5406db83f8968aa3)
1 /*
2 *
3 *   This source code is released for free distribution under the terms of the
4 *   GNU General Public License version 2 or (at your option) any later version.
5 *
6 *   This module contains functions for generating tags for Rust files.
7 */
8 
9 /*
10 *   INCLUDE FILES
11 */
12 #include "general.h"	/* must always come first */
13 
14 #include <string.h>
15 
16 #include "keyword.h"
17 #include "parse.h"
18 #include "entry.h"
19 #include "options.h"
20 #include "read.h"
21 #include "routines.h"
22 #include "vstring.h"
23 
24 /*
25 *   MACROS
26 */
27 #define MAX_STRING_LENGTH 256
28 
29 /*
30 *   DATA DECLARATIONS
31 */
32 
33 typedef enum {
34 	K_MOD,
35 	K_STRUCT,
36 	K_TRAIT,
37 	K_IMPL,
38 	K_FN,
39 	K_ENUM,
40 	K_TYPE,
41 	K_STATIC,
42 	K_MACRO,
43 	K_FIELD,
44 	K_VARIANT,
45 	K_METHOD,
46 	K_CONST,
47 	K_NONE
48 } RustKind;
49 
50 static kindDefinition rustKinds[] = {
51 	{true, 'n', "module", "module"},
52 	{true, 's', "struct", "structural type"},
53 	{true, 'i', "interface", "trait interface"},
54 	{true, 'c', "implementation", "implementation"},
55 	{true, 'f', "function", "Function"},
56 	{true, 'g', "enum", "Enum"},
57 	{true, 't', "typedef", "Type Alias"},
58 	{true, 'v', "variable", "Global variable"},
59 	{true, 'M', "macro", "Macro Definition"},
60 	{true, 'm', "field", "A struct field"},
61 	{true, 'e', "enumerator", "An enum variant"},
62 	{true, 'P', "method", "A method"},
63 	{true, 'C', "constant", "A constant"},
64 };
65 
66 typedef enum {
67 	TOKEN_WHITESPACE,
68 	TOKEN_STRING,
69 	TOKEN_IDENT,
70 	TOKEN_LSHIFT,
71 	TOKEN_RSHIFT,
72 	TOKEN_RARROW,
73 	TOKEN_EOF
74 } tokenType;
75 
76 typedef struct {
77 	/* Characters */
78 	int cur_c;
79 	int next_c;
80 
81 	/* Tokens */
82 	int cur_token;
83 	vString* token_str;
84 	unsigned long line;
85 	MIOPos pos;
86 } lexerState;
87 
88 /*
89 *   FUNCTION PROTOTYPES
90 */
91 
92 static void parseBlock (lexerState *lexer, bool delim, int kind, vString *scope);
93 
94 /*
95 *   FUNCTION DEFINITIONS
96 */
97 
98 /* Resets the scope string to the old length */
resetScope(vString * scope,size_t old_len)99 static void resetScope (vString *scope, size_t old_len)
100 {
101 	vStringTruncate (scope, old_len);
102 }
103 
104 /* Adds a name to the end of the scope string */
addToScope(vString * scope,vString * name)105 static void addToScope (vString *scope, vString *name)
106 {
107 	if (vStringLength(scope) > 0)
108 		vStringCatS(scope, "::");
109 	vStringCat(scope, name);
110 }
111 
112 /* Write the lexer's current token to string, taking care of special tokens */
writeCurTokenToStr(lexerState * lexer,vString * out_str)113 static void writeCurTokenToStr (lexerState *lexer, vString *out_str)
114 {
115 	switch (lexer->cur_token)
116 	{
117 		case TOKEN_IDENT:
118 			vStringCat(out_str, lexer->token_str);
119 			break;
120 		case TOKEN_STRING:
121 			vStringCat(out_str, lexer->token_str);
122 			break;
123 		case TOKEN_WHITESPACE:
124 			vStringPut(out_str, ' ');
125 			break;
126 		case TOKEN_LSHIFT:
127 			vStringCatS(out_str, "<<");
128 			break;
129 		case TOKEN_RSHIFT:
130 			vStringCatS(out_str, ">>");
131 			break;
132 		case TOKEN_RARROW:
133 			vStringCatS(out_str, "->");
134 			break;
135 		default:
136 			vStringPut(out_str, (char) lexer->cur_token);
137 	}
138 }
139 
140 /* Reads a character from the file */
advanceChar(lexerState * lexer)141 static void advanceChar (lexerState *lexer)
142 {
143 	lexer->cur_c = lexer->next_c;
144 	lexer->next_c = getcFromInputFile();
145 }
146 
147 /* Reads N characters from the file */
advanceNChar(lexerState * lexer,int n)148 static void advanceNChar (lexerState *lexer, int n)
149 {
150 	while (n--)
151 		advanceChar(lexer);
152 }
153 
154 /* Store the current character in lexerState::token_str if there is space
155  * (set by MAX_STRING_LENGTH), and then read the next character from the file */
advanceAndStoreChar(lexerState * lexer)156 static void advanceAndStoreChar (lexerState *lexer)
157 {
158 	if (vStringLength(lexer->token_str) < MAX_STRING_LENGTH)
159 		vStringPut(lexer->token_str, (char) lexer->cur_c);
160 	advanceChar(lexer);
161 }
162 
isWhitespace(int c)163 static bool isWhitespace (int c)
164 {
165 	return c == ' ' || c == '\t' || c == '\r' || c == '\n';
166 }
167 
isAscii(int c)168 static bool isAscii (int c)
169 {
170 	return (c >= 0) && (c < 0x80);
171 }
172 
173 /* This isn't quite right for Unicode identifiers */
isIdentifierStart(int c)174 static bool isIdentifierStart (int c)
175 {
176 	return (isAscii(c) && (isalpha(c) || c == '_')) || !isAscii(c);
177 }
178 
179 /* This isn't quite right for Unicode identifiers */
isIdentifierContinue(int c)180 static bool isIdentifierContinue (int c)
181 {
182 	return (isAscii(c) && (isalnum(c) || c == '_')) || !isAscii(c);
183 }
184 
scanWhitespace(lexerState * lexer)185 static void scanWhitespace (lexerState *lexer)
186 {
187 	while (isWhitespace(lexer->cur_c))
188 		advanceChar(lexer);
189 }
190 
191 /* Normal line comments start with two /'s and continue until the next \n
192  * (potentially after a \r). Additionally, a shebang in the beginning of the
193  * file also counts as a line comment as long as it is not this sequence: #![ .
194  * Block comments start with / followed by a * and end with a * followed by a /.
195  * Unlike in C/C++ they nest. */
scanComments(lexerState * lexer)196 static void scanComments (lexerState *lexer)
197 {
198 	/* // */
199 	if (lexer->next_c == '/')
200 	{
201 		advanceNChar(lexer, 2);
202 		while (lexer->cur_c != EOF && lexer->cur_c != '\n')
203 			advanceChar(lexer);
204 	}
205 	/* #! */
206 	else if (lexer->next_c == '!')
207 	{
208 		advanceNChar(lexer, 2);
209 		/* If it is exactly #![ then it is not a comment, but an attribute */
210 		if (lexer->cur_c == '[')
211 			return;
212 		while (lexer->cur_c != EOF && lexer->cur_c != '\n')
213 			advanceChar(lexer);
214 	}
215 	/* block comment */
216 	else if (lexer->next_c == '*')
217 	{
218 		int level = 1;
219 		advanceNChar(lexer, 2);
220 		while (lexer->cur_c != EOF && level > 0)
221 		{
222 			if (lexer->cur_c == '*' && lexer->next_c == '/')
223 			{
224 				level--;
225 				advanceNChar(lexer, 2);
226 			}
227 			else if (lexer->cur_c == '/' && lexer->next_c == '*')
228 			{
229 				level++;
230 				advanceNChar(lexer, 2);
231 			}
232 			else
233 			{
234 				advanceChar(lexer);
235 			}
236 		}
237 	}
238 }
239 
scanIdentifier(lexerState * lexer)240 static void scanIdentifier (lexerState *lexer)
241 {
242 	vStringClear(lexer->token_str);
243 	do
244 	{
245 		advanceAndStoreChar(lexer);
246 	} while(lexer->cur_c != EOF && isIdentifierContinue(lexer->cur_c));
247 }
248 
249 /* Double-quoted strings, we only care about the \" escape. These
250  * last past the end of the line, so be careful not too store too much
251  * of them (see MAX_STRING_LENGTH). The only place we look at their
252  * contents is in the function definitions, and there the valid strings are
253  * things like "C" and "Rust" */
scanString(lexerState * lexer)254 static void scanString (lexerState *lexer)
255 {
256 	vStringClear(lexer->token_str);
257 	advanceAndStoreChar(lexer);
258 	while (lexer->cur_c != EOF && lexer->cur_c != '"')
259 	{
260 		if (lexer->cur_c == '\\' && lexer->next_c == '"')
261 			advanceAndStoreChar(lexer);
262 		advanceAndStoreChar(lexer);
263 	}
264 	advanceAndStoreChar(lexer);
265 }
266 
267 /* Raw strings look like this: r"" or r##""## where the number of
268  * hashes must match */
scanRawString(lexerState * lexer)269 static void scanRawString (lexerState *lexer)
270 {
271 	size_t num_initial_hashes = 0;
272 	vStringClear(lexer->token_str);
273 	advanceAndStoreChar(lexer);
274 	/* Count how many leading hashes there are */
275 	while (lexer->cur_c == '#')
276 	{
277 		num_initial_hashes++;
278 		advanceAndStoreChar(lexer);
279 	}
280 	if (lexer->cur_c != '"')
281 		return;
282 	advanceAndStoreChar(lexer);
283 	while (lexer->cur_c != EOF)
284 	{
285 		/* Count how many trailing hashes there are. If the number is equal or more
286 		 * than the number of leading hashes, break. */
287 		if (lexer->cur_c == '"')
288 		{
289 			size_t num_trailing_hashes = 0;
290 			advanceAndStoreChar(lexer);
291 			while (lexer->cur_c == '#' && num_trailing_hashes < num_initial_hashes)
292 			{
293 				num_trailing_hashes++;
294 
295 				advanceAndStoreChar(lexer);
296 			}
297 			if (num_trailing_hashes == num_initial_hashes)
298 				break;
299 		}
300 		else
301 		{
302 			advanceAndStoreChar(lexer);
303 		}
304 	}
305 }
306 
307 /* This deals with character literals: 'n', '\n', '\uFFFF'; and lifetimes:
308  * 'lifetime. We'll use this approximate regexp for the literals:
309  * \' \\ [^']+ \' or \' [^'] \' or \' \\ \' \'. Either way, we'll treat this
310  * token as a string, so it gets preserved as is for function signatures with
311  * lifetimes. */
scanCharacterOrLifetime(lexerState * lexer)312 static void scanCharacterOrLifetime (lexerState *lexer)
313 {
314 	vStringClear(lexer->token_str);
315 	advanceAndStoreChar(lexer);
316 
317 	if (lexer->cur_c == '\\')
318 	{
319 		advanceAndStoreChar(lexer);
320 		/* The \' \\ \' \' (literally '\'') case */
321 		if (lexer->cur_c == '\'' && lexer->next_c == '\'')
322 		{
323 			advanceAndStoreChar(lexer);
324 			advanceAndStoreChar(lexer);
325 		}
326 		/* The \' \\ [^']+ \' case */
327 		else
328 		{
329 			while (lexer->cur_c != EOF && lexer->cur_c != '\'')
330 				advanceAndStoreChar(lexer);
331 		}
332 	}
333 	/* The \' [^'] \' case */
334 	else if (lexer->cur_c != '\'' && lexer->next_c == '\'')
335 	{
336 		advanceAndStoreChar(lexer);
337 		advanceAndStoreChar(lexer);
338 	}
339 	/* Otherwise it is malformed, or a lifetime */
340 }
341 
342 /* Advances the parser one token, optionally skipping whitespace
343  * (otherwise it is concatenated and returned as a single whitespace token).
344  * Whitespace is needed to properly render function signatures. Unrecognized
345  * token starts are stored literally, e.g. token may equal to a character '#'. */
advanceToken(lexerState * lexer,bool skip_whitspace)346 static int advanceToken (lexerState *lexer, bool skip_whitspace)
347 {
348 	bool have_whitespace = false;
349 	lexer->line = getInputLineNumber();
350 	lexer->pos = getInputFilePosition();
351 	while (lexer->cur_c != EOF)
352 	{
353 		if (isWhitespace(lexer->cur_c))
354 		{
355 			scanWhitespace(lexer);
356 			have_whitespace = true;
357 		}
358 		else if (lexer->cur_c == '/' && (lexer->next_c == '/' || lexer->next_c == '*'))
359 		{
360 			scanComments(lexer);
361 			have_whitespace = true;
362 		}
363 		else
364 		{
365 			if (have_whitespace && !skip_whitspace)
366 				return lexer->cur_token = TOKEN_WHITESPACE;
367 			break;
368 		}
369 	}
370 	lexer->line = getInputLineNumber();
371 	lexer->pos = getInputFilePosition();
372 	while (lexer->cur_c != EOF)
373 	{
374 		if (lexer->cur_c == '"')
375 		{
376 			scanString(lexer);
377 			return lexer->cur_token = TOKEN_STRING;
378 		}
379 		else if (lexer->cur_c == 'r' && (lexer->next_c == '#' || lexer->next_c == '"'))
380 		{
381 			scanRawString(lexer);
382 			return lexer->cur_token = TOKEN_STRING;
383 		}
384 		else if (lexer->cur_c == '\'')
385 		{
386 			scanCharacterOrLifetime(lexer);
387 			return lexer->cur_token = TOKEN_STRING;
388 		}
389 		else if (isIdentifierStart(lexer->cur_c))
390 		{
391 			scanIdentifier(lexer);
392 			return lexer->cur_token = TOKEN_IDENT;
393 		}
394 		/* These shift tokens aren't too important for tag-generation per se,
395 		 * but they confuse the skipUntil code which tracks the <> pairs. */
396 		else if (lexer->cur_c == '>' && lexer->next_c == '>')
397 		{
398 			advanceNChar(lexer, 2);
399 			return lexer->cur_token = TOKEN_RSHIFT;
400 		}
401 		else if (lexer->cur_c == '<' && lexer->next_c == '<')
402 		{
403 			advanceNChar(lexer, 2);
404 			return lexer->cur_token = TOKEN_LSHIFT;
405 		}
406 		else if (lexer->cur_c == '-' && lexer->next_c == '>')
407 		{
408 			advanceNChar(lexer, 2);
409 			return lexer->cur_token = TOKEN_RARROW;
410 		}
411 		else
412 		{
413 			int c = lexer->cur_c;
414 			advanceChar(lexer);
415 			return lexer->cur_token = c;
416 		}
417 	}
418 	return lexer->cur_token = TOKEN_EOF;
419 }
420 
initLexer(lexerState * lexer)421 static void initLexer (lexerState *lexer)
422 {
423 	advanceNChar(lexer, 2);
424 	lexer->token_str = vStringNew();
425 
426 	if (lexer->cur_c == '#' && lexer->next_c == '!')
427 		scanComments(lexer);
428 	advanceToken(lexer, true);
429 }
430 
deInitLexer(lexerState * lexer)431 static void deInitLexer (lexerState *lexer)
432 {
433 	vStringDelete(lexer->token_str);
434 	lexer->token_str = NULL;
435 }
436 
addTag(vString * ident,const char * arg_list,int kind,unsigned long line,MIOPos pos,vString * scope,int parent_kind)437 static void addTag (vString* ident, const char* arg_list, int kind, unsigned long line, MIOPos pos, vString *scope, int parent_kind)
438 {
439 	if (kind == K_NONE || ! rustKinds[kind].enabled)
440 		return;
441 	tagEntryInfo tag;
442 	initTagEntry(&tag, vStringValue(ident), kind);
443 
444 	tag.lineNumber = line;
445 	tag.filePosition = pos;
446 
447 	tag.extensionFields.signature = arg_list;
448 	/*tag.extensionFields.varType = type;*/ /* FIXME: map to typeRef[1]? */
449 	if (parent_kind != K_NONE)
450 	{
451 		tag.extensionFields.scopeKindIndex = parent_kind;
452 		tag.extensionFields.scopeName = vStringValue(scope);
453 	}
454 	makeTagEntry(&tag);
455 }
456 
457 /* Skip tokens until one of the goal tokens is hit. Escapes when level = 0 if there are no goal tokens.
458  * Keeps track of balanced <>'s, ()'s, []'s, and {}'s and ignores the goal tokens within those pairings */
skipUntil(lexerState * lexer,int goal_tokens[],int num_goal_tokens)459 static void skipUntil (lexerState *lexer, int goal_tokens[], int num_goal_tokens)
460 {
461 	int angle_level = 0;
462 	int paren_level = 0;
463 	int brace_level = 0;
464 	int bracket_level = 0;
465 	while (lexer->cur_token != TOKEN_EOF)
466 	{
467 		if (angle_level == 0 && paren_level == 0 && brace_level == 0
468 		    && bracket_level == 0)
469 		{
470 			int ii = 0;
471 			for(ii = 0; ii < num_goal_tokens; ii++)
472 			{
473 				if (lexer->cur_token == goal_tokens[ii])
474 				{
475 					break;
476 				}
477 			}
478 			if (ii < num_goal_tokens)
479 				break;
480 		}
481 		switch (lexer->cur_token)
482 		{
483 			case '<':
484 				angle_level++;
485 				break;
486 			case '(':
487 				paren_level++;
488 				break;
489 			case '{':
490 				brace_level++;
491 				break;
492 			case '[':
493 				bracket_level++;
494 				break;
495 			case '>':
496 				angle_level--;
497 				break;
498 			case ')':
499 				paren_level--;
500 				break;
501 			case '}':
502 				brace_level--;
503 				break;
504 			case ']':
505 				bracket_level--;
506 				break;
507 			case TOKEN_RSHIFT:
508 				if (angle_level >= 2)
509 					angle_level -= 2;
510 				break;
511 			/* TOKEN_LSHIFT is never interpreted as two <'s in valid Rust code */
512 			default:
513 				break;
514 		}
515 		/* Has to be after the token switch to catch the case when we start with the initial level token */
516 		if (num_goal_tokens == 0 && angle_level == 0 && paren_level == 0 && brace_level == 0
517 		    && bracket_level == 0)
518 			break;
519 		advanceToken(lexer, true);
520 	}
521 }
522 
523 /* Function format:
524  * "fn" <ident>[<type_bounds>] "(" [<args>] ")" ["->" <ret_type>] "{" [<body>] "}"*/
parseFn(lexerState * lexer,vString * scope,int parent_kind)525 static void parseFn (lexerState *lexer, vString *scope, int parent_kind)
526 {
527 	int kind = (parent_kind == K_TRAIT || parent_kind == K_IMPL) ? K_METHOD : K_FN;
528 	vString *name;
529 	vString *arg_list;
530 	unsigned long line;
531 	MIOPos pos;
532 	int paren_level = 0;
533 	int bracket_level = 0;
534 	bool found_paren = false;
535 	bool valid_signature = true;
536 
537 	advanceToken(lexer, true);
538 	if (lexer->cur_token != TOKEN_IDENT)
539 		return;
540 
541 	name = vStringNewCopy(lexer->token_str);
542 	arg_list = vStringNew();
543 
544 	line = lexer->line;
545 	pos = lexer->pos;
546 
547 	advanceToken(lexer, true);
548 
549 	/* HACK: This is a bit coarse as far as what tag entry means by
550 	 * 'arglist'... */
551 	while (lexer->cur_token != '{')
552 	{
553 		if (lexer->cur_token == ';' && bracket_level == 0)
554 		{
555 			break;
556 		}
557 		else if (lexer->cur_token == '}')
558 		{
559 			valid_signature = false;
560 			break;
561 		}
562 		else if (lexer->cur_token == '(')
563 		{
564 			found_paren = true;
565 			paren_level++;
566 		}
567 		else if (lexer->cur_token == ')')
568 		{
569 			paren_level--;
570 			if (paren_level < 0)
571 			{
572 				valid_signature = false;
573 				break;
574 			}
575 		}
576 		else if (lexer->cur_token == '[')
577 		{
578 			bracket_level++;
579 		}
580 		else if (lexer->cur_token == ']')
581 		{
582 			bracket_level--;
583 		}
584 		else if (lexer->cur_token == TOKEN_EOF)
585 		{
586 			valid_signature = false;
587 			break;
588 		}
589 		writeCurTokenToStr(lexer, arg_list);
590 		advanceToken(lexer, false);
591 	}
592 	if (!found_paren || paren_level != 0 || bracket_level != 0)
593 		valid_signature = false;
594 
595 	if (valid_signature)
596 	{
597 		vStringStripTrailing(arg_list);
598 		addTag(name, vStringValue(arg_list), kind, line, pos, scope, parent_kind);
599 		addToScope(scope, name);
600 		parseBlock(lexer, true, kind, scope);
601 	}
602 
603 	vStringDelete(name);
604 	vStringDelete(arg_list);
605 }
606 
607 /* Mod format:
608  * "mod" <ident> "{" [<body>] "}"
609  * "mod" <ident> ";"*/
parseMod(lexerState * lexer,vString * scope,int parent_kind)610 static void parseMod (lexerState *lexer, vString *scope, int parent_kind)
611 {
612 	advanceToken(lexer, true);
613 	if (lexer->cur_token != TOKEN_IDENT)
614 		return;
615 
616 	addTag(lexer->token_str, NULL, K_MOD, lexer->line, lexer->pos, scope, parent_kind);
617 	addToScope(scope, lexer->token_str);
618 
619 	advanceToken(lexer, true);
620 
621 	parseBlock(lexer, true, K_MOD, scope);
622 }
623 
624 /* Trait format:
625  * "trait" <ident> [<type_bounds>] "{" [<body>] "}"
626  */
parseTrait(lexerState * lexer,vString * scope,int parent_kind)627 static void parseTrait (lexerState *lexer, vString *scope, int parent_kind)
628 {
629 	int goal_tokens[] = {'{'};
630 
631 	advanceToken(lexer, true);
632 	if (lexer->cur_token != TOKEN_IDENT)
633 		return;
634 
635 	addTag(lexer->token_str, NULL, K_TRAIT, lexer->line, lexer->pos, scope, parent_kind);
636 	addToScope(scope, lexer->token_str);
637 
638 	advanceToken(lexer, true);
639 
640 	skipUntil(lexer, goal_tokens, 1);
641 
642 	parseBlock(lexer, true, K_TRAIT, scope);
643 }
644 
645 /* Skips type blocks of the form <T:T<T>, ...> */
skipTypeBlock(lexerState * lexer)646 static void skipTypeBlock (lexerState *lexer)
647 {
648 	if (lexer->cur_token == '<')
649 	{
650 		skipUntil(lexer, NULL, 0);
651 		advanceToken(lexer, true);
652 	}
653 }
654 
655 /* Essentially grabs the last ident before 'for', '<' and '{', which
656  * tends to correspond to what we want as the impl tag entry name */
parseQualifiedType(lexerState * lexer,vString * name)657 static void parseQualifiedType (lexerState *lexer, vString* name)
658 {
659 	while (lexer->cur_token != TOKEN_EOF)
660 	{
661 		if (lexer->cur_token == TOKEN_IDENT)
662 		{
663 			if (strcmp(vStringValue(lexer->token_str), "for") == 0
664 				|| strcmp(vStringValue(lexer->token_str), "where") == 0)
665 				break;
666 			vStringClear(name);
667 			vStringCat(name, lexer->token_str);
668 		}
669 		else if (lexer->cur_token == '<' || lexer->cur_token == '{')
670 		{
671 			break;
672 		}
673 		advanceToken(lexer, true);
674 	}
675 	skipTypeBlock(lexer);
676 }
677 
678 /* Impl format:
679  * "impl" [<type_bounds>] <qualified_ident>[<type_bounds>] ["for" <qualified_ident>[<type_bounds>]] "{" [<body>] "}"
680  */
parseImpl(lexerState * lexer,vString * scope,int parent_kind)681 static void parseImpl (lexerState *lexer, vString *scope, int parent_kind)
682 {
683 	unsigned long line;
684 	MIOPos pos;
685 	vString *name;
686 
687 	advanceToken(lexer, true);
688 
689 	line = lexer->line;
690 	pos = lexer->pos;
691 
692 	skipTypeBlock(lexer);
693 
694 	name = vStringNew();
695 
696 	parseQualifiedType(lexer, name);
697 
698 	if (lexer->cur_token == TOKEN_IDENT && strcmp(vStringValue(lexer->token_str), "for") == 0)
699 	{
700 		advanceToken(lexer, true);
701 		parseQualifiedType(lexer, name);
702 	}
703 
704 	addTag(name, NULL, K_IMPL, line, pos, scope, parent_kind);
705 	addToScope(scope, name);
706 
707 	parseBlock(lexer, true, K_IMPL, scope);
708 
709 	vStringDelete(name);
710 }
711 
712 /* Static format:
713  * "static" ["mut"] <ident>
714  */
parseStatic(lexerState * lexer,vString * scope,int parent_kind)715 static void parseStatic (lexerState *lexer, vString *scope, int parent_kind)
716 {
717 	advanceToken(lexer, true);
718 	if (lexer->cur_token != TOKEN_IDENT)
719 		return;
720 	if (strcmp(vStringValue(lexer->token_str), "mut") == 0)
721 	{
722 		advanceToken(lexer, true);
723 	}
724 	if (lexer->cur_token != TOKEN_IDENT)
725 		return;
726 
727 	addTag(lexer->token_str, NULL, K_STATIC, lexer->line, lexer->pos, scope, parent_kind);
728 }
729 
730 /* Const format:
731  * "const" <ident>
732  */
parseConst(lexerState * lexer,vString * scope,int parent_kind)733 static void parseConst (lexerState *lexer, vString *scope, int parent_kind)
734 {
735 	advanceToken(lexer, true);
736 	if (lexer->cur_token != TOKEN_IDENT)
737 		return;
738 
739 	addTag(lexer->token_str, NULL, K_CONST, lexer->line, lexer->pos, scope, parent_kind);
740 }
741 
742 /* Type format:
743  * "type" <ident>
744  */
parseType(lexerState * lexer,vString * scope,int parent_kind)745 static void parseType (lexerState *lexer, vString *scope, int parent_kind)
746 {
747 	advanceToken(lexer, true);
748 	if (lexer->cur_token != TOKEN_IDENT)
749 		return;
750 
751 	addTag(lexer->token_str, NULL, K_TYPE, lexer->line, lexer->pos, scope, parent_kind);
752 }
753 
754 /* Structs and enums are very similar syntax-wise.
755  * It is possible to parse variants a bit more cleverly (e.g. make tuple variants functions and
756  * struct variants structs) but it'd be too clever and the signature wouldn't make too much sense without
757  * the enum's definition (e.g. for the type bounds)
758  *
759  * Struct/Enum format:
760  * "struct/enum" <ident>[<type_bounds>] "{" [<ident>,]+ "}"
761  * "struct/enum" <ident>[<type_bounds>] ";"
762  * */
parseStructOrEnum(lexerState * lexer,vString * scope,int parent_kind,bool is_struct)763 static void parseStructOrEnum (lexerState *lexer, vString *scope, int parent_kind, bool is_struct)
764 {
765 	int kind = is_struct ? K_STRUCT : K_ENUM;
766 	int field_kind = is_struct ? K_FIELD : K_VARIANT;
767 	int goal_tokens1[] = {';', '{'};
768 
769 	advanceToken(lexer, true);
770 	if (lexer->cur_token != TOKEN_IDENT)
771 		return;
772 
773 	addTag(lexer->token_str, NULL, kind, lexer->line, lexer->pos, scope, parent_kind);
774 	addToScope(scope, lexer->token_str);
775 
776 	skipUntil(lexer, goal_tokens1, 2);
777 
778 	if (lexer->cur_token == '{')
779 	{
780 		vString *field_name = vStringNew();
781 		while (lexer->cur_token != TOKEN_EOF)
782 		{
783 			int goal_tokens2[] = {'}', ','};
784 			/* Skip attributes. Format:
785 			 * #[..] or #![..]
786 			 * */
787 			if (lexer->cur_token == '#')
788 			{
789 				advanceToken(lexer, true);
790 				if (lexer->cur_token == '!')
791 					advanceToken(lexer, true);
792 				if (lexer->cur_token == '[')
793 				{
794 					/* It's an attribute, skip it. */
795 					skipUntil(lexer, NULL, 0);
796 				}
797 				else
798 				{
799 					/* Something's up with this field, skip to the next one */
800 					skipUntil(lexer, goal_tokens2, 2);
801 					continue;
802 				}
803 			}
804 			if (lexer->cur_token == TOKEN_IDENT)
805 			{
806 				if (strcmp(vStringValue(lexer->token_str), "priv") == 0
807 				    || strcmp(vStringValue(lexer->token_str), "pub") == 0)
808 				{
809 					advanceToken(lexer, true);
810 
811 					/* Skip thevisibility specificaions.
812 					 * https://doc.rust-lang.org/reference/visibility-and-privacy.html */
813 					if (lexer->cur_token == '(')
814 					{
815 						advanceToken(lexer, true);
816 						skipUntil (lexer, (int []){')'}, 1);
817 						advanceToken(lexer, true);
818 					}
819 
820 					if (lexer->cur_token != TOKEN_IDENT)
821 					{
822 						/* Something's up with this field, skip to the next one */
823 						skipUntil(lexer, goal_tokens2, 2);
824 						continue;
825 					}
826 				}
827 
828 				vStringClear(field_name);
829 				vStringCat(field_name, lexer->token_str);
830 				addTag(field_name, NULL, field_kind, lexer->line, lexer->pos, scope, kind);
831 				skipUntil(lexer, goal_tokens2, 2);
832 			}
833 			if (lexer->cur_token == '}')
834 			{
835 				advanceToken(lexer, true);
836 				break;
837 			}
838 			advanceToken(lexer, true);
839 		}
840 		vStringDelete(field_name);
841 	}
842 }
843 
844 /* Skip the body of the macro. Can't use skipUntil here as
845  * the body of the macro may have arbitrary code which confuses it (e.g.
846  * bitshift operators/function return arrows) */
skipMacro(lexerState * lexer)847 static void skipMacro (lexerState *lexer)
848 {
849 	int level = 0;
850 	int plus_token = 0;
851 	int minus_token = 0;
852 
853 	advanceToken(lexer, true);
854 	switch (lexer->cur_token)
855 	{
856 		case '(':
857 			plus_token = '(';
858 			minus_token = ')';
859 			break;
860 		case '{':
861 			plus_token = '{';
862 			minus_token = '}';
863 			break;
864 		case '[':
865 			plus_token = '[';
866 			minus_token = ']';
867 			break;
868 		default:
869 			return;
870 	}
871 
872 	while (lexer->cur_token != TOKEN_EOF)
873 	{
874 		if (lexer->cur_token == plus_token)
875 			level++;
876 		else if (lexer->cur_token == minus_token)
877 			level--;
878 		if (level == 0)
879 			break;
880 		advanceToken(lexer, true);
881 	}
882 	advanceToken(lexer, true);
883 }
884 
885 /*
886  * Macro rules format:
887  * "macro_rules" "!" <ident> <macro_body>
888  */
parseMacroRules(lexerState * lexer,vString * scope,int parent_kind)889 static void parseMacroRules (lexerState *lexer, vString *scope, int parent_kind)
890 {
891 	advanceToken(lexer, true);
892 
893 	if (lexer->cur_token != '!')
894 		return;
895 
896 	advanceToken(lexer, true);
897 
898 	if (lexer->cur_token != TOKEN_IDENT)
899 		return;
900 
901 	addTag(lexer->token_str, NULL, K_MACRO, lexer->line, lexer->pos, scope, parent_kind);
902 
903 	skipMacro(lexer);
904 }
905 
906 /*
907  * Rust is very liberal with nesting, so this function is used pretty much for any block
908  */
parseBlock(lexerState * lexer,bool delim,int kind,vString * scope)909 static void parseBlock (lexerState *lexer, bool delim, int kind, vString *scope)
910 {
911 	int level = 1;
912 	if (delim)
913 	{
914 		if (lexer->cur_token != '{')
915 			return;
916 		advanceToken(lexer, true);
917 	}
918 	while (lexer->cur_token != TOKEN_EOF)
919 	{
920 		if (lexer->cur_token == TOKEN_IDENT)
921 		{
922 			size_t old_scope_len = vStringLength(scope);
923 			if (strcmp(vStringValue(lexer->token_str), "fn") == 0)
924 			{
925 				parseFn(lexer, scope, kind);
926 			}
927 			else if(strcmp(vStringValue(lexer->token_str), "mod") == 0)
928 			{
929 				parseMod(lexer, scope, kind);
930 			}
931 			else if(strcmp(vStringValue(lexer->token_str), "static") == 0)
932 			{
933 				parseStatic(lexer, scope, kind);
934 			}
935 			else if(strcmp(vStringValue(lexer->token_str), "const") == 0)
936 			{
937 				parseConst(lexer, scope, kind);
938 			}
939 			else if(strcmp(vStringValue(lexer->token_str), "trait") == 0)
940 			{
941 				parseTrait(lexer, scope, kind);
942 			}
943 			else if(strcmp(vStringValue(lexer->token_str), "type") == 0)
944 			{
945 				parseType(lexer, scope, kind);
946 			}
947 			else if(strcmp(vStringValue(lexer->token_str), "impl") == 0)
948 			{
949 				parseImpl(lexer, scope, kind);
950 			}
951 			else if(strcmp(vStringValue(lexer->token_str), "struct") == 0)
952 			{
953 				parseStructOrEnum(lexer, scope, kind, true);
954 			}
955 			else if(strcmp(vStringValue(lexer->token_str), "enum") == 0)
956 			{
957 				parseStructOrEnum(lexer, scope, kind, false);
958 			}
959 			else if(strcmp(vStringValue(lexer->token_str), "macro_rules") == 0)
960 			{
961 				parseMacroRules(lexer, scope, kind);
962 			}
963 			else
964 			{
965 				advanceToken(lexer, true);
966 				if (lexer->cur_token == '!')
967 				{
968 					skipMacro(lexer);
969 				}
970 			}
971 			resetScope(scope, old_scope_len);
972 		}
973 		else if (lexer->cur_token == '{')
974 		{
975 			level++;
976 			advanceToken(lexer, true);
977 		}
978 		else if (lexer->cur_token == '}')
979 		{
980 			level--;
981 			advanceToken(lexer, true);
982 		}
983 		else if (lexer->cur_token == '\'')
984 		{
985 			/* Skip over the 'static lifetime, as it confuses the static parser above */
986 			advanceToken(lexer, true);
987 			if (lexer->cur_token == TOKEN_IDENT && strcmp(vStringValue(lexer->token_str), "static") == 0)
988 				advanceToken(lexer, true);
989 		}
990 		else
991 		{
992 			advanceToken(lexer, true);
993 		}
994 		if (delim && level <= 0)
995 			break;
996 	}
997 }
998 
findRustTags(void)999 static void findRustTags (void)
1000 {
1001 	lexerState lexer = {0};
1002 	vString* scope = vStringNew();
1003 	initLexer(&lexer);
1004 
1005 	parseBlock(&lexer, false, K_NONE, scope);
1006 	vStringDelete(scope);
1007 
1008 	deInitLexer(&lexer);
1009 }
1010 
RustParser(void)1011 extern parserDefinition *RustParser (void)
1012 {
1013 	static const char *const extensions[] = { "rs", NULL };
1014 	parserDefinition *def = parserNew ("Rust");
1015 	def->kindTable = rustKinds;
1016 	def->kindCount = ARRAY_SIZE (rustKinds);
1017 	def->extensions = extensions;
1018 	def->parser = findRustTags;
1019 
1020 	return def;
1021 }
1022