xref: /Universal-ctags/parsers/json.c (revision 2ec6b03e1eb0c8a3535e89601f969899d7e854f8)
1 /*
2  * Copyright (c) 2014, Colomban Wendling <colomban@geany.org>
3  *
4  * This source code is released for free distribution under the terms of the
5  * GNU General Public License version 2 or (at your option) any later version.
6  */
7 /*
8  * This module contains functions for generating tags for JSON files.
9  *
10  * http://www.ecma-international.org/publications/files/ECMA-ST/ECMA-404.pdf
11  *
12  * This implementation is forgiving and allows many constructs that are not
13  * actually valid but that don't conflict with the format.  This is intend to
14  * better support partly broken or unfinished files.
15  */
16 
17 #include "general.h"
18 
19 #include <string.h>
20 #include "debug.h"
21 #include "entry.h"
22 #include "keyword.h"
23 #include "options.h"
24 #include "parse.h"
25 #include "read.h"
26 #include "routines.h"
27 #include "vstring.h"
28 
29 #define isIdentChar(c) \
30 	(isalnum (c) || (c) == '+' || (c) == '-' || (c) == '.')
31 
32 typedef enum {
33 	TOKEN_EOF,
34 	TOKEN_UNDEFINED,
35 	TOKEN_OPEN_SQUARE,
36 	TOKEN_CLOSE_SQUARE,
37 	TOKEN_OPEN_CURLY,
38 	TOKEN_CLOSE_CURLY,
39 	TOKEN_COLON,
40 	TOKEN_COMMA,
41 	TOKEN_TRUE,
42 	TOKEN_FALSE,
43 	TOKEN_NULL,
44 	TOKEN_NUMBER,
45 	TOKEN_STRING
46 } tokenType;
47 
48 typedef enum {
49 	TAG_NONE = -1,
50 	TAG_OBJECT,
51 	TAG_ARRAY,
52 	TAG_NUMBER,
53 	TAG_STRING,
54 	TAG_BOOLEAN,
55 	TAG_NULL,
56 	TAG_COUNT
57 } jsonKind;
58 
59 typedef struct {
60 	tokenType		type;
61 	jsonKind		scopeKind;
62 	vString			*string;
63 	vString			*scope;
64 	unsigned long	lineNumber;
65 	MIOPos			filePosition;
66 } tokenInfo;
67 
68 typedef enum {
69 	KEYWORD_true,
70 	KEYWORD_false,
71 	KEYWORD_null
72 } keywordId;
73 
74 static langType Lang_json;
75 
76 static kindDefinition JsonKinds [] = {
77 	{ true,  'o', "object",		"objects"	},
78 	{ true,  'a', "array",		"arrays"	},
79 	{ true,  'n', "number",		"numbers"	},
80 	{ true,  's', "string",		"strings"	},
81 	{ true,  'b', "boolean",	"booleans"	},
82 	{ true,  'z', "null",		"nulls"		}
83 };
84 
85 static const keywordTable JsonKeywordTable [] = {
86 	{"true",  KEYWORD_true },
87 	{"false", KEYWORD_false},
88 	{"null", KEYWORD_null },
89 };
90 
newToken(void)91 static tokenInfo *newToken (void)
92 {
93 	tokenInfo *const token = xMalloc (1, tokenInfo);
94 
95 	token->type			= TOKEN_UNDEFINED;
96 	token->scopeKind	= TAG_NONE;
97 	token->string		= vStringNew ();
98 	token->scope		= vStringNew ();
99 	token->lineNumber	= getInputLineNumber ();
100 	token->filePosition	= getInputFilePosition ();
101 
102 	return token;
103 }
104 
deleteToken(tokenInfo * const token)105 static void deleteToken (tokenInfo *const token)
106 {
107 	vStringDelete (token->string);
108 	vStringDelete (token->scope);
109 	eFree (token);
110 }
111 
copyToken(tokenInfo * const dest,tokenInfo * const src)112 static void copyToken (tokenInfo *const dest, tokenInfo *const src)
113 {
114 	dest->type = src->type;
115 	dest->scopeKind = src->scopeKind;
116 	vStringCopy (dest->string, src->string);
117 	vStringCopy (dest->scope, src->scope);
118 	dest->lineNumber = src->lineNumber;
119 	dest->filePosition = src->filePosition;
120 }
121 
makeJsonTag(tokenInfo * const token,const jsonKind kind)122 static void makeJsonTag (tokenInfo *const token, const jsonKind kind)
123 {
124 	tagEntryInfo e;
125 
126 	if (! JsonKinds[kind].enabled)
127 		return;
128 
129 	initTagEntry (&e, vStringValue (token->string), kind);
130 
131 	e.lineNumber	= token->lineNumber;
132 	e.filePosition	= token->filePosition;
133 
134 	if (vStringLength (token->scope) > 0)
135 	{
136 		Assert (token->scopeKind > TAG_NONE && token->scopeKind < TAG_COUNT);
137 
138 		e.extensionFields.scopeKindIndex = token->scopeKind;
139 		e.extensionFields.scopeName = vStringValue (token->scope);
140 	}
141 
142 	makeTagEntry (&e);
143 }
144 
145 #define DEPTH_LIMIT 512
146 static int depth_counter;
147 
readTokenFull(tokenInfo * const token,bool includeStringRepr)148 static void readTokenFull (tokenInfo *const token,
149 						   bool includeStringRepr)
150 {
151 	int c;
152 
153 	if (depth_counter > DEPTH_LIMIT)
154 	{
155 		token->type = TOKEN_EOF;
156 
157 		/* Not to repeat warnings. */
158 		if (depth_counter == (DEPTH_LIMIT + 1))
159 		{
160 			notice ("Terminate parsing: too deep brackets recursion in %s at %ld",
161 					getInputFileName(), getInputLineNumber());
162 			depth_counter++;
163 		}
164 		return;
165 	}
166 
167 	token->type = TOKEN_UNDEFINED;
168 	vStringClear (token->string);
169 
170 	do
171 		c = getcFromInputFile ();
172 	while (c == '\t' || c == ' ' || c == '\r' || c == '\n');
173 
174 	token->lineNumber   = getInputLineNumber ();
175 	token->filePosition = getInputFilePosition ();
176 
177 	switch (c)
178 	{
179 		case EOF: token->type = TOKEN_EOF;			break;
180 		case '[':
181 			depth_counter++;
182 			token->type = TOKEN_OPEN_SQUARE;		break;
183 		case ']':
184 			depth_counter--;
185 			token->type = TOKEN_CLOSE_SQUARE;		break;
186 		case '{':
187 			depth_counter++;
188 			token->type = TOKEN_OPEN_CURLY;			break;
189 		case '}':
190 			depth_counter--;
191 			token->type = TOKEN_CLOSE_CURLY;		break;
192 		case ':': token->type = TOKEN_COLON;		break;
193 		case ',': token->type = TOKEN_COMMA;		break;
194 
195 		case '"':
196 		{
197 			bool escaped = false;
198 			token->type = TOKEN_STRING;
199 			while (true)
200 			{
201 				c = getcFromInputFile ();
202 				/* we don't handle unicode escapes but they are safe */
203 				if (escaped)
204 					escaped = false;
205 				else if (c == '\\')
206 					escaped = true;
207 				else if (c >= 0x00 && c <= 0x1F)
208 					break; /* break on invalid, unescaped, control characters */
209 				else if (c == '"' || c == EOF)
210 					break;
211 				if (includeStringRepr)
212 					vStringPut (token->string, c);
213 			}
214 			break;
215 		}
216 
217 		default:
218 			if (! isIdentChar (c))
219 				token->type = TOKEN_UNDEFINED;
220 			else
221 			{
222 				do
223 				{
224 					vStringPut (token->string, c);
225 					c = getcFromInputFile ();
226 				}
227 				while (c != EOF && isIdentChar (c));
228 				ungetcToInputFile (c);
229 				switch (lookupKeyword (vStringValue (token->string), Lang_json))
230 				{
231 					case KEYWORD_true:	token->type = TOKEN_TRUE;	break;
232 					case KEYWORD_false:	token->type = TOKEN_FALSE;	break;
233 					case KEYWORD_null:	token->type = TOKEN_NULL;	break;
234 					default:			token->type = TOKEN_NUMBER;	break;
235 				}
236 			}
237 			break;
238 	}
239 }
240 
241 #define readToken(t) (readTokenFull ((t), false))
242 
pushScope(tokenInfo * const token,const tokenInfo * const parent,const jsonKind parentKind)243 static void pushScope (tokenInfo *const token,
244 					   const tokenInfo *const parent,
245 					   const jsonKind parentKind)
246 {
247 	if (vStringLength (token->scope) > 0)
248 		vStringPut (token->scope, '.');
249 	vStringCat (token->scope, parent->string);
250 	token->scopeKind = parentKind;
251 }
252 
popScope(tokenInfo * const token,const tokenInfo * const parent)253 static void popScope (tokenInfo *const token,
254 					  const tokenInfo *const parent)
255 {
256 	vStringTruncate (token->scope, vStringLength (parent->scope));
257 	token->scopeKind = parent->scopeKind;
258 }
259 
260 #define skipToOneOf2(token, type1, type2) \
261 	(skipToOneOf3 (token, type1, type2, TOKEN_EOF /* dummy */))
262 
263 #define skipTo(token, type) \
264 	(skipToOneOf3 (token, type, /* dummies */ TOKEN_EOF, TOKEN_EOF))
265 
skipToOneOf3(tokenInfo * const token,const tokenType type1,const tokenType type2,const tokenType type3)266 static void skipToOneOf3 (tokenInfo *const token,
267 						  const tokenType type1,
268 						  const tokenType type2,
269 						  const tokenType type3)
270 {
271 	while (token->type != TOKEN_EOF &&
272 		   token->type != type1 &&
273 		   token->type != type2 &&
274 		   token->type != type3)
275 	{
276 		readToken (token);
277 		if (token->type == TOKEN_OPEN_CURLY)
278 		{
279 			skipTo (token, TOKEN_CLOSE_CURLY);
280 			readToken (token);
281 		}
282 		else if (token->type == TOKEN_OPEN_SQUARE)
283 		{
284 			skipTo (token, TOKEN_CLOSE_SQUARE);
285 			readToken (token);
286 		}
287 	}
288 }
289 
tokenToKind(const tokenType type)290 static jsonKind tokenToKind (const tokenType type)
291 {
292 	switch (type)
293 	{
294 		case TOKEN_OPEN_CURLY:	return TAG_OBJECT;
295 		case TOKEN_OPEN_SQUARE:	return TAG_ARRAY;
296 		case TOKEN_STRING:		return TAG_STRING;
297 		case TOKEN_TRUE:
298 		case TOKEN_FALSE:		return TAG_BOOLEAN;
299 		case TOKEN_NUMBER:		return TAG_NUMBER;
300 		default:				return TAG_NULL;
301 	}
302 }
303 
parseValue(tokenInfo * const token)304 static void parseValue (tokenInfo *const token)
305 {
306 	if (token->type == TOKEN_OPEN_CURLY)
307 	{
308 		tokenInfo *name = newToken ();
309 
310 		do
311 		{
312 			readTokenFull (token, true);
313 			if (token->type == TOKEN_STRING)
314 			{
315 				jsonKind tagKind = TAG_NULL; /* default in case of invalid value */
316 
317 				copyToken (name, token);
318 
319 				/* skip any possible garbage before the value */
320 				skipToOneOf3 (token, TOKEN_CLOSE_CURLY, TOKEN_COLON, TOKEN_COMMA);
321 
322 				if (token->type == TOKEN_COLON)
323 				{
324 					readToken (token);
325 					tagKind = tokenToKind (token->type);
326 
327 					pushScope (token, name, tagKind);
328 					parseValue (token);
329 					popScope (token, name);
330 				}
331 
332 				makeJsonTag (name, tagKind);
333 			}
334 			/* skip to the end of the construct */
335 			skipToOneOf2 (token, TOKEN_CLOSE_CURLY, TOKEN_COMMA);
336 		}
337 		while (token->type != TOKEN_EOF &&
338 			   token->type != TOKEN_CLOSE_CURLY);
339 
340 		if (token->type == TOKEN_CLOSE_CURLY)
341 			readToken (token);
342 
343 		deleteToken (name);
344 	}
345 	else if (token->type == TOKEN_OPEN_SQUARE)
346 	{
347 		tokenInfo *name = newToken ();
348 		char buf[32];
349 		unsigned int nth = 0;
350 
351 		readToken (token);
352 		while (token->type != TOKEN_EOF &&
353 			   token->type != TOKEN_CLOSE_SQUARE)
354 		{
355 			jsonKind tagKind;
356 
357 			tagKind = tokenToKind (token->type);
358 
359 			copyToken (name, token);
360 			snprintf (buf, sizeof buf, "%u", nth++);
361 			vStringCopyS (name->string, buf);
362 
363 			makeJsonTag (name, tagKind);
364 			pushScope (token, name, tagKind);
365 			parseValue (token);
366 			popScope (token, name);
367 
368 			/* skip to the end of the construct */
369 			skipToOneOf2 (token, TOKEN_CLOSE_SQUARE, TOKEN_COMMA);
370 			if (token->type != TOKEN_CLOSE_SQUARE)
371 				readToken (token);
372 		}
373 
374 		if (token->type == TOKEN_CLOSE_SQUARE)
375 			readToken (token);
376 
377 		deleteToken (name);
378 	}
379 }
380 
findJsonTags(void)381 static void findJsonTags (void)
382 {
383 	tokenInfo *const token = newToken ();
384 
385 	depth_counter = 0;
386 
387 	/* We allow multiple top-level elements, although it's not actually valid
388 	 * JSON.  An interesting side effect of this is that we allow a leading
389 	 * Unicode BOM mark -- even though ok, many JSON parsers will choke on it */
390 	do
391 	{
392 		readToken (token);
393 		parseValue (token);
394 	}
395 	while (token->type != TOKEN_EOF);
396 
397 	deleteToken (token);
398 }
399 
initialize(const langType language)400 static void initialize (const langType language)
401 {
402 	Lang_json = language;
403 }
404 
405 /* Create parser definition structure */
JsonParser(void)406 extern parserDefinition* JsonParser (void)
407 {
408 	static const char *const extensions [] = { "json", NULL };
409 	parserDefinition *const def = parserNew ("JSON");
410 	def->extensions = extensions;
411 	def->kindTable	= JsonKinds;
412 	def->kindCount	= ARRAY_SIZE (JsonKinds);
413 	def->parser		= findJsonTags;
414 	def->initialize = initialize;
415 	def->keywordTable = JsonKeywordTable;
416 	def->keywordCount = ARRAY_SIZE (JsonKeywordTable);
417 	def->allowNullTag = true;
418 
419 	return def;
420 }
421