1 /*
2 * Copyright (c) 2014, Colomban Wendling <colomban@geany.org>
3 *
4 * This source code is released for free distribution under the terms of the
5 * GNU General Public License version 2 or (at your option) any later version.
6 */
7 /*
8 * This module contains functions for generating tags for JSON files.
9 *
10 * http://www.ecma-international.org/publications/files/ECMA-ST/ECMA-404.pdf
11 *
12 * This implementation is forgiving and allows many constructs that are not
13 * actually valid but that don't conflict with the format. This is intend to
14 * better support partly broken or unfinished files.
15 */
16
17 #include "general.h"
18
19 #include <string.h>
20 #include "debug.h"
21 #include "entry.h"
22 #include "keyword.h"
23 #include "options.h"
24 #include "parse.h"
25 #include "read.h"
26 #include "routines.h"
27 #include "vstring.h"
28
29 #define isIdentChar(c) \
30 (isalnum (c) || (c) == '+' || (c) == '-' || (c) == '.')
31
32 typedef enum {
33 TOKEN_EOF,
34 TOKEN_UNDEFINED,
35 TOKEN_OPEN_SQUARE,
36 TOKEN_CLOSE_SQUARE,
37 TOKEN_OPEN_CURLY,
38 TOKEN_CLOSE_CURLY,
39 TOKEN_COLON,
40 TOKEN_COMMA,
41 TOKEN_TRUE,
42 TOKEN_FALSE,
43 TOKEN_NULL,
44 TOKEN_NUMBER,
45 TOKEN_STRING
46 } tokenType;
47
48 typedef enum {
49 TAG_NONE = -1,
50 TAG_OBJECT,
51 TAG_ARRAY,
52 TAG_NUMBER,
53 TAG_STRING,
54 TAG_BOOLEAN,
55 TAG_NULL,
56 TAG_COUNT
57 } jsonKind;
58
59 typedef struct {
60 tokenType type;
61 jsonKind scopeKind;
62 vString *string;
63 vString *scope;
64 unsigned long lineNumber;
65 MIOPos filePosition;
66 } tokenInfo;
67
68 typedef enum {
69 KEYWORD_true,
70 KEYWORD_false,
71 KEYWORD_null
72 } keywordId;
73
74 static langType Lang_json;
75
76 static kindDefinition JsonKinds [] = {
77 { true, 'o', "object", "objects" },
78 { true, 'a', "array", "arrays" },
79 { true, 'n', "number", "numbers" },
80 { true, 's', "string", "strings" },
81 { true, 'b', "boolean", "booleans" },
82 { true, 'z', "null", "nulls" }
83 };
84
85 static const keywordTable JsonKeywordTable [] = {
86 {"true", KEYWORD_true },
87 {"false", KEYWORD_false},
88 {"null", KEYWORD_null },
89 };
90
newToken(void)91 static tokenInfo *newToken (void)
92 {
93 tokenInfo *const token = xMalloc (1, tokenInfo);
94
95 token->type = TOKEN_UNDEFINED;
96 token->scopeKind = TAG_NONE;
97 token->string = vStringNew ();
98 token->scope = vStringNew ();
99 token->lineNumber = getInputLineNumber ();
100 token->filePosition = getInputFilePosition ();
101
102 return token;
103 }
104
deleteToken(tokenInfo * const token)105 static void deleteToken (tokenInfo *const token)
106 {
107 vStringDelete (token->string);
108 vStringDelete (token->scope);
109 eFree (token);
110 }
111
copyToken(tokenInfo * const dest,tokenInfo * const src)112 static void copyToken (tokenInfo *const dest, tokenInfo *const src)
113 {
114 dest->type = src->type;
115 dest->scopeKind = src->scopeKind;
116 vStringCopy (dest->string, src->string);
117 vStringCopy (dest->scope, src->scope);
118 dest->lineNumber = src->lineNumber;
119 dest->filePosition = src->filePosition;
120 }
121
makeJsonTag(tokenInfo * const token,const jsonKind kind)122 static void makeJsonTag (tokenInfo *const token, const jsonKind kind)
123 {
124 tagEntryInfo e;
125
126 if (! JsonKinds[kind].enabled)
127 return;
128
129 initTagEntry (&e, vStringValue (token->string), kind);
130
131 e.lineNumber = token->lineNumber;
132 e.filePosition = token->filePosition;
133
134 if (vStringLength (token->scope) > 0)
135 {
136 Assert (token->scopeKind > TAG_NONE && token->scopeKind < TAG_COUNT);
137
138 e.extensionFields.scopeKindIndex = token->scopeKind;
139 e.extensionFields.scopeName = vStringValue (token->scope);
140 }
141
142 makeTagEntry (&e);
143 }
144
145 #define DEPTH_LIMIT 512
146 static int depth_counter;
147
readTokenFull(tokenInfo * const token,bool includeStringRepr)148 static void readTokenFull (tokenInfo *const token,
149 bool includeStringRepr)
150 {
151 int c;
152
153 if (depth_counter > DEPTH_LIMIT)
154 {
155 token->type = TOKEN_EOF;
156
157 /* Not to repeat warnings. */
158 if (depth_counter == (DEPTH_LIMIT + 1))
159 {
160 notice ("Terminate parsing: too deep brackets recursion in %s at %ld",
161 getInputFileName(), getInputLineNumber());
162 depth_counter++;
163 }
164 return;
165 }
166
167 token->type = TOKEN_UNDEFINED;
168 vStringClear (token->string);
169
170 do
171 c = getcFromInputFile ();
172 while (c == '\t' || c == ' ' || c == '\r' || c == '\n');
173
174 token->lineNumber = getInputLineNumber ();
175 token->filePosition = getInputFilePosition ();
176
177 switch (c)
178 {
179 case EOF: token->type = TOKEN_EOF; break;
180 case '[':
181 depth_counter++;
182 token->type = TOKEN_OPEN_SQUARE; break;
183 case ']':
184 depth_counter--;
185 token->type = TOKEN_CLOSE_SQUARE; break;
186 case '{':
187 depth_counter++;
188 token->type = TOKEN_OPEN_CURLY; break;
189 case '}':
190 depth_counter--;
191 token->type = TOKEN_CLOSE_CURLY; break;
192 case ':': token->type = TOKEN_COLON; break;
193 case ',': token->type = TOKEN_COMMA; break;
194
195 case '"':
196 {
197 bool escaped = false;
198 token->type = TOKEN_STRING;
199 while (true)
200 {
201 c = getcFromInputFile ();
202 /* we don't handle unicode escapes but they are safe */
203 if (escaped)
204 escaped = false;
205 else if (c == '\\')
206 escaped = true;
207 else if (c >= 0x00 && c <= 0x1F)
208 break; /* break on invalid, unescaped, control characters */
209 else if (c == '"' || c == EOF)
210 break;
211 if (includeStringRepr)
212 vStringPut (token->string, c);
213 }
214 break;
215 }
216
217 default:
218 if (! isIdentChar (c))
219 token->type = TOKEN_UNDEFINED;
220 else
221 {
222 do
223 {
224 vStringPut (token->string, c);
225 c = getcFromInputFile ();
226 }
227 while (c != EOF && isIdentChar (c));
228 ungetcToInputFile (c);
229 switch (lookupKeyword (vStringValue (token->string), Lang_json))
230 {
231 case KEYWORD_true: token->type = TOKEN_TRUE; break;
232 case KEYWORD_false: token->type = TOKEN_FALSE; break;
233 case KEYWORD_null: token->type = TOKEN_NULL; break;
234 default: token->type = TOKEN_NUMBER; break;
235 }
236 }
237 break;
238 }
239 }
240
241 #define readToken(t) (readTokenFull ((t), false))
242
pushScope(tokenInfo * const token,const tokenInfo * const parent,const jsonKind parentKind)243 static void pushScope (tokenInfo *const token,
244 const tokenInfo *const parent,
245 const jsonKind parentKind)
246 {
247 if (vStringLength (token->scope) > 0)
248 vStringPut (token->scope, '.');
249 vStringCat (token->scope, parent->string);
250 token->scopeKind = parentKind;
251 }
252
popScope(tokenInfo * const token,const tokenInfo * const parent)253 static void popScope (tokenInfo *const token,
254 const tokenInfo *const parent)
255 {
256 vStringTruncate (token->scope, vStringLength (parent->scope));
257 token->scopeKind = parent->scopeKind;
258 }
259
260 #define skipToOneOf2(token, type1, type2) \
261 (skipToOneOf3 (token, type1, type2, TOKEN_EOF /* dummy */))
262
263 #define skipTo(token, type) \
264 (skipToOneOf3 (token, type, /* dummies */ TOKEN_EOF, TOKEN_EOF))
265
skipToOneOf3(tokenInfo * const token,const tokenType type1,const tokenType type2,const tokenType type3)266 static void skipToOneOf3 (tokenInfo *const token,
267 const tokenType type1,
268 const tokenType type2,
269 const tokenType type3)
270 {
271 while (token->type != TOKEN_EOF &&
272 token->type != type1 &&
273 token->type != type2 &&
274 token->type != type3)
275 {
276 readToken (token);
277 if (token->type == TOKEN_OPEN_CURLY)
278 {
279 skipTo (token, TOKEN_CLOSE_CURLY);
280 readToken (token);
281 }
282 else if (token->type == TOKEN_OPEN_SQUARE)
283 {
284 skipTo (token, TOKEN_CLOSE_SQUARE);
285 readToken (token);
286 }
287 }
288 }
289
tokenToKind(const tokenType type)290 static jsonKind tokenToKind (const tokenType type)
291 {
292 switch (type)
293 {
294 case TOKEN_OPEN_CURLY: return TAG_OBJECT;
295 case TOKEN_OPEN_SQUARE: return TAG_ARRAY;
296 case TOKEN_STRING: return TAG_STRING;
297 case TOKEN_TRUE:
298 case TOKEN_FALSE: return TAG_BOOLEAN;
299 case TOKEN_NUMBER: return TAG_NUMBER;
300 default: return TAG_NULL;
301 }
302 }
303
parseValue(tokenInfo * const token)304 static void parseValue (tokenInfo *const token)
305 {
306 if (token->type == TOKEN_OPEN_CURLY)
307 {
308 tokenInfo *name = newToken ();
309
310 do
311 {
312 readTokenFull (token, true);
313 if (token->type == TOKEN_STRING)
314 {
315 jsonKind tagKind = TAG_NULL; /* default in case of invalid value */
316
317 copyToken (name, token);
318
319 /* skip any possible garbage before the value */
320 skipToOneOf3 (token, TOKEN_CLOSE_CURLY, TOKEN_COLON, TOKEN_COMMA);
321
322 if (token->type == TOKEN_COLON)
323 {
324 readToken (token);
325 tagKind = tokenToKind (token->type);
326
327 pushScope (token, name, tagKind);
328 parseValue (token);
329 popScope (token, name);
330 }
331
332 makeJsonTag (name, tagKind);
333 }
334 /* skip to the end of the construct */
335 skipToOneOf2 (token, TOKEN_CLOSE_CURLY, TOKEN_COMMA);
336 }
337 while (token->type != TOKEN_EOF &&
338 token->type != TOKEN_CLOSE_CURLY);
339
340 if (token->type == TOKEN_CLOSE_CURLY)
341 readToken (token);
342
343 deleteToken (name);
344 }
345 else if (token->type == TOKEN_OPEN_SQUARE)
346 {
347 tokenInfo *name = newToken ();
348 char buf[32];
349 unsigned int nth = 0;
350
351 readToken (token);
352 while (token->type != TOKEN_EOF &&
353 token->type != TOKEN_CLOSE_SQUARE)
354 {
355 jsonKind tagKind;
356
357 tagKind = tokenToKind (token->type);
358
359 copyToken (name, token);
360 snprintf (buf, sizeof buf, "%u", nth++);
361 vStringCopyS (name->string, buf);
362
363 makeJsonTag (name, tagKind);
364 pushScope (token, name, tagKind);
365 parseValue (token);
366 popScope (token, name);
367
368 /* skip to the end of the construct */
369 skipToOneOf2 (token, TOKEN_CLOSE_SQUARE, TOKEN_COMMA);
370 if (token->type != TOKEN_CLOSE_SQUARE)
371 readToken (token);
372 }
373
374 if (token->type == TOKEN_CLOSE_SQUARE)
375 readToken (token);
376
377 deleteToken (name);
378 }
379 }
380
findJsonTags(void)381 static void findJsonTags (void)
382 {
383 tokenInfo *const token = newToken ();
384
385 depth_counter = 0;
386
387 /* We allow multiple top-level elements, although it's not actually valid
388 * JSON. An interesting side effect of this is that we allow a leading
389 * Unicode BOM mark -- even though ok, many JSON parsers will choke on it */
390 do
391 {
392 readToken (token);
393 parseValue (token);
394 }
395 while (token->type != TOKEN_EOF);
396
397 deleteToken (token);
398 }
399
initialize(const langType language)400 static void initialize (const langType language)
401 {
402 Lang_json = language;
403 }
404
405 /* Create parser definition structure */
JsonParser(void)406 extern parserDefinition* JsonParser (void)
407 {
408 static const char *const extensions [] = { "json", NULL };
409 parserDefinition *const def = parserNew ("JSON");
410 def->extensions = extensions;
411 def->kindTable = JsonKinds;
412 def->kindCount = ARRAY_SIZE (JsonKinds);
413 def->parser = findJsonTags;
414 def->initialize = initialize;
415 def->keywordTable = JsonKeywordTable;
416 def->keywordCount = ARRAY_SIZE (JsonKeywordTable);
417 def->allowNullTag = true;
418
419 return def;
420 }
421