xref: /Universal-ctags/parsers/html.c (revision ae9f095608a8efa2b27574ac9ec8dd2465ac849c)
1 /*
2 *   Copyright (c) 2016, Jiri Techet
3 *
4 *   This source code is released for free distribution under the terms of the
5 *   GNU General Public License version 2 or (at your option) any later version.
6 *
7 *   This module contains functions for generating tags for HTML language
8 *   files.
9 */
10 
11 #include "general.h"
12 
13 #include <string.h>
14 #include <ctype.h>
15 
16 #include "entry.h"
17 #include "parse.h"
18 #include "read.h"
19 #include "routines.h"
20 #include "keyword.h"
21 #include "promise.h"
22 
23 /* The max. number of nested elements - prevents further recursion if the limit
24  * is exceeded and avoids stack overflow for invalid input containing too many
25  * open tags */
26 #define MAX_DEPTH 1000
27 
28 
29 typedef enum {
30 	K_ANCHOR,
31 	K_CLASS,
32 	K_TITLE,
33 	K_HEADING1,
34 	K_HEADING2,
35 	K_HEADING3,
36 	K_STYELSHEET,
37 	K_ID,
38 	K_SCRIPT,
39 } htmlKind;
40 
41 
42 typedef enum {
43 	CLASS_KIND_ATTRIBUTE_ROLE,
44 } ClassRole;
45 
46 typedef enum {
47 	SCRIPT_KIND_EXTERNAL_FILE_ROLE,
48 } ScriptRole;
49 
50 typedef enum {
51 	STYLESHEET_KIND_EXTERNAL_FILE_ROLE,
52 } StylesheetRole;
53 
54 static roleDefinition ClassRoles [] = {
55 	{ true, "attribute", "assigned as attributes" },
56 };
57 
58 static roleDefinition ScriptRoles [] = {
59 	{ true, "extFile", "referenced as external files" },
60 };
61 
62 static roleDefinition StylesheetRoles [] = {
63 	{ true, "extFile", "referenced as external files" },
64 };
65 
66 static kindDefinition HtmlKinds [] = {
67 	{ true, 'a', "anchor",		"named anchors" },
68 	{ true, 'c', "class",		"classes",
69 	  .referenceOnly = true, ATTACH_ROLES (ClassRoles)},
70 	{ true, 't', "title",		"titles" },
71 	{ true, 'h', "heading1",	"H1 headings" },
72 	{ true, 'i', "heading2",	"H2 headings" },
73 	{ true, 'j', "heading3",	"H3 headings" },
74 	{ true, 'C', "stylesheet",	"stylesheets",
75 	  .referenceOnly = true, ATTACH_ROLES (StylesheetRoles)},
76 	{ true, 'I', "id",			"identifiers" },
77 	{ true, 'J', "script",		"scripts",
78 	  .referenceOnly = true, ATTACH_ROLES (ScriptRoles)},
79 };
80 
81 typedef enum {
82 	/* The order starting from "title" to "h3" should
83 	 * not be changed.
84 	 *
85 	 */
86 	KEYWORD_heading_start,
87 	KEYWORD_title = KEYWORD_heading_start,
88 	KEYWORD_h1,
89 	KEYWORD_h2,
90 	KEYWORD_h3,
91 	KEYWORD_heading_end = KEYWORD_h3,
92 	KEYWORD_a,
93 	KEYWORD_script,
94 	KEYWORD_style,
95 	KEYWORD_name,
96 
97 	/* void elements */
98 	KEYWORD_area,
99 	KEYWORD_base,
100 	KEYWORD_br,
101 	KEYWORD_class,
102 	KEYWORD_col,
103 	KEYWORD_command,
104 	KEYWORD_embed,
105 	KEYWORD_hr,
106 	KEYWORD_href,
107 	KEYWORD_id,
108 	KEYWORD_img,
109 	KEYWORD_input,
110 	KEYWORD_keygen,
111 	KEYWORD_link,
112 	KEYWORD_meta,
113 	KEYWORD_param,
114 	KEYWORD_rel,
115 	KEYWORD_source,
116 	KEYWORD_src,
117 	KEYWORD_track,
118 	KEYWORD_wbr
119 } keywordId;
120 
121 static const keywordTable HtmlKeywordTable[] = {
122 	{"title", KEYWORD_title},
123 	{"h1", KEYWORD_h1},
124 	{"h2", KEYWORD_h2},
125 	{"h3", KEYWORD_h3},
126 	{"a", KEYWORD_a},
127 	{"script", KEYWORD_script},
128 	{"style", KEYWORD_style},
129 	{"name", KEYWORD_name},
130 
131 	/* void elements */
132 	{"area", KEYWORD_area},
133 	{"base", KEYWORD_base},
134 	{"br", KEYWORD_br},
135 	{"class", KEYWORD_class},
136 	{"col", KEYWORD_col},
137 	{"command", KEYWORD_command},
138 	{"embed", KEYWORD_embed},
139 	{"hr", KEYWORD_hr},
140 	{"href", KEYWORD_href},
141 	{"id", KEYWORD_id},
142 	{"img", KEYWORD_img},
143 	{"input", KEYWORD_input},
144 	{"keygen", KEYWORD_keygen},
145 	{"link", KEYWORD_link},
146 	{"meta", KEYWORD_meta},
147 	{"param", KEYWORD_param},
148 	{"rel", KEYWORD_rel},
149 	{"source", KEYWORD_source},
150 	{"src", KEYWORD_src},
151 	{"track", KEYWORD_track},
152 	{"wbr", KEYWORD_wbr},
153 };
154 
155 typedef enum {
156 	TOKEN_EOF,
157 	TOKEN_NAME,			/* tag and attribute names */
158 	TOKEN_STRING,		/* single- or double-quoted attribute value */
159 	TOKEN_TEXT,
160 	TOKEN_TAG_START,	/* <  */
161 	TOKEN_TAG_START2,	/* </ */
162 	TOKEN_TAG_END,		/* >  */
163 	TOKEN_TAG_END2,		/* /> */
164 	TOKEN_EQUAL,
165 	TOKEN_COMMENT,
166 	TOKEN_OTHER
167 } tokenType;
168 
169 #ifdef DEBUG
170 const char *tokenTypes[] = {
171 #define E(X) [TOKEN_##X] = #X
172 	E(EOF),
173 	E(NAME),
174 	E(STRING),
175 	E(TEXT),
176 	E(TAG_START),
177 	E(TAG_START2),
178 	E(TAG_END),
179 	E(TAG_END2),
180 	E(EQUAL),
181 	E(COMMENT),
182 	E(OTHER),
183 #undef E
184 };
185 #endif
186 
187 typedef struct {
188 	tokenType type;
189 	vString *string;
190 } tokenInfo;
191 
192 
193 static int Lang_html;
194 
195 
196 static void readTag (tokenInfo *token, vString *text, int depth);
197 
198 #ifdef DEBUG
199 #if 0
200 static void dumpToken (tokenInfo *token, const char *context, const char* extra_context)
201 {
202 	fprintf (stderr, "[%7s] %-20s@%s.%s\n",
203 			 tokenTypes[token->type], vStringValue(token->string),
204 			 context, extra_context? extra_context: "_");
205 }
206 #endif
207 #endif
208 
readTokenText(tokenInfo * const token,bool collectText)209 static void readTokenText (tokenInfo *const token, bool collectText)
210 {
211 	int c;
212 	int lastC = 'X';  /* whatever non-space character */
213 
214 	vStringClear (token->string);
215 
216 getNextChar:
217 
218 	c = getcFromInputFile ();
219 
220 	switch (c)
221 	{
222 		case EOF:
223 			token->type = TOKEN_EOF;
224 			break;
225 
226 		case '<':
227 			ungetcToInputFile (c);
228 			token->type = TOKEN_TEXT;
229 			break;
230 
231 		default:
232 			if (collectText)
233 			{
234 				if (isspace (c))
235 					c = ' ';
236 				if (c != ' ' || lastC != ' ')
237 				{
238 					vStringPut (token->string, c);
239 					lastC = c;
240 				}
241 			}
242 
243 			goto getNextChar;
244 	}
245 }
246 
readToken(tokenInfo * const token,bool skipComments)247 static void readToken (tokenInfo *const token, bool skipComments)
248 {
249 	int c;
250 
251 	vStringClear (token->string);
252 
253 getNextChar:
254 
255 	c = getcFromInputFile ();
256 	while (isspace (c))
257 		c = getcFromInputFile ();
258 
259 	switch (c)
260 	{
261 		case EOF:
262 			token->type = TOKEN_EOF;
263 			break;
264 
265 		case '<':
266 		{
267 			int d = getcFromInputFile ();
268 
269 			if (d == '!')
270 			{
271 				d = getcFromInputFile ();
272 				if (d == '-')
273 				{
274 					d = getcFromInputFile ();
275 					if (d == '-')
276 					{
277 						int e = ' ';
278 						int f = ' ';
279 						do
280 						{
281 							d = e;
282 							e = f;
283 							f = getcFromInputFile ();
284 						}
285 						while (f != EOF && ! (d == '-' && e == '-' && f == '>'));
286 
287 						if (skipComments)
288 							goto getNextChar;
289 						else
290 						{
291 							token->type = TOKEN_COMMENT;
292 							break;
293 						}
294 					}
295 				}
296 				ungetcToInputFile (d);
297 				token->type = TOKEN_OTHER;
298 			}
299 			else if (d == '?')
300 				token->type = TOKEN_OTHER;
301 			else if (d == '/')
302 				token->type = TOKEN_TAG_START2;
303 			else
304 			{
305 				ungetcToInputFile (d);
306 				token->type = TOKEN_TAG_START;
307 			}
308 			break;
309 		}
310 		case '/':
311 		{
312 			int d = getcFromInputFile ();
313 			if (d == '>')
314 				token->type = TOKEN_TAG_END2;
315 			else
316 			{
317 				ungetcToInputFile (d);
318 				token->type = TOKEN_OTHER;
319 			}
320 			break;
321 		}
322 		case '>':
323 			token->type = TOKEN_TAG_END;
324 			break;
325 
326 		case '=':
327 			token->type = TOKEN_EQUAL;
328 			break;
329 
330 		case '"':
331 		case '\'':
332 		{
333 			const int delimiter = c;
334 			c = getcFromInputFile ();
335 			while (c != EOF && c != delimiter)
336 			{
337 				vStringPut (token->string, c);
338 				c = getcFromInputFile ();
339 			}
340 			token->type = TOKEN_STRING;
341 			break;
342 		}
343 
344 		default:
345 		{
346 			do
347 			{
348 				vStringPut (token->string, tolower (c));
349 				c = getcFromInputFile ();
350 			}
351 			while (!isspace (c) && c != '<' && c != '>' && c != '/' &&
352 				   c != '=' && c != '\'' && c != '"' && c != EOF);
353 			if (c != EOF)
354 				ungetcToInputFile (c);
355 			token->type = TOKEN_NAME;
356 			break;
357 		}
358 	}
359 }
360 
appendText(vString * text,vString * appendedText)361 static void appendText (vString *text, vString *appendedText)
362 {
363 	if (text != NULL && vStringLength (appendedText) > 0)
364 	{
365 		if (vStringLength (text) > 0 && vStringLast (text) == ' ' &&
366 			vStringLength (appendedText) > 0 && vStringChar (appendedText, 0) == ' ')
367 		{
368 			vStringStripTrailing (text);
369 		}
370 		vStringCat (text, appendedText);
371 	}
372 }
373 
readTagContent(tokenInfo * token,vString * text,long * line,long * lineOffset,int depth)374 static bool readTagContent (tokenInfo *token, vString *text, long *line, long *lineOffset, int depth)
375 {
376 	tokenType type;
377 
378 	readTokenText (token, text != NULL);
379 	appendText (text, token->string);
380 
381 	do
382 	{
383 		*line = getInputLineNumber ();
384 		*lineOffset = getInputLineOffset ();
385 		readToken (token, false);
386 		type = token->type;
387 		if (type == TOKEN_TAG_START)
388 			readTag (token, text, depth + 1);
389 		if (type == TOKEN_COMMENT || type == TOKEN_TAG_START)
390 		{
391 			readTokenText (token, text != NULL);
392 			appendText (text, token->string);
393 		}
394 	}
395 	while (type == TOKEN_COMMENT || type == TOKEN_TAG_START);
396 
397 	return type == TOKEN_TAG_START2;
398 }
399 
skipScriptContent(tokenInfo * token,long * line,long * lineOffset)400 static bool skipScriptContent (tokenInfo *token, long *line, long *lineOffset)
401 {
402 	bool found_start = false;
403 	bool found_script = false;
404 
405 	long line_tmp[2] = {0};
406 	long lineOffset_tmp[2] = {0};
407 
408 	tokenType type;
409 
410 	do
411 	{
412 		line_tmp[0] = getInputLineNumber ();
413 		lineOffset_tmp[0] = getInputLineOffset ();
414 
415 		readToken (token, false);
416 		type = token->type;
417 
418 		if (type == TOKEN_TAG_START2)
419 		{
420 			found_start = true;
421 			line_tmp[1] = line_tmp[0];
422 			lineOffset_tmp[1] = lineOffset_tmp[0];
423 		}
424 		else if (found_start
425 				 && type == TOKEN_NAME
426 				 && lookupKeyword (vStringValue (token->string), Lang_html) == KEYWORD_script)
427 		{
428 			found_script = true;
429 			*line = line_tmp[1];
430 			*lineOffset = lineOffset_tmp[1];
431 		}
432 		else
433 			found_start = false;
434 	}
435 	while ((type != TOKEN_EOF) && (!found_script));
436 
437 	return found_script;
438 }
439 
makeClassRefTags(const char * classes)440 static void makeClassRefTags (const char *classes)
441 {
442 	vString *klass = vStringNew ();
443 
444 	do
445 	{
446 		if (*classes && !isspace (*classes))
447 			vStringPut (klass, *classes);
448 		else if (!vStringIsEmpty (klass))
449 		{
450 			makeSimpleRefTag (klass, K_CLASS,
451 							  CLASS_KIND_ATTRIBUTE_ROLE);
452 			vStringClear (klass);
453 		}
454 
455 		if (!*classes)
456 			break;
457 
458 		classes++;
459 	} while (1);
460 
461 	vStringDelete (klass);
462 }
463 
readTag(tokenInfo * token,vString * text,int depth)464 static void readTag (tokenInfo *token, vString *text, int depth)
465 {
466 	bool textCreated = false;
467 
468 	readToken (token, true);
469 	if (token->type == TOKEN_NAME)
470 	{
471 		keywordId startTag;
472 		bool isHeading;
473 		bool isVoid;
474 		vString *stylesheet = NULL;
475 		bool stylesheet_expectation = false;
476 
477 		startTag = lookupKeyword (vStringValue (token->string), Lang_html);
478 		isHeading = (KEYWORD_heading_start <= startTag && startTag <= KEYWORD_heading_end);
479 		isVoid = (startTag >= KEYWORD_area && startTag <= KEYWORD_wbr);
480 		if (text == NULL && isHeading)
481 		{
482 			text = vStringNew ();
483 			textCreated = true;
484 		}
485 
486 		do
487 		{
488 			keywordId attribute = KEYWORD_NONE;
489 
490 			readToken (token, true);
491 			if (token->type == TOKEN_NAME)
492 				attribute = lookupKeyword (vStringValue (token->string), Lang_html);
493 
494 			if (attribute == KEYWORD_class)
495 			{
496 				readToken (token, true);
497 				if (token->type == TOKEN_EQUAL)
498 				{
499 					readToken (token, true);
500 					if (token->type == TOKEN_STRING)
501 						makeClassRefTags (vStringValue (token->string));
502 				}
503 			}
504 			else if (attribute == KEYWORD_id)
505 			{
506 				readToken (token, true);
507 				if (token->type == TOKEN_EQUAL)
508 				{
509 					readToken (token, true);
510 					if (token->type == TOKEN_STRING)
511 						makeSimpleTag (token->string, K_ID);
512 				}
513 			}
514 			else if (startTag == KEYWORD_a && attribute == KEYWORD_name)
515 			{
516 				readToken (token, true);
517 				if (token->type == TOKEN_EQUAL)
518 				{
519 					readToken (token, true);
520 					if (token->type == TOKEN_STRING || token->type == TOKEN_NAME)
521 						makeSimpleTag (token->string, K_ANCHOR);
522 				}
523 			}
524 			else if (startTag == KEYWORD_script && attribute == KEYWORD_src)
525 			{
526 				readToken (token, true);
527 				if (token->type == TOKEN_EQUAL)
528 				{
529 					readToken (token, true);
530 					if (token->type == TOKEN_STRING)
531 						makeSimpleRefTag (token->string, K_SCRIPT,
532 										  SCRIPT_KIND_EXTERNAL_FILE_ROLE);
533 				}
534 			}
535 			else if (startTag == KEYWORD_link)
536 			{
537 				if (attribute == KEYWORD_rel)
538 				{
539 					readToken (token, true);
540 					if (token->type == TOKEN_EQUAL)
541 					{
542 						readToken (token, true);
543 						if (token->type == TOKEN_STRING &&
544 							/* strcmp is not enough:
545 							 * e.g. <link href="fancy.css"
546 							 *            rel="alternate stylesheet" title="Fancy"> */
547 							vStringLength(token->string) >= 10 &&
548 							strstr (vStringValue (token->string), "stylesheet"))
549 							stylesheet_expectation = true;
550 					}
551 				}
552 				else if (attribute == KEYWORD_href)
553 				{
554 					readToken (token, true);
555 					if (token->type == TOKEN_EQUAL)
556 					{
557 						readToken (token, true);
558 						if (token->type == TOKEN_STRING)
559 						{
560 							if (stylesheet == NULL)
561 								stylesheet = vStringNewCopy (token->string);
562 							else
563 								vStringCopy (stylesheet, token->string);
564 						}
565 					}
566 				}
567 				if (stylesheet_expectation && stylesheet && !vStringIsEmpty (stylesheet))
568 				{
569 					makeSimpleRefTag (stylesheet, K_STYELSHEET,
570 									  STYLESHEET_KIND_EXTERNAL_FILE_ROLE);
571 					stylesheet_expectation = false;
572 					if (stylesheet)
573 						vStringClear (stylesheet);
574 				}
575 			}
576 		}
577 		while (token->type != TOKEN_TAG_END && token->type != TOKEN_TAG_END2 &&
578 			   token->type != TOKEN_EOF);
579 
580 		vStringDelete (stylesheet);
581 		stylesheet = NULL;
582 
583 		if (!isVoid && token->type == TOKEN_TAG_END && depth < MAX_DEPTH)
584 		{
585 			long startSourceLineNumber = getSourceLineNumber ();
586 			long startLineNumber = getInputLineNumber ();
587 			long startLineOffset = getInputLineOffset ();
588 			long endLineNumber;
589 			long endLineOffset;
590 			bool tag_start2;
591 
592 			if (startTag == KEYWORD_script)
593 			{
594 				bool script = skipScriptContent (token, &endLineNumber, &endLineOffset);
595 				if (script)
596 					makePromise ("JavaScript", startLineNumber, startLineOffset,
597 								 endLineNumber, endLineOffset, startSourceLineNumber);
598 				readToken (token, true);
599 				goto out;
600 			}
601 
602 			tag_start2 = readTagContent (token, text, &endLineNumber, &endLineOffset, depth);
603 			if (tag_start2)
604 			{
605 				readToken (token, true);
606 				if (isHeading && textCreated && vStringLength (text) > 0)
607 				{
608 					keywordId endTag = lookupKeyword (vStringValue (token->string), Lang_html);
609 					if (startTag == endTag)
610 					{
611 						htmlKind headingKind;
612 
613 						if (startTag == KEYWORD_title)
614 							headingKind = K_TITLE;
615 						if (startTag == KEYWORD_h1)
616 							headingKind = K_HEADING1;
617 						else if (startTag == KEYWORD_h2)
618 							headingKind = K_HEADING2;
619 						else
620 							headingKind = K_HEADING3;
621 
622 						vStringStripLeading (text);
623 						vStringStripTrailing (text);
624 						makeSimpleTag (text, headingKind);
625 					}
626 				}
627 				else if (startTag == KEYWORD_style)
628 				{
629 					keywordId endTag = lookupKeyword (vStringValue (token->string), Lang_html);
630 					if (startTag == endTag)
631 						makePromise ("CSS", startLineNumber, startLineOffset,
632 									 endLineNumber, endLineOffset, startSourceLineNumber);
633 				}
634 
635 				readToken (token, true);
636 			}
637 		}
638 	}
639 
640  out:
641 	if (textCreated)
642 		vStringDelete (text);
643 }
644 
findHtmlTags(void)645 static void findHtmlTags (void)
646 {
647 	tokenInfo token;
648 
649 	token.string = vStringNew ();
650 
651 	do
652 	{
653 		readToken (&token, true);
654 		if (token.type == TOKEN_TAG_START)
655 			readTag (&token, NULL, 0);
656 	}
657 	while (token.type != TOKEN_EOF);
658 
659 	vStringDelete (token.string);
660 }
661 
initialize(const langType language)662 static void initialize (const langType language)
663 {
664 	Lang_html = language;
665 }
666 
667 /* parser definition */
HtmlParser(void)668 extern parserDefinition* HtmlParser (void)
669 {
670 	static const char *const extensions [] = { "htm", "html", NULL };
671 	parserDefinition* def = parserNew ("HTML");
672 	def->kindTable        = HtmlKinds;
673 	def->kindCount    = ARRAY_SIZE (HtmlKinds);
674 	def->extensions   = extensions;
675 	def->parser       = findHtmlTags;
676 	def->initialize   = initialize;
677 	def->keywordTable = HtmlKeywordTable;
678 	def->keywordCount = ARRAY_SIZE (HtmlKeywordTable);
679 	return def;
680 }
681