1 /*
2 * Copyright (c) 2016, Jiri Techet
3 *
4 * This source code is released for free distribution under the terms of the
5 * GNU General Public License version 2 or (at your option) any later version.
6 *
7 * This module contains functions for generating tags for HTML language
8 * files.
9 */
10
11 #include "general.h"
12
13 #include <string.h>
14 #include <ctype.h>
15
16 #include "entry.h"
17 #include "parse.h"
18 #include "read.h"
19 #include "routines.h"
20 #include "keyword.h"
21 #include "promise.h"
22
23 /* The max. number of nested elements - prevents further recursion if the limit
24 * is exceeded and avoids stack overflow for invalid input containing too many
25 * open tags */
26 #define MAX_DEPTH 1000
27
28
29 typedef enum {
30 K_ANCHOR,
31 K_CLASS,
32 K_TITLE,
33 K_HEADING1,
34 K_HEADING2,
35 K_HEADING3,
36 K_STYELSHEET,
37 K_ID,
38 K_SCRIPT,
39 } htmlKind;
40
41
42 typedef enum {
43 CLASS_KIND_ATTRIBUTE_ROLE,
44 } ClassRole;
45
46 typedef enum {
47 SCRIPT_KIND_EXTERNAL_FILE_ROLE,
48 } ScriptRole;
49
50 typedef enum {
51 STYLESHEET_KIND_EXTERNAL_FILE_ROLE,
52 } StylesheetRole;
53
54 static roleDefinition ClassRoles [] = {
55 { true, "attribute", "assigned as attributes" },
56 };
57
58 static roleDefinition ScriptRoles [] = {
59 { true, "extFile", "referenced as external files" },
60 };
61
62 static roleDefinition StylesheetRoles [] = {
63 { true, "extFile", "referenced as external files" },
64 };
65
66 static kindDefinition HtmlKinds [] = {
67 { true, 'a', "anchor", "named anchors" },
68 { true, 'c', "class", "classes",
69 .referenceOnly = true, ATTACH_ROLES (ClassRoles)},
70 { true, 't', "title", "titles" },
71 { true, 'h', "heading1", "H1 headings" },
72 { true, 'i', "heading2", "H2 headings" },
73 { true, 'j', "heading3", "H3 headings" },
74 { true, 'C', "stylesheet", "stylesheets",
75 .referenceOnly = true, ATTACH_ROLES (StylesheetRoles)},
76 { true, 'I', "id", "identifiers" },
77 { true, 'J', "script", "scripts",
78 .referenceOnly = true, ATTACH_ROLES (ScriptRoles)},
79 };
80
81 typedef enum {
82 /* The order starting from "title" to "h3" should
83 * not be changed.
84 *
85 */
86 KEYWORD_heading_start,
87 KEYWORD_title = KEYWORD_heading_start,
88 KEYWORD_h1,
89 KEYWORD_h2,
90 KEYWORD_h3,
91 KEYWORD_heading_end = KEYWORD_h3,
92 KEYWORD_a,
93 KEYWORD_script,
94 KEYWORD_style,
95 KEYWORD_name,
96
97 /* void elements */
98 KEYWORD_area,
99 KEYWORD_base,
100 KEYWORD_br,
101 KEYWORD_class,
102 KEYWORD_col,
103 KEYWORD_command,
104 KEYWORD_embed,
105 KEYWORD_hr,
106 KEYWORD_href,
107 KEYWORD_id,
108 KEYWORD_img,
109 KEYWORD_input,
110 KEYWORD_keygen,
111 KEYWORD_link,
112 KEYWORD_meta,
113 KEYWORD_param,
114 KEYWORD_rel,
115 KEYWORD_source,
116 KEYWORD_src,
117 KEYWORD_track,
118 KEYWORD_wbr
119 } keywordId;
120
121 static const keywordTable HtmlKeywordTable[] = {
122 {"title", KEYWORD_title},
123 {"h1", KEYWORD_h1},
124 {"h2", KEYWORD_h2},
125 {"h3", KEYWORD_h3},
126 {"a", KEYWORD_a},
127 {"script", KEYWORD_script},
128 {"style", KEYWORD_style},
129 {"name", KEYWORD_name},
130
131 /* void elements */
132 {"area", KEYWORD_area},
133 {"base", KEYWORD_base},
134 {"br", KEYWORD_br},
135 {"class", KEYWORD_class},
136 {"col", KEYWORD_col},
137 {"command", KEYWORD_command},
138 {"embed", KEYWORD_embed},
139 {"hr", KEYWORD_hr},
140 {"href", KEYWORD_href},
141 {"id", KEYWORD_id},
142 {"img", KEYWORD_img},
143 {"input", KEYWORD_input},
144 {"keygen", KEYWORD_keygen},
145 {"link", KEYWORD_link},
146 {"meta", KEYWORD_meta},
147 {"param", KEYWORD_param},
148 {"rel", KEYWORD_rel},
149 {"source", KEYWORD_source},
150 {"src", KEYWORD_src},
151 {"track", KEYWORD_track},
152 {"wbr", KEYWORD_wbr},
153 };
154
155 typedef enum {
156 TOKEN_EOF,
157 TOKEN_NAME, /* tag and attribute names */
158 TOKEN_STRING, /* single- or double-quoted attribute value */
159 TOKEN_TEXT,
160 TOKEN_TAG_START, /* < */
161 TOKEN_TAG_START2, /* </ */
162 TOKEN_TAG_END, /* > */
163 TOKEN_TAG_END2, /* /> */
164 TOKEN_EQUAL,
165 TOKEN_COMMENT,
166 TOKEN_OTHER
167 } tokenType;
168
169 #ifdef DEBUG
170 const char *tokenTypes[] = {
171 #define E(X) [TOKEN_##X] = #X
172 E(EOF),
173 E(NAME),
174 E(STRING),
175 E(TEXT),
176 E(TAG_START),
177 E(TAG_START2),
178 E(TAG_END),
179 E(TAG_END2),
180 E(EQUAL),
181 E(COMMENT),
182 E(OTHER),
183 #undef E
184 };
185 #endif
186
187 typedef struct {
188 tokenType type;
189 vString *string;
190 } tokenInfo;
191
192
193 static int Lang_html;
194
195
196 static void readTag (tokenInfo *token, vString *text, int depth);
197
198 #ifdef DEBUG
199 #if 0
200 static void dumpToken (tokenInfo *token, const char *context, const char* extra_context)
201 {
202 fprintf (stderr, "[%7s] %-20s@%s.%s\n",
203 tokenTypes[token->type], vStringValue(token->string),
204 context, extra_context? extra_context: "_");
205 }
206 #endif
207 #endif
208
readTokenText(tokenInfo * const token,bool collectText)209 static void readTokenText (tokenInfo *const token, bool collectText)
210 {
211 int c;
212 int lastC = 'X'; /* whatever non-space character */
213
214 vStringClear (token->string);
215
216 getNextChar:
217
218 c = getcFromInputFile ();
219
220 switch (c)
221 {
222 case EOF:
223 token->type = TOKEN_EOF;
224 break;
225
226 case '<':
227 ungetcToInputFile (c);
228 token->type = TOKEN_TEXT;
229 break;
230
231 default:
232 if (collectText)
233 {
234 if (isspace (c))
235 c = ' ';
236 if (c != ' ' || lastC != ' ')
237 {
238 vStringPut (token->string, c);
239 lastC = c;
240 }
241 }
242
243 goto getNextChar;
244 }
245 }
246
readToken(tokenInfo * const token,bool skipComments)247 static void readToken (tokenInfo *const token, bool skipComments)
248 {
249 int c;
250
251 vStringClear (token->string);
252
253 getNextChar:
254
255 c = getcFromInputFile ();
256 while (isspace (c))
257 c = getcFromInputFile ();
258
259 switch (c)
260 {
261 case EOF:
262 token->type = TOKEN_EOF;
263 break;
264
265 case '<':
266 {
267 int d = getcFromInputFile ();
268
269 if (d == '!')
270 {
271 d = getcFromInputFile ();
272 if (d == '-')
273 {
274 d = getcFromInputFile ();
275 if (d == '-')
276 {
277 int e = ' ';
278 int f = ' ';
279 do
280 {
281 d = e;
282 e = f;
283 f = getcFromInputFile ();
284 }
285 while (f != EOF && ! (d == '-' && e == '-' && f == '>'));
286
287 if (skipComments)
288 goto getNextChar;
289 else
290 {
291 token->type = TOKEN_COMMENT;
292 break;
293 }
294 }
295 }
296 ungetcToInputFile (d);
297 token->type = TOKEN_OTHER;
298 }
299 else if (d == '?')
300 token->type = TOKEN_OTHER;
301 else if (d == '/')
302 token->type = TOKEN_TAG_START2;
303 else
304 {
305 ungetcToInputFile (d);
306 token->type = TOKEN_TAG_START;
307 }
308 break;
309 }
310 case '/':
311 {
312 int d = getcFromInputFile ();
313 if (d == '>')
314 token->type = TOKEN_TAG_END2;
315 else
316 {
317 ungetcToInputFile (d);
318 token->type = TOKEN_OTHER;
319 }
320 break;
321 }
322 case '>':
323 token->type = TOKEN_TAG_END;
324 break;
325
326 case '=':
327 token->type = TOKEN_EQUAL;
328 break;
329
330 case '"':
331 case '\'':
332 {
333 const int delimiter = c;
334 c = getcFromInputFile ();
335 while (c != EOF && c != delimiter)
336 {
337 vStringPut (token->string, c);
338 c = getcFromInputFile ();
339 }
340 token->type = TOKEN_STRING;
341 break;
342 }
343
344 default:
345 {
346 do
347 {
348 vStringPut (token->string, tolower (c));
349 c = getcFromInputFile ();
350 }
351 while (!isspace (c) && c != '<' && c != '>' && c != '/' &&
352 c != '=' && c != '\'' && c != '"' && c != EOF);
353 if (c != EOF)
354 ungetcToInputFile (c);
355 token->type = TOKEN_NAME;
356 break;
357 }
358 }
359 }
360
appendText(vString * text,vString * appendedText)361 static void appendText (vString *text, vString *appendedText)
362 {
363 if (text != NULL && vStringLength (appendedText) > 0)
364 {
365 if (vStringLength (text) > 0 && vStringLast (text) == ' ' &&
366 vStringLength (appendedText) > 0 && vStringChar (appendedText, 0) == ' ')
367 {
368 vStringStripTrailing (text);
369 }
370 vStringCat (text, appendedText);
371 }
372 }
373
readTagContent(tokenInfo * token,vString * text,long * line,long * lineOffset,int depth)374 static bool readTagContent (tokenInfo *token, vString *text, long *line, long *lineOffset, int depth)
375 {
376 tokenType type;
377
378 readTokenText (token, text != NULL);
379 appendText (text, token->string);
380
381 do
382 {
383 *line = getInputLineNumber ();
384 *lineOffset = getInputLineOffset ();
385 readToken (token, false);
386 type = token->type;
387 if (type == TOKEN_TAG_START)
388 readTag (token, text, depth + 1);
389 if (type == TOKEN_COMMENT || type == TOKEN_TAG_START)
390 {
391 readTokenText (token, text != NULL);
392 appendText (text, token->string);
393 }
394 }
395 while (type == TOKEN_COMMENT || type == TOKEN_TAG_START);
396
397 return type == TOKEN_TAG_START2;
398 }
399
skipScriptContent(tokenInfo * token,long * line,long * lineOffset)400 static bool skipScriptContent (tokenInfo *token, long *line, long *lineOffset)
401 {
402 bool found_start = false;
403 bool found_script = false;
404
405 long line_tmp[2] = {0};
406 long lineOffset_tmp[2] = {0};
407
408 tokenType type;
409
410 do
411 {
412 line_tmp[0] = getInputLineNumber ();
413 lineOffset_tmp[0] = getInputLineOffset ();
414
415 readToken (token, false);
416 type = token->type;
417
418 if (type == TOKEN_TAG_START2)
419 {
420 found_start = true;
421 line_tmp[1] = line_tmp[0];
422 lineOffset_tmp[1] = lineOffset_tmp[0];
423 }
424 else if (found_start
425 && type == TOKEN_NAME
426 && lookupKeyword (vStringValue (token->string), Lang_html) == KEYWORD_script)
427 {
428 found_script = true;
429 *line = line_tmp[1];
430 *lineOffset = lineOffset_tmp[1];
431 }
432 else
433 found_start = false;
434 }
435 while ((type != TOKEN_EOF) && (!found_script));
436
437 return found_script;
438 }
439
makeClassRefTags(const char * classes)440 static void makeClassRefTags (const char *classes)
441 {
442 vString *klass = vStringNew ();
443
444 do
445 {
446 if (*classes && !isspace (*classes))
447 vStringPut (klass, *classes);
448 else if (!vStringIsEmpty (klass))
449 {
450 makeSimpleRefTag (klass, K_CLASS,
451 CLASS_KIND_ATTRIBUTE_ROLE);
452 vStringClear (klass);
453 }
454
455 if (!*classes)
456 break;
457
458 classes++;
459 } while (1);
460
461 vStringDelete (klass);
462 }
463
readTag(tokenInfo * token,vString * text,int depth)464 static void readTag (tokenInfo *token, vString *text, int depth)
465 {
466 bool textCreated = false;
467
468 readToken (token, true);
469 if (token->type == TOKEN_NAME)
470 {
471 keywordId startTag;
472 bool isHeading;
473 bool isVoid;
474 vString *stylesheet = NULL;
475 bool stylesheet_expectation = false;
476
477 startTag = lookupKeyword (vStringValue (token->string), Lang_html);
478 isHeading = (KEYWORD_heading_start <= startTag && startTag <= KEYWORD_heading_end);
479 isVoid = (startTag >= KEYWORD_area && startTag <= KEYWORD_wbr);
480 if (text == NULL && isHeading)
481 {
482 text = vStringNew ();
483 textCreated = true;
484 }
485
486 do
487 {
488 keywordId attribute = KEYWORD_NONE;
489
490 readToken (token, true);
491 if (token->type == TOKEN_NAME)
492 attribute = lookupKeyword (vStringValue (token->string), Lang_html);
493
494 if (attribute == KEYWORD_class)
495 {
496 readToken (token, true);
497 if (token->type == TOKEN_EQUAL)
498 {
499 readToken (token, true);
500 if (token->type == TOKEN_STRING)
501 makeClassRefTags (vStringValue (token->string));
502 }
503 }
504 else if (attribute == KEYWORD_id)
505 {
506 readToken (token, true);
507 if (token->type == TOKEN_EQUAL)
508 {
509 readToken (token, true);
510 if (token->type == TOKEN_STRING)
511 makeSimpleTag (token->string, K_ID);
512 }
513 }
514 else if (startTag == KEYWORD_a && attribute == KEYWORD_name)
515 {
516 readToken (token, true);
517 if (token->type == TOKEN_EQUAL)
518 {
519 readToken (token, true);
520 if (token->type == TOKEN_STRING || token->type == TOKEN_NAME)
521 makeSimpleTag (token->string, K_ANCHOR);
522 }
523 }
524 else if (startTag == KEYWORD_script && attribute == KEYWORD_src)
525 {
526 readToken (token, true);
527 if (token->type == TOKEN_EQUAL)
528 {
529 readToken (token, true);
530 if (token->type == TOKEN_STRING)
531 makeSimpleRefTag (token->string, K_SCRIPT,
532 SCRIPT_KIND_EXTERNAL_FILE_ROLE);
533 }
534 }
535 else if (startTag == KEYWORD_link)
536 {
537 if (attribute == KEYWORD_rel)
538 {
539 readToken (token, true);
540 if (token->type == TOKEN_EQUAL)
541 {
542 readToken (token, true);
543 if (token->type == TOKEN_STRING &&
544 /* strcmp is not enough:
545 * e.g. <link href="fancy.css"
546 * rel="alternate stylesheet" title="Fancy"> */
547 vStringLength(token->string) >= 10 &&
548 strstr (vStringValue (token->string), "stylesheet"))
549 stylesheet_expectation = true;
550 }
551 }
552 else if (attribute == KEYWORD_href)
553 {
554 readToken (token, true);
555 if (token->type == TOKEN_EQUAL)
556 {
557 readToken (token, true);
558 if (token->type == TOKEN_STRING)
559 {
560 if (stylesheet == NULL)
561 stylesheet = vStringNewCopy (token->string);
562 else
563 vStringCopy (stylesheet, token->string);
564 }
565 }
566 }
567 if (stylesheet_expectation && stylesheet && !vStringIsEmpty (stylesheet))
568 {
569 makeSimpleRefTag (stylesheet, K_STYELSHEET,
570 STYLESHEET_KIND_EXTERNAL_FILE_ROLE);
571 stylesheet_expectation = false;
572 if (stylesheet)
573 vStringClear (stylesheet);
574 }
575 }
576 }
577 while (token->type != TOKEN_TAG_END && token->type != TOKEN_TAG_END2 &&
578 token->type != TOKEN_EOF);
579
580 vStringDelete (stylesheet);
581 stylesheet = NULL;
582
583 if (!isVoid && token->type == TOKEN_TAG_END && depth < MAX_DEPTH)
584 {
585 long startSourceLineNumber = getSourceLineNumber ();
586 long startLineNumber = getInputLineNumber ();
587 long startLineOffset = getInputLineOffset ();
588 long endLineNumber;
589 long endLineOffset;
590 bool tag_start2;
591
592 if (startTag == KEYWORD_script)
593 {
594 bool script = skipScriptContent (token, &endLineNumber, &endLineOffset);
595 if (script)
596 makePromise ("JavaScript", startLineNumber, startLineOffset,
597 endLineNumber, endLineOffset, startSourceLineNumber);
598 readToken (token, true);
599 goto out;
600 }
601
602 tag_start2 = readTagContent (token, text, &endLineNumber, &endLineOffset, depth);
603 if (tag_start2)
604 {
605 readToken (token, true);
606 if (isHeading && textCreated && vStringLength (text) > 0)
607 {
608 keywordId endTag = lookupKeyword (vStringValue (token->string), Lang_html);
609 if (startTag == endTag)
610 {
611 htmlKind headingKind;
612
613 if (startTag == KEYWORD_title)
614 headingKind = K_TITLE;
615 if (startTag == KEYWORD_h1)
616 headingKind = K_HEADING1;
617 else if (startTag == KEYWORD_h2)
618 headingKind = K_HEADING2;
619 else
620 headingKind = K_HEADING3;
621
622 vStringStripLeading (text);
623 vStringStripTrailing (text);
624 makeSimpleTag (text, headingKind);
625 }
626 }
627 else if (startTag == KEYWORD_style)
628 {
629 keywordId endTag = lookupKeyword (vStringValue (token->string), Lang_html);
630 if (startTag == endTag)
631 makePromise ("CSS", startLineNumber, startLineOffset,
632 endLineNumber, endLineOffset, startSourceLineNumber);
633 }
634
635 readToken (token, true);
636 }
637 }
638 }
639
640 out:
641 if (textCreated)
642 vStringDelete (text);
643 }
644
findHtmlTags(void)645 static void findHtmlTags (void)
646 {
647 tokenInfo token;
648
649 token.string = vStringNew ();
650
651 do
652 {
653 readToken (&token, true);
654 if (token.type == TOKEN_TAG_START)
655 readTag (&token, NULL, 0);
656 }
657 while (token.type != TOKEN_EOF);
658
659 vStringDelete (token.string);
660 }
661
initialize(const langType language)662 static void initialize (const langType language)
663 {
664 Lang_html = language;
665 }
666
667 /* parser definition */
HtmlParser(void)668 extern parserDefinition* HtmlParser (void)
669 {
670 static const char *const extensions [] = { "htm", "html", NULL };
671 parserDefinition* def = parserNew ("HTML");
672 def->kindTable = HtmlKinds;
673 def->kindCount = ARRAY_SIZE (HtmlKinds);
674 def->extensions = extensions;
675 def->parser = findHtmlTags;
676 def->initialize = initialize;
677 def->keywordTable = HtmlKeywordTable;
678 def->keywordCount = ARRAY_SIZE (HtmlKeywordTable);
679 return def;
680 }
681