13ae02089SMasatake YAMATO /*
23b7fe603SJiří Techet * Copyright (c) 2016, Jiri Techet
33ae02089SMasatake YAMATO *
43ae02089SMasatake YAMATO * This source code is released for free distribution under the terms of the
50ce38835Sviccuad * GNU General Public License version 2 or (at your option) any later version.
63ae02089SMasatake YAMATO *
73ae02089SMasatake YAMATO * This module contains functions for generating tags for HTML language
83ae02089SMasatake YAMATO * files.
93ae02089SMasatake YAMATO */
103ae02089SMasatake YAMATO
113b7fe603SJiří Techet #include "general.h"
123ae02089SMasatake YAMATO
133b7fe603SJiří Techet #include <string.h>
143b7fe603SJiří Techet #include <ctype.h>
153b7fe603SJiří Techet
163b7fe603SJiří Techet #include "entry.h"
173b7fe603SJiří Techet #include "parse.h"
183b7fe603SJiří Techet #include "read.h"
193b7fe603SJiří Techet #include "routines.h"
203b7fe603SJiří Techet #include "keyword.h"
213b7fe603SJiří Techet #include "promise.h"
223b7fe603SJiří Techet
23b119ea03SJiří Techet /* The max. number of nested elements - prevents further recursion if the limit
24b119ea03SJiří Techet * is exceeded and avoids stack overflow for invalid input containing too many
25b119ea03SJiří Techet * open tags */
26b119ea03SJiří Techet #define MAX_DEPTH 1000
27b119ea03SJiří Techet
283b7fe603SJiří Techet
293b7fe603SJiří Techet typedef enum {
303b7fe603SJiří Techet K_ANCHOR,
31dfae3021SMasatake YAMATO K_CLASS,
32*ae9f0956SMasatake YAMATO K_TITLE,
333b7fe603SJiří Techet K_HEADING1,
343b7fe603SJiří Techet K_HEADING2,
3530ee9227SMasatake YAMATO K_HEADING3,
3675b31c74SMasatake YAMATO K_STYELSHEET,
379b02eb06SMasatake YAMATO K_ID,
3830ee9227SMasatake YAMATO K_SCRIPT,
393b7fe603SJiří Techet } htmlKind;
403b7fe603SJiří Techet
413b7fe603SJiří Techet
4230ee9227SMasatake YAMATO typedef enum {
43dfae3021SMasatake YAMATO CLASS_KIND_ATTRIBUTE_ROLE,
44dfae3021SMasatake YAMATO } ClassRole;
45dfae3021SMasatake YAMATO
46dfae3021SMasatake YAMATO typedef enum {
4730ee9227SMasatake YAMATO SCRIPT_KIND_EXTERNAL_FILE_ROLE,
4830ee9227SMasatake YAMATO } ScriptRole;
4930ee9227SMasatake YAMATO
5075b31c74SMasatake YAMATO typedef enum {
5175b31c74SMasatake YAMATO STYLESHEET_KIND_EXTERNAL_FILE_ROLE,
5275b31c74SMasatake YAMATO } StylesheetRole;
5375b31c74SMasatake YAMATO
54dfae3021SMasatake YAMATO static roleDefinition ClassRoles [] = {
55dfae3021SMasatake YAMATO { true, "attribute", "assigned as attributes" },
56dfae3021SMasatake YAMATO };
57dfae3021SMasatake YAMATO
5830ee9227SMasatake YAMATO static roleDefinition ScriptRoles [] = {
5930ee9227SMasatake YAMATO { true, "extFile", "referenced as external files" },
6030ee9227SMasatake YAMATO };
6130ee9227SMasatake YAMATO
6275b31c74SMasatake YAMATO static roleDefinition StylesheetRoles [] = {
6375b31c74SMasatake YAMATO { true, "extFile", "referenced as external files" },
6475b31c74SMasatake YAMATO };
6575b31c74SMasatake YAMATO
66e112e8abSMasatake YAMATO static kindDefinition HtmlKinds [] = {
67ce990805SThomas Braun { true, 'a', "anchor", "named anchors" },
68dfae3021SMasatake YAMATO { true, 'c', "class", "classes",
69dfae3021SMasatake YAMATO .referenceOnly = true, ATTACH_ROLES (ClassRoles)},
70*ae9f0956SMasatake YAMATO { true, 't', "title", "titles" },
71ce990805SThomas Braun { true, 'h', "heading1", "H1 headings" },
72ce990805SThomas Braun { true, 'i', "heading2", "H2 headings" },
7330ee9227SMasatake YAMATO { true, 'j', "heading3", "H3 headings" },
7475b31c74SMasatake YAMATO { true, 'C', "stylesheet", "stylesheets",
7575b31c74SMasatake YAMATO .referenceOnly = true, ATTACH_ROLES (StylesheetRoles)},
769b02eb06SMasatake YAMATO { true, 'I', "id", "identifiers" },
7730ee9227SMasatake YAMATO { true, 'J', "script", "scripts",
7830ee9227SMasatake YAMATO .referenceOnly = true, ATTACH_ROLES (ScriptRoles)},
79dc0f490fSMasatake YAMATO };
80dc0f490fSMasatake YAMATO
813b7fe603SJiří Techet typedef enum {
82*ae9f0956SMasatake YAMATO /* The order starting from "title" to "h3" should
83*ae9f0956SMasatake YAMATO * not be changed.
84*ae9f0956SMasatake YAMATO *
85*ae9f0956SMasatake YAMATO */
86*ae9f0956SMasatake YAMATO KEYWORD_heading_start,
87*ae9f0956SMasatake YAMATO KEYWORD_title = KEYWORD_heading_start,
883b7fe603SJiří Techet KEYWORD_h1,
893b7fe603SJiří Techet KEYWORD_h2,
903b7fe603SJiří Techet KEYWORD_h3,
91*ae9f0956SMasatake YAMATO KEYWORD_heading_end = KEYWORD_h3,
923b7fe603SJiří Techet KEYWORD_a,
933b7fe603SJiří Techet KEYWORD_script,
943b7fe603SJiří Techet KEYWORD_style,
95d7c12985SJiří Techet KEYWORD_name,
96d7c12985SJiří Techet
97d7c12985SJiří Techet /* void elements */
98d7c12985SJiří Techet KEYWORD_area,
99d7c12985SJiří Techet KEYWORD_base,
100d7c12985SJiří Techet KEYWORD_br,
101dfae3021SMasatake YAMATO KEYWORD_class,
102d7c12985SJiří Techet KEYWORD_col,
103d7c12985SJiří Techet KEYWORD_command,
104d7c12985SJiří Techet KEYWORD_embed,
105d7c12985SJiří Techet KEYWORD_hr,
10675b31c74SMasatake YAMATO KEYWORD_href,
1079b02eb06SMasatake YAMATO KEYWORD_id,
108d7c12985SJiří Techet KEYWORD_img,
109d7c12985SJiří Techet KEYWORD_input,
110d7c12985SJiří Techet KEYWORD_keygen,
111d7c12985SJiří Techet KEYWORD_link,
112d7c12985SJiří Techet KEYWORD_meta,
113d7c12985SJiří Techet KEYWORD_param,
11475b31c74SMasatake YAMATO KEYWORD_rel,
115d7c12985SJiří Techet KEYWORD_source,
11630ee9227SMasatake YAMATO KEYWORD_src,
117d7c12985SJiří Techet KEYWORD_track,
118d7c12985SJiří Techet KEYWORD_wbr
1193b7fe603SJiří Techet } keywordId;
1203ae02089SMasatake YAMATO
1213b7fe603SJiří Techet static const keywordTable HtmlKeywordTable[] = {
122*ae9f0956SMasatake YAMATO {"title", KEYWORD_title},
1233b7fe603SJiří Techet {"h1", KEYWORD_h1},
1243b7fe603SJiří Techet {"h2", KEYWORD_h2},
1253b7fe603SJiří Techet {"h3", KEYWORD_h3},
1263b7fe603SJiří Techet {"a", KEYWORD_a},
1273b7fe603SJiří Techet {"script", KEYWORD_script},
1283b7fe603SJiří Techet {"style", KEYWORD_style},
1293b7fe603SJiří Techet {"name", KEYWORD_name},
130d7c12985SJiří Techet
131d7c12985SJiří Techet /* void elements */
132d7c12985SJiří Techet {"area", KEYWORD_area},
133d7c12985SJiří Techet {"base", KEYWORD_base},
134d7c12985SJiří Techet {"br", KEYWORD_br},
135dfae3021SMasatake YAMATO {"class", KEYWORD_class},
136d7c12985SJiří Techet {"col", KEYWORD_col},
137d7c12985SJiří Techet {"command", KEYWORD_command},
138d7c12985SJiří Techet {"embed", KEYWORD_embed},
139d7c12985SJiří Techet {"hr", KEYWORD_hr},
14075b31c74SMasatake YAMATO {"href", KEYWORD_href},
1419b02eb06SMasatake YAMATO {"id", KEYWORD_id},
142d7c12985SJiří Techet {"img", KEYWORD_img},
143d7c12985SJiří Techet {"input", KEYWORD_input},
144d7c12985SJiří Techet {"keygen", KEYWORD_keygen},
145d7c12985SJiří Techet {"link", KEYWORD_link},
146d7c12985SJiří Techet {"meta", KEYWORD_meta},
147d7c12985SJiří Techet {"param", KEYWORD_param},
14875b31c74SMasatake YAMATO {"rel", KEYWORD_rel},
149d7c12985SJiří Techet {"source", KEYWORD_source},
15030ee9227SMasatake YAMATO {"src", KEYWORD_src},
151d7c12985SJiří Techet {"track", KEYWORD_track},
152d7c12985SJiří Techet {"wbr", KEYWORD_wbr},
1533b7fe603SJiří Techet };
1543b7fe603SJiří Techet
1553b7fe603SJiří Techet typedef enum {
1563b7fe603SJiří Techet TOKEN_EOF,
1573b7fe603SJiří Techet TOKEN_NAME, /* tag and attribute names */
1583b7fe603SJiří Techet TOKEN_STRING, /* single- or double-quoted attribute value */
1593b7fe603SJiří Techet TOKEN_TEXT,
1603b7fe603SJiří Techet TOKEN_TAG_START, /* < */
1613b7fe603SJiří Techet TOKEN_TAG_START2, /* </ */
1623b7fe603SJiří Techet TOKEN_TAG_END, /* > */
1633b7fe603SJiří Techet TOKEN_TAG_END2, /* /> */
1643b7fe603SJiří Techet TOKEN_EQUAL,
1653b7fe603SJiří Techet TOKEN_COMMENT,
1663b7fe603SJiří Techet TOKEN_OTHER
1673b7fe603SJiří Techet } tokenType;
1683b7fe603SJiří Techet
169b0a80065SMasatake YAMATO #ifdef DEBUG
170b0a80065SMasatake YAMATO const char *tokenTypes[] = {
171b0a80065SMasatake YAMATO #define E(X) [TOKEN_##X] = #X
172b0a80065SMasatake YAMATO E(EOF),
173b0a80065SMasatake YAMATO E(NAME),
174b0a80065SMasatake YAMATO E(STRING),
175b0a80065SMasatake YAMATO E(TEXT),
176b0a80065SMasatake YAMATO E(TAG_START),
177b0a80065SMasatake YAMATO E(TAG_START2),
178b0a80065SMasatake YAMATO E(TAG_END),
179b0a80065SMasatake YAMATO E(TAG_END2),
180b0a80065SMasatake YAMATO E(EQUAL),
181b0a80065SMasatake YAMATO E(COMMENT),
182b0a80065SMasatake YAMATO E(OTHER),
183b0a80065SMasatake YAMATO #undef E
184b0a80065SMasatake YAMATO };
185b0a80065SMasatake YAMATO #endif
186b0a80065SMasatake YAMATO
1873b7fe603SJiří Techet typedef struct {
1883b7fe603SJiří Techet tokenType type;
1893b7fe603SJiří Techet vString *string;
1903b7fe603SJiří Techet } tokenInfo;
1913b7fe603SJiří Techet
1923b7fe603SJiří Techet
1933b7fe603SJiří Techet static int Lang_html;
1943b7fe603SJiří Techet
1953b7fe603SJiří Techet
196b119ea03SJiří Techet static void readTag (tokenInfo *token, vString *text, int depth);
1973b7fe603SJiří Techet
198b0a80065SMasatake YAMATO #ifdef DEBUG
1992cf5f15cSK.Takata #if 0
200b0a80065SMasatake YAMATO static void dumpToken (tokenInfo *token, const char *context, const char* extra_context)
201b0a80065SMasatake YAMATO {
202b0a80065SMasatake YAMATO fprintf (stderr, "[%7s] %-20s@%s.%s\n",
203b0a80065SMasatake YAMATO tokenTypes[token->type], vStringValue(token->string),
204b0a80065SMasatake YAMATO context, extra_context? extra_context: "_");
205b0a80065SMasatake YAMATO }
206b0a80065SMasatake YAMATO #endif
2072cf5f15cSK.Takata #endif
2083b7fe603SJiří Techet
readTokenText(tokenInfo * const token,bool collectText)209ce990805SThomas Braun static void readTokenText (tokenInfo *const token, bool collectText)
2103b7fe603SJiří Techet {
2113b7fe603SJiří Techet int c;
2123b7fe603SJiří Techet int lastC = 'X'; /* whatever non-space character */
2133b7fe603SJiří Techet
2143b7fe603SJiří Techet vStringClear (token->string);
2153b7fe603SJiří Techet
2163b7fe603SJiří Techet getNextChar:
2173b7fe603SJiří Techet
2183b7fe603SJiří Techet c = getcFromInputFile ();
2193b7fe603SJiří Techet
2203b7fe603SJiří Techet switch (c)
2213b7fe603SJiří Techet {
2223b7fe603SJiří Techet case EOF:
2233b7fe603SJiří Techet token->type = TOKEN_EOF;
2243b7fe603SJiří Techet break;
2253b7fe603SJiří Techet
2263b7fe603SJiří Techet case '<':
2273b7fe603SJiří Techet ungetcToInputFile (c);
2283b7fe603SJiří Techet token->type = TOKEN_TEXT;
2293b7fe603SJiří Techet break;
2303b7fe603SJiří Techet
2313b7fe603SJiří Techet default:
2323b7fe603SJiří Techet if (collectText)
2333b7fe603SJiří Techet {
2343b7fe603SJiří Techet if (isspace (c))
2353b7fe603SJiří Techet c = ' ';
23635576061SMasatake YAMATO if (c != ' ' || lastC != ' ')
2373b7fe603SJiří Techet {
2383b7fe603SJiří Techet vStringPut (token->string, c);
2393b7fe603SJiří Techet lastC = c;
2403b7fe603SJiří Techet }
2413b7fe603SJiří Techet }
2423b7fe603SJiří Techet
2433b7fe603SJiří Techet goto getNextChar;
2443b7fe603SJiří Techet }
2453b7fe603SJiří Techet }
2463b7fe603SJiří Techet
readToken(tokenInfo * const token,bool skipComments)247ce990805SThomas Braun static void readToken (tokenInfo *const token, bool skipComments)
2483b7fe603SJiří Techet {
2493b7fe603SJiří Techet int c;
2503b7fe603SJiří Techet
2513b7fe603SJiří Techet vStringClear (token->string);
2523b7fe603SJiří Techet
2533b7fe603SJiří Techet getNextChar:
2543b7fe603SJiří Techet
2553b7fe603SJiří Techet c = getcFromInputFile ();
2563b7fe603SJiří Techet while (isspace (c))
2573b7fe603SJiří Techet c = getcFromInputFile ();
2583b7fe603SJiří Techet
2593b7fe603SJiří Techet switch (c)
2603b7fe603SJiří Techet {
2613b7fe603SJiří Techet case EOF:
2623b7fe603SJiří Techet token->type = TOKEN_EOF;
2633b7fe603SJiří Techet break;
2643b7fe603SJiří Techet
2653b7fe603SJiří Techet case '<':
2663b7fe603SJiří Techet {
2673b7fe603SJiří Techet int d = getcFromInputFile ();
2683b7fe603SJiří Techet
2693b7fe603SJiří Techet if (d == '!')
2703b7fe603SJiří Techet {
2713b7fe603SJiří Techet d = getcFromInputFile ();
2723b7fe603SJiří Techet if (d == '-')
2733b7fe603SJiří Techet {
2743b7fe603SJiří Techet d = getcFromInputFile ();
2753b7fe603SJiří Techet if (d == '-')
2763b7fe603SJiří Techet {
2773b7fe603SJiří Techet int e = ' ';
2783b7fe603SJiří Techet int f = ' ';
2793b7fe603SJiří Techet do
2803b7fe603SJiří Techet {
2813b7fe603SJiří Techet d = e;
2823b7fe603SJiří Techet e = f;
2833b7fe603SJiří Techet f = getcFromInputFile ();
2843b7fe603SJiří Techet }
2853b7fe603SJiří Techet while (f != EOF && ! (d == '-' && e == '-' && f == '>'));
2863b7fe603SJiří Techet
2873b7fe603SJiří Techet if (skipComments)
2883b7fe603SJiří Techet goto getNextChar;
2893b7fe603SJiří Techet else
2903b7fe603SJiří Techet {
2913b7fe603SJiří Techet token->type = TOKEN_COMMENT;
2923b7fe603SJiří Techet break;
2933b7fe603SJiří Techet }
2943b7fe603SJiří Techet }
2953b7fe603SJiří Techet }
2963b7fe603SJiří Techet ungetcToInputFile (d);
2973b7fe603SJiří Techet token->type = TOKEN_OTHER;
2983b7fe603SJiří Techet }
2993b7fe603SJiří Techet else if (d == '?')
3003b7fe603SJiří Techet token->type = TOKEN_OTHER;
3013b7fe603SJiří Techet else if (d == '/')
3023b7fe603SJiří Techet token->type = TOKEN_TAG_START2;
3033b7fe603SJiří Techet else
3043b7fe603SJiří Techet {
3053b7fe603SJiří Techet ungetcToInputFile (d);
3063b7fe603SJiří Techet token->type = TOKEN_TAG_START;
3073b7fe603SJiří Techet }
3083b7fe603SJiří Techet break;
3093b7fe603SJiří Techet }
3103b7fe603SJiří Techet case '/':
3113b7fe603SJiří Techet {
3123b7fe603SJiří Techet int d = getcFromInputFile ();
3133b7fe603SJiří Techet if (d == '>')
3143b7fe603SJiří Techet token->type = TOKEN_TAG_END2;
3153b7fe603SJiří Techet else
3163b7fe603SJiří Techet {
3173b7fe603SJiří Techet ungetcToInputFile (d);
3183b7fe603SJiří Techet token->type = TOKEN_OTHER;
3193b7fe603SJiří Techet }
3203b7fe603SJiří Techet break;
3213b7fe603SJiří Techet }
3223b7fe603SJiří Techet case '>':
3233b7fe603SJiří Techet token->type = TOKEN_TAG_END;
3243b7fe603SJiří Techet break;
3253b7fe603SJiří Techet
3263b7fe603SJiří Techet case '=':
3273b7fe603SJiří Techet token->type = TOKEN_EQUAL;
3283b7fe603SJiří Techet break;
3293b7fe603SJiří Techet
3303b7fe603SJiří Techet case '"':
3313b7fe603SJiří Techet case '\'':
3323b7fe603SJiří Techet {
3333b7fe603SJiří Techet const int delimiter = c;
3343b7fe603SJiří Techet c = getcFromInputFile ();
3353b7fe603SJiří Techet while (c != EOF && c != delimiter)
3363b7fe603SJiří Techet {
3373b7fe603SJiří Techet vStringPut (token->string, c);
3383b7fe603SJiří Techet c = getcFromInputFile ();
3393b7fe603SJiří Techet }
3403b7fe603SJiří Techet token->type = TOKEN_STRING;
3413b7fe603SJiří Techet break;
3423b7fe603SJiří Techet }
3433b7fe603SJiří Techet
3443b7fe603SJiří Techet default:
3453b7fe603SJiří Techet {
3463b7fe603SJiří Techet do
3473b7fe603SJiří Techet {
3483b7fe603SJiří Techet vStringPut (token->string, tolower (c));
3493b7fe603SJiří Techet c = getcFromInputFile ();
3503b7fe603SJiří Techet }
3513b7fe603SJiří Techet while (!isspace (c) && c != '<' && c != '>' && c != '/' &&
3523b7fe603SJiří Techet c != '=' && c != '\'' && c != '"' && c != EOF);
3533b7fe603SJiří Techet if (c != EOF)
3543b7fe603SJiří Techet ungetcToInputFile (c);
3553b7fe603SJiří Techet token->type = TOKEN_NAME;
3563b7fe603SJiří Techet break;
3573b7fe603SJiří Techet }
3583b7fe603SJiří Techet }
3593b7fe603SJiří Techet }
3603b7fe603SJiří Techet
appendText(vString * text,vString * appendedText)3613b7fe603SJiří Techet static void appendText (vString *text, vString *appendedText)
3623b7fe603SJiří Techet {
3633b7fe603SJiří Techet if (text != NULL && vStringLength (appendedText) > 0)
3643b7fe603SJiří Techet {
3653b7fe603SJiří Techet if (vStringLength (text) > 0 && vStringLast (text) == ' ' &&
3663b7fe603SJiří Techet vStringLength (appendedText) > 0 && vStringChar (appendedText, 0) == ' ')
3673b7fe603SJiří Techet {
3683b7fe603SJiří Techet vStringStripTrailing (text);
3693b7fe603SJiří Techet }
3703b7fe603SJiří Techet vStringCat (text, appendedText);
3713b7fe603SJiří Techet }
3723b7fe603SJiří Techet }
3733b7fe603SJiří Techet
readTagContent(tokenInfo * token,vString * text,long * line,long * lineOffset,int depth)374b119ea03SJiří Techet static bool readTagContent (tokenInfo *token, vString *text, long *line, long *lineOffset, int depth)
3753b7fe603SJiří Techet {
3763b7fe603SJiří Techet tokenType type;
3773b7fe603SJiří Techet
3783b7fe603SJiří Techet readTokenText (token, text != NULL);
3793b7fe603SJiří Techet appendText (text, token->string);
3803b7fe603SJiří Techet
3813b7fe603SJiří Techet do
3823b7fe603SJiří Techet {
3833b7fe603SJiří Techet *line = getInputLineNumber ();
3843b7fe603SJiří Techet *lineOffset = getInputLineOffset ();
385ce990805SThomas Braun readToken (token, false);
3863b7fe603SJiří Techet type = token->type;
3873b7fe603SJiří Techet if (type == TOKEN_TAG_START)
388b119ea03SJiří Techet readTag (token, text, depth + 1);
3893b7fe603SJiří Techet if (type == TOKEN_COMMENT || type == TOKEN_TAG_START)
3903b7fe603SJiří Techet {
3913b7fe603SJiří Techet readTokenText (token, text != NULL);
3923b7fe603SJiří Techet appendText (text, token->string);
3933b7fe603SJiří Techet }
3943b7fe603SJiří Techet }
3953b7fe603SJiří Techet while (type == TOKEN_COMMENT || type == TOKEN_TAG_START);
3963b7fe603SJiří Techet
3973b7fe603SJiří Techet return type == TOKEN_TAG_START2;
3983b7fe603SJiří Techet }
3993b7fe603SJiří Techet
skipScriptContent(tokenInfo * token,long * line,long * lineOffset)400fd3a8075SMasatake YAMATO static bool skipScriptContent (tokenInfo *token, long *line, long *lineOffset)
401fd3a8075SMasatake YAMATO {
402fd3a8075SMasatake YAMATO bool found_start = false;
403fd3a8075SMasatake YAMATO bool found_script = false;
404fd3a8075SMasatake YAMATO
405e9acb8f4SK.Takata long line_tmp[2] = {0};
406e9acb8f4SK.Takata long lineOffset_tmp[2] = {0};
407fd3a8075SMasatake YAMATO
408fd3a8075SMasatake YAMATO tokenType type;
409fd3a8075SMasatake YAMATO
410fd3a8075SMasatake YAMATO do
411fd3a8075SMasatake YAMATO {
412fd3a8075SMasatake YAMATO line_tmp[0] = getInputLineNumber ();
413fd3a8075SMasatake YAMATO lineOffset_tmp[0] = getInputLineOffset ();
414fd3a8075SMasatake YAMATO
415fd3a8075SMasatake YAMATO readToken (token, false);
416fd3a8075SMasatake YAMATO type = token->type;
417fd3a8075SMasatake YAMATO
418fd3a8075SMasatake YAMATO if (type == TOKEN_TAG_START2)
419fd3a8075SMasatake YAMATO {
420fd3a8075SMasatake YAMATO found_start = true;
421fd3a8075SMasatake YAMATO line_tmp[1] = line_tmp[0];
422fd3a8075SMasatake YAMATO lineOffset_tmp[1] = lineOffset_tmp[0];
423fd3a8075SMasatake YAMATO }
424fd3a8075SMasatake YAMATO else if (found_start
425fd3a8075SMasatake YAMATO && type == TOKEN_NAME
426fd3a8075SMasatake YAMATO && lookupKeyword (vStringValue (token->string), Lang_html) == KEYWORD_script)
427fd3a8075SMasatake YAMATO {
428ec10a9daSMasatake YAMATO found_script = true;
429fd3a8075SMasatake YAMATO *line = line_tmp[1];
430fd3a8075SMasatake YAMATO *lineOffset = lineOffset_tmp[1];
431fd3a8075SMasatake YAMATO }
432fd3a8075SMasatake YAMATO else
433fd3a8075SMasatake YAMATO found_start = false;
434fd3a8075SMasatake YAMATO }
435fd3a8075SMasatake YAMATO while ((type != TOKEN_EOF) && (!found_script));
436fd3a8075SMasatake YAMATO
437fcd86ad0SMasatake YAMATO return found_script;
438fd3a8075SMasatake YAMATO }
439fd3a8075SMasatake YAMATO
makeClassRefTags(const char * classes)440dfae3021SMasatake YAMATO static void makeClassRefTags (const char *classes)
441dfae3021SMasatake YAMATO {
442dfae3021SMasatake YAMATO vString *klass = vStringNew ();
443dfae3021SMasatake YAMATO
444dfae3021SMasatake YAMATO do
445dfae3021SMasatake YAMATO {
446dfae3021SMasatake YAMATO if (*classes && !isspace (*classes))
447dfae3021SMasatake YAMATO vStringPut (klass, *classes);
448dfae3021SMasatake YAMATO else if (!vStringIsEmpty (klass))
449dfae3021SMasatake YAMATO {
450dfae3021SMasatake YAMATO makeSimpleRefTag (klass, K_CLASS,
451dfae3021SMasatake YAMATO CLASS_KIND_ATTRIBUTE_ROLE);
452dfae3021SMasatake YAMATO vStringClear (klass);
453dfae3021SMasatake YAMATO }
454dfae3021SMasatake YAMATO
455dfae3021SMasatake YAMATO if (!*classes)
456dfae3021SMasatake YAMATO break;
457dfae3021SMasatake YAMATO
458dfae3021SMasatake YAMATO classes++;
459dfae3021SMasatake YAMATO } while (1);
460dfae3021SMasatake YAMATO
461dfae3021SMasatake YAMATO vStringDelete (klass);
462dfae3021SMasatake YAMATO }
463dfae3021SMasatake YAMATO
readTag(tokenInfo * token,vString * text,int depth)464b119ea03SJiří Techet static void readTag (tokenInfo *token, vString *text, int depth)
4653b7fe603SJiří Techet {
466ce990805SThomas Braun bool textCreated = false;
4673b7fe603SJiří Techet
468ce990805SThomas Braun readToken (token, true);
4693b7fe603SJiří Techet if (token->type == TOKEN_NAME)
4703b7fe603SJiří Techet {
4713b7fe603SJiří Techet keywordId startTag;
472ce990805SThomas Braun bool isHeading;
473d7c12985SJiří Techet bool isVoid;
47475b31c74SMasatake YAMATO vString *stylesheet = NULL;
47575b31c74SMasatake YAMATO bool stylesheet_expectation = false;
4763b7fe603SJiří Techet
4773b7fe603SJiří Techet startTag = lookupKeyword (vStringValue (token->string), Lang_html);
478*ae9f0956SMasatake YAMATO isHeading = (KEYWORD_heading_start <= startTag && startTag <= KEYWORD_heading_end);
479d7c12985SJiří Techet isVoid = (startTag >= KEYWORD_area && startTag <= KEYWORD_wbr);
4803b7fe603SJiří Techet if (text == NULL && isHeading)
4813b7fe603SJiří Techet {
4823b7fe603SJiří Techet text = vStringNew ();
483ce990805SThomas Braun textCreated = true;
4843b7fe603SJiří Techet }
4853b7fe603SJiří Techet
4863b7fe603SJiří Techet do
4873b7fe603SJiří Techet {
488ba274515SMasatake YAMATO keywordId attribute = KEYWORD_NONE;
489ba274515SMasatake YAMATO
490ce990805SThomas Braun readToken (token, true);
491ba274515SMasatake YAMATO if (token->type == TOKEN_NAME)
492ba274515SMasatake YAMATO attribute = lookupKeyword (vStringValue (token->string), Lang_html);
493ba274515SMasatake YAMATO
494dfae3021SMasatake YAMATO if (attribute == KEYWORD_class)
495dfae3021SMasatake YAMATO {
496dfae3021SMasatake YAMATO readToken (token, true);
497dfae3021SMasatake YAMATO if (token->type == TOKEN_EQUAL)
498dfae3021SMasatake YAMATO {
499dfae3021SMasatake YAMATO readToken (token, true);
500dfae3021SMasatake YAMATO if (token->type == TOKEN_STRING)
501dfae3021SMasatake YAMATO makeClassRefTags (vStringValue (token->string));
502dfae3021SMasatake YAMATO }
503dfae3021SMasatake YAMATO }
504dfae3021SMasatake YAMATO else if (attribute == KEYWORD_id)
5059b02eb06SMasatake YAMATO {
5069b02eb06SMasatake YAMATO readToken (token, true);
5079b02eb06SMasatake YAMATO if (token->type == TOKEN_EQUAL)
5089b02eb06SMasatake YAMATO {
5099b02eb06SMasatake YAMATO readToken (token, true);
5109b02eb06SMasatake YAMATO if (token->type == TOKEN_STRING)
5119b02eb06SMasatake YAMATO makeSimpleTag (token->string, K_ID);
5129b02eb06SMasatake YAMATO }
5139b02eb06SMasatake YAMATO }
514ba274515SMasatake YAMATO else if (startTag == KEYWORD_a && attribute == KEYWORD_name)
5153b7fe603SJiří Techet {
516ce990805SThomas Braun readToken (token, true);
5173b7fe603SJiří Techet if (token->type == TOKEN_EQUAL)
5183b7fe603SJiří Techet {
519ce990805SThomas Braun readToken (token, true);
5203b7fe603SJiří Techet if (token->type == TOKEN_STRING || token->type == TOKEN_NAME)
52116a2541cSMasatake YAMATO makeSimpleTag (token->string, K_ANCHOR);
5223b7fe603SJiří Techet }
5233b7fe603SJiří Techet }
524ba274515SMasatake YAMATO else if (startTag == KEYWORD_script && attribute == KEYWORD_src)
52530ee9227SMasatake YAMATO {
52630ee9227SMasatake YAMATO readToken (token, true);
52730ee9227SMasatake YAMATO if (token->type == TOKEN_EQUAL)
52830ee9227SMasatake YAMATO {
52930ee9227SMasatake YAMATO readToken (token, true);
53030ee9227SMasatake YAMATO if (token->type == TOKEN_STRING)
53130ee9227SMasatake YAMATO makeSimpleRefTag (token->string, K_SCRIPT,
53230ee9227SMasatake YAMATO SCRIPT_KIND_EXTERNAL_FILE_ROLE);
53330ee9227SMasatake YAMATO }
53430ee9227SMasatake YAMATO }
535ba274515SMasatake YAMATO else if (startTag == KEYWORD_link)
53675b31c74SMasatake YAMATO {
53775b31c74SMasatake YAMATO if (attribute == KEYWORD_rel)
53875b31c74SMasatake YAMATO {
53975b31c74SMasatake YAMATO readToken (token, true);
54075b31c74SMasatake YAMATO if (token->type == TOKEN_EQUAL)
54175b31c74SMasatake YAMATO {
54275b31c74SMasatake YAMATO readToken (token, true);
54375b31c74SMasatake YAMATO if (token->type == TOKEN_STRING &&
54475b31c74SMasatake YAMATO /* strcmp is not enough:
54575b31c74SMasatake YAMATO * e.g. <link href="fancy.css"
54675b31c74SMasatake YAMATO * rel="alternate stylesheet" title="Fancy"> */
54775b31c74SMasatake YAMATO vStringLength(token->string) >= 10 &&
54875b31c74SMasatake YAMATO strstr (vStringValue (token->string), "stylesheet"))
54975b31c74SMasatake YAMATO stylesheet_expectation = true;
55075b31c74SMasatake YAMATO }
55175b31c74SMasatake YAMATO }
55275b31c74SMasatake YAMATO else if (attribute == KEYWORD_href)
55375b31c74SMasatake YAMATO {
55475b31c74SMasatake YAMATO readToken (token, true);
55575b31c74SMasatake YAMATO if (token->type == TOKEN_EQUAL)
55675b31c74SMasatake YAMATO {
55775b31c74SMasatake YAMATO readToken (token, true);
55875b31c74SMasatake YAMATO if (token->type == TOKEN_STRING)
55975b31c74SMasatake YAMATO {
56075b31c74SMasatake YAMATO if (stylesheet == NULL)
56175b31c74SMasatake YAMATO stylesheet = vStringNewCopy (token->string);
56275b31c74SMasatake YAMATO else
56375b31c74SMasatake YAMATO vStringCopy (stylesheet, token->string);
56475b31c74SMasatake YAMATO }
56575b31c74SMasatake YAMATO }
56675b31c74SMasatake YAMATO }
56775b31c74SMasatake YAMATO if (stylesheet_expectation && stylesheet && !vStringIsEmpty (stylesheet))
56875b31c74SMasatake YAMATO {
56975b31c74SMasatake YAMATO makeSimpleRefTag (stylesheet, K_STYELSHEET,
57075b31c74SMasatake YAMATO STYLESHEET_KIND_EXTERNAL_FILE_ROLE);
57175b31c74SMasatake YAMATO stylesheet_expectation = false;
57275b31c74SMasatake YAMATO if (stylesheet)
57375b31c74SMasatake YAMATO vStringClear (stylesheet);
57475b31c74SMasatake YAMATO }
57575b31c74SMasatake YAMATO }
5763b7fe603SJiří Techet }
5773b7fe603SJiří Techet while (token->type != TOKEN_TAG_END && token->type != TOKEN_TAG_END2 &&
5783b7fe603SJiří Techet token->type != TOKEN_EOF);
5793b7fe603SJiří Techet
58075b31c74SMasatake YAMATO vStringDelete (stylesheet);
58175b31c74SMasatake YAMATO stylesheet = NULL;
58275b31c74SMasatake YAMATO
583b119ea03SJiří Techet if (!isVoid && token->type == TOKEN_TAG_END && depth < MAX_DEPTH)
5843b7fe603SJiří Techet {
5853b7fe603SJiří Techet long startSourceLineNumber = getSourceLineNumber ();
5863b7fe603SJiří Techet long startLineNumber = getInputLineNumber ();
587ef722b09SMasatake YAMATO long startLineOffset = getInputLineOffset ();
5883b7fe603SJiří Techet long endLineNumber;
589ef722b09SMasatake YAMATO long endLineOffset;
590ce990805SThomas Braun bool tag_start2;
5913b7fe603SJiří Techet
592fd3a8075SMasatake YAMATO if (startTag == KEYWORD_script)
593fd3a8075SMasatake YAMATO {
594fd3a8075SMasatake YAMATO bool script = skipScriptContent (token, &endLineNumber, &endLineOffset);
595fd3a8075SMasatake YAMATO if (script)
596fd3a8075SMasatake YAMATO makePromise ("JavaScript", startLineNumber, startLineOffset,
597fd3a8075SMasatake YAMATO endLineNumber, endLineOffset, startSourceLineNumber);
598fd3a8075SMasatake YAMATO readToken (token, true);
599fd3a8075SMasatake YAMATO goto out;
600fd3a8075SMasatake YAMATO }
6013b7fe603SJiří Techet
602fd3a8075SMasatake YAMATO tag_start2 = readTagContent (token, text, &endLineNumber, &endLineOffset, depth);
6033b7fe603SJiří Techet if (tag_start2)
6043b7fe603SJiří Techet {
605ce990805SThomas Braun readToken (token, true);
6063b7fe603SJiří Techet if (isHeading && textCreated && vStringLength (text) > 0)
6073b7fe603SJiří Techet {
6083b7fe603SJiří Techet keywordId endTag = lookupKeyword (vStringValue (token->string), Lang_html);
6093b7fe603SJiří Techet if (startTag == endTag)
6103b7fe603SJiří Techet {
6113b7fe603SJiří Techet htmlKind headingKind;
6123b7fe603SJiří Techet
613*ae9f0956SMasatake YAMATO if (startTag == KEYWORD_title)
614*ae9f0956SMasatake YAMATO headingKind = K_TITLE;
6153b7fe603SJiří Techet if (startTag == KEYWORD_h1)
6163b7fe603SJiří Techet headingKind = K_HEADING1;
6173b7fe603SJiří Techet else if (startTag == KEYWORD_h2)
6183b7fe603SJiří Techet headingKind = K_HEADING2;
6193b7fe603SJiří Techet else
6203b7fe603SJiří Techet headingKind = K_HEADING3;
6213b7fe603SJiří Techet
6224b4bf448SColomban Wendling vStringStripLeading (text);
6233b7fe603SJiří Techet vStringStripTrailing (text);
62416a2541cSMasatake YAMATO makeSimpleTag (text, headingKind);
6253b7fe603SJiří Techet }
6263b7fe603SJiří Techet }
6273b7fe603SJiří Techet else if (startTag == KEYWORD_style)
6283b7fe603SJiří Techet {
6293b7fe603SJiří Techet keywordId endTag = lookupKeyword (vStringValue (token->string), Lang_html);
6303b7fe603SJiří Techet if (startTag == endTag)
6313b7fe603SJiří Techet makePromise ("CSS", startLineNumber, startLineOffset,
6323b7fe603SJiří Techet endLineNumber, endLineOffset, startSourceLineNumber);
6333b7fe603SJiří Techet }
6343b7fe603SJiří Techet
635ce990805SThomas Braun readToken (token, true);
6363b7fe603SJiří Techet }
6373b7fe603SJiří Techet }
6383b7fe603SJiří Techet }
6393b7fe603SJiří Techet
640fd3a8075SMasatake YAMATO out:
6413b7fe603SJiří Techet if (textCreated)
6423b7fe603SJiří Techet vStringDelete (text);
6433b7fe603SJiří Techet }
6443b7fe603SJiří Techet
findHtmlTags(void)6453b7fe603SJiří Techet static void findHtmlTags (void)
6463b7fe603SJiří Techet {
6473b7fe603SJiří Techet tokenInfo token;
6483b7fe603SJiří Techet
6493b7fe603SJiří Techet token.string = vStringNew ();
6503b7fe603SJiří Techet
6513b7fe603SJiří Techet do
6523b7fe603SJiří Techet {
653ce990805SThomas Braun readToken (&token, true);
6543b7fe603SJiří Techet if (token.type == TOKEN_TAG_START)
655b119ea03SJiří Techet readTag (&token, NULL, 0);
6563b7fe603SJiří Techet }
6573b7fe603SJiří Techet while (token.type != TOKEN_EOF);
6583b7fe603SJiří Techet
6593b7fe603SJiří Techet vStringDelete (token.string);
6603b7fe603SJiří Techet }
6613b7fe603SJiří Techet
initialize(const langType language)6623b7fe603SJiří Techet static void initialize (const langType language)
6633b7fe603SJiří Techet {
6643b7fe603SJiří Techet Lang_html = language;
6653b7fe603SJiří Techet }
6663b7fe603SJiří Techet
6673b7fe603SJiří Techet /* parser definition */
HtmlParser(void)6683ae02089SMasatake YAMATO extern parserDefinition* HtmlParser (void)
6693ae02089SMasatake YAMATO {
6703ae02089SMasatake YAMATO static const char *const extensions [] = { "htm", "html", NULL };
6713b7fe603SJiří Techet parserDefinition* def = parserNew ("HTML");
67209ae690fSMasatake YAMATO def->kindTable = HtmlKinds;
6733b7fe603SJiří Techet def->kindCount = ARRAY_SIZE (HtmlKinds);
6743ae02089SMasatake YAMATO def->extensions = extensions;
6753b7fe603SJiří Techet def->parser = findHtmlTags;
6763b7fe603SJiří Techet def->initialize = initialize;
6773b7fe603SJiří Techet def->keywordTable = HtmlKeywordTable;
6783b7fe603SJiří Techet def->keywordCount = ARRAY_SIZE (HtmlKeywordTable);
6793ae02089SMasatake YAMATO return def;
6803ae02089SMasatake YAMATO }
681