xref: /Universal-ctags/parsers/html.c (revision ae9f095608a8efa2b27574ac9ec8dd2465ac849c)
13ae02089SMasatake YAMATO /*
23b7fe603SJiří Techet *   Copyright (c) 2016, Jiri Techet
33ae02089SMasatake YAMATO *
43ae02089SMasatake YAMATO *   This source code is released for free distribution under the terms of the
50ce38835Sviccuad *   GNU General Public License version 2 or (at your option) any later version.
63ae02089SMasatake YAMATO *
73ae02089SMasatake YAMATO *   This module contains functions for generating tags for HTML language
83ae02089SMasatake YAMATO *   files.
93ae02089SMasatake YAMATO */
103ae02089SMasatake YAMATO 
113b7fe603SJiří Techet #include "general.h"
123ae02089SMasatake YAMATO 
133b7fe603SJiří Techet #include <string.h>
143b7fe603SJiří Techet #include <ctype.h>
153b7fe603SJiří Techet 
163b7fe603SJiří Techet #include "entry.h"
173b7fe603SJiří Techet #include "parse.h"
183b7fe603SJiří Techet #include "read.h"
193b7fe603SJiří Techet #include "routines.h"
203b7fe603SJiří Techet #include "keyword.h"
213b7fe603SJiří Techet #include "promise.h"
223b7fe603SJiří Techet 
23b119ea03SJiří Techet /* The max. number of nested elements - prevents further recursion if the limit
24b119ea03SJiří Techet  * is exceeded and avoids stack overflow for invalid input containing too many
25b119ea03SJiří Techet  * open tags */
26b119ea03SJiří Techet #define MAX_DEPTH 1000
27b119ea03SJiří Techet 
283b7fe603SJiří Techet 
293b7fe603SJiří Techet typedef enum {
303b7fe603SJiří Techet 	K_ANCHOR,
31dfae3021SMasatake YAMATO 	K_CLASS,
32*ae9f0956SMasatake YAMATO 	K_TITLE,
333b7fe603SJiří Techet 	K_HEADING1,
343b7fe603SJiří Techet 	K_HEADING2,
3530ee9227SMasatake YAMATO 	K_HEADING3,
3675b31c74SMasatake YAMATO 	K_STYELSHEET,
379b02eb06SMasatake YAMATO 	K_ID,
3830ee9227SMasatake YAMATO 	K_SCRIPT,
393b7fe603SJiří Techet } htmlKind;
403b7fe603SJiří Techet 
413b7fe603SJiří Techet 
4230ee9227SMasatake YAMATO typedef enum {
43dfae3021SMasatake YAMATO 	CLASS_KIND_ATTRIBUTE_ROLE,
44dfae3021SMasatake YAMATO } ClassRole;
45dfae3021SMasatake YAMATO 
46dfae3021SMasatake YAMATO typedef enum {
4730ee9227SMasatake YAMATO 	SCRIPT_KIND_EXTERNAL_FILE_ROLE,
4830ee9227SMasatake YAMATO } ScriptRole;
4930ee9227SMasatake YAMATO 
5075b31c74SMasatake YAMATO typedef enum {
5175b31c74SMasatake YAMATO 	STYLESHEET_KIND_EXTERNAL_FILE_ROLE,
5275b31c74SMasatake YAMATO } StylesheetRole;
5375b31c74SMasatake YAMATO 
54dfae3021SMasatake YAMATO static roleDefinition ClassRoles [] = {
55dfae3021SMasatake YAMATO 	{ true, "attribute", "assigned as attributes" },
56dfae3021SMasatake YAMATO };
57dfae3021SMasatake YAMATO 
5830ee9227SMasatake YAMATO static roleDefinition ScriptRoles [] = {
5930ee9227SMasatake YAMATO 	{ true, "extFile", "referenced as external files" },
6030ee9227SMasatake YAMATO };
6130ee9227SMasatake YAMATO 
6275b31c74SMasatake YAMATO static roleDefinition StylesheetRoles [] = {
6375b31c74SMasatake YAMATO 	{ true, "extFile", "referenced as external files" },
6475b31c74SMasatake YAMATO };
6575b31c74SMasatake YAMATO 
66e112e8abSMasatake YAMATO static kindDefinition HtmlKinds [] = {
67ce990805SThomas Braun 	{ true, 'a', "anchor",		"named anchors" },
68dfae3021SMasatake YAMATO 	{ true, 'c', "class",		"classes",
69dfae3021SMasatake YAMATO 	  .referenceOnly = true, ATTACH_ROLES (ClassRoles)},
70*ae9f0956SMasatake YAMATO 	{ true, 't', "title",		"titles" },
71ce990805SThomas Braun 	{ true, 'h', "heading1",	"H1 headings" },
72ce990805SThomas Braun 	{ true, 'i', "heading2",	"H2 headings" },
7330ee9227SMasatake YAMATO 	{ true, 'j', "heading3",	"H3 headings" },
7475b31c74SMasatake YAMATO 	{ true, 'C', "stylesheet",	"stylesheets",
7575b31c74SMasatake YAMATO 	  .referenceOnly = true, ATTACH_ROLES (StylesheetRoles)},
769b02eb06SMasatake YAMATO 	{ true, 'I', "id",			"identifiers" },
7730ee9227SMasatake YAMATO 	{ true, 'J', "script",		"scripts",
7830ee9227SMasatake YAMATO 	  .referenceOnly = true, ATTACH_ROLES (ScriptRoles)},
79dc0f490fSMasatake YAMATO };
80dc0f490fSMasatake YAMATO 
813b7fe603SJiří Techet typedef enum {
82*ae9f0956SMasatake YAMATO 	/* The order starting from "title" to "h3" should
83*ae9f0956SMasatake YAMATO 	 * not be changed.
84*ae9f0956SMasatake YAMATO 	 *
85*ae9f0956SMasatake YAMATO 	 */
86*ae9f0956SMasatake YAMATO 	KEYWORD_heading_start,
87*ae9f0956SMasatake YAMATO 	KEYWORD_title = KEYWORD_heading_start,
883b7fe603SJiří Techet 	KEYWORD_h1,
893b7fe603SJiří Techet 	KEYWORD_h2,
903b7fe603SJiří Techet 	KEYWORD_h3,
91*ae9f0956SMasatake YAMATO 	KEYWORD_heading_end = KEYWORD_h3,
923b7fe603SJiří Techet 	KEYWORD_a,
933b7fe603SJiří Techet 	KEYWORD_script,
943b7fe603SJiří Techet 	KEYWORD_style,
95d7c12985SJiří Techet 	KEYWORD_name,
96d7c12985SJiří Techet 
97d7c12985SJiří Techet 	/* void elements */
98d7c12985SJiří Techet 	KEYWORD_area,
99d7c12985SJiří Techet 	KEYWORD_base,
100d7c12985SJiří Techet 	KEYWORD_br,
101dfae3021SMasatake YAMATO 	KEYWORD_class,
102d7c12985SJiří Techet 	KEYWORD_col,
103d7c12985SJiří Techet 	KEYWORD_command,
104d7c12985SJiří Techet 	KEYWORD_embed,
105d7c12985SJiří Techet 	KEYWORD_hr,
10675b31c74SMasatake YAMATO 	KEYWORD_href,
1079b02eb06SMasatake YAMATO 	KEYWORD_id,
108d7c12985SJiří Techet 	KEYWORD_img,
109d7c12985SJiří Techet 	KEYWORD_input,
110d7c12985SJiří Techet 	KEYWORD_keygen,
111d7c12985SJiří Techet 	KEYWORD_link,
112d7c12985SJiří Techet 	KEYWORD_meta,
113d7c12985SJiří Techet 	KEYWORD_param,
11475b31c74SMasatake YAMATO 	KEYWORD_rel,
115d7c12985SJiří Techet 	KEYWORD_source,
11630ee9227SMasatake YAMATO 	KEYWORD_src,
117d7c12985SJiří Techet 	KEYWORD_track,
118d7c12985SJiří Techet 	KEYWORD_wbr
1193b7fe603SJiří Techet } keywordId;
1203ae02089SMasatake YAMATO 
1213b7fe603SJiří Techet static const keywordTable HtmlKeywordTable[] = {
122*ae9f0956SMasatake YAMATO 	{"title", KEYWORD_title},
1233b7fe603SJiří Techet 	{"h1", KEYWORD_h1},
1243b7fe603SJiří Techet 	{"h2", KEYWORD_h2},
1253b7fe603SJiří Techet 	{"h3", KEYWORD_h3},
1263b7fe603SJiří Techet 	{"a", KEYWORD_a},
1273b7fe603SJiří Techet 	{"script", KEYWORD_script},
1283b7fe603SJiří Techet 	{"style", KEYWORD_style},
1293b7fe603SJiří Techet 	{"name", KEYWORD_name},
130d7c12985SJiří Techet 
131d7c12985SJiří Techet 	/* void elements */
132d7c12985SJiří Techet 	{"area", KEYWORD_area},
133d7c12985SJiří Techet 	{"base", KEYWORD_base},
134d7c12985SJiří Techet 	{"br", KEYWORD_br},
135dfae3021SMasatake YAMATO 	{"class", KEYWORD_class},
136d7c12985SJiří Techet 	{"col", KEYWORD_col},
137d7c12985SJiří Techet 	{"command", KEYWORD_command},
138d7c12985SJiří Techet 	{"embed", KEYWORD_embed},
139d7c12985SJiří Techet 	{"hr", KEYWORD_hr},
14075b31c74SMasatake YAMATO 	{"href", KEYWORD_href},
1419b02eb06SMasatake YAMATO 	{"id", KEYWORD_id},
142d7c12985SJiří Techet 	{"img", KEYWORD_img},
143d7c12985SJiří Techet 	{"input", KEYWORD_input},
144d7c12985SJiří Techet 	{"keygen", KEYWORD_keygen},
145d7c12985SJiří Techet 	{"link", KEYWORD_link},
146d7c12985SJiří Techet 	{"meta", KEYWORD_meta},
147d7c12985SJiří Techet 	{"param", KEYWORD_param},
14875b31c74SMasatake YAMATO 	{"rel", KEYWORD_rel},
149d7c12985SJiří Techet 	{"source", KEYWORD_source},
15030ee9227SMasatake YAMATO 	{"src", KEYWORD_src},
151d7c12985SJiří Techet 	{"track", KEYWORD_track},
152d7c12985SJiří Techet 	{"wbr", KEYWORD_wbr},
1533b7fe603SJiří Techet };
1543b7fe603SJiří Techet 
1553b7fe603SJiří Techet typedef enum {
1563b7fe603SJiří Techet 	TOKEN_EOF,
1573b7fe603SJiří Techet 	TOKEN_NAME,			/* tag and attribute names */
1583b7fe603SJiří Techet 	TOKEN_STRING,		/* single- or double-quoted attribute value */
1593b7fe603SJiří Techet 	TOKEN_TEXT,
1603b7fe603SJiří Techet 	TOKEN_TAG_START,	/* <  */
1613b7fe603SJiří Techet 	TOKEN_TAG_START2,	/* </ */
1623b7fe603SJiří Techet 	TOKEN_TAG_END,		/* >  */
1633b7fe603SJiří Techet 	TOKEN_TAG_END2,		/* /> */
1643b7fe603SJiří Techet 	TOKEN_EQUAL,
1653b7fe603SJiří Techet 	TOKEN_COMMENT,
1663b7fe603SJiří Techet 	TOKEN_OTHER
1673b7fe603SJiří Techet } tokenType;
1683b7fe603SJiří Techet 
169b0a80065SMasatake YAMATO #ifdef DEBUG
170b0a80065SMasatake YAMATO const char *tokenTypes[] = {
171b0a80065SMasatake YAMATO #define E(X) [TOKEN_##X] = #X
172b0a80065SMasatake YAMATO 	E(EOF),
173b0a80065SMasatake YAMATO 	E(NAME),
174b0a80065SMasatake YAMATO 	E(STRING),
175b0a80065SMasatake YAMATO 	E(TEXT),
176b0a80065SMasatake YAMATO 	E(TAG_START),
177b0a80065SMasatake YAMATO 	E(TAG_START2),
178b0a80065SMasatake YAMATO 	E(TAG_END),
179b0a80065SMasatake YAMATO 	E(TAG_END2),
180b0a80065SMasatake YAMATO 	E(EQUAL),
181b0a80065SMasatake YAMATO 	E(COMMENT),
182b0a80065SMasatake YAMATO 	E(OTHER),
183b0a80065SMasatake YAMATO #undef E
184b0a80065SMasatake YAMATO };
185b0a80065SMasatake YAMATO #endif
186b0a80065SMasatake YAMATO 
1873b7fe603SJiří Techet typedef struct {
1883b7fe603SJiří Techet 	tokenType type;
1893b7fe603SJiří Techet 	vString *string;
1903b7fe603SJiří Techet } tokenInfo;
1913b7fe603SJiří Techet 
1923b7fe603SJiří Techet 
1933b7fe603SJiří Techet static int Lang_html;
1943b7fe603SJiří Techet 
1953b7fe603SJiří Techet 
196b119ea03SJiří Techet static void readTag (tokenInfo *token, vString *text, int depth);
1973b7fe603SJiří Techet 
198b0a80065SMasatake YAMATO #ifdef DEBUG
1992cf5f15cSK.Takata #if 0
200b0a80065SMasatake YAMATO static void dumpToken (tokenInfo *token, const char *context, const char* extra_context)
201b0a80065SMasatake YAMATO {
202b0a80065SMasatake YAMATO 	fprintf (stderr, "[%7s] %-20s@%s.%s\n",
203b0a80065SMasatake YAMATO 			 tokenTypes[token->type], vStringValue(token->string),
204b0a80065SMasatake YAMATO 			 context, extra_context? extra_context: "_");
205b0a80065SMasatake YAMATO }
206b0a80065SMasatake YAMATO #endif
2072cf5f15cSK.Takata #endif
2083b7fe603SJiří Techet 
readTokenText(tokenInfo * const token,bool collectText)209ce990805SThomas Braun static void readTokenText (tokenInfo *const token, bool collectText)
2103b7fe603SJiří Techet {
2113b7fe603SJiří Techet 	int c;
2123b7fe603SJiří Techet 	int lastC = 'X';  /* whatever non-space character */
2133b7fe603SJiří Techet 
2143b7fe603SJiří Techet 	vStringClear (token->string);
2153b7fe603SJiří Techet 
2163b7fe603SJiří Techet getNextChar:
2173b7fe603SJiří Techet 
2183b7fe603SJiří Techet 	c = getcFromInputFile ();
2193b7fe603SJiří Techet 
2203b7fe603SJiří Techet 	switch (c)
2213b7fe603SJiří Techet 	{
2223b7fe603SJiří Techet 		case EOF:
2233b7fe603SJiří Techet 			token->type = TOKEN_EOF;
2243b7fe603SJiří Techet 			break;
2253b7fe603SJiří Techet 
2263b7fe603SJiří Techet 		case '<':
2273b7fe603SJiří Techet 			ungetcToInputFile (c);
2283b7fe603SJiří Techet 			token->type = TOKEN_TEXT;
2293b7fe603SJiří Techet 			break;
2303b7fe603SJiří Techet 
2313b7fe603SJiří Techet 		default:
2323b7fe603SJiří Techet 			if (collectText)
2333b7fe603SJiří Techet 			{
2343b7fe603SJiří Techet 				if (isspace (c))
2353b7fe603SJiří Techet 					c = ' ';
23635576061SMasatake YAMATO 				if (c != ' ' || lastC != ' ')
2373b7fe603SJiří Techet 				{
2383b7fe603SJiří Techet 					vStringPut (token->string, c);
2393b7fe603SJiří Techet 					lastC = c;
2403b7fe603SJiří Techet 				}
2413b7fe603SJiří Techet 			}
2423b7fe603SJiří Techet 
2433b7fe603SJiří Techet 			goto getNextChar;
2443b7fe603SJiří Techet 	}
2453b7fe603SJiří Techet }
2463b7fe603SJiří Techet 
readToken(tokenInfo * const token,bool skipComments)247ce990805SThomas Braun static void readToken (tokenInfo *const token, bool skipComments)
2483b7fe603SJiří Techet {
2493b7fe603SJiří Techet 	int c;
2503b7fe603SJiří Techet 
2513b7fe603SJiří Techet 	vStringClear (token->string);
2523b7fe603SJiří Techet 
2533b7fe603SJiří Techet getNextChar:
2543b7fe603SJiří Techet 
2553b7fe603SJiří Techet 	c = getcFromInputFile ();
2563b7fe603SJiří Techet 	while (isspace (c))
2573b7fe603SJiří Techet 		c = getcFromInputFile ();
2583b7fe603SJiří Techet 
2593b7fe603SJiří Techet 	switch (c)
2603b7fe603SJiří Techet 	{
2613b7fe603SJiří Techet 		case EOF:
2623b7fe603SJiří Techet 			token->type = TOKEN_EOF;
2633b7fe603SJiří Techet 			break;
2643b7fe603SJiří Techet 
2653b7fe603SJiří Techet 		case '<':
2663b7fe603SJiří Techet 		{
2673b7fe603SJiří Techet 			int d = getcFromInputFile ();
2683b7fe603SJiří Techet 
2693b7fe603SJiří Techet 			if (d == '!')
2703b7fe603SJiří Techet 			{
2713b7fe603SJiří Techet 				d = getcFromInputFile ();
2723b7fe603SJiří Techet 				if (d == '-')
2733b7fe603SJiří Techet 				{
2743b7fe603SJiří Techet 					d = getcFromInputFile ();
2753b7fe603SJiří Techet 					if (d == '-')
2763b7fe603SJiří Techet 					{
2773b7fe603SJiří Techet 						int e = ' ';
2783b7fe603SJiří Techet 						int f = ' ';
2793b7fe603SJiří Techet 						do
2803b7fe603SJiří Techet 						{
2813b7fe603SJiří Techet 							d = e;
2823b7fe603SJiří Techet 							e = f;
2833b7fe603SJiří Techet 							f = getcFromInputFile ();
2843b7fe603SJiří Techet 						}
2853b7fe603SJiří Techet 						while (f != EOF && ! (d == '-' && e == '-' && f == '>'));
2863b7fe603SJiří Techet 
2873b7fe603SJiří Techet 						if (skipComments)
2883b7fe603SJiří Techet 							goto getNextChar;
2893b7fe603SJiří Techet 						else
2903b7fe603SJiří Techet 						{
2913b7fe603SJiří Techet 							token->type = TOKEN_COMMENT;
2923b7fe603SJiří Techet 							break;
2933b7fe603SJiří Techet 						}
2943b7fe603SJiří Techet 					}
2953b7fe603SJiří Techet 				}
2963b7fe603SJiří Techet 				ungetcToInputFile (d);
2973b7fe603SJiří Techet 				token->type = TOKEN_OTHER;
2983b7fe603SJiří Techet 			}
2993b7fe603SJiří Techet 			else if (d == '?')
3003b7fe603SJiří Techet 				token->type = TOKEN_OTHER;
3013b7fe603SJiří Techet 			else if (d == '/')
3023b7fe603SJiří Techet 				token->type = TOKEN_TAG_START2;
3033b7fe603SJiří Techet 			else
3043b7fe603SJiří Techet 			{
3053b7fe603SJiří Techet 				ungetcToInputFile (d);
3063b7fe603SJiří Techet 				token->type = TOKEN_TAG_START;
3073b7fe603SJiří Techet 			}
3083b7fe603SJiří Techet 			break;
3093b7fe603SJiří Techet 		}
3103b7fe603SJiří Techet 		case '/':
3113b7fe603SJiří Techet 		{
3123b7fe603SJiří Techet 			int d = getcFromInputFile ();
3133b7fe603SJiří Techet 			if (d == '>')
3143b7fe603SJiří Techet 				token->type = TOKEN_TAG_END2;
3153b7fe603SJiří Techet 			else
3163b7fe603SJiří Techet 			{
3173b7fe603SJiří Techet 				ungetcToInputFile (d);
3183b7fe603SJiří Techet 				token->type = TOKEN_OTHER;
3193b7fe603SJiří Techet 			}
3203b7fe603SJiří Techet 			break;
3213b7fe603SJiří Techet 		}
3223b7fe603SJiří Techet 		case '>':
3233b7fe603SJiří Techet 			token->type = TOKEN_TAG_END;
3243b7fe603SJiří Techet 			break;
3253b7fe603SJiří Techet 
3263b7fe603SJiří Techet 		case '=':
3273b7fe603SJiří Techet 			token->type = TOKEN_EQUAL;
3283b7fe603SJiří Techet 			break;
3293b7fe603SJiří Techet 
3303b7fe603SJiří Techet 		case '"':
3313b7fe603SJiří Techet 		case '\'':
3323b7fe603SJiří Techet 		{
3333b7fe603SJiří Techet 			const int delimiter = c;
3343b7fe603SJiří Techet 			c = getcFromInputFile ();
3353b7fe603SJiří Techet 			while (c != EOF && c != delimiter)
3363b7fe603SJiří Techet 			{
3373b7fe603SJiří Techet 				vStringPut (token->string, c);
3383b7fe603SJiří Techet 				c = getcFromInputFile ();
3393b7fe603SJiří Techet 			}
3403b7fe603SJiří Techet 			token->type = TOKEN_STRING;
3413b7fe603SJiří Techet 			break;
3423b7fe603SJiří Techet 		}
3433b7fe603SJiří Techet 
3443b7fe603SJiří Techet 		default:
3453b7fe603SJiří Techet 		{
3463b7fe603SJiří Techet 			do
3473b7fe603SJiří Techet 			{
3483b7fe603SJiří Techet 				vStringPut (token->string, tolower (c));
3493b7fe603SJiří Techet 				c = getcFromInputFile ();
3503b7fe603SJiří Techet 			}
3513b7fe603SJiří Techet 			while (!isspace (c) && c != '<' && c != '>' && c != '/' &&
3523b7fe603SJiří Techet 				   c != '=' && c != '\'' && c != '"' && c != EOF);
3533b7fe603SJiří Techet 			if (c != EOF)
3543b7fe603SJiří Techet 				ungetcToInputFile (c);
3553b7fe603SJiří Techet 			token->type = TOKEN_NAME;
3563b7fe603SJiří Techet 			break;
3573b7fe603SJiří Techet 		}
3583b7fe603SJiří Techet 	}
3593b7fe603SJiří Techet }
3603b7fe603SJiří Techet 
appendText(vString * text,vString * appendedText)3613b7fe603SJiří Techet static void appendText (vString *text, vString *appendedText)
3623b7fe603SJiří Techet {
3633b7fe603SJiří Techet 	if (text != NULL && vStringLength (appendedText) > 0)
3643b7fe603SJiří Techet 	{
3653b7fe603SJiří Techet 		if (vStringLength (text) > 0 && vStringLast (text) == ' ' &&
3663b7fe603SJiří Techet 			vStringLength (appendedText) > 0 && vStringChar (appendedText, 0) == ' ')
3673b7fe603SJiří Techet 		{
3683b7fe603SJiří Techet 			vStringStripTrailing (text);
3693b7fe603SJiří Techet 		}
3703b7fe603SJiří Techet 		vStringCat (text, appendedText);
3713b7fe603SJiří Techet 	}
3723b7fe603SJiří Techet }
3733b7fe603SJiří Techet 
readTagContent(tokenInfo * token,vString * text,long * line,long * lineOffset,int depth)374b119ea03SJiří Techet static bool readTagContent (tokenInfo *token, vString *text, long *line, long *lineOffset, int depth)
3753b7fe603SJiří Techet {
3763b7fe603SJiří Techet 	tokenType type;
3773b7fe603SJiří Techet 
3783b7fe603SJiří Techet 	readTokenText (token, text != NULL);
3793b7fe603SJiří Techet 	appendText (text, token->string);
3803b7fe603SJiří Techet 
3813b7fe603SJiří Techet 	do
3823b7fe603SJiří Techet 	{
3833b7fe603SJiří Techet 		*line = getInputLineNumber ();
3843b7fe603SJiří Techet 		*lineOffset = getInputLineOffset ();
385ce990805SThomas Braun 		readToken (token, false);
3863b7fe603SJiří Techet 		type = token->type;
3873b7fe603SJiří Techet 		if (type == TOKEN_TAG_START)
388b119ea03SJiří Techet 			readTag (token, text, depth + 1);
3893b7fe603SJiří Techet 		if (type == TOKEN_COMMENT || type == TOKEN_TAG_START)
3903b7fe603SJiří Techet 		{
3913b7fe603SJiří Techet 			readTokenText (token, text != NULL);
3923b7fe603SJiří Techet 			appendText (text, token->string);
3933b7fe603SJiří Techet 		}
3943b7fe603SJiří Techet 	}
3953b7fe603SJiří Techet 	while (type == TOKEN_COMMENT || type == TOKEN_TAG_START);
3963b7fe603SJiří Techet 
3973b7fe603SJiří Techet 	return type == TOKEN_TAG_START2;
3983b7fe603SJiří Techet }
3993b7fe603SJiří Techet 
skipScriptContent(tokenInfo * token,long * line,long * lineOffset)400fd3a8075SMasatake YAMATO static bool skipScriptContent (tokenInfo *token, long *line, long *lineOffset)
401fd3a8075SMasatake YAMATO {
402fd3a8075SMasatake YAMATO 	bool found_start = false;
403fd3a8075SMasatake YAMATO 	bool found_script = false;
404fd3a8075SMasatake YAMATO 
405e9acb8f4SK.Takata 	long line_tmp[2] = {0};
406e9acb8f4SK.Takata 	long lineOffset_tmp[2] = {0};
407fd3a8075SMasatake YAMATO 
408fd3a8075SMasatake YAMATO 	tokenType type;
409fd3a8075SMasatake YAMATO 
410fd3a8075SMasatake YAMATO 	do
411fd3a8075SMasatake YAMATO 	{
412fd3a8075SMasatake YAMATO 		line_tmp[0] = getInputLineNumber ();
413fd3a8075SMasatake YAMATO 		lineOffset_tmp[0] = getInputLineOffset ();
414fd3a8075SMasatake YAMATO 
415fd3a8075SMasatake YAMATO 		readToken (token, false);
416fd3a8075SMasatake YAMATO 		type = token->type;
417fd3a8075SMasatake YAMATO 
418fd3a8075SMasatake YAMATO 		if (type == TOKEN_TAG_START2)
419fd3a8075SMasatake YAMATO 		{
420fd3a8075SMasatake YAMATO 			found_start = true;
421fd3a8075SMasatake YAMATO 			line_tmp[1] = line_tmp[0];
422fd3a8075SMasatake YAMATO 			lineOffset_tmp[1] = lineOffset_tmp[0];
423fd3a8075SMasatake YAMATO 		}
424fd3a8075SMasatake YAMATO 		else if (found_start
425fd3a8075SMasatake YAMATO 				 && type == TOKEN_NAME
426fd3a8075SMasatake YAMATO 				 && lookupKeyword (vStringValue (token->string), Lang_html) == KEYWORD_script)
427fd3a8075SMasatake YAMATO 		{
428ec10a9daSMasatake YAMATO 			found_script = true;
429fd3a8075SMasatake YAMATO 			*line = line_tmp[1];
430fd3a8075SMasatake YAMATO 			*lineOffset = lineOffset_tmp[1];
431fd3a8075SMasatake YAMATO 		}
432fd3a8075SMasatake YAMATO 		else
433fd3a8075SMasatake YAMATO 			found_start = false;
434fd3a8075SMasatake YAMATO 	}
435fd3a8075SMasatake YAMATO 	while ((type != TOKEN_EOF) && (!found_script));
436fd3a8075SMasatake YAMATO 
437fcd86ad0SMasatake YAMATO 	return found_script;
438fd3a8075SMasatake YAMATO }
439fd3a8075SMasatake YAMATO 
makeClassRefTags(const char * classes)440dfae3021SMasatake YAMATO static void makeClassRefTags (const char *classes)
441dfae3021SMasatake YAMATO {
442dfae3021SMasatake YAMATO 	vString *klass = vStringNew ();
443dfae3021SMasatake YAMATO 
444dfae3021SMasatake YAMATO 	do
445dfae3021SMasatake YAMATO 	{
446dfae3021SMasatake YAMATO 		if (*classes && !isspace (*classes))
447dfae3021SMasatake YAMATO 			vStringPut (klass, *classes);
448dfae3021SMasatake YAMATO 		else if (!vStringIsEmpty (klass))
449dfae3021SMasatake YAMATO 		{
450dfae3021SMasatake YAMATO 			makeSimpleRefTag (klass, K_CLASS,
451dfae3021SMasatake YAMATO 							  CLASS_KIND_ATTRIBUTE_ROLE);
452dfae3021SMasatake YAMATO 			vStringClear (klass);
453dfae3021SMasatake YAMATO 		}
454dfae3021SMasatake YAMATO 
455dfae3021SMasatake YAMATO 		if (!*classes)
456dfae3021SMasatake YAMATO 			break;
457dfae3021SMasatake YAMATO 
458dfae3021SMasatake YAMATO 		classes++;
459dfae3021SMasatake YAMATO 	} while (1);
460dfae3021SMasatake YAMATO 
461dfae3021SMasatake YAMATO 	vStringDelete (klass);
462dfae3021SMasatake YAMATO }
463dfae3021SMasatake YAMATO 
readTag(tokenInfo * token,vString * text,int depth)464b119ea03SJiří Techet static void readTag (tokenInfo *token, vString *text, int depth)
4653b7fe603SJiří Techet {
466ce990805SThomas Braun 	bool textCreated = false;
4673b7fe603SJiří Techet 
468ce990805SThomas Braun 	readToken (token, true);
4693b7fe603SJiří Techet 	if (token->type == TOKEN_NAME)
4703b7fe603SJiří Techet 	{
4713b7fe603SJiří Techet 		keywordId startTag;
472ce990805SThomas Braun 		bool isHeading;
473d7c12985SJiří Techet 		bool isVoid;
47475b31c74SMasatake YAMATO 		vString *stylesheet = NULL;
47575b31c74SMasatake YAMATO 		bool stylesheet_expectation = false;
4763b7fe603SJiří Techet 
4773b7fe603SJiří Techet 		startTag = lookupKeyword (vStringValue (token->string), Lang_html);
478*ae9f0956SMasatake YAMATO 		isHeading = (KEYWORD_heading_start <= startTag && startTag <= KEYWORD_heading_end);
479d7c12985SJiří Techet 		isVoid = (startTag >= KEYWORD_area && startTag <= KEYWORD_wbr);
4803b7fe603SJiří Techet 		if (text == NULL && isHeading)
4813b7fe603SJiří Techet 		{
4823b7fe603SJiří Techet 			text = vStringNew ();
483ce990805SThomas Braun 			textCreated = true;
4843b7fe603SJiří Techet 		}
4853b7fe603SJiří Techet 
4863b7fe603SJiří Techet 		do
4873b7fe603SJiří Techet 		{
488ba274515SMasatake YAMATO 			keywordId attribute = KEYWORD_NONE;
489ba274515SMasatake YAMATO 
490ce990805SThomas Braun 			readToken (token, true);
491ba274515SMasatake YAMATO 			if (token->type == TOKEN_NAME)
492ba274515SMasatake YAMATO 				attribute = lookupKeyword (vStringValue (token->string), Lang_html);
493ba274515SMasatake YAMATO 
494dfae3021SMasatake YAMATO 			if (attribute == KEYWORD_class)
495dfae3021SMasatake YAMATO 			{
496dfae3021SMasatake YAMATO 				readToken (token, true);
497dfae3021SMasatake YAMATO 				if (token->type == TOKEN_EQUAL)
498dfae3021SMasatake YAMATO 				{
499dfae3021SMasatake YAMATO 					readToken (token, true);
500dfae3021SMasatake YAMATO 					if (token->type == TOKEN_STRING)
501dfae3021SMasatake YAMATO 						makeClassRefTags (vStringValue (token->string));
502dfae3021SMasatake YAMATO 				}
503dfae3021SMasatake YAMATO 			}
504dfae3021SMasatake YAMATO 			else if (attribute == KEYWORD_id)
5059b02eb06SMasatake YAMATO 			{
5069b02eb06SMasatake YAMATO 				readToken (token, true);
5079b02eb06SMasatake YAMATO 				if (token->type == TOKEN_EQUAL)
5089b02eb06SMasatake YAMATO 				{
5099b02eb06SMasatake YAMATO 					readToken (token, true);
5109b02eb06SMasatake YAMATO 					if (token->type == TOKEN_STRING)
5119b02eb06SMasatake YAMATO 						makeSimpleTag (token->string, K_ID);
5129b02eb06SMasatake YAMATO 				}
5139b02eb06SMasatake YAMATO 			}
514ba274515SMasatake YAMATO 			else if (startTag == KEYWORD_a && attribute == KEYWORD_name)
5153b7fe603SJiří Techet 			{
516ce990805SThomas Braun 				readToken (token, true);
5173b7fe603SJiří Techet 				if (token->type == TOKEN_EQUAL)
5183b7fe603SJiří Techet 				{
519ce990805SThomas Braun 					readToken (token, true);
5203b7fe603SJiří Techet 					if (token->type == TOKEN_STRING || token->type == TOKEN_NAME)
52116a2541cSMasatake YAMATO 						makeSimpleTag (token->string, K_ANCHOR);
5223b7fe603SJiří Techet 				}
5233b7fe603SJiří Techet 			}
524ba274515SMasatake YAMATO 			else if (startTag == KEYWORD_script && attribute == KEYWORD_src)
52530ee9227SMasatake YAMATO 			{
52630ee9227SMasatake YAMATO 				readToken (token, true);
52730ee9227SMasatake YAMATO 				if (token->type == TOKEN_EQUAL)
52830ee9227SMasatake YAMATO 				{
52930ee9227SMasatake YAMATO 					readToken (token, true);
53030ee9227SMasatake YAMATO 					if (token->type == TOKEN_STRING)
53130ee9227SMasatake YAMATO 						makeSimpleRefTag (token->string, K_SCRIPT,
53230ee9227SMasatake YAMATO 										  SCRIPT_KIND_EXTERNAL_FILE_ROLE);
53330ee9227SMasatake YAMATO 				}
53430ee9227SMasatake YAMATO 			}
535ba274515SMasatake YAMATO 			else if (startTag == KEYWORD_link)
53675b31c74SMasatake YAMATO 			{
53775b31c74SMasatake YAMATO 				if (attribute == KEYWORD_rel)
53875b31c74SMasatake YAMATO 				{
53975b31c74SMasatake YAMATO 					readToken (token, true);
54075b31c74SMasatake YAMATO 					if (token->type == TOKEN_EQUAL)
54175b31c74SMasatake YAMATO 					{
54275b31c74SMasatake YAMATO 						readToken (token, true);
54375b31c74SMasatake YAMATO 						if (token->type == TOKEN_STRING &&
54475b31c74SMasatake YAMATO 							/* strcmp is not enough:
54575b31c74SMasatake YAMATO 							 * e.g. <link href="fancy.css"
54675b31c74SMasatake YAMATO 							 *            rel="alternate stylesheet" title="Fancy"> */
54775b31c74SMasatake YAMATO 							vStringLength(token->string) >= 10 &&
54875b31c74SMasatake YAMATO 							strstr (vStringValue (token->string), "stylesheet"))
54975b31c74SMasatake YAMATO 							stylesheet_expectation = true;
55075b31c74SMasatake YAMATO 					}
55175b31c74SMasatake YAMATO 				}
55275b31c74SMasatake YAMATO 				else if (attribute == KEYWORD_href)
55375b31c74SMasatake YAMATO 				{
55475b31c74SMasatake YAMATO 					readToken (token, true);
55575b31c74SMasatake YAMATO 					if (token->type == TOKEN_EQUAL)
55675b31c74SMasatake YAMATO 					{
55775b31c74SMasatake YAMATO 						readToken (token, true);
55875b31c74SMasatake YAMATO 						if (token->type == TOKEN_STRING)
55975b31c74SMasatake YAMATO 						{
56075b31c74SMasatake YAMATO 							if (stylesheet == NULL)
56175b31c74SMasatake YAMATO 								stylesheet = vStringNewCopy (token->string);
56275b31c74SMasatake YAMATO 							else
56375b31c74SMasatake YAMATO 								vStringCopy (stylesheet, token->string);
56475b31c74SMasatake YAMATO 						}
56575b31c74SMasatake YAMATO 					}
56675b31c74SMasatake YAMATO 				}
56775b31c74SMasatake YAMATO 				if (stylesheet_expectation && stylesheet && !vStringIsEmpty (stylesheet))
56875b31c74SMasatake YAMATO 				{
56975b31c74SMasatake YAMATO 					makeSimpleRefTag (stylesheet, K_STYELSHEET,
57075b31c74SMasatake YAMATO 									  STYLESHEET_KIND_EXTERNAL_FILE_ROLE);
57175b31c74SMasatake YAMATO 					stylesheet_expectation = false;
57275b31c74SMasatake YAMATO 					if (stylesheet)
57375b31c74SMasatake YAMATO 						vStringClear (stylesheet);
57475b31c74SMasatake YAMATO 				}
57575b31c74SMasatake YAMATO 			}
5763b7fe603SJiří Techet 		}
5773b7fe603SJiří Techet 		while (token->type != TOKEN_TAG_END && token->type != TOKEN_TAG_END2 &&
5783b7fe603SJiří Techet 			   token->type != TOKEN_EOF);
5793b7fe603SJiří Techet 
58075b31c74SMasatake YAMATO 		vStringDelete (stylesheet);
58175b31c74SMasatake YAMATO 		stylesheet = NULL;
58275b31c74SMasatake YAMATO 
583b119ea03SJiří Techet 		if (!isVoid && token->type == TOKEN_TAG_END && depth < MAX_DEPTH)
5843b7fe603SJiří Techet 		{
5853b7fe603SJiří Techet 			long startSourceLineNumber = getSourceLineNumber ();
5863b7fe603SJiří Techet 			long startLineNumber = getInputLineNumber ();
587ef722b09SMasatake YAMATO 			long startLineOffset = getInputLineOffset ();
5883b7fe603SJiří Techet 			long endLineNumber;
589ef722b09SMasatake YAMATO 			long endLineOffset;
590ce990805SThomas Braun 			bool tag_start2;
5913b7fe603SJiří Techet 
592fd3a8075SMasatake YAMATO 			if (startTag == KEYWORD_script)
593fd3a8075SMasatake YAMATO 			{
594fd3a8075SMasatake YAMATO 				bool script = skipScriptContent (token, &endLineNumber, &endLineOffset);
595fd3a8075SMasatake YAMATO 				if (script)
596fd3a8075SMasatake YAMATO 					makePromise ("JavaScript", startLineNumber, startLineOffset,
597fd3a8075SMasatake YAMATO 								 endLineNumber, endLineOffset, startSourceLineNumber);
598fd3a8075SMasatake YAMATO 				readToken (token, true);
599fd3a8075SMasatake YAMATO 				goto out;
600fd3a8075SMasatake YAMATO 			}
6013b7fe603SJiří Techet 
602fd3a8075SMasatake YAMATO 			tag_start2 = readTagContent (token, text, &endLineNumber, &endLineOffset, depth);
6033b7fe603SJiří Techet 			if (tag_start2)
6043b7fe603SJiří Techet 			{
605ce990805SThomas Braun 				readToken (token, true);
6063b7fe603SJiří Techet 				if (isHeading && textCreated && vStringLength (text) > 0)
6073b7fe603SJiří Techet 				{
6083b7fe603SJiří Techet 					keywordId endTag = lookupKeyword (vStringValue (token->string), Lang_html);
6093b7fe603SJiří Techet 					if (startTag == endTag)
6103b7fe603SJiří Techet 					{
6113b7fe603SJiří Techet 						htmlKind headingKind;
6123b7fe603SJiří Techet 
613*ae9f0956SMasatake YAMATO 						if (startTag == KEYWORD_title)
614*ae9f0956SMasatake YAMATO 							headingKind = K_TITLE;
6153b7fe603SJiří Techet 						if (startTag == KEYWORD_h1)
6163b7fe603SJiří Techet 							headingKind = K_HEADING1;
6173b7fe603SJiří Techet 						else if (startTag == KEYWORD_h2)
6183b7fe603SJiří Techet 							headingKind = K_HEADING2;
6193b7fe603SJiří Techet 						else
6203b7fe603SJiří Techet 							headingKind = K_HEADING3;
6213b7fe603SJiří Techet 
6224b4bf448SColomban Wendling 						vStringStripLeading (text);
6233b7fe603SJiří Techet 						vStringStripTrailing (text);
62416a2541cSMasatake YAMATO 						makeSimpleTag (text, headingKind);
6253b7fe603SJiří Techet 					}
6263b7fe603SJiří Techet 				}
6273b7fe603SJiří Techet 				else if (startTag == KEYWORD_style)
6283b7fe603SJiří Techet 				{
6293b7fe603SJiří Techet 					keywordId endTag = lookupKeyword (vStringValue (token->string), Lang_html);
6303b7fe603SJiří Techet 					if (startTag == endTag)
6313b7fe603SJiří Techet 						makePromise ("CSS", startLineNumber, startLineOffset,
6323b7fe603SJiří Techet 									 endLineNumber, endLineOffset, startSourceLineNumber);
6333b7fe603SJiří Techet 				}
6343b7fe603SJiří Techet 
635ce990805SThomas Braun 				readToken (token, true);
6363b7fe603SJiří Techet 			}
6373b7fe603SJiří Techet 		}
6383b7fe603SJiří Techet 	}
6393b7fe603SJiří Techet 
640fd3a8075SMasatake YAMATO  out:
6413b7fe603SJiří Techet 	if (textCreated)
6423b7fe603SJiří Techet 		vStringDelete (text);
6433b7fe603SJiří Techet }
6443b7fe603SJiří Techet 
findHtmlTags(void)6453b7fe603SJiří Techet static void findHtmlTags (void)
6463b7fe603SJiří Techet {
6473b7fe603SJiří Techet 	tokenInfo token;
6483b7fe603SJiří Techet 
6493b7fe603SJiří Techet 	token.string = vStringNew ();
6503b7fe603SJiří Techet 
6513b7fe603SJiří Techet 	do
6523b7fe603SJiří Techet 	{
653ce990805SThomas Braun 		readToken (&token, true);
6543b7fe603SJiří Techet 		if (token.type == TOKEN_TAG_START)
655b119ea03SJiří Techet 			readTag (&token, NULL, 0);
6563b7fe603SJiří Techet 	}
6573b7fe603SJiří Techet 	while (token.type != TOKEN_EOF);
6583b7fe603SJiří Techet 
6593b7fe603SJiří Techet 	vStringDelete (token.string);
6603b7fe603SJiří Techet }
6613b7fe603SJiří Techet 
initialize(const langType language)6623b7fe603SJiří Techet static void initialize (const langType language)
6633b7fe603SJiří Techet {
6643b7fe603SJiří Techet 	Lang_html = language;
6653b7fe603SJiří Techet }
6663b7fe603SJiří Techet 
6673b7fe603SJiří Techet /* parser definition */
HtmlParser(void)6683ae02089SMasatake YAMATO extern parserDefinition* HtmlParser (void)
6693ae02089SMasatake YAMATO {
6703ae02089SMasatake YAMATO 	static const char *const extensions [] = { "htm", "html", NULL };
6713b7fe603SJiří Techet 	parserDefinition* def = parserNew ("HTML");
67209ae690fSMasatake YAMATO 	def->kindTable        = HtmlKinds;
6733b7fe603SJiří Techet 	def->kindCount    = ARRAY_SIZE (HtmlKinds);
6743ae02089SMasatake YAMATO 	def->extensions   = extensions;
6753b7fe603SJiří Techet 	def->parser       = findHtmlTags;
6763b7fe603SJiří Techet 	def->initialize   = initialize;
6773b7fe603SJiří Techet 	def->keywordTable = HtmlKeywordTable;
6783b7fe603SJiří Techet 	def->keywordCount = ARRAY_SIZE (HtmlKeywordTable);
6793ae02089SMasatake YAMATO 	return def;
6803ae02089SMasatake YAMATO }
681