xref: /Universal-ctags/parsers/markdown.c (revision 532866040b26c111e1e2a75f9383272afb02abfe)
1 /*
2  *
3  *  Copyright (c) 2007-2011, Nick Treleaven
4  *  Copyright (c) 2012, Lex Trotman
5  *  Copyright (c) 2021, Jiri Techet
6  *
7  *   This source code is released for free distribution under the terms of the
8  *   GNU General Public License version 2 or (at your option) any later version.
9  *
10  * This module contains functions for generating tags for markdown files.
11  *
12  * This parser was based on the asciidoc parser.
13  *
14  * Extended syntax like footnotes is described in
15  * https://www.markdownguide.org/extended-syntax/
16  */
17 
18 /*
19  *   INCLUDE FILES
20  */
21 #include "general.h"	/* must always come first */
22 
23 #include <ctype.h>
24 #include <string.h>
25 
26 #include "debug.h"
27 #include "entry.h"
28 #include "parse.h"
29 #include "read.h"
30 #include "vstring.h"
31 #include "nestlevel.h"
32 #include "routines.h"
33 #include "promise.h"
34 #include "htable.h"
35 
36 #include "markdown.h"
37 
38 /*
39  *   DATA DEFINITIONS
40  */
41 typedef enum {
42 	K_CHAPTER = 0,
43 	K_SECTION,
44 	K_SUBSECTION,
45 	K_SUBSUBSECTION,
46 	K_LEVEL4SECTION,
47 	K_LEVEL5SECTION,
48 	K_SECTION_COUNT,
49 	K_FOOTNOTE = K_SECTION_COUNT,
50 } markdownKind;
51 
52 static kindDefinition MarkdownKinds[] = {
53 	{ true, 'c', "chapter",       "chapters"},
54 	{ true, 's', "section",       "sections" },
55 	{ true, 'S', "subsection",    "level 2 sections" },
56 	{ true, 't', "subsubsection", "level 3 sections" },
57 	{ true, 'T', "l4subsection",  "level 4 sections" },
58 	{ true, 'u', "l5subsection",  "level 5 sections" },
59 	{ true, 'n', "footnote",      "footnotes" },
60 };
61 
62 static fieldDefinition MarkdownFields [] = {
63 	{
64 	  .enabled     = false,
65 	  .name        = "sectionMarker",
66 	  .description = "character used for declaring section(#, ##, =, or -)",
67 	},
68 };
69 
70 typedef enum {
71 	F_MARKER,
72 } markdownField;
73 
74 static NestingLevels *nestingLevels = NULL;
75 
76 /*
77 *   FUNCTION DEFINITIONS
78 */
79 
getNestingLevel(const int kind,unsigned long adjustmentWhenPop)80 static NestingLevel *getNestingLevel (const int kind, unsigned long adjustmentWhenPop)
81 {
82 	NestingLevel *nl;
83 	tagEntryInfo *e;
84 	unsigned long line = getInputLineNumber ();
85 
86 	line = (line > adjustmentWhenPop)? (line - adjustmentWhenPop): 0;
87 
88 	while (1)
89 	{
90 		nl = nestingLevelsGetCurrent (nestingLevels);
91 		e = getEntryOfNestingLevel (nl);
92 		if ((nl && (e == NULL)) || (e && (e->kindIndex >= kind)))
93 			nestingLevelsPopFull (nestingLevels, HT_UINT_TO_PTR ((unsigned int)line));
94 		else
95 			break;
96 	}
97 	return nl;
98 }
99 
100 
makeMarkdownTag(const vString * const name,const int kind,const bool twoLine)101 static int makeMarkdownTag (const vString* const name, const int kind, const bool twoLine)
102 {
103 	int r = CORK_NIL;
104 
105 	if (vStringLength (name) > 0)
106 	{
107 		const NestingLevel *const nl = getNestingLevel (kind, twoLine? 2: 1);
108 		tagEntryInfo *parent = getEntryOfNestingLevel (nl);
109 		tagEntryInfo e;
110 
111 		initTagEntry (&e, vStringValue (name), kind);
112 
113 		if (twoLine)
114 		{
115 			/* we want the line before the '---' underline chars */
116 			const unsigned long line = getInputLineNumber ();
117 			Assert (line > 0);
118 			if (line > 0)
119 			{
120 				e.lineNumber--;
121 				e.filePosition = getInputFilePositionForLine (line - 1);
122 			}
123 		}
124 
125 		if (parent && (parent->kindIndex < kind))
126 			e.extensionFields.scopeIndex = nl->corkIndex;
127 
128 		r = makeTagEntry (&e);
129 	}
130 	return r;
131 }
132 
133 
makeSectionMarkdownTag(const vString * const name,const int kind,const char * marker)134 static int makeSectionMarkdownTag (const vString* const name, const int kind, const char *marker)
135 {
136 	int r = makeMarkdownTag (name, kind, marker[0] != '#');
137 	attachParserFieldToCorkEntry (r, MarkdownFields [F_MARKER].ftype, marker);
138 
139 	nestingLevelsPush (nestingLevels, r);
140 	return r;
141 }
142 
143 
getHeading(const int kind,const unsigned char * line,const int lineLen,bool * delimited)144 static vString *getHeading (const int kind, const unsigned char *line,
145 	const int lineLen, bool *delimited)
146 {
147 	int pos = 0;
148 	int start = kind + 1;
149 	int end = lineLen - 1;
150 	vString *name = vStringNew ();
151 
152 	Assert (kind >= 0 && kind < K_SECTION_COUNT);
153 	Assert (lineLen > start);
154 
155 	*delimited = false;
156 	while (isspace (line[pos])) ++pos;
157 	while (line[end] == line[pos] && end - 1 >= 0 && line[end - 1] != '\\')
158 	{
159 		--end;
160 		*delimited = true;
161 	}
162 	while (isspace (line[start])) ++start;
163 	while (isspace (line[end])) --end;
164 
165 	if (start <= end)
166 		vStringNCatS (name, (const char*)(&(line[start])), end - start + 1);
167 
168 	return name;
169 }
170 
171 
getFirstCharPos(const unsigned char * line,int lineLen,bool * indented)172 static int getFirstCharPos (const unsigned char *line, int lineLen, bool *indented)
173 {
174 	int indent = 0;
175 	int i;
176 	for (i = 0; i < lineLen && isspace (line[i]); i++)
177 		indent += line[i] == '\t' ? 4 : 1;
178 	*indented = indent >= 4;
179 	return i;
180 }
181 
182 
fillEndField(NestingLevel * nl,void * ctxData)183 static void fillEndField (NestingLevel *nl, void *ctxData)
184 {
185 	tagEntryInfo *e = getEntryOfNestingLevel (nl);
186 	if (e)
187 	{
188 		unsigned long line = (unsigned long)(HT_PTR_TO_UINT (ctxData));
189 		e->extensionFields.endLine = line;
190 	}
191 }
192 
getFootnoteMaybe(const char * line)193 static void getFootnoteMaybe (const char *line)
194 {
195 	const char *start = strstr (line, "[^");
196 	const char *end = start? strstr(start + 2, "]:"): NULL;
197 
198 	if (! (start && end))
199 		return;
200 	if (! (end > (start + 2)))
201 		return;
202 
203 	vString * footnote = vStringNewNInit (start + 2, end - (start + 2));
204 	const NestingLevel *const nl = nestingLevelsGetCurrent (nestingLevels);
205 	tagEntryInfo e;
206 
207 	initTagEntry (&e, vStringValue (footnote), K_FOOTNOTE);
208 	if (nl)
209 		e.extensionFields.scopeIndex = nl->corkIndex;
210 	makeTagEntry (&e);
211 
212 	vStringDelete (footnote);
213 }
214 
extractLanguageForCodeBlock(const char * langMarker,vString * codeLang)215 static bool extractLanguageForCodeBlock (const char *langMarker,
216 										 vString *codeLang)
217 {
218 	subparser *s;
219 	bool r = false;
220 
221 	foreachSubparser (s, false)
222 	{
223 		markdownSubparser *m = (markdownSubparser *)s;
224 		enterSubparser(s);
225 		if (m->extractLanguageForCodeBlock)
226 			r = m->extractLanguageForCodeBlock (m, langMarker, codeLang);
227 		leaveSubparser();
228 		if (r)
229 			break;
230 	}
231 
232 	return r;
233 }
234 
findMarkdownTags(void)235 static void findMarkdownTags (void)
236 {
237 	vString *prevLine = vStringNew ();
238 	vString *codeLang = vStringNew ();
239 	const unsigned char *line;
240 	char inCodeChar = 0;
241 	long startSourceLineNumber = 0;
242 	long startLineNumber = 0;
243 	bool inPreambule = false;
244 	bool inComment = false;
245 
246 	subparser *sub = getSubparserRunningBaseparser();
247 	if (sub)
248 		chooseExclusiveSubparser (sub, NULL);
249 
250 	nestingLevels = nestingLevelsNewFull (0, fillEndField);
251 
252 	while ((line = readLineFromInputFile ()) != NULL)
253 	{
254 		int lineLen = strlen ((const char*) line);
255 		bool lineProcessed = false;
256 		bool indented;
257 		int pos = getFirstCharPos (line, lineLen, &indented);
258 		const int lineNum = getInputLineNumber ();
259 
260 		if (lineNum == 1 || inPreambule)
261 		{
262 			if (line[pos] == '-' && line[pos + 1] == '-' && line[pos + 2] == '-')
263 			{
264 				if (inPreambule)
265 				{
266 					long endLineNumber = lineNum;
267 					if (startLineNumber < endLineNumber)
268 						makePromise ("FrontMatter", startLineNumber, 0,
269 									 endLineNumber, 0, startSourceLineNumber);
270 				}
271 				else
272 					startSourceLineNumber = startLineNumber = lineNum;
273 				inPreambule = !inPreambule;
274 			}
275 		}
276 
277 		if (inPreambule)
278 			continue;
279 
280 		/* fenced code block */
281 		if (line[pos] == '`' || line[pos] == '~')
282 		{
283 			char c = line[pos];
284 			char otherC = c == '`' ? '~' : '`';
285 			int nSame;
286 			for (nSame = 1; line[nSame] == line[pos]; ++nSame);
287 
288 			if (inCodeChar != otherC && nSame >= 3)
289 			{
290 				inCodeChar = inCodeChar ? 0 : c;
291 				if (inCodeChar == c && strstr ((const char *)(line + pos + nSame), "```") != NULL)
292 					inCodeChar = 0;
293 				else if (inCodeChar)
294 				{
295 					const char *langMarker = (const char *)(line + pos + nSame);
296 					startLineNumber = startSourceLineNumber = lineNum + 1;
297 
298 					vStringClear (codeLang);
299 					if (! extractLanguageForCodeBlock (langMarker, codeLang))
300 					{
301 						vStringCopyS (codeLang, langMarker);
302 						vStringStripLeading (codeLang);
303 						vStringStripTrailing (codeLang);
304 					}
305 				}
306 				else
307 				{
308 					long endLineNumber = lineNum;
309 					if (vStringLength (codeLang) > 0
310 						&& startLineNumber < endLineNumber)
311 						makePromise (vStringValue (codeLang), startLineNumber, 0,
312 							endLineNumber, 0, startSourceLineNumber);
313 				}
314 
315 				lineProcessed = true;
316 			}
317 		}
318 		/* XML comment start */
319 		else if (lineLen >= pos + 4 && line[pos] == '<' && line[pos + 1] == '!' &&
320 			line[pos + 2] == '-' && line[pos + 3] == '-')
321 		{
322 			if (strstr ((const char *)(line + pos + 4), "-->") == NULL)
323 				inComment = true;
324 			lineProcessed = true;
325 		}
326 		/* XML comment end */
327 		else if (inComment && strstr ((const char *)(line + pos), "-->"))
328 		{
329 			inComment = false;
330 			lineProcessed = true;
331 		}
332 
333 		/* code block or comment */
334 		if (inCodeChar || inComment)
335 			lineProcessed = true;
336 
337 		/* code block using indent */
338 		else if (indented)
339 			lineProcessed = true;
340 
341 		/* if it's a title underline, or a delimited block marking character */
342 		else if (line[pos] == '=' || line[pos] == '-' || line[pos] == '#' || line[pos] == '>')
343 		{
344 			int nSame;
345 			for (nSame = 1; line[nSame] == line[pos]; ++nSame);
346 
347 			/* quote */
348 			if (line[pos] == '>')
349 				;  /* just to make sure lineProcessed = true so it won't be in a heading */
350 			/* is it a two line title */
351 			else if (line[pos] == '=' || line[pos] == '-')
352 			{
353 				char marker[2] = { line[pos], '\0' };
354 				int kind = line[pos] == '=' ? K_CHAPTER : K_SECTION;
355 				bool whitespaceTerminated = true;
356 
357 				for (int i = pos + nSame; i < lineLen; i++)
358 				{
359 					if (!isspace (line[i]))
360 					{
361 						whitespaceTerminated = false;
362 						break;
363 					}
364 				}
365 
366 				vStringStripLeading (prevLine);
367 				vStringStripTrailing (prevLine);
368 				if (whitespaceTerminated && vStringLength (prevLine) > 0)
369 					makeSectionMarkdownTag (prevLine, kind, marker);
370 			}
371 			/* otherwise is it a one line title */
372 			else if (line[pos] == '#' && nSame <= K_SECTION_COUNT && isspace (line[nSame]))
373 			{
374 				int kind = nSame - 1;
375 				bool delimited = false;
376 				vString *name = getHeading (kind, line, lineLen, &delimited);
377 				if (vStringLength (name) > 0)
378 					makeSectionMarkdownTag (name, kind, delimited ? "##" : "#");
379 				vStringDelete (name);
380 			}
381 
382 			lineProcessed = true;
383 		}
384 
385 		vStringClear (prevLine);
386 		if (!lineProcessed)
387 		{
388 			getFootnoteMaybe ((const char *)line);
389 			vStringCatS (prevLine, (const char*) line);
390 		}
391 	}
392 	vStringDelete (prevLine);
393 	vStringDelete (codeLang);
394 	{
395 		unsigned int line = (unsigned int)getInputLineNumber ();
396 		nestingLevelsFreeFull (nestingLevels, HT_UINT_TO_PTR (line));
397 	}
398 }
399 
MarkdownParser(void)400 extern parserDefinition* MarkdownParser (void)
401 {
402 	parserDefinition* const def = parserNew ("Markdown");
403 	static const char *const extensions [] = { "md", "markdown", NULL };
404 
405 	def->enabled  = true;
406 	def->extensions = extensions;
407 	def->useCork = CORK_QUEUE;
408 	def->kindTable = MarkdownKinds;
409 	def->kindCount = ARRAY_SIZE (MarkdownKinds);
410 	def->fieldTable = MarkdownFields;
411 	def->fieldCount = ARRAY_SIZE (MarkdownFields);
412 	def->defaultScopeSeparator = "\"\"";
413 	def->parser = findMarkdownTags;
414 
415 	/*
416 	 * This setting (useMemoryStreamInput) is for running
417 	 * Yaml parser from YamlFrontMatter as subparser.
418 	 * YamlFrontMatter is run from FrontMatter as a gust parser.
419 	 * FrontMatter is run from Markdown as a guest parser.
420 	 * This stacked structure hits the limitation of the main
421 	 * part: subparser's requirement for memory based input stream
422 	 * is not propagated to the main part.
423 	 *
424 	 * TODO: instead of setting useMemoryStreamInput here, we
425 	 * should remove the limitation.
426 	 */
427 	def->useMemoryStreamInput = true;
428 
429 	return def;
430 }
431