1 /*
2 *
3 * Copyright (c) 2007-2011, Nick Treleaven
4 * Copyright (c) 2012, Lex Trotman
5 * Copyright (c) 2021, Jiri Techet
6 *
7 * This source code is released for free distribution under the terms of the
8 * GNU General Public License version 2 or (at your option) any later version.
9 *
10 * This module contains functions for generating tags for markdown files.
11 *
12 * This parser was based on the asciidoc parser.
13 *
14 * Extended syntax like footnotes is described in
15 * https://www.markdownguide.org/extended-syntax/
16 */
17
18 /*
19 * INCLUDE FILES
20 */
21 #include "general.h" /* must always come first */
22
23 #include <ctype.h>
24 #include <string.h>
25
26 #include "debug.h"
27 #include "entry.h"
28 #include "parse.h"
29 #include "read.h"
30 #include "vstring.h"
31 #include "nestlevel.h"
32 #include "routines.h"
33 #include "promise.h"
34 #include "htable.h"
35
36 #include "markdown.h"
37
38 /*
39 * DATA DEFINITIONS
40 */
41 typedef enum {
42 K_CHAPTER = 0,
43 K_SECTION,
44 K_SUBSECTION,
45 K_SUBSUBSECTION,
46 K_LEVEL4SECTION,
47 K_LEVEL5SECTION,
48 K_SECTION_COUNT,
49 K_FOOTNOTE = K_SECTION_COUNT,
50 } markdownKind;
51
52 static kindDefinition MarkdownKinds[] = {
53 { true, 'c', "chapter", "chapters"},
54 { true, 's', "section", "sections" },
55 { true, 'S', "subsection", "level 2 sections" },
56 { true, 't', "subsubsection", "level 3 sections" },
57 { true, 'T', "l4subsection", "level 4 sections" },
58 { true, 'u', "l5subsection", "level 5 sections" },
59 { true, 'n', "footnote", "footnotes" },
60 };
61
62 static fieldDefinition MarkdownFields [] = {
63 {
64 .enabled = false,
65 .name = "sectionMarker",
66 .description = "character used for declaring section(#, ##, =, or -)",
67 },
68 };
69
70 typedef enum {
71 F_MARKER,
72 } markdownField;
73
74 static NestingLevels *nestingLevels = NULL;
75
76 /*
77 * FUNCTION DEFINITIONS
78 */
79
getNestingLevel(const int kind,unsigned long adjustmentWhenPop)80 static NestingLevel *getNestingLevel (const int kind, unsigned long adjustmentWhenPop)
81 {
82 NestingLevel *nl;
83 tagEntryInfo *e;
84 unsigned long line = getInputLineNumber ();
85
86 line = (line > adjustmentWhenPop)? (line - adjustmentWhenPop): 0;
87
88 while (1)
89 {
90 nl = nestingLevelsGetCurrent (nestingLevels);
91 e = getEntryOfNestingLevel (nl);
92 if ((nl && (e == NULL)) || (e && (e->kindIndex >= kind)))
93 nestingLevelsPopFull (nestingLevels, HT_UINT_TO_PTR ((unsigned int)line));
94 else
95 break;
96 }
97 return nl;
98 }
99
100
makeMarkdownTag(const vString * const name,const int kind,const bool twoLine)101 static int makeMarkdownTag (const vString* const name, const int kind, const bool twoLine)
102 {
103 int r = CORK_NIL;
104
105 if (vStringLength (name) > 0)
106 {
107 const NestingLevel *const nl = getNestingLevel (kind, twoLine? 2: 1);
108 tagEntryInfo *parent = getEntryOfNestingLevel (nl);
109 tagEntryInfo e;
110
111 initTagEntry (&e, vStringValue (name), kind);
112
113 if (twoLine)
114 {
115 /* we want the line before the '---' underline chars */
116 const unsigned long line = getInputLineNumber ();
117 Assert (line > 0);
118 if (line > 0)
119 {
120 e.lineNumber--;
121 e.filePosition = getInputFilePositionForLine (line - 1);
122 }
123 }
124
125 if (parent && (parent->kindIndex < kind))
126 e.extensionFields.scopeIndex = nl->corkIndex;
127
128 r = makeTagEntry (&e);
129 }
130 return r;
131 }
132
133
makeSectionMarkdownTag(const vString * const name,const int kind,const char * marker)134 static int makeSectionMarkdownTag (const vString* const name, const int kind, const char *marker)
135 {
136 int r = makeMarkdownTag (name, kind, marker[0] != '#');
137 attachParserFieldToCorkEntry (r, MarkdownFields [F_MARKER].ftype, marker);
138
139 nestingLevelsPush (nestingLevels, r);
140 return r;
141 }
142
143
getHeading(const int kind,const unsigned char * line,const int lineLen,bool * delimited)144 static vString *getHeading (const int kind, const unsigned char *line,
145 const int lineLen, bool *delimited)
146 {
147 int pos = 0;
148 int start = kind + 1;
149 int end = lineLen - 1;
150 vString *name = vStringNew ();
151
152 Assert (kind >= 0 && kind < K_SECTION_COUNT);
153 Assert (lineLen > start);
154
155 *delimited = false;
156 while (isspace (line[pos])) ++pos;
157 while (line[end] == line[pos] && end - 1 >= 0 && line[end - 1] != '\\')
158 {
159 --end;
160 *delimited = true;
161 }
162 while (isspace (line[start])) ++start;
163 while (isspace (line[end])) --end;
164
165 if (start <= end)
166 vStringNCatS (name, (const char*)(&(line[start])), end - start + 1);
167
168 return name;
169 }
170
171
getFirstCharPos(const unsigned char * line,int lineLen,bool * indented)172 static int getFirstCharPos (const unsigned char *line, int lineLen, bool *indented)
173 {
174 int indent = 0;
175 int i;
176 for (i = 0; i < lineLen && isspace (line[i]); i++)
177 indent += line[i] == '\t' ? 4 : 1;
178 *indented = indent >= 4;
179 return i;
180 }
181
182
fillEndField(NestingLevel * nl,void * ctxData)183 static void fillEndField (NestingLevel *nl, void *ctxData)
184 {
185 tagEntryInfo *e = getEntryOfNestingLevel (nl);
186 if (e)
187 {
188 unsigned long line = (unsigned long)(HT_PTR_TO_UINT (ctxData));
189 e->extensionFields.endLine = line;
190 }
191 }
192
getFootnoteMaybe(const char * line)193 static void getFootnoteMaybe (const char *line)
194 {
195 const char *start = strstr (line, "[^");
196 const char *end = start? strstr(start + 2, "]:"): NULL;
197
198 if (! (start && end))
199 return;
200 if (! (end > (start + 2)))
201 return;
202
203 vString * footnote = vStringNewNInit (start + 2, end - (start + 2));
204 const NestingLevel *const nl = nestingLevelsGetCurrent (nestingLevels);
205 tagEntryInfo e;
206
207 initTagEntry (&e, vStringValue (footnote), K_FOOTNOTE);
208 if (nl)
209 e.extensionFields.scopeIndex = nl->corkIndex;
210 makeTagEntry (&e);
211
212 vStringDelete (footnote);
213 }
214
extractLanguageForCodeBlock(const char * langMarker,vString * codeLang)215 static bool extractLanguageForCodeBlock (const char *langMarker,
216 vString *codeLang)
217 {
218 subparser *s;
219 bool r = false;
220
221 foreachSubparser (s, false)
222 {
223 markdownSubparser *m = (markdownSubparser *)s;
224 enterSubparser(s);
225 if (m->extractLanguageForCodeBlock)
226 r = m->extractLanguageForCodeBlock (m, langMarker, codeLang);
227 leaveSubparser();
228 if (r)
229 break;
230 }
231
232 return r;
233 }
234
findMarkdownTags(void)235 static void findMarkdownTags (void)
236 {
237 vString *prevLine = vStringNew ();
238 vString *codeLang = vStringNew ();
239 const unsigned char *line;
240 char inCodeChar = 0;
241 long startSourceLineNumber = 0;
242 long startLineNumber = 0;
243 bool inPreambule = false;
244 bool inComment = false;
245
246 subparser *sub = getSubparserRunningBaseparser();
247 if (sub)
248 chooseExclusiveSubparser (sub, NULL);
249
250 nestingLevels = nestingLevelsNewFull (0, fillEndField);
251
252 while ((line = readLineFromInputFile ()) != NULL)
253 {
254 int lineLen = strlen ((const char*) line);
255 bool lineProcessed = false;
256 bool indented;
257 int pos = getFirstCharPos (line, lineLen, &indented);
258 const int lineNum = getInputLineNumber ();
259
260 if (lineNum == 1 || inPreambule)
261 {
262 if (line[pos] == '-' && line[pos + 1] == '-' && line[pos + 2] == '-')
263 {
264 if (inPreambule)
265 {
266 long endLineNumber = lineNum;
267 if (startLineNumber < endLineNumber)
268 makePromise ("FrontMatter", startLineNumber, 0,
269 endLineNumber, 0, startSourceLineNumber);
270 }
271 else
272 startSourceLineNumber = startLineNumber = lineNum;
273 inPreambule = !inPreambule;
274 }
275 }
276
277 if (inPreambule)
278 continue;
279
280 /* fenced code block */
281 if (line[pos] == '`' || line[pos] == '~')
282 {
283 char c = line[pos];
284 char otherC = c == '`' ? '~' : '`';
285 int nSame;
286 for (nSame = 1; line[nSame] == line[pos]; ++nSame);
287
288 if (inCodeChar != otherC && nSame >= 3)
289 {
290 inCodeChar = inCodeChar ? 0 : c;
291 if (inCodeChar == c && strstr ((const char *)(line + pos + nSame), "```") != NULL)
292 inCodeChar = 0;
293 else if (inCodeChar)
294 {
295 const char *langMarker = (const char *)(line + pos + nSame);
296 startLineNumber = startSourceLineNumber = lineNum + 1;
297
298 vStringClear (codeLang);
299 if (! extractLanguageForCodeBlock (langMarker, codeLang))
300 {
301 vStringCopyS (codeLang, langMarker);
302 vStringStripLeading (codeLang);
303 vStringStripTrailing (codeLang);
304 }
305 }
306 else
307 {
308 long endLineNumber = lineNum;
309 if (vStringLength (codeLang) > 0
310 && startLineNumber < endLineNumber)
311 makePromise (vStringValue (codeLang), startLineNumber, 0,
312 endLineNumber, 0, startSourceLineNumber);
313 }
314
315 lineProcessed = true;
316 }
317 }
318 /* XML comment start */
319 else if (lineLen >= pos + 4 && line[pos] == '<' && line[pos + 1] == '!' &&
320 line[pos + 2] == '-' && line[pos + 3] == '-')
321 {
322 if (strstr ((const char *)(line + pos + 4), "-->") == NULL)
323 inComment = true;
324 lineProcessed = true;
325 }
326 /* XML comment end */
327 else if (inComment && strstr ((const char *)(line + pos), "-->"))
328 {
329 inComment = false;
330 lineProcessed = true;
331 }
332
333 /* code block or comment */
334 if (inCodeChar || inComment)
335 lineProcessed = true;
336
337 /* code block using indent */
338 else if (indented)
339 lineProcessed = true;
340
341 /* if it's a title underline, or a delimited block marking character */
342 else if (line[pos] == '=' || line[pos] == '-' || line[pos] == '#' || line[pos] == '>')
343 {
344 int nSame;
345 for (nSame = 1; line[nSame] == line[pos]; ++nSame);
346
347 /* quote */
348 if (line[pos] == '>')
349 ; /* just to make sure lineProcessed = true so it won't be in a heading */
350 /* is it a two line title */
351 else if (line[pos] == '=' || line[pos] == '-')
352 {
353 char marker[2] = { line[pos], '\0' };
354 int kind = line[pos] == '=' ? K_CHAPTER : K_SECTION;
355 bool whitespaceTerminated = true;
356
357 for (int i = pos + nSame; i < lineLen; i++)
358 {
359 if (!isspace (line[i]))
360 {
361 whitespaceTerminated = false;
362 break;
363 }
364 }
365
366 vStringStripLeading (prevLine);
367 vStringStripTrailing (prevLine);
368 if (whitespaceTerminated && vStringLength (prevLine) > 0)
369 makeSectionMarkdownTag (prevLine, kind, marker);
370 }
371 /* otherwise is it a one line title */
372 else if (line[pos] == '#' && nSame <= K_SECTION_COUNT && isspace (line[nSame]))
373 {
374 int kind = nSame - 1;
375 bool delimited = false;
376 vString *name = getHeading (kind, line, lineLen, &delimited);
377 if (vStringLength (name) > 0)
378 makeSectionMarkdownTag (name, kind, delimited ? "##" : "#");
379 vStringDelete (name);
380 }
381
382 lineProcessed = true;
383 }
384
385 vStringClear (prevLine);
386 if (!lineProcessed)
387 {
388 getFootnoteMaybe ((const char *)line);
389 vStringCatS (prevLine, (const char*) line);
390 }
391 }
392 vStringDelete (prevLine);
393 vStringDelete (codeLang);
394 {
395 unsigned int line = (unsigned int)getInputLineNumber ();
396 nestingLevelsFreeFull (nestingLevels, HT_UINT_TO_PTR (line));
397 }
398 }
399
MarkdownParser(void)400 extern parserDefinition* MarkdownParser (void)
401 {
402 parserDefinition* const def = parserNew ("Markdown");
403 static const char *const extensions [] = { "md", "markdown", NULL };
404
405 def->enabled = true;
406 def->extensions = extensions;
407 def->useCork = CORK_QUEUE;
408 def->kindTable = MarkdownKinds;
409 def->kindCount = ARRAY_SIZE (MarkdownKinds);
410 def->fieldTable = MarkdownFields;
411 def->fieldCount = ARRAY_SIZE (MarkdownFields);
412 def->defaultScopeSeparator = "\"\"";
413 def->parser = findMarkdownTags;
414
415 /*
416 * This setting (useMemoryStreamInput) is for running
417 * Yaml parser from YamlFrontMatter as subparser.
418 * YamlFrontMatter is run from FrontMatter as a gust parser.
419 * FrontMatter is run from Markdown as a guest parser.
420 * This stacked structure hits the limitation of the main
421 * part: subparser's requirement for memory based input stream
422 * is not propagated to the main part.
423 *
424 * TODO: instead of setting useMemoryStreamInput here, we
425 * should remove the limitation.
426 */
427 def->useMemoryStreamInput = true;
428
429 return def;
430 }
431