xref: /Universal-ctags/parsers/rst.c (revision 507b32e97e96a442c945a0ac39f22a297b16052e)
1 /*
2 *
3 *   Copyright (c) 2007-2011, Nick Treleaven
4 *
5 *   This source code is released for free distribution under the terms of the
6 *   GNU General Public License version 2 or (at your option) any later version.
7 *
8 *   This module contains functions for generating tags for reStructuredText (reST) files.
9 *
10 *   This module was ported from geany.
11 *
12 *   References:
13 *      https://docutils.sourceforge.io/docs/ref/rst/restructuredtext.html
14 */
15 
16 /*
17 *   INCLUDE FILES
18 */
19 #include "general.h"	/* must always come first */
20 
21 #include <ctype.h>
22 #include <string.h>
23 
24 #include "parse.h"
25 #include "read.h"
26 #include "vstring.h"
27 #include "nestlevel.h"
28 #include "entry.h"
29 #include "routines.h"
30 #include "field.h"
31 #include "htable.h"
32 #include "debug.h"
33 
34 /*
35 *   DATA DEFINITIONS
36 */
37 typedef enum {
38 	K_EOF = -1,
39 	K_TITLE = 0,
40 	K_SUBTITLE,
41 	K_CHAPTER,
42 	K_SECTION,
43 	K_SUBSECTION,
44 	K_SUBSUBSECTION,
45 	SECTION_COUNT,
46 	K_CITATION = SECTION_COUNT,
47 	K_TARGET,
48 	K_SUBSTDEF,
49 } rstKind;
50 
51 static kindDefinition RstKinds[] = {
52 	{ true, 'H', "title",         "titles"},
53 	{ true, 'h', "subtitle",      "sub titles" },
54 	{ true, 'c', "chapter",       "chapters"},
55 	{ true, 's', "section",       "sections" },
56 	{ true, 'S', "subsection",    "subsections" },
57 	{ true, 't', "subsubsection", "subsubsections" },
58 	{ true, 'C', "citation",      "citations"},
59 	{ true, 'T', "target",        "targets" },
60 	{ true, 'd', "substdef",      "substitute definitions" },
61 };
62 
63 typedef enum {
64 	F_SECTION_MARKER,
65 	F_SECTION_OVERLINE,
66 } rstField;
67 
68 static fieldDefinition RstFields [] = {
69 	{
70 		.name = "sectionMarker",
71 		.description = "character used for declaring section",
72 		.enabled = false,
73 	},
74 	{
75 		.name = "overline",
76 		.description = "whether using overline & underline for declaring section",
77 		.enabled = false,
78 		.dataType = FIELDTYPE_BOOL
79 	},
80 };
81 
82 static NestingLevels *nestingLevels = NULL;
83 
84 struct sectionTracker {
85 	char kindchar;
86 	bool overline;
87 	int count;
88 };
89 
90 struct olineTracker
91 {
92 	char c;
93 	size_t len;
94 };
95 
96 /*
97 *   FUNCTION DEFINITIONS
98 */
99 
getNestingLevel(const int kind)100 static NestingLevel *getNestingLevel(const int kind)
101 {
102 	NestingLevel *nl;
103 	tagEntryInfo *e;
104 
105 	int d = 0;
106 
107 	if (kind > K_EOF)
108 	{
109 		d++;
110 		/* 1. we want the line before the '---' underline chars */
111 		d++;
112 		/* 2. we want the line before the next section/chapter title. */
113 	}
114 
115 	while (1)
116 	{
117 		nl = nestingLevelsGetCurrent(nestingLevels);
118 		e = getEntryOfNestingLevel (nl);
119 		if ((nl && (e == NULL)) || (e && e->kindIndex >= kind))
120 		{
121 			if (e)
122 				e->extensionFields.endLine = (getInputLineNumber() - d);
123 			nestingLevelsPop(nestingLevels);
124 		}
125 		else
126 			break;
127 	}
128 	return nl;
129 }
130 
makeTargetRstTag(const vString * const name,rstKind kindex)131 static int makeTargetRstTag(const vString* const name, rstKind kindex)
132 {
133 	tagEntryInfo e;
134 
135 	initTagEntry (&e, vStringValue (name), kindex);
136 
137 	const NestingLevel *nl = nestingLevelsGetCurrent(nestingLevels);
138 	if (nl)
139 		e.extensionFields.scopeIndex = nl->corkIndex;
140 
141 	return makeTagEntry (&e);
142 }
143 
makeSectionRstTag(const vString * const name,const int kind,const MIOPos filepos,char marker,bool overline)144 static void makeSectionRstTag(const vString* const name, const int kind, const MIOPos filepos,
145 		       char marker, bool overline)
146 {
147 	const NestingLevel *const nl = getNestingLevel(kind);
148 	tagEntryInfo *parent;
149 
150 	int r = CORK_NIL;
151 
152 	if (vStringLength (name) > 0)
153 	{
154 		tagEntryInfo e;
155 		char m [2] = { [1] = '\0' };
156 
157 		initTagEntry (&e, vStringValue (name), kind);
158 
159 		e.lineNumber--;	/* we want the line before the '---' underline chars */
160 		e.filePosition = filepos;
161 
162 		parent = getEntryOfNestingLevel (nl);
163 		if (parent && (parent->kindIndex < kind))
164 			e.extensionFields.scopeIndex = nl->corkIndex;
165 
166 		m[0] = marker;
167 		attachParserField (&e, false, RstFields [F_SECTION_MARKER].ftype, m);
168 
169 		if (overline)
170 			attachParserField (&e, false, RstFields [F_SECTION_OVERLINE].ftype, "");
171 
172 		r = makeTagEntry (&e);
173 	}
174 	nestingLevelsPush(nestingLevels, r);
175 }
176 
177 
178 /* checks if str is all the same character */
issame(const char * str)179 static bool issame(const char *str)
180 {
181 	char first = *str;
182 
183 	while (*str)
184 	{
185 		char c;
186 
187 		str++;
188 		c = *str;
189 		if (c && c != first)
190 			return false;
191 	}
192 	return true;
193 }
194 
195 
get_kind(char c,bool overline,struct sectionTracker tracker[])196 static int get_kind(char c, bool overline, struct sectionTracker tracker[])
197 {
198 	int i;
199 
200 	for (i = 0; i < SECTION_COUNT; i++)
201 	{
202 		if (tracker[i].kindchar == c && tracker[i].overline == overline)
203 		{
204 			tracker[i].count++;
205 			return i;
206 		}
207 
208 		if (tracker[i].count == 0)
209 		{
210 			tracker[i].count = 1;
211 			tracker[i].kindchar = c;
212 			tracker[i].overline = overline;
213 			return i;
214 		}
215 	}
216 	return -1;
217 }
218 
219 
220 /* computes the length of an UTF-8 string
221  * if the string doesn't look like UTF-8, return -1 */
utf8_strlen(const char * buf,int buf_len)222 static int utf8_strlen(const char *buf, int buf_len)
223 {
224 	int len = 0;
225 	const char *end = buf + buf_len;
226 
227 	for (len = 0; buf < end; len ++)
228 	{
229 		/* perform quick and naive validation (no sub-byte checking) */
230 		if (! (*buf & 0x80))
231 			buf ++;
232 		else if ((*buf & 0xe0) == 0xc0)
233 			buf += 2;
234 		else if ((*buf & 0xf0) == 0xe0)
235 			buf += 3;
236 		else if ((*buf & 0xf8) == 0xf0)
237 			buf += 4;
238 		else /* not a valid leading UTF-8 byte, abort */
239 			return -1;
240 
241 		if (buf > end) /* incomplete last byte */
242 			return -1;
243 	}
244 
245 	return len;
246 }
247 
248 
is_markup_line(const unsigned char * line,char reftype)249 static const unsigned char *is_markup_line (const unsigned char *line, char reftype)
250 {
251 	if ((line [0] == '.') && (line [1] == '.') && (line [2] == ' ')
252 		&& (line [3] == reftype))
253 		return line + 4;
254 	return NULL;
255 }
256 
capture_markup(const unsigned char * target_line,char defaultTerminator,rstKind kindex)257 static int capture_markup (const unsigned char *target_line, char defaultTerminator, rstKind kindex)
258 {
259 	vString *name = vStringNew ();
260 	unsigned char terminator;
261 	int r = CORK_NIL;
262 
263 	if (*target_line == '`')
264 		terminator = '`';
265 	else if (!isspace (*target_line) && *target_line != '\0')
266 	{
267 		/* "Simple reference names are single words consisting of
268 		 * alphanumerics plus isolated (no two adjacent) internal
269 		 * hyphens, underscores, periods, colons and plus signs; no
270 		 * whitespace or other characters are allowed."
271 		 * -- http://docutils.sourceforge.net/docs/ref/rst/restructuredtext.html#reference-names
272 		 */
273 		vStringPut (name, *target_line);
274 		terminator = defaultTerminator;
275 	}
276 	else
277 		goto out;
278 
279 	target_line++;
280 
281 
282 	bool escaped = false;
283 	while (*target_line != '\0')
284 	{
285 		if (escaped)
286 		{
287 			vStringPut (name, *target_line);
288 			escaped = false;
289 		}
290 		else
291 		{
292 			if (*target_line == '\\')
293 			{
294 				vStringPut (name, *target_line);
295 				escaped = true;
296 			}
297 			else if (*target_line == terminator)
298 				break;
299 			else
300 				vStringPut (name, *target_line);
301 		}
302 		target_line++;
303 	}
304 
305 	if (vStringLength (name) == 0)
306 		goto out;
307 
308 	r = makeTargetRstTag (name, kindex);
309 
310  out:
311 	vStringDelete (name);
312 	return r;
313 }
314 
overline_clear(struct olineTracker * ol)315 static void overline_clear(struct olineTracker *ol)
316 {
317 	ol->c = 0;
318 	ol->len = 0;
319 }
320 
overline_set(struct olineTracker * ol,char c,size_t len)321 static void overline_set(struct olineTracker *ol, char c, size_t len)
322 {
323 	ol->c = c;
324 	ol->len = len;
325 }
326 
has_overline(struct olineTracker * ol)327 static bool has_overline(struct olineTracker *ol)
328 {
329 	return (ol->c != 0);
330 }
331 
getFosterEntry(tagEntryInfo * e,int shift)332 static int getFosterEntry(tagEntryInfo *e, int shift)
333 {
334 	int r = CORK_NIL;
335 
336 	while (shift-- > 0)
337 	{
338 		r = e->extensionFields.scopeIndex;
339 		Assert(r != CORK_NIL);
340 		e = getEntryInCorkQueue(r);
341 		Assert(e);
342 	}
343 	return r;
344 }
345 
shiftKinds(int shift,rstKind baseKind)346 static void shiftKinds(int shift, rstKind baseKind)
347 {
348 	size_t count = countEntryInCorkQueue();
349 	hashTable *remapping_table = hashTableNew (count,
350 											   hashPtrhash,
351 											   hashPtreq, NULL, NULL);
352 	hashTableSetValueForUnknownKey(remapping_table, HT_INT_TO_PTR(CORK_NIL), NULL);
353 
354 	for (int index = 0; index < count; index++)
355 	{
356 		tagEntryInfo *e = getEntryInCorkQueue(index);
357 		if (e && (baseKind <= e->kindIndex && e->kindIndex < SECTION_COUNT))
358 		{
359 			e->kindIndex += shift;
360 			if (e->kindIndex >= SECTION_COUNT)
361 			{
362 				markTagPlaceholder(e, true);
363 
364 				int foster_parent = getFosterEntry(e, shift);
365 				Assert (foster_parent != CORK_NIL);
366 				hashTablePutItem(remapping_table, HT_INT_TO_PTR(index),
367 								 HT_INT_TO_PTR(foster_parent));
368 			}
369 		}
370 	}
371 
372 	for (int index = 0; index < count; index++)
373 	{
374 		tagEntryInfo *e = getEntryInCorkQueue(index);
375 		if (e && e->extensionFields.scopeIndex != CORK_NIL)
376 		{
377 			void *remapping_to = hashTableGetItem (remapping_table,
378 												   HT_INT_TO_PTR(e->extensionFields.scopeIndex));
379 			if (HT_PTR_TO_INT(remapping_to) != CORK_NIL)
380 				e->extensionFields.scopeIndex = HT_PTR_TO_INT(remapping_to);
381 		}
382 	}
383 	hashTableDelete(remapping_table);
384 }
385 
adjustSectionKinds(struct sectionTracker section_tracker[])386 static void adjustSectionKinds(struct sectionTracker section_tracker[])
387 {
388 	if (section_tracker[K_TITLE].count > 1)
389 	{
390 		shiftKinds(2, K_TITLE);
391 		return;
392 	}
393 
394 	if (section_tracker[K_TITLE].count == 1
395 		&& section_tracker[K_SUBTITLE].count > 1)
396 	{
397 		shiftKinds(1, K_SUBTITLE);
398 		return;
399 	}
400 }
401 
inlineTagScope(tagEntryInfo * e,int parent_index)402 static void inlineTagScope(tagEntryInfo *e, int parent_index)
403 {
404 	tagEntryInfo *parent = getEntryInCorkQueue (parent_index);
405 	if (parent)
406 	{
407 		e->extensionFields.scopeKindIndex = parent->kindIndex;
408 		e->extensionFields.scopeName = eStrdup(parent->name);
409 		e->extensionFields.scopeIndex = CORK_NIL;
410 	}
411 }
412 
inlineScopes(void)413 static void inlineScopes (void)
414 {
415 	/* TODO
416 	   Following code makes the scope information full qualified form.
417 	   Do users want the full qualified form?
418 	   --- ./Units/rst.simple.d/expected.tags	2015-12-18 01:32:35.574255617 +0900
419 	   +++ /home/yamato/var/ctags-github/Units/rst.simple.d/FILTERED.tmp	2016-05-05 03:05:38.165604756 +0900
420 	   @@ -5,2 +5,2 @@
421 	   -Subsection 1.1.1	input.rst	/^Subsection 1.1.1$/;"	S	section:Section 1.1
422 	   -Subsubsection 1.1.1.1	input.rst	/^Subsubsection 1.1.1.1$/;"	t	subsection:Subsection 1.1.1
423 	   +Subsection 1.1.1	input.rst	/^Subsection 1.1.1$/;"	S	section:Chapter 1.Section 1.1
424 	   +Subsubsection 1.1.1.1	input.rst	/^Subsubsection 1.1.1.1$/;"	t	subsection:Chapter 1.Section 1.1.Subsection 1.1.1
425 	*/
426 	size_t count = countEntryInCorkQueue();
427 	for (int index = 0; index < count; index++)
428 	{
429 		tagEntryInfo *e = getEntryInCorkQueue(index);
430 
431 		if (e && e->extensionFields.scopeIndex != CORK_NIL)
432 			inlineTagScope(e, e->extensionFields.scopeIndex);
433 	}
434 }
435 
findRstTags(void)436 static void findRstTags (void)
437 {
438 	vString *name = vStringNew ();
439 	MIOPos filepos;
440 	const unsigned char *line;
441 	const unsigned char *markup_line;
442 	struct sectionTracker section_tracker[SECTION_COUNT];
443 	struct olineTracker overline;
444 
445 	memset(&filepos, 0, sizeof(filepos));
446 	memset(section_tracker, 0, sizeof section_tracker);
447 	overline_clear(&overline);
448 	nestingLevels = nestingLevelsNew(0);
449 
450 	while ((line = readLineFromInputFile ()) != NULL)
451 	{
452 		if ((markup_line = is_markup_line (line, '_')) != NULL)
453 		{
454 			overline_clear(&overline);
455 			/* Handle .. _target:
456 			 * http://docutils.sourceforge.net/docs/ref/rst/restructuredtext.html#hyperlink-targets
457 			 */
458 			if (capture_markup (markup_line, ':', K_TARGET) != CORK_NIL)
459 			{
460 				vStringClear (name);
461 				continue;
462 			}
463 		}
464 		else if ((markup_line = is_markup_line (line, '[')) != NULL)
465 		{
466 			overline_clear(&overline);
467 			/* Handle .. [citation]
468 			 * https://docutils.sourceforge.io/docs/ref/rst/restructuredtext.html#citations
469 			 */
470 			if (capture_markup (markup_line, ']', K_CITATION) != CORK_NIL)
471 			{
472 				vStringClear (name);
473 				continue;
474 			}
475 		}
476 		else if ((markup_line = is_markup_line (line, '|')) != NULL)
477 		{
478 			overline_clear(&overline);
479 			/* Hanle .. |substitute definition|
480 			 * https://docutils.sourceforge.io/docs/ref/rst/restructuredtext.html#substitution-definitions
481 			 */
482 			if (capture_markup (markup_line, '|', K_SUBSTDEF) != CORK_NIL)
483 			{
484 				vStringClear (name);
485 				continue;
486 			}
487 		}
488 
489 		int line_len = strlen((const char*) line);
490 		int name_len_bytes = vStringLength(name);
491 		/* FIXME: this isn't right, actually we need the real display width,
492 		 * taking into account double-width characters and stuff like that.
493 		 * But duh. */
494 		int name_len = utf8_strlen(vStringValue(name), name_len_bytes);
495 
496 		/* if the name doesn't look like UTF-8, assume one-byte charset */
497 		if (name_len < 0)
498 			name_len = name_len_bytes;
499 
500 		/* overline may come after an empty line (or begging of file). */
501 		if (name_len_bytes == 0 && line_len > 0 &&
502 			ispunct(line[0]) && issame((const char*) line))
503 		{
504 			overline_set(&overline, *line, line_len);
505 			continue;
506 		}
507 
508 		/* underlines must be the same length or more */
509 		if (line_len >= name_len && name_len > 0 &&
510 			ispunct(line[0]) && issame((const char*) line))
511 		{
512 			char c = line[0];
513 			bool o = (overline.c == c && overline.len == line_len);
514 			int kind = get_kind(c, o, section_tracker);
515 
516 			overline_clear(&overline);
517 
518 			if (kind >= 0)
519 			{
520 				makeSectionRstTag(name, kind, filepos, c, o);
521 				vStringClear(name);
522 				continue;
523 			}
524 		}
525 
526 		if (has_overline(&overline))
527 		{
528 			if (name_len > 0)
529 			{
530 				/*
531 				 * Though we saw an overline and a section title text,
532 				 * we cannot find the associated underline.
533 				 * In that case, we must reset the state of tracking
534 				 * overline.
535 				 */
536 				overline_clear(&overline);
537 			}
538 
539 			/*
540 			 * We san an overline. The line is the candidate
541 			 * of a section title text. Skip the prefixed whitespaces.
542 			 */
543 			while (isspace(*line))
544 				line++;
545 		}
546 
547 		vStringClear (name);
548 		if (!isspace(*line))
549 		{
550 			vStringCatS(name, (const char*)line);
551 			vStringStripTrailing (name);
552 			filepos = getInputFilePosition();
553 		}
554 	}
555 	/* Force popping all nesting levels */
556 	getNestingLevel (K_EOF);
557 	vStringDelete (name);
558 	nestingLevelsFree(nestingLevels);
559 
560 	adjustSectionKinds(section_tracker);
561 	inlineScopes();
562 }
563 
RstParser(void)564 extern parserDefinition* RstParser (void)
565 {
566 	static const char *const extensions [] = { "rest", "reST", "rst", NULL };
567 	parserDefinition* const def = parserNew ("ReStructuredText");
568 	static const char *const aliases[] = {
569 		"rst",					/* The name of emacs's mode */
570 		NULL
571 	};
572 
573 	def->kindTable = RstKinds;
574 	def->kindCount = ARRAY_SIZE (RstKinds);
575 	def->extensions = extensions;
576 	def->aliases = aliases;
577 	def->parser = findRstTags;
578 
579 	def->fieldTable = RstFields;
580 	def->fieldCount = ARRAY_SIZE (RstFields);
581 
582 	def->useCork = CORK_QUEUE;
583 
584 	return def;
585 }
586