xref: /Universal-ctags/parsers/asciidoc.c (revision 6b1a862e526d5017f9f212a321f59d67c859d521)
1 /*
2  *
3  *  Copyright (c) 2007-2011, Nick Treleaven
4  * 	Copyright (c) 2012, Lex Trotman
5  *
6  *   This source code is released for free distribution under the terms of the
7  *   GNU General Public License version 2 or (at your option) any later version.
8  *
9  * This module contains functions for generating tags for asciidoc files.
10  *
11  * Based on Rest code by Nick Treleaven, see rest.c
12  *
13  * This code was ported from geany git commit 40396a3 at:
14  *   https://github.com/geany/geany/blob/master/ctags/parsers/asciidoc.c
15  * with the changes in geany's PR #1263, with some changes to work in uctags.
16  */
17 
18 /*
19  *   INCLUDE FILES
20  */
21 #include "general.h"	/* must always come first */
22 
23 #include <ctype.h>
24 #include <string.h>
25 
26 #include "debug.h"
27 #include "entry.h"
28 #include "parse.h"
29 #include "read.h"
30 #include "vstring.h"
31 #include "nestlevel.h"
32 #include "routines.h"
33 
34 /*
35  *   DATA DEFINITIONS
36  */
37 typedef enum {
38 	K_CHAPTER = 0,
39 	K_SECTION,
40 	K_SUBSECTION,
41 	K_SUBSUBSECTION,
42 	K_LEVEL4SECTION,
43 	/* level-5 section not in here because it only works for one-line */
44 	SECTION_COUNT, /* this is the same as level-5 kind number */
45 	K_ANCHOR
46 } asciidocKind;
47 
48 /*
49  * The following kind letters are based on the markdown parser kinds,
50  * and thus different than geany's.
51  */
52 static kindDefinition AsciidocKinds[] = {
53 	{ true, 'c', "chapter",       "chapters"},
54 	{ true, 's', "section",       "sections" },
55 	{ true, 'S', "subsection",    "level 2 sections" },
56 	{ true, 't', "subsubsection", "level 3 sections" },
57 	{ true, 'T', "l4subsection",  "level 4 sections" },
58 	{ true, 'u', "l5subsection",  "level 5 sections" },
59 	{ true, 'a', "anchor",        "anchors" }
60 };
61 
62 static char kindchars[SECTION_COUNT]={ '=', '-', '~', '^', '+' };
63 
64 static NestingLevels *nestingLevels = NULL;
65 
66 /*
67 *   FUNCTION DEFINITIONS
68 */
69 
getNestingLevel(const int kind)70 static NestingLevel *getNestingLevel(const int kind)
71 {
72 	NestingLevel *nl;
73 	tagEntryInfo *e;
74 
75 	while (1)
76 	{
77 		nl = nestingLevelsGetCurrent(nestingLevels);
78 		e = getEntryOfNestingLevel (nl);
79 		if ((nl && (e == NULL)) || (e && (e->kindIndex >= kind)))
80 			nestingLevelsPop(nestingLevels);
81 		else
82 			break;
83 	}
84 	return nl;
85 }
86 
makeAsciidocTag(const vString * const name,const int kind,const bool two_line)87 static int makeAsciidocTag (const vString* const name, const int kind, const bool two_line)
88 {
89 	const NestingLevel *const nl = getNestingLevel(kind);
90 	int r = CORK_NIL;
91 
92 	if (vStringLength (name) > 0)
93 	{
94 		tagEntryInfo *parent = getEntryOfNestingLevel (nl);
95 		tagEntryInfo e;
96 
97 		initTagEntry (&e, vStringValue (name), kind);
98 
99 		if (two_line)
100 		{
101 			/* we want the line before the '---' underline chars */
102 			const unsigned long line = getInputLineNumber();
103 			Assert (line > 0);
104 			if (line > 0)
105 			{
106 				e.lineNumber--;
107 				e.filePosition = getInputFilePositionForLine(line - 1);
108 			}
109 		}
110 
111 		if (parent && (parent->kindIndex < kind))
112 		{
113 			/*
114 			 * This doesn't use Cork, but in this case I think this is better,
115 			 * because Cork would record the scopes of all parents in the chain
116 			 * which is weird for text section identifiers, and also this is
117 			 * what the rst.c reStructuredText parser does.
118 			 */
119 			e.extensionFields.scopeKindIndex = parent->kindIndex;
120 			e.extensionFields.scopeName = parent->name;
121 		}
122 
123 		r = makeTagEntry (&e);
124 	}
125 	return r;
126 }
127 
makeSectionAsciidocTag(const vString * const name,const int kind,const bool two_line)128 static int makeSectionAsciidocTag (const vString* const name, const int kind, const bool two_line)
129 {
130 	int r = makeAsciidocTag(name, kind, two_line);
131 	nestingLevelsPush(nestingLevels, r);
132 	return r;
133 }
134 
135 
get_kind(char c)136 static int get_kind(char c)
137 {
138 	int i;
139 
140 	for (i = 0; i < SECTION_COUNT; i++)
141 	{
142 		if (kindchars[i] == c)
143 			return i;
144 	}
145 	return -1;
146 }
147 
148 
is_anchor(const unsigned char * line)149 static bool is_anchor(const unsigned char *line)
150 {
151 	/* must be at least "[#a]" */
152 	return line[0] == '[' && (line[1] == '#' || line[1] == '[');
153 }
154 
capture_anchor(const unsigned char * const orig,int * captured_len)155 static int capture_anchor(const unsigned char *const orig, int* captured_len)
156 {
157 	vString *name = vStringNew ();
158 	int r = CORK_NIL;
159 	const bool shorthand = orig[1] == '#' ? true : false;
160 	bool is_valid = false;
161 	bool seen_comma = false;
162 	const unsigned char *line = orig;
163 
164 	Assert (line[0] == '[');
165 	Assert (line[1] == '#' || line[1] == '[');
166 
167 	if (captured_len) *captured_len = 0;
168 
169 	line += 2;
170 
171 	while (*line != '\0')
172 	{
173 		if (*line == ']')
174 		{
175 			if (shorthand || line[1] == ']')
176 			{
177 				is_valid = true;
178 				if (shorthand) line++;
179 				else line += 2;
180 				break;
181 			}
182 			/* otherwise it's not the end, keep going */
183 		}
184 
185 		if (*line == ',')
186 			seen_comma = true;
187 
188 		if (!seen_comma)
189 			vStringPut (name, *line);
190 
191 		line++;
192 	}
193 
194 	if (is_valid && vStringLength (name) != 0)
195 	{
196 		r = makeAsciidocTag (name, K_ANCHOR, false);
197 
198 		if (captured_len)
199 		{
200 			*captured_len = line - orig;
201 		}
202 	}
203 
204 	vStringDelete (name);
205 	return r;
206 }
207 
208 
209 /* skips any leading anchor(s) in a one-line title, generating tags for them */
process_leading_anchors(const unsigned char * const begin)210 static int process_leading_anchors(const unsigned char *const begin)
211 {
212 	int captured_len = 0;
213 	const unsigned char *current = begin;
214 
215 	while (is_anchor(current) && capture_anchor(current, &captured_len) != CORK_NIL)
216 	{
217 		/* minimum is "[#a]" */
218 		Assert (captured_len >= 4);
219 		current += captured_len;
220 		while (isspace(*current)) ++current;
221 	}
222 
223 	return current - begin;
224 }
225 
process_trailing_anchor(const unsigned char * const begin,const unsigned char * const end)226 static int process_trailing_anchor(const unsigned char *const begin,
227 								   const unsigned char *const end)
228 {
229 	int captured_len = 0;
230 	const unsigned char *found = NULL;
231 
232 	/* minimum is "[#a]" */
233 	if (*end == ']' && (end - begin) >= 4)
234 	{
235 		found = (const unsigned char*) strrchr((const char*) begin , '[');
236 		if (found && ((end - found) >= 4))
237 		{
238 			/* see if it's not shorthand [#a] but instead [[a]] */
239 			if (end[-1] == ']' && found > begin && found[-1] == '[')
240 				--found;
241 
242 			if (is_anchor (found))
243 				capture_anchor(found, &captured_len);
244 		}
245 	}
246 
247 	return captured_len;
248 }
249 
process_name(vString * const name,const int kind,const unsigned char * line,const int line_len)250 static void process_name(vString *const name, const int kind,
251 						 const unsigned char *line, const int line_len)
252 {
253 	int start = kind + 1;
254 	int end = line_len - 1;
255 
256 	Assert (kind >= 0 && kind < K_ANCHOR);
257 	Assert (line_len > start);
258 
259 	vStringClear(name);
260 
261 	while (line[end] == line[0]) --end;
262 	while (isspace(line[start])) ++start;
263 	while (isspace(line[end])) --end;
264 
265 	if (start < end)
266 	{
267 		/* pop nesting levels, so that anchors get the parent's scope */
268 		getNestingLevel(kind);
269 		end -= process_trailing_anchor(line + start, line + end);
270 		start += process_leading_anchors(line + start);
271 	}
272 
273 	while (isspace(line[end])) --end;
274 
275 	if (start <= end)
276 		vStringNCatS(name, (const char*)(&(line[start])), end - start + 1);
277 }
278 
279 
280 /* computes the length of an UTF-8 string
281  * if the string doesn't look like UTF-8, return -1
282  * FIXME consider East_Asian_Width Unicode property */
utf8_strlen(const char * buf,int buf_len)283 static int utf8_strlen(const char *buf, int buf_len)
284 {
285 	int len = 0;
286 	const char *end = buf + buf_len;
287 
288 	for (len = 0; buf < end; len ++)
289 	{
290 		/* perform quick and naive validation (no sub-byte checking) */
291 		if (! (*buf & 0x80))
292 			buf ++;
293 		else if ((*buf & 0xe0) == 0xc0)
294 			buf += 2;
295 		else if ((*buf & 0xf0) == 0xe0)
296 			buf += 3;
297 		else if ((*buf & 0xf8) == 0xf0)
298 			buf += 4;
299 		else /* not a valid leading UTF-8 byte, abort */
300 			return -1;
301 
302 		if (buf > end) /* incomplete last byte */
303 			return -1;
304 	}
305 
306 	return len;
307 }
308 
309 
findAsciidocTags(void)310 static void findAsciidocTags(void)
311 {
312 	vString *name = vStringNew();
313 	const unsigned char *line;
314 	unsigned char in_block = '\0';  /* holds the block marking char or \0 if not in block */
315 
316 	nestingLevels = nestingLevelsNew(0);
317 
318 	while ((line = readLineFromInputFile()) != NULL)
319 	{
320 		if (is_anchor (line))
321 		{
322 			if (capture_anchor (line, NULL) != CORK_NIL)
323 			{
324 				vStringClear (name);
325 				continue;
326 			}
327 		}
328 
329 		int line_len = strlen((const char*) line);
330 		int name_len_bytes = vStringLength(name);
331 		int name_len = utf8_strlen(vStringValue(name), name_len_bytes);
332 
333 		/* if the name doesn't look like UTF-8, assume one-byte charset */
334 		if (name_len < 0) name_len = name_len_bytes;
335 
336 		/* if its a title underline, or a delimited block marking character */
337 		if (line[0] == '=' || line[0] == '-' || line[0] == '~' ||
338 			line[0] == '^' || line[0] == '+' || line[0] == '.' ||
339 			line[0] == '*' || line[0] == '_' || line[0] == '/')
340 		{
341 			int n_same;
342 			for (n_same = 1; line[n_same] == line[0]; ++n_same);
343 
344 			/* is it a two line title or a delimited block */
345 			if (n_same == line_len)
346 			{
347 				/* if in a block, can't be block start or title, look for block end */
348 				if (in_block)
349 				{
350 					if (line[0] == in_block) in_block = '\0';
351 				}
352 
353 				/* if its a =_~^+ and the same length +-2 as the line before then its a title */
354 				/* (except in the special case its a -- open block start line) */
355 				else if ((line[0] == '=' || line[0] == '-' || line[0] == '~' ||
356 							line[0] == '^' || line[0] == '+') &&
357 						line_len <= name_len + 2 && line_len >= name_len - 2 &&
358 						!(line_len == 2 && line[0] == '-'))
359 				{
360 					int kind = get_kind((char)(line[0]));
361 					if (kind >= 0)
362 					{
363 						makeSectionAsciidocTag(name, kind, true);
364 						continue;
365 					}
366 				}
367 
368 				/* else if its 4 or more /+-.*_= (plus the -- special case) its a block start */
369 				else if (((line[0] == '/' || line[0] == '+' || line[0] == '-' ||
370 						   line[0] == '.' || line[0] == '*' || line[0] == '_' ||
371 						   line[0] == '=') && line_len >= 4 )
372 						 || (line[0] == '-' && line_len == 2))
373 				{
374 					in_block = line[0];
375 				}
376 			}
377 
378 			/* otherwise is it a one line title */
379 			else if (line[0] == '=' && n_same <= 6 && isspace(line[n_same]) &&
380 					!in_block)
381 			{
382 				int kind = n_same - 1;
383 				process_name(name, kind, line, line_len);
384 				makeSectionAsciidocTag(name, kind, false);
385 				continue;
386 			}
387 		}
388 		vStringClear(name);
389 		if (! isspace(*line))
390 			vStringCatS(name, (const char*) line);
391 	}
392 	vStringDelete(name);
393 	nestingLevelsFree(nestingLevels);
394 }
395 
AsciidocParser(void)396 extern parserDefinition* AsciidocParser (void)
397 {
398 	static const char *const patterns [] = { "*.asc", "*.adoc", "*.asciidoc", NULL };
399 	static const char *const extensions [] = { "asc", "adoc", "asciidoc", NULL };
400 
401 	parserDefinition* const def = parserNew ("Asciidoc");
402 
403 	def->kindTable = AsciidocKinds;
404 	def->kindCount = ARRAY_SIZE (AsciidocKinds);
405 	def->patterns = patterns;
406 	def->extensions = extensions;
407 	def->parser = findAsciidocTags;
408 	/* do we even need to use Cork? */
409 	def->useCork = CORK_QUEUE;
410 
411 	return def;
412 }
413