1 /*
2 *
3 * Copyright (c) 2007-2011, Nick Treleaven
4 * Copyright (c) 2012, Lex Trotman
5 *
6 * This source code is released for free distribution under the terms of the
7 * GNU General Public License version 2 or (at your option) any later version.
8 *
9 * This module contains functions for generating tags for asciidoc files.
10 *
11 * Based on Rest code by Nick Treleaven, see rest.c
12 *
13 * This code was ported from geany git commit 40396a3 at:
14 * https://github.com/geany/geany/blob/master/ctags/parsers/asciidoc.c
15 * with the changes in geany's PR #1263, with some changes to work in uctags.
16 */
17
18 /*
19 * INCLUDE FILES
20 */
21 #include "general.h" /* must always come first */
22
23 #include <ctype.h>
24 #include <string.h>
25
26 #include "debug.h"
27 #include "entry.h"
28 #include "parse.h"
29 #include "read.h"
30 #include "vstring.h"
31 #include "nestlevel.h"
32 #include "routines.h"
33
34 /*
35 * DATA DEFINITIONS
36 */
37 typedef enum {
38 K_CHAPTER = 0,
39 K_SECTION,
40 K_SUBSECTION,
41 K_SUBSUBSECTION,
42 K_LEVEL4SECTION,
43 /* level-5 section not in here because it only works for one-line */
44 SECTION_COUNT, /* this is the same as level-5 kind number */
45 K_ANCHOR
46 } asciidocKind;
47
48 /*
49 * The following kind letters are based on the markdown parser kinds,
50 * and thus different than geany's.
51 */
52 static kindDefinition AsciidocKinds[] = {
53 { true, 'c', "chapter", "chapters"},
54 { true, 's', "section", "sections" },
55 { true, 'S', "subsection", "level 2 sections" },
56 { true, 't', "subsubsection", "level 3 sections" },
57 { true, 'T', "l4subsection", "level 4 sections" },
58 { true, 'u', "l5subsection", "level 5 sections" },
59 { true, 'a', "anchor", "anchors" }
60 };
61
62 static char kindchars[SECTION_COUNT]={ '=', '-', '~', '^', '+' };
63
64 static NestingLevels *nestingLevels = NULL;
65
66 /*
67 * FUNCTION DEFINITIONS
68 */
69
getNestingLevel(const int kind)70 static NestingLevel *getNestingLevel(const int kind)
71 {
72 NestingLevel *nl;
73 tagEntryInfo *e;
74
75 while (1)
76 {
77 nl = nestingLevelsGetCurrent(nestingLevels);
78 e = getEntryOfNestingLevel (nl);
79 if ((nl && (e == NULL)) || (e && (e->kindIndex >= kind)))
80 nestingLevelsPop(nestingLevels);
81 else
82 break;
83 }
84 return nl;
85 }
86
makeAsciidocTag(const vString * const name,const int kind,const bool two_line)87 static int makeAsciidocTag (const vString* const name, const int kind, const bool two_line)
88 {
89 const NestingLevel *const nl = getNestingLevel(kind);
90 int r = CORK_NIL;
91
92 if (vStringLength (name) > 0)
93 {
94 tagEntryInfo *parent = getEntryOfNestingLevel (nl);
95 tagEntryInfo e;
96
97 initTagEntry (&e, vStringValue (name), kind);
98
99 if (two_line)
100 {
101 /* we want the line before the '---' underline chars */
102 const unsigned long line = getInputLineNumber();
103 Assert (line > 0);
104 if (line > 0)
105 {
106 e.lineNumber--;
107 e.filePosition = getInputFilePositionForLine(line - 1);
108 }
109 }
110
111 if (parent && (parent->kindIndex < kind))
112 {
113 /*
114 * This doesn't use Cork, but in this case I think this is better,
115 * because Cork would record the scopes of all parents in the chain
116 * which is weird for text section identifiers, and also this is
117 * what the rst.c reStructuredText parser does.
118 */
119 e.extensionFields.scopeKindIndex = parent->kindIndex;
120 e.extensionFields.scopeName = parent->name;
121 }
122
123 r = makeTagEntry (&e);
124 }
125 return r;
126 }
127
makeSectionAsciidocTag(const vString * const name,const int kind,const bool two_line)128 static int makeSectionAsciidocTag (const vString* const name, const int kind, const bool two_line)
129 {
130 int r = makeAsciidocTag(name, kind, two_line);
131 nestingLevelsPush(nestingLevels, r);
132 return r;
133 }
134
135
get_kind(char c)136 static int get_kind(char c)
137 {
138 int i;
139
140 for (i = 0; i < SECTION_COUNT; i++)
141 {
142 if (kindchars[i] == c)
143 return i;
144 }
145 return -1;
146 }
147
148
is_anchor(const unsigned char * line)149 static bool is_anchor(const unsigned char *line)
150 {
151 /* must be at least "[#a]" */
152 return line[0] == '[' && (line[1] == '#' || line[1] == '[');
153 }
154
capture_anchor(const unsigned char * const orig,int * captured_len)155 static int capture_anchor(const unsigned char *const orig, int* captured_len)
156 {
157 vString *name = vStringNew ();
158 int r = CORK_NIL;
159 const bool shorthand = orig[1] == '#' ? true : false;
160 bool is_valid = false;
161 bool seen_comma = false;
162 const unsigned char *line = orig;
163
164 Assert (line[0] == '[');
165 Assert (line[1] == '#' || line[1] == '[');
166
167 if (captured_len) *captured_len = 0;
168
169 line += 2;
170
171 while (*line != '\0')
172 {
173 if (*line == ']')
174 {
175 if (shorthand || line[1] == ']')
176 {
177 is_valid = true;
178 if (shorthand) line++;
179 else line += 2;
180 break;
181 }
182 /* otherwise it's not the end, keep going */
183 }
184
185 if (*line == ',')
186 seen_comma = true;
187
188 if (!seen_comma)
189 vStringPut (name, *line);
190
191 line++;
192 }
193
194 if (is_valid && vStringLength (name) != 0)
195 {
196 r = makeAsciidocTag (name, K_ANCHOR, false);
197
198 if (captured_len)
199 {
200 *captured_len = line - orig;
201 }
202 }
203
204 vStringDelete (name);
205 return r;
206 }
207
208
209 /* skips any leading anchor(s) in a one-line title, generating tags for them */
process_leading_anchors(const unsigned char * const begin)210 static int process_leading_anchors(const unsigned char *const begin)
211 {
212 int captured_len = 0;
213 const unsigned char *current = begin;
214
215 while (is_anchor(current) && capture_anchor(current, &captured_len) != CORK_NIL)
216 {
217 /* minimum is "[#a]" */
218 Assert (captured_len >= 4);
219 current += captured_len;
220 while (isspace(*current)) ++current;
221 }
222
223 return current - begin;
224 }
225
process_trailing_anchor(const unsigned char * const begin,const unsigned char * const end)226 static int process_trailing_anchor(const unsigned char *const begin,
227 const unsigned char *const end)
228 {
229 int captured_len = 0;
230 const unsigned char *found = NULL;
231
232 /* minimum is "[#a]" */
233 if (*end == ']' && (end - begin) >= 4)
234 {
235 found = (const unsigned char*) strrchr((const char*) begin , '[');
236 if (found && ((end - found) >= 4))
237 {
238 /* see if it's not shorthand [#a] but instead [[a]] */
239 if (end[-1] == ']' && found > begin && found[-1] == '[')
240 --found;
241
242 if (is_anchor (found))
243 capture_anchor(found, &captured_len);
244 }
245 }
246
247 return captured_len;
248 }
249
process_name(vString * const name,const int kind,const unsigned char * line,const int line_len)250 static void process_name(vString *const name, const int kind,
251 const unsigned char *line, const int line_len)
252 {
253 int start = kind + 1;
254 int end = line_len - 1;
255
256 Assert (kind >= 0 && kind < K_ANCHOR);
257 Assert (line_len > start);
258
259 vStringClear(name);
260
261 while (line[end] == line[0]) --end;
262 while (isspace(line[start])) ++start;
263 while (isspace(line[end])) --end;
264
265 if (start < end)
266 {
267 /* pop nesting levels, so that anchors get the parent's scope */
268 getNestingLevel(kind);
269 end -= process_trailing_anchor(line + start, line + end);
270 start += process_leading_anchors(line + start);
271 }
272
273 while (isspace(line[end])) --end;
274
275 if (start <= end)
276 vStringNCatS(name, (const char*)(&(line[start])), end - start + 1);
277 }
278
279
280 /* computes the length of an UTF-8 string
281 * if the string doesn't look like UTF-8, return -1
282 * FIXME consider East_Asian_Width Unicode property */
utf8_strlen(const char * buf,int buf_len)283 static int utf8_strlen(const char *buf, int buf_len)
284 {
285 int len = 0;
286 const char *end = buf + buf_len;
287
288 for (len = 0; buf < end; len ++)
289 {
290 /* perform quick and naive validation (no sub-byte checking) */
291 if (! (*buf & 0x80))
292 buf ++;
293 else if ((*buf & 0xe0) == 0xc0)
294 buf += 2;
295 else if ((*buf & 0xf0) == 0xe0)
296 buf += 3;
297 else if ((*buf & 0xf8) == 0xf0)
298 buf += 4;
299 else /* not a valid leading UTF-8 byte, abort */
300 return -1;
301
302 if (buf > end) /* incomplete last byte */
303 return -1;
304 }
305
306 return len;
307 }
308
309
findAsciidocTags(void)310 static void findAsciidocTags(void)
311 {
312 vString *name = vStringNew();
313 const unsigned char *line;
314 unsigned char in_block = '\0'; /* holds the block marking char or \0 if not in block */
315
316 nestingLevels = nestingLevelsNew(0);
317
318 while ((line = readLineFromInputFile()) != NULL)
319 {
320 if (is_anchor (line))
321 {
322 if (capture_anchor (line, NULL) != CORK_NIL)
323 {
324 vStringClear (name);
325 continue;
326 }
327 }
328
329 int line_len = strlen((const char*) line);
330 int name_len_bytes = vStringLength(name);
331 int name_len = utf8_strlen(vStringValue(name), name_len_bytes);
332
333 /* if the name doesn't look like UTF-8, assume one-byte charset */
334 if (name_len < 0) name_len = name_len_bytes;
335
336 /* if its a title underline, or a delimited block marking character */
337 if (line[0] == '=' || line[0] == '-' || line[0] == '~' ||
338 line[0] == '^' || line[0] == '+' || line[0] == '.' ||
339 line[0] == '*' || line[0] == '_' || line[0] == '/')
340 {
341 int n_same;
342 for (n_same = 1; line[n_same] == line[0]; ++n_same);
343
344 /* is it a two line title or a delimited block */
345 if (n_same == line_len)
346 {
347 /* if in a block, can't be block start or title, look for block end */
348 if (in_block)
349 {
350 if (line[0] == in_block) in_block = '\0';
351 }
352
353 /* if its a =_~^+ and the same length +-2 as the line before then its a title */
354 /* (except in the special case its a -- open block start line) */
355 else if ((line[0] == '=' || line[0] == '-' || line[0] == '~' ||
356 line[0] == '^' || line[0] == '+') &&
357 line_len <= name_len + 2 && line_len >= name_len - 2 &&
358 !(line_len == 2 && line[0] == '-'))
359 {
360 int kind = get_kind((char)(line[0]));
361 if (kind >= 0)
362 {
363 makeSectionAsciidocTag(name, kind, true);
364 continue;
365 }
366 }
367
368 /* else if its 4 or more /+-.*_= (plus the -- special case) its a block start */
369 else if (((line[0] == '/' || line[0] == '+' || line[0] == '-' ||
370 line[0] == '.' || line[0] == '*' || line[0] == '_' ||
371 line[0] == '=') && line_len >= 4 )
372 || (line[0] == '-' && line_len == 2))
373 {
374 in_block = line[0];
375 }
376 }
377
378 /* otherwise is it a one line title */
379 else if (line[0] == '=' && n_same <= 6 && isspace(line[n_same]) &&
380 !in_block)
381 {
382 int kind = n_same - 1;
383 process_name(name, kind, line, line_len);
384 makeSectionAsciidocTag(name, kind, false);
385 continue;
386 }
387 }
388 vStringClear(name);
389 if (! isspace(*line))
390 vStringCatS(name, (const char*) line);
391 }
392 vStringDelete(name);
393 nestingLevelsFree(nestingLevels);
394 }
395
AsciidocParser(void)396 extern parserDefinition* AsciidocParser (void)
397 {
398 static const char *const patterns [] = { "*.asc", "*.adoc", "*.asciidoc", NULL };
399 static const char *const extensions [] = { "asc", "adoc", "asciidoc", NULL };
400
401 parserDefinition* const def = parserNew ("Asciidoc");
402
403 def->kindTable = AsciidocKinds;
404 def->kindCount = ARRAY_SIZE (AsciidocKinds);
405 def->patterns = patterns;
406 def->extensions = extensions;
407 def->parser = findAsciidocTags;
408 /* do we even need to use Cork? */
409 def->useCork = CORK_QUEUE;
410
411 return def;
412 }
413