xref: /Universal-ctags/parsers/bibtex.c (revision a5ce885dcffd63ba471c4f69c803b372b0af734c)
1 /*
2  *	 Copyright (c) 2008, David Fishburn
3  *	 Copyright (c) 2012, Jan Larres
4  *	 Copyright (c) 2019, Mirco Schönfeld
5  *
6  *	 This source code is released for free distribution under the terms of the
7  *	 GNU General Public License version 2 or (at your option) any later version.
8  *
9  *	 This module contains functions for generating identifiers of entries of Bibtex language files.
10  *
11  *	 BibTex language "reference":
12  *		 https://en.wikipedia.org/wiki/BibTeX
13  */
14 
15 /*
16  *	 INCLUDE FILES
17  */
18 #include "general.h"	/* must always come first */
19 #include <ctype.h>	/* to define isalpha () */
20 #include <string.h>
21 
22 #include "debug.h"
23 #include "entry.h"
24 #include "keyword.h"
25 #include "parse.h"
26 #include "read.h"
27 #include "routines.h"
28 #include "vstring.h"
29 
30 /*
31  *	 MACROS
32  */
33 #define isType(token,t)		(bool) ((token)->type == (t))
34 #define isKeyword(token,k)	(bool) ((token)->keyword == (k))
35 #define isIdentChar(c) \
36 	(isalpha (c) || isdigit (c) || (c) == '_' || (c) == '-' || (c) == '+' || (c) == ':')
37 
38 /*
39  *	 DATA DECLARATIONS
40  */
41 
42 /*
43  * Used to specify type of keyword.
44  */
45 enum eKeywordId {
46 	KEYWORD_article,
47 	KEYWORD_book,
48 	KEYWORD_booklet,
49 	KEYWORD_conference,
50 	KEYWORD_inbook,
51 	KEYWORD_incollection,
52 	KEYWORD_inproceedings,
53 	KEYWORD_manual,
54 	KEYWORD_mastersthesis,
55 	KEYWORD_misc,
56 	KEYWORD_phdthesis,
57 	KEYWORD_proceedings,
58 	KEYWORD_string,
59 	KEYWORD_techreport,
60 	KEYWORD_unpublished
61 };
62 typedef int keywordId; /* to allow KEYWORD_NONE */
63 
64 enum eTokenType {
65 	/* 0..255 are the byte's value.  Some are named for convenience */
66 	TOKEN_OPEN_CURLY = '{',
67 	/* above is special types */
68 	TOKEN_UNDEFINED = 256,
69 	TOKEN_KEYWORD,
70 	TOKEN_IDENTIFIER
71 };
72 typedef int tokenType;
73 
74 typedef struct sTokenInfo {
75 	tokenType		type;
76 	keywordId		keyword;
77 	vString *		string;
78 	unsigned long 	lineNumber;
79 	MIOPos 			filePosition;
80 } tokenInfo;
81 
82 /*
83  *	DATA DEFINITIONS
84  */
85 
86 static langType Lang_bib;
87 
88 typedef enum {
89 	BIBTAG_ARTICLE,
90 	BIBTAG_BOOK,
91 	BIBTAG_BOOKLET,
92 	BIBTAG_CONFERENCE,
93 	BIBTAG_INBOOK,
94 	BIBTAG_INCOLLECTION,
95 	BIBTAG_INPROCEEDINGS,
96 	BIBTAG_MANUAL,
97 	BIBTAG_MASTERSTHESIS,
98 	BIBTAG_MISC,
99 	BIBTAG_PHDTHESIS,
100 	BIBTAG_PROCEEDINGS,
101 	BIBTAG_STRING,
102 	BIBTAG_TECHREPORT,
103 	BIBTAG_UNPUBLISHED,
104 	BIBTAG_COUNT
105 } bibKind;
106 
107 static kindDefinition BibKinds [] = {
108 	{ true,  'a', "article",				"article"				},
109 	{ true,  'b', "book",						"book"					},
110 	{ true,  'B', "booklet",				"booklet"				},
111 	{ true,  'c', "conference",			"conference"		},
112 	{ true,  'i', "inbook",					"inbook"				},
113 	{ true,  'I', "incollection",		"incollection"	},
114 	{ true,  'j', "inproceedings",	"inproceedings"	},
115 	{ true,  'm', "manual",					"manual"				},
116 	{ true,  'M', "mastersthesis",	"mastersthesis"	},
117 	{ true,  'n', "misc",						"misc"					},
118 	{ true,  'p', "phdthesis",			"phdthesis"			},
119 	{ true,  'P', "proceedings",		"proceedings"		},
120 	{ true,  's', "string",					"string"				},
121 	{ true,  't', "techreport",			"techreport"		},
122 	{ true,  'u', "unpublished",		"unpublished"		}
123 };
124 
125 static const keywordTable BibKeywordTable [] = {
126 	/* keyword			  keyword ID */
127 	{ "article",	    KEYWORD_article				},
128 	{ "book",	        KEYWORD_book				  },
129 	{ "booklet",	    KEYWORD_booklet				},
130 	{ "conference",	  KEYWORD_conference		},
131 	{ "inbook",	      KEYWORD_inbook				},
132 	{ "incollection",	KEYWORD_incollection	},
133 	{ "inproceedings",KEYWORD_inproceedings	},
134 	{ "manual",	      KEYWORD_manual				},
135 	{ "mastersthesis",KEYWORD_mastersthesis	},
136 	{ "misc",	        KEYWORD_misc				  },
137 	{ "phdthesis",	  KEYWORD_phdthesis			},
138 	{ "proceedings",	KEYWORD_proceedings		},
139 	{ "string",				KEYWORD_string				},
140 	{ "techreport",	  KEYWORD_techreport		},
141 	{ "unpublished",	KEYWORD_unpublished		}
142 };
143 
144 /*
145  *	 FUNCTION DEFINITIONS
146  */
147 
newToken(void)148 static tokenInfo *newToken (void)
149 {
150 	tokenInfo *const token = xMalloc (1, tokenInfo);
151 
152 	token->type			= TOKEN_UNDEFINED;
153 	token->keyword		= KEYWORD_NONE;
154 	token->string		= vStringNew ();
155 	token->lineNumber   = getInputLineNumber ();
156 	token->filePosition = getInputFilePosition ();
157 
158 	return token;
159 }
160 
deleteToken(tokenInfo * const token)161 static void deleteToken (tokenInfo *const token)
162 {
163 	vStringDelete (token->string);
164 	eFree (token);
165 }
166 
167 /*
168  *	 Tag generation functions
169  */
makeBibTag(tokenInfo * const token,bibKind kind)170 static void makeBibTag (tokenInfo *const token, bibKind kind)
171 {
172 	if (BibKinds [kind].enabled)
173 	{
174 		const char *const name = vStringValue (token->string);
175 		tagEntryInfo e;
176 		initTagEntry (&e, name, kind);
177 
178 		e.lineNumber   = token->lineNumber;
179 		e.filePosition = token->filePosition;
180 
181 		makeTagEntry (&e);
182 	}
183 }
184 
185 /*
186  *	 Parsing functions
187  */
188 
189 /*
190  *	Read a C identifier beginning with "firstChar" and places it into
191  *	"name".
192  */
parseIdentifier(vString * const string,const int firstChar)193 static void parseIdentifier (vString *const string, const int firstChar)
194 {
195 	int c = firstChar;
196 	Assert (isIdentChar (c));
197 	do
198 	{
199 		vStringPut (string, c);
200 		c = getcFromInputFile ();
201 	} while (c != EOF && isIdentChar (c));
202 	if (c != EOF)
203 		ungetcToInputFile (c);		/* unget non-identifier character */
204 }
205 
readToken(tokenInfo * const token)206 static bool readToken (tokenInfo *const token)
207 {
208 	int c;
209 
210 	token->type			= TOKEN_UNDEFINED;
211 	token->keyword		= KEYWORD_NONE;
212 	vStringClear (token->string);
213 
214 getNextChar:
215 
216 	do
217 	{
218 		c = getcFromInputFile ();
219 	}
220 	while (c == '\t' || c == ' ' || c == '\n');
221 
222 	token->lineNumber   = getInputLineNumber ();
223 	token->filePosition = getInputFilePosition ();
224 
225 	token->type = (unsigned char) c;
226 	switch (c)
227 	{
228 		case EOF: return false;
229 
230 		case '@':
231 					/*
232 					 * All Bib entries start with an at symbol.
233 					 * Check if the next character is an alpha character
234 					 * else it is not a potential tex tag.
235 					 */
236 					c = getcFromInputFile ();
237 					if (! isalpha (c))
238 					  ungetcToInputFile (c);
239 					else
240 					{
241 						vStringPut (token->string, '@');
242 						parseIdentifier (token->string, c);
243 						token->keyword = lookupCaseKeyword (vStringValue (token->string) + 1, Lang_bib);
244 						if (isKeyword (token, KEYWORD_NONE))
245 							token->type = TOKEN_IDENTIFIER;
246 						else
247 							token->type = TOKEN_KEYWORD;
248 					}
249 					break;
250 		case '%':
251 					skipToCharacterInInputFile ('\n'); /* % are single line comments */
252 					goto getNextChar;
253 					break;
254 		default:
255 					if (isIdentChar (c))
256 					{
257 						parseIdentifier (token->string, c);
258 						token->type = TOKEN_IDENTIFIER;
259 					}
260 					break;
261 	}
262 	return true;
263 }
264 
copyToken(tokenInfo * const dest,tokenInfo * const src)265 static void copyToken (tokenInfo *const dest, tokenInfo *const src)
266 {
267 	dest->lineNumber = src->lineNumber;
268 	dest->filePosition = src->filePosition;
269 	dest->type = src->type;
270 	dest->keyword = src->keyword;
271 	vStringCopy (dest->string, src->string);
272 }
273 
274 /*
275  *	 Scanning functions
276  */
277 
parseTag(tokenInfo * const token,bibKind kind)278 static bool parseTag (tokenInfo *const token, bibKind kind)
279 {
280 	tokenInfo *	const name = newToken ();
281 	vString *		currentid;
282 	bool				eof = false;
283 
284 	currentid = vStringNew ();
285 	/*
286 	 * Bib entries are of these formats:
287 	 *   @article{identifier,
288 	 *   author="John Doe"}
289 	 *
290 	 * When a keyword is found, loop through all words up to
291 	 * a comma brace for the tag name.
292 	 *
293 	 */
294 	if (isType (token, TOKEN_KEYWORD))
295 	{
296 		copyToken (name, token);
297 		if (!readToken (token))
298 		{
299 			eof = true;
300 			goto out;
301 		}
302 	}
303 
304 	if (isType (token, TOKEN_OPEN_CURLY))
305 	{
306 		if (!readToken (token))
307 		{
308 			eof = true;
309 			goto out;
310 		}
311 		if (isType (token, TOKEN_IDENTIFIER)){
312 			vStringCat (currentid, token->string);
313 			vStringStripTrailing (currentid);
314 			if (vStringLength (currentid) > 0)
315 			{
316 				vStringCopy (name->string, currentid);
317 				makeBibTag (name, kind);
318 			}
319 		}
320 		else
321 		{ // should find an identifier for bib item at first place
322 			eof = true;
323 			goto out;
324 		}
325 	}
326 
327  out:
328 	deleteToken (name);
329 	vStringDelete (currentid);
330 	return eof;
331 }
332 
parseBibFile(tokenInfo * const token)333 static void parseBibFile (tokenInfo *const token)
334 {
335 	bool eof = false;
336 
337 	do
338 	{
339 		if (!readToken (token))
340 			break;
341 
342 		if (isType (token, TOKEN_KEYWORD))
343 		{
344 			switch (token->keyword)
345 			{
346 				case KEYWORD_article:
347 					eof = parseTag (token, BIBTAG_ARTICLE);
348 					break;
349 				case KEYWORD_book:
350 					eof = parseTag (token, BIBTAG_BOOK);
351 					break;
352 				case KEYWORD_booklet:
353 					eof = parseTag (token, BIBTAG_BOOKLET);
354 					break;
355 				case KEYWORD_conference:
356 					eof = parseTag (token, BIBTAG_CONFERENCE);
357 					break;
358 				case KEYWORD_inbook:
359 					eof = parseTag (token, BIBTAG_INBOOK);
360 					break;
361 				case KEYWORD_incollection:
362 					eof = parseTag (token, BIBTAG_INCOLLECTION);
363 					break;
364 				case KEYWORD_inproceedings:
365 					eof = parseTag (token, BIBTAG_INPROCEEDINGS);
366 					break;
367 				case KEYWORD_manual:
368 					eof = parseTag (token, BIBTAG_MANUAL);
369 					break;
370 				case KEYWORD_mastersthesis:
371 					eof = parseTag (token, BIBTAG_MASTERSTHESIS);
372 					break;
373 				case KEYWORD_misc:
374 					eof = parseTag (token, BIBTAG_MISC);
375 					break;
376 				case KEYWORD_phdthesis:
377 					eof = parseTag (token, BIBTAG_PHDTHESIS);
378 					break;
379 				case KEYWORD_proceedings:
380 					eof = parseTag (token, BIBTAG_PROCEEDINGS);
381 					break;
382 				case KEYWORD_string:
383 					eof = parseTag (token, BIBTAG_STRING);
384 					break;
385 				case KEYWORD_techreport:
386 					eof = parseTag (token, BIBTAG_TECHREPORT);
387 					break;
388 				case KEYWORD_unpublished:
389 					eof = parseTag (token, BIBTAG_UNPUBLISHED);
390 					break;
391 				default:
392 					break;
393 			}
394 		}
395 		if (eof)
396 			break;
397 	} while (true);
398 }
399 
initialize(const langType language)400 static void initialize (const langType language)
401 {
402 	Lang_bib = language;
403 }
404 
findBibTags(void)405 static void findBibTags (void)
406 {
407 	tokenInfo *const token = newToken ();
408 
409 	parseBibFile (token);
410 
411 	deleteToken (token);
412 }
413 
414 /* Create parser definition structure */
BibtexParser(void)415 extern parserDefinition* BibtexParser (void)
416 {
417 	Assert (ARRAY_SIZE (BibKinds) == BIBTAG_COUNT);
418 	static const char *const extensions [] = { "bib", NULL };
419 	parserDefinition *const def = parserNew ("BibTeX");
420 	def->extensions = extensions;
421 	/*
422 	 * New definitions for parsing instead of regex
423 	 */
424 	def->kindTable		= BibKinds;
425 	def->kindCount		= ARRAY_SIZE (BibKinds);
426 	def->parser				= findBibTags;
427 	def->initialize		= initialize;
428 	def->keywordTable	= BibKeywordTable;
429 	def->keywordCount	= ARRAY_SIZE (BibKeywordTable);
430 	return def;
431 }
432