1 /* Extended regular expression matching and search library.
2 Copyright (C) 2002-2021 Free Software Foundation, Inc.
3 This file is part of the GNU C Library.
4 Contributed by Isamu Hasegawa <isamu@yamato.ibm.com>.
5
6 The GNU C Library is free software; you can redistribute it and/or
7 modify it under the terms of the GNU Lesser General Public
8 License as published by the Free Software Foundation; either
9 version 2.1 of the License, or (at your option) any later version.
10
11 The GNU C Library is distributed in the hope that it will be useful,
12 but WITHOUT ANY WARRANTY; without even the implied warranty of
13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14 Lesser General Public License for more details.
15
16 You should have received a copy of the GNU Lesser General Public
17 License along with the GNU C Library; if not, see
18 <https://www.gnu.org/licenses/>. */
19
20 static void re_string_construct_common (const char *str, Idx len,
21 re_string_t *pstr,
22 RE_TRANSLATE_TYPE trans, bool icase,
23 const re_dfa_t *dfa);
24 static re_dfastate_t *create_ci_newstate (const re_dfa_t *dfa,
25 const re_node_set *nodes,
26 re_hashval_t hash);
27 static re_dfastate_t *create_cd_newstate (const re_dfa_t *dfa,
28 const re_node_set *nodes,
29 unsigned int context,
30 re_hashval_t hash);
31 static reg_errcode_t re_string_realloc_buffers (re_string_t *pstr,
32 Idx new_buf_len);
33 #ifdef RE_ENABLE_I18N
34 static void build_wcs_buffer (re_string_t *pstr);
35 static reg_errcode_t build_wcs_upper_buffer (re_string_t *pstr);
36 #endif /* RE_ENABLE_I18N */
37 static void build_upper_buffer (re_string_t *pstr);
38 static void re_string_translate_buffer (re_string_t *pstr);
39 static unsigned int re_string_context_at (const re_string_t *input, Idx idx,
40 int eflags) __attribute__ ((pure));
41
42 /* Functions for string operation. */
43
44 /* This function allocate the buffers. It is necessary to call
45 re_string_reconstruct before using the object. */
46
47 static reg_errcode_t
48 __attribute_warn_unused_result__
re_string_allocate(re_string_t * pstr,const char * str,Idx len,Idx init_len,RE_TRANSLATE_TYPE trans,bool icase,const re_dfa_t * dfa)49 re_string_allocate (re_string_t *pstr, const char *str, Idx len, Idx init_len,
50 RE_TRANSLATE_TYPE trans, bool icase, const re_dfa_t *dfa)
51 {
52 reg_errcode_t ret;
53 Idx init_buf_len;
54
55 /* Ensure at least one character fits into the buffers. */
56 if (init_len < dfa->mb_cur_max)
57 init_len = dfa->mb_cur_max;
58 init_buf_len = (len + 1 < init_len) ? len + 1: init_len;
59 re_string_construct_common (str, len, pstr, trans, icase, dfa);
60
61 ret = re_string_realloc_buffers (pstr, init_buf_len);
62 if (__glibc_unlikely (ret != REG_NOERROR))
63 return ret;
64
65 pstr->word_char = dfa->word_char;
66 pstr->word_ops_used = dfa->word_ops_used;
67 pstr->mbs = pstr->mbs_allocated ? pstr->mbs : (unsigned char *) str;
68 pstr->valid_len = (pstr->mbs_allocated || dfa->mb_cur_max > 1) ? 0 : len;
69 pstr->valid_raw_len = pstr->valid_len;
70 return REG_NOERROR;
71 }
72
73 /* This function allocate the buffers, and initialize them. */
74
75 static reg_errcode_t
76 __attribute_warn_unused_result__
re_string_construct(re_string_t * pstr,const char * str,Idx len,RE_TRANSLATE_TYPE trans,bool icase,const re_dfa_t * dfa)77 re_string_construct (re_string_t *pstr, const char *str, Idx len,
78 RE_TRANSLATE_TYPE trans, bool icase, const re_dfa_t *dfa)
79 {
80 reg_errcode_t ret;
81 memset (pstr, '\0', sizeof (re_string_t));
82 re_string_construct_common (str, len, pstr, trans, icase, dfa);
83
84 if (len > 0)
85 {
86 ret = re_string_realloc_buffers (pstr, len + 1);
87 if (__glibc_unlikely (ret != REG_NOERROR))
88 return ret;
89 }
90 pstr->mbs = pstr->mbs_allocated ? pstr->mbs : (unsigned char *) str;
91
92 if (icase)
93 {
94 #ifdef RE_ENABLE_I18N
95 if (dfa->mb_cur_max > 1)
96 {
97 while (1)
98 {
99 ret = build_wcs_upper_buffer (pstr);
100 if (__glibc_unlikely (ret != REG_NOERROR))
101 return ret;
102 if (pstr->valid_raw_len >= len)
103 break;
104 if (pstr->bufs_len > pstr->valid_len + dfa->mb_cur_max)
105 break;
106 ret = re_string_realloc_buffers (pstr, pstr->bufs_len * 2);
107 if (__glibc_unlikely (ret != REG_NOERROR))
108 return ret;
109 }
110 }
111 else
112 #endif /* RE_ENABLE_I18N */
113 build_upper_buffer (pstr);
114 }
115 else
116 {
117 #ifdef RE_ENABLE_I18N
118 if (dfa->mb_cur_max > 1)
119 build_wcs_buffer (pstr);
120 else
121 #endif /* RE_ENABLE_I18N */
122 {
123 if (trans != NULL)
124 re_string_translate_buffer (pstr);
125 else
126 {
127 pstr->valid_len = pstr->bufs_len;
128 pstr->valid_raw_len = pstr->bufs_len;
129 }
130 }
131 }
132
133 return REG_NOERROR;
134 }
135
136 /* Helper functions for re_string_allocate, and re_string_construct. */
137
138 static reg_errcode_t
139 __attribute_warn_unused_result__
re_string_realloc_buffers(re_string_t * pstr,Idx new_buf_len)140 re_string_realloc_buffers (re_string_t *pstr, Idx new_buf_len)
141 {
142 #ifdef RE_ENABLE_I18N
143 if (pstr->mb_cur_max > 1)
144 {
145 wint_t *new_wcs;
146
147 /* Avoid overflow in realloc. */
148 const size_t max_object_size = MAX (sizeof (wint_t), sizeof (Idx));
149 if (__glibc_unlikely (MIN (IDX_MAX, SIZE_MAX / max_object_size)
150 < new_buf_len))
151 return REG_ESPACE;
152
153 new_wcs = re_realloc (pstr->wcs, wint_t, new_buf_len);
154 if (__glibc_unlikely (new_wcs == NULL))
155 return REG_ESPACE;
156 pstr->wcs = new_wcs;
157 if (pstr->offsets != NULL)
158 {
159 Idx *new_offsets = re_realloc (pstr->offsets, Idx, new_buf_len);
160 if (__glibc_unlikely (new_offsets == NULL))
161 return REG_ESPACE;
162 pstr->offsets = new_offsets;
163 }
164 }
165 #endif /* RE_ENABLE_I18N */
166 if (pstr->mbs_allocated)
167 {
168 unsigned char *new_mbs = re_realloc (pstr->mbs, unsigned char,
169 new_buf_len);
170 if (__glibc_unlikely (new_mbs == NULL))
171 return REG_ESPACE;
172 pstr->mbs = new_mbs;
173 }
174 pstr->bufs_len = new_buf_len;
175 return REG_NOERROR;
176 }
177
178
179 static void
re_string_construct_common(const char * str,Idx len,re_string_t * pstr,RE_TRANSLATE_TYPE trans,bool icase,const re_dfa_t * dfa)180 re_string_construct_common (const char *str, Idx len, re_string_t *pstr,
181 RE_TRANSLATE_TYPE trans, bool icase,
182 const re_dfa_t *dfa)
183 {
184 pstr->raw_mbs = (const unsigned char *) str;
185 pstr->len = len;
186 pstr->raw_len = len;
187 pstr->trans = trans;
188 pstr->icase = icase;
189 pstr->mbs_allocated = (trans != NULL || icase);
190 pstr->mb_cur_max = dfa->mb_cur_max;
191 pstr->is_utf8 = dfa->is_utf8;
192 pstr->map_notascii = dfa->map_notascii;
193 pstr->stop = pstr->len;
194 pstr->raw_stop = pstr->stop;
195 }
196
197 #ifdef RE_ENABLE_I18N
198
199 /* Build wide character buffer PSTR->WCS.
200 If the byte sequence of the string are:
201 <mb1>(0), <mb1>(1), <mb2>(0), <mb2>(1), <sb3>
202 Then wide character buffer will be:
203 <wc1> , WEOF , <wc2> , WEOF , <wc3>
204 We use WEOF for padding, they indicate that the position isn't
205 a first byte of a multibyte character.
206
207 Note that this function assumes PSTR->VALID_LEN elements are already
208 built and starts from PSTR->VALID_LEN. */
209
210 static void
build_wcs_buffer(re_string_t * pstr)211 build_wcs_buffer (re_string_t *pstr)
212 {
213 #ifdef _LIBC
214 unsigned char buf[MB_LEN_MAX];
215 DEBUG_ASSERT (MB_LEN_MAX >= pstr->mb_cur_max);
216 #else
217 unsigned char buf[64];
218 #endif
219 mbstate_t prev_st;
220 Idx byte_idx, end_idx, remain_len;
221 size_t mbclen;
222
223 /* Build the buffers from pstr->valid_len to either pstr->len or
224 pstr->bufs_len. */
225 end_idx = (pstr->bufs_len > pstr->len) ? pstr->len : pstr->bufs_len;
226 for (byte_idx = pstr->valid_len; byte_idx < end_idx;)
227 {
228 wchar_t wc;
229 const char *p;
230
231 remain_len = end_idx - byte_idx;
232 prev_st = pstr->cur_state;
233 /* Apply the translation if we need. */
234 if (__glibc_unlikely (pstr->trans != NULL))
235 {
236 int i, ch;
237
238 for (i = 0; i < pstr->mb_cur_max && i < remain_len; ++i)
239 {
240 ch = pstr->raw_mbs [pstr->raw_mbs_idx + byte_idx + i];
241 buf[i] = pstr->mbs[byte_idx + i] = pstr->trans[ch];
242 }
243 p = (const char *) buf;
244 }
245 else
246 p = (const char *) pstr->raw_mbs + pstr->raw_mbs_idx + byte_idx;
247 mbclen = __mbrtowc (&wc, p, remain_len, &pstr->cur_state);
248 if (__glibc_unlikely (mbclen == (size_t) -1 || mbclen == 0
249 || (mbclen == (size_t) -2
250 && pstr->bufs_len >= pstr->len)))
251 {
252 /* We treat these cases as a singlebyte character. */
253 mbclen = 1;
254 wc = (wchar_t) pstr->raw_mbs[pstr->raw_mbs_idx + byte_idx];
255 if (__glibc_unlikely (pstr->trans != NULL))
256 wc = pstr->trans[wc];
257 pstr->cur_state = prev_st;
258 }
259 else if (__glibc_unlikely (mbclen == (size_t) -2))
260 {
261 /* The buffer doesn't have enough space, finish to build. */
262 pstr->cur_state = prev_st;
263 break;
264 }
265
266 /* Write wide character and padding. */
267 pstr->wcs[byte_idx++] = wc;
268 /* Write paddings. */
269 for (remain_len = byte_idx + mbclen - 1; byte_idx < remain_len ;)
270 pstr->wcs[byte_idx++] = WEOF;
271 }
272 pstr->valid_len = byte_idx;
273 pstr->valid_raw_len = byte_idx;
274 }
275
276 /* Build wide character buffer PSTR->WCS like build_wcs_buffer,
277 but for REG_ICASE. */
278
279 static reg_errcode_t
280 __attribute_warn_unused_result__
build_wcs_upper_buffer(re_string_t * pstr)281 build_wcs_upper_buffer (re_string_t *pstr)
282 {
283 mbstate_t prev_st;
284 Idx src_idx, byte_idx, end_idx, remain_len;
285 size_t mbclen;
286 #ifdef _LIBC
287 char buf[MB_LEN_MAX];
288 DEBUG_ASSERT (pstr->mb_cur_max <= MB_LEN_MAX);
289 #else
290 char buf[64];
291 #endif
292
293 byte_idx = pstr->valid_len;
294 end_idx = (pstr->bufs_len > pstr->len) ? pstr->len : pstr->bufs_len;
295
296 /* The following optimization assumes that ASCII characters can be
297 mapped to wide characters with a simple cast. */
298 if (! pstr->map_notascii && pstr->trans == NULL && !pstr->offsets_needed)
299 {
300 while (byte_idx < end_idx)
301 {
302 wchar_t wc;
303 unsigned char ch = pstr->raw_mbs[pstr->raw_mbs_idx + byte_idx];
304
305 if (isascii (ch) && mbsinit (&pstr->cur_state))
306 {
307 /* The next step uses the assumption that wchar_t is encoded
308 ASCII-safe: all ASCII values can be converted like this. */
309 wchar_t wcu = __towupper (ch);
310 if (isascii (wcu))
311 {
312 pstr->mbs[byte_idx] = wcu;
313 pstr->wcs[byte_idx] = wcu;
314 byte_idx++;
315 continue;
316 }
317 }
318
319 remain_len = end_idx - byte_idx;
320 prev_st = pstr->cur_state;
321 mbclen = __mbrtowc (&wc,
322 ((const char *) pstr->raw_mbs + pstr->raw_mbs_idx
323 + byte_idx), remain_len, &pstr->cur_state);
324 if (__glibc_likely (0 < mbclen && mbclen < (size_t) -2))
325 {
326 wchar_t wcu = __towupper (wc);
327 if (wcu != wc)
328 {
329 size_t mbcdlen;
330
331 mbcdlen = __wcrtomb (buf, wcu, &prev_st);
332 if (__glibc_likely (mbclen == mbcdlen))
333 memcpy (pstr->mbs + byte_idx, buf, mbclen);
334 else
335 {
336 src_idx = byte_idx;
337 goto offsets_needed;
338 }
339 }
340 else
341 memcpy (pstr->mbs + byte_idx,
342 pstr->raw_mbs + pstr->raw_mbs_idx + byte_idx, mbclen);
343 pstr->wcs[byte_idx++] = wcu;
344 /* Write paddings. */
345 for (remain_len = byte_idx + mbclen - 1; byte_idx < remain_len ;)
346 pstr->wcs[byte_idx++] = WEOF;
347 }
348 else if (mbclen == (size_t) -1 || mbclen == 0
349 || (mbclen == (size_t) -2 && pstr->bufs_len >= pstr->len))
350 {
351 /* It is an invalid character, an incomplete character
352 at the end of the string, or '\0'. Just use the byte. */
353 pstr->mbs[byte_idx] = ch;
354 /* And also cast it to wide char. */
355 pstr->wcs[byte_idx++] = (wchar_t) ch;
356 if (__glibc_unlikely (mbclen == (size_t) -1))
357 pstr->cur_state = prev_st;
358 }
359 else
360 {
361 /* The buffer doesn't have enough space, finish to build. */
362 pstr->cur_state = prev_st;
363 break;
364 }
365 }
366 pstr->valid_len = byte_idx;
367 pstr->valid_raw_len = byte_idx;
368 return REG_NOERROR;
369 }
370 else
371 for (src_idx = pstr->valid_raw_len; byte_idx < end_idx;)
372 {
373 wchar_t wc;
374 const char *p;
375 offsets_needed:
376 remain_len = end_idx - byte_idx;
377 prev_st = pstr->cur_state;
378 if (__glibc_unlikely (pstr->trans != NULL))
379 {
380 int i, ch;
381
382 for (i = 0; i < pstr->mb_cur_max && i < remain_len; ++i)
383 {
384 ch = pstr->raw_mbs [pstr->raw_mbs_idx + src_idx + i];
385 buf[i] = pstr->trans[ch];
386 }
387 p = (const char *) buf;
388 }
389 else
390 p = (const char *) pstr->raw_mbs + pstr->raw_mbs_idx + src_idx;
391 mbclen = __mbrtowc (&wc, p, remain_len, &pstr->cur_state);
392 if (__glibc_likely (0 < mbclen && mbclen < (size_t) -2))
393 {
394 wchar_t wcu = __towupper (wc);
395 if (wcu != wc)
396 {
397 size_t mbcdlen;
398
399 mbcdlen = __wcrtomb ((char *) buf, wcu, &prev_st);
400 if (__glibc_likely (mbclen == mbcdlen))
401 memcpy (pstr->mbs + byte_idx, buf, mbclen);
402 else if (mbcdlen != (size_t) -1)
403 {
404 size_t i;
405
406 if (byte_idx + mbcdlen > pstr->bufs_len)
407 {
408 pstr->cur_state = prev_st;
409 break;
410 }
411
412 if (pstr->offsets == NULL)
413 {
414 pstr->offsets = re_malloc (Idx, pstr->bufs_len);
415
416 if (pstr->offsets == NULL)
417 return REG_ESPACE;
418 }
419 if (!pstr->offsets_needed)
420 {
421 for (i = 0; i < (size_t) byte_idx; ++i)
422 pstr->offsets[i] = i;
423 pstr->offsets_needed = 1;
424 }
425
426 memcpy (pstr->mbs + byte_idx, buf, mbcdlen);
427 pstr->wcs[byte_idx] = wcu;
428 pstr->offsets[byte_idx] = src_idx;
429 for (i = 1; i < mbcdlen; ++i)
430 {
431 pstr->offsets[byte_idx + i]
432 = src_idx + (i < mbclen ? i : mbclen - 1);
433 pstr->wcs[byte_idx + i] = WEOF;
434 }
435 pstr->len += mbcdlen - mbclen;
436 if (pstr->raw_stop > src_idx)
437 pstr->stop += mbcdlen - mbclen;
438 end_idx = (pstr->bufs_len > pstr->len)
439 ? pstr->len : pstr->bufs_len;
440 byte_idx += mbcdlen;
441 src_idx += mbclen;
442 continue;
443 }
444 else
445 memcpy (pstr->mbs + byte_idx, p, mbclen);
446 }
447 else
448 memcpy (pstr->mbs + byte_idx, p, mbclen);
449
450 if (__glibc_unlikely (pstr->offsets_needed != 0))
451 {
452 size_t i;
453 for (i = 0; i < mbclen; ++i)
454 pstr->offsets[byte_idx + i] = src_idx + i;
455 }
456 src_idx += mbclen;
457
458 pstr->wcs[byte_idx++] = wcu;
459 /* Write paddings. */
460 for (remain_len = byte_idx + mbclen - 1; byte_idx < remain_len ;)
461 pstr->wcs[byte_idx++] = WEOF;
462 }
463 else if (mbclen == (size_t) -1 || mbclen == 0
464 || (mbclen == (size_t) -2 && pstr->bufs_len >= pstr->len))
465 {
466 /* It is an invalid character or '\0'. Just use the byte. */
467 int ch = pstr->raw_mbs[pstr->raw_mbs_idx + src_idx];
468
469 if (__glibc_unlikely (pstr->trans != NULL))
470 ch = pstr->trans [ch];
471 pstr->mbs[byte_idx] = ch;
472
473 if (__glibc_unlikely (pstr->offsets_needed != 0))
474 pstr->offsets[byte_idx] = src_idx;
475 ++src_idx;
476
477 /* And also cast it to wide char. */
478 pstr->wcs[byte_idx++] = (wchar_t) ch;
479 if (__glibc_unlikely (mbclen == (size_t) -1))
480 pstr->cur_state = prev_st;
481 }
482 else
483 {
484 /* The buffer doesn't have enough space, finish to build. */
485 pstr->cur_state = prev_st;
486 break;
487 }
488 }
489 pstr->valid_len = byte_idx;
490 pstr->valid_raw_len = src_idx;
491 return REG_NOERROR;
492 }
493
494 /* Skip characters until the index becomes greater than NEW_RAW_IDX.
495 Return the index. */
496
497 static Idx
re_string_skip_chars(re_string_t * pstr,Idx new_raw_idx,wint_t * last_wc)498 re_string_skip_chars (re_string_t *pstr, Idx new_raw_idx, wint_t *last_wc)
499 {
500 mbstate_t prev_st;
501 Idx rawbuf_idx;
502 size_t mbclen;
503 wint_t wc = WEOF;
504
505 /* Skip the characters which are not necessary to check. */
506 for (rawbuf_idx = pstr->raw_mbs_idx + pstr->valid_raw_len;
507 rawbuf_idx < new_raw_idx;)
508 {
509 wchar_t wc2;
510 Idx remain_len = pstr->raw_len - rawbuf_idx;
511 prev_st = pstr->cur_state;
512 mbclen = __mbrtowc (&wc2, (const char *) pstr->raw_mbs + rawbuf_idx,
513 remain_len, &pstr->cur_state);
514 if (__glibc_unlikely (mbclen == (size_t) -2 || mbclen == (size_t) -1
515 || mbclen == 0))
516 {
517 /* We treat these cases as a single byte character. */
518 if (mbclen == 0 || remain_len == 0)
519 wc = L'\0';
520 else
521 wc = *(unsigned char *) (pstr->raw_mbs + rawbuf_idx);
522 mbclen = 1;
523 pstr->cur_state = prev_st;
524 }
525 else
526 wc = wc2;
527 /* Then proceed the next character. */
528 rawbuf_idx += mbclen;
529 }
530 *last_wc = wc;
531 return rawbuf_idx;
532 }
533 #endif /* RE_ENABLE_I18N */
534
535 /* Build the buffer PSTR->MBS, and apply the translation if we need.
536 This function is used in case of REG_ICASE. */
537
538 static void
build_upper_buffer(re_string_t * pstr)539 build_upper_buffer (re_string_t *pstr)
540 {
541 Idx char_idx, end_idx;
542 end_idx = (pstr->bufs_len > pstr->len) ? pstr->len : pstr->bufs_len;
543
544 for (char_idx = pstr->valid_len; char_idx < end_idx; ++char_idx)
545 {
546 int ch = pstr->raw_mbs[pstr->raw_mbs_idx + char_idx];
547 if (__glibc_unlikely (pstr->trans != NULL))
548 ch = pstr->trans[ch];
549 pstr->mbs[char_idx] = toupper (ch);
550 }
551 pstr->valid_len = char_idx;
552 pstr->valid_raw_len = char_idx;
553 }
554
555 /* Apply TRANS to the buffer in PSTR. */
556
557 static void
re_string_translate_buffer(re_string_t * pstr)558 re_string_translate_buffer (re_string_t *pstr)
559 {
560 Idx buf_idx, end_idx;
561 end_idx = (pstr->bufs_len > pstr->len) ? pstr->len : pstr->bufs_len;
562
563 for (buf_idx = pstr->valid_len; buf_idx < end_idx; ++buf_idx)
564 {
565 int ch = pstr->raw_mbs[pstr->raw_mbs_idx + buf_idx];
566 pstr->mbs[buf_idx] = pstr->trans[ch];
567 }
568
569 pstr->valid_len = buf_idx;
570 pstr->valid_raw_len = buf_idx;
571 }
572
573 /* This function re-construct the buffers.
574 Concretely, convert to wide character in case of pstr->mb_cur_max > 1,
575 convert to upper case in case of REG_ICASE, apply translation. */
576
577 static reg_errcode_t
578 __attribute_warn_unused_result__
re_string_reconstruct(re_string_t * pstr,Idx idx,int eflags)579 re_string_reconstruct (re_string_t *pstr, Idx idx, int eflags)
580 {
581 Idx offset;
582
583 if (__glibc_unlikely (pstr->raw_mbs_idx <= idx))
584 offset = idx - pstr->raw_mbs_idx;
585 else
586 {
587 /* Reset buffer. */
588 #ifdef RE_ENABLE_I18N
589 if (pstr->mb_cur_max > 1)
590 memset (&pstr->cur_state, '\0', sizeof (mbstate_t));
591 #endif /* RE_ENABLE_I18N */
592 pstr->len = pstr->raw_len;
593 pstr->stop = pstr->raw_stop;
594 pstr->valid_len = 0;
595 pstr->raw_mbs_idx = 0;
596 pstr->valid_raw_len = 0;
597 pstr->offsets_needed = 0;
598 pstr->tip_context = ((eflags & REG_NOTBOL) ? CONTEXT_BEGBUF
599 : CONTEXT_NEWLINE | CONTEXT_BEGBUF);
600 if (!pstr->mbs_allocated)
601 pstr->mbs = (unsigned char *) pstr->raw_mbs;
602 offset = idx;
603 }
604
605 if (__glibc_likely (offset != 0))
606 {
607 /* Should the already checked characters be kept? */
608 if (__glibc_likely (offset < pstr->valid_raw_len))
609 {
610 /* Yes, move them to the front of the buffer. */
611 #ifdef RE_ENABLE_I18N
612 if (__glibc_unlikely (pstr->offsets_needed))
613 {
614 Idx low = 0, high = pstr->valid_len, mid;
615 do
616 {
617 mid = (high + low) / 2;
618 if (pstr->offsets[mid] > offset)
619 high = mid;
620 else if (pstr->offsets[mid] < offset)
621 low = mid + 1;
622 else
623 break;
624 }
625 while (low < high);
626 if (pstr->offsets[mid] < offset)
627 ++mid;
628 pstr->tip_context = re_string_context_at (pstr, mid - 1,
629 eflags);
630 /* This can be quite complicated, so handle specially
631 only the common and easy case where the character with
632 different length representation of lower and upper
633 case is present at or after offset. */
634 if (pstr->valid_len > offset
635 && mid == offset && pstr->offsets[mid] == offset)
636 {
637 memmove (pstr->wcs, pstr->wcs + offset,
638 (pstr->valid_len - offset) * sizeof (wint_t));
639 memmove (pstr->mbs, pstr->mbs + offset, pstr->valid_len - offset);
640 pstr->valid_len -= offset;
641 pstr->valid_raw_len -= offset;
642 for (low = 0; low < pstr->valid_len; low++)
643 pstr->offsets[low] = pstr->offsets[low + offset] - offset;
644 }
645 else
646 {
647 /* Otherwise, just find out how long the partial multibyte
648 character at offset is and fill it with WEOF/255. */
649 pstr->len = pstr->raw_len - idx + offset;
650 pstr->stop = pstr->raw_stop - idx + offset;
651 pstr->offsets_needed = 0;
652 while (mid > 0 && pstr->offsets[mid - 1] == offset)
653 --mid;
654 while (mid < pstr->valid_len)
655 if (pstr->wcs[mid] != WEOF)
656 break;
657 else
658 ++mid;
659 if (mid == pstr->valid_len)
660 pstr->valid_len = 0;
661 else
662 {
663 pstr->valid_len = pstr->offsets[mid] - offset;
664 if (pstr->valid_len)
665 {
666 for (low = 0; low < pstr->valid_len; ++low)
667 pstr->wcs[low] = WEOF;
668 memset (pstr->mbs, 255, pstr->valid_len);
669 }
670 }
671 pstr->valid_raw_len = pstr->valid_len;
672 }
673 }
674 else
675 #endif
676 {
677 pstr->tip_context = re_string_context_at (pstr, offset - 1,
678 eflags);
679 #ifdef RE_ENABLE_I18N
680 if (pstr->mb_cur_max > 1)
681 memmove (pstr->wcs, pstr->wcs + offset,
682 (pstr->valid_len - offset) * sizeof (wint_t));
683 #endif /* RE_ENABLE_I18N */
684 if (__glibc_unlikely (pstr->mbs_allocated))
685 memmove (pstr->mbs, pstr->mbs + offset,
686 pstr->valid_len - offset);
687 pstr->valid_len -= offset;
688 pstr->valid_raw_len -= offset;
689 DEBUG_ASSERT (pstr->valid_len > 0);
690 }
691 }
692 else
693 {
694 #ifdef RE_ENABLE_I18N
695 /* No, skip all characters until IDX. */
696 Idx prev_valid_len = pstr->valid_len;
697
698 if (__glibc_unlikely (pstr->offsets_needed))
699 {
700 pstr->len = pstr->raw_len - idx + offset;
701 pstr->stop = pstr->raw_stop - idx + offset;
702 pstr->offsets_needed = 0;
703 }
704 #endif
705 pstr->valid_len = 0;
706 #ifdef RE_ENABLE_I18N
707 if (pstr->mb_cur_max > 1)
708 {
709 Idx wcs_idx;
710 wint_t wc = WEOF;
711
712 if (pstr->is_utf8)
713 {
714 const unsigned char *raw, *p, *end;
715
716 /* Special case UTF-8. Multi-byte chars start with any
717 byte other than 0x80 - 0xbf. */
718 raw = pstr->raw_mbs + pstr->raw_mbs_idx;
719 end = raw + (offset - pstr->mb_cur_max);
720 if (end < pstr->raw_mbs)
721 end = pstr->raw_mbs;
722 p = raw + offset - 1;
723 #ifdef _LIBC
724 /* We know the wchar_t encoding is UCS4, so for the simple
725 case, ASCII characters, skip the conversion step. */
726 if (isascii (*p) && __glibc_likely (pstr->trans == NULL))
727 {
728 memset (&pstr->cur_state, '\0', sizeof (mbstate_t));
729 /* pstr->valid_len = 0; */
730 wc = (wchar_t) *p;
731 }
732 else
733 #endif
734 for (; p >= end; --p)
735 if ((*p & 0xc0) != 0x80)
736 {
737 mbstate_t cur_state;
738 wchar_t wc2;
739 Idx mlen = raw + pstr->len - p;
740 unsigned char buf[6];
741 size_t mbclen;
742
743 const unsigned char *pp = p;
744 if (__glibc_unlikely (pstr->trans != NULL))
745 {
746 int i = mlen < 6 ? mlen : 6;
747 while (--i >= 0)
748 buf[i] = pstr->trans[p[i]];
749 pp = buf;
750 }
751 /* XXX Don't use mbrtowc, we know which conversion
752 to use (UTF-8 -> UCS4). */
753 memset (&cur_state, 0, sizeof (cur_state));
754 mbclen = __mbrtowc (&wc2, (const char *) pp, mlen,
755 &cur_state);
756 if (raw + offset - p <= mbclen
757 && mbclen < (size_t) -2)
758 {
759 memset (&pstr->cur_state, '\0',
760 sizeof (mbstate_t));
761 pstr->valid_len = mbclen - (raw + offset - p);
762 wc = wc2;
763 }
764 break;
765 }
766 }
767
768 if (wc == WEOF)
769 pstr->valid_len = re_string_skip_chars (pstr, idx, &wc) - idx;
770 if (wc == WEOF)
771 pstr->tip_context
772 = re_string_context_at (pstr, prev_valid_len - 1, eflags);
773 else
774 pstr->tip_context = ((__glibc_unlikely (pstr->word_ops_used != 0)
775 && IS_WIDE_WORD_CHAR (wc))
776 ? CONTEXT_WORD
777 : ((IS_WIDE_NEWLINE (wc)
778 && pstr->newline_anchor)
779 ? CONTEXT_NEWLINE : 0));
780 if (__glibc_unlikely (pstr->valid_len))
781 {
782 for (wcs_idx = 0; wcs_idx < pstr->valid_len; ++wcs_idx)
783 pstr->wcs[wcs_idx] = WEOF;
784 if (pstr->mbs_allocated)
785 memset (pstr->mbs, 255, pstr->valid_len);
786 }
787 pstr->valid_raw_len = pstr->valid_len;
788 }
789 else
790 #endif /* RE_ENABLE_I18N */
791 {
792 int c = pstr->raw_mbs[pstr->raw_mbs_idx + offset - 1];
793 pstr->valid_raw_len = 0;
794 if (pstr->trans)
795 c = pstr->trans[c];
796 pstr->tip_context = (bitset_contain (pstr->word_char, c)
797 ? CONTEXT_WORD
798 : ((IS_NEWLINE (c) && pstr->newline_anchor)
799 ? CONTEXT_NEWLINE : 0));
800 }
801 }
802 if (!__glibc_unlikely (pstr->mbs_allocated))
803 pstr->mbs += offset;
804 }
805 pstr->raw_mbs_idx = idx;
806 pstr->len -= offset;
807 pstr->stop -= offset;
808
809 /* Then build the buffers. */
810 #ifdef RE_ENABLE_I18N
811 if (pstr->mb_cur_max > 1)
812 {
813 if (pstr->icase)
814 {
815 reg_errcode_t ret = build_wcs_upper_buffer (pstr);
816 if (__glibc_unlikely (ret != REG_NOERROR))
817 return ret;
818 }
819 else
820 build_wcs_buffer (pstr);
821 }
822 else
823 #endif /* RE_ENABLE_I18N */
824 if (__glibc_unlikely (pstr->mbs_allocated))
825 {
826 if (pstr->icase)
827 build_upper_buffer (pstr);
828 else if (pstr->trans != NULL)
829 re_string_translate_buffer (pstr);
830 }
831 else
832 pstr->valid_len = pstr->len;
833
834 pstr->cur_idx = 0;
835 return REG_NOERROR;
836 }
837
838 static unsigned char
839 __attribute__ ((pure))
re_string_peek_byte_case(const re_string_t * pstr,Idx idx)840 re_string_peek_byte_case (const re_string_t *pstr, Idx idx)
841 {
842 int ch;
843 Idx off;
844
845 /* Handle the common (easiest) cases first. */
846 if (__glibc_likely (!pstr->mbs_allocated))
847 return re_string_peek_byte (pstr, idx);
848
849 #ifdef RE_ENABLE_I18N
850 if (pstr->mb_cur_max > 1
851 && ! re_string_is_single_byte_char (pstr, pstr->cur_idx + idx))
852 return re_string_peek_byte (pstr, idx);
853 #endif
854
855 off = pstr->cur_idx + idx;
856 #ifdef RE_ENABLE_I18N
857 if (pstr->offsets_needed)
858 off = pstr->offsets[off];
859 #endif
860
861 ch = pstr->raw_mbs[pstr->raw_mbs_idx + off];
862
863 #ifdef RE_ENABLE_I18N
864 /* Ensure that e.g. for tr_TR.UTF-8 BACKSLASH DOTLESS SMALL LETTER I
865 this function returns CAPITAL LETTER I instead of first byte of
866 DOTLESS SMALL LETTER I. The latter would confuse the parser,
867 since peek_byte_case doesn't advance cur_idx in any way. */
868 if (pstr->offsets_needed && !isascii (ch))
869 return re_string_peek_byte (pstr, idx);
870 #endif
871
872 return ch;
873 }
874
875 static unsigned char
re_string_fetch_byte_case(re_string_t * pstr)876 re_string_fetch_byte_case (re_string_t *pstr)
877 {
878 if (__glibc_likely (!pstr->mbs_allocated))
879 return re_string_fetch_byte (pstr);
880
881 #ifdef RE_ENABLE_I18N
882 if (pstr->offsets_needed)
883 {
884 Idx off;
885 int ch;
886
887 /* For tr_TR.UTF-8 [[:islower:]] there is
888 [[: CAPITAL LETTER I WITH DOT lower:]] in mbs. Skip
889 in that case the whole multi-byte character and return
890 the original letter. On the other side, with
891 [[: DOTLESS SMALL LETTER I return [[:I, as doing
892 anything else would complicate things too much. */
893
894 if (!re_string_first_byte (pstr, pstr->cur_idx))
895 return re_string_fetch_byte (pstr);
896
897 off = pstr->offsets[pstr->cur_idx];
898 ch = pstr->raw_mbs[pstr->raw_mbs_idx + off];
899
900 if (! isascii (ch))
901 return re_string_fetch_byte (pstr);
902
903 re_string_skip_bytes (pstr,
904 re_string_char_size_at (pstr, pstr->cur_idx));
905 return ch;
906 }
907 #endif
908
909 return pstr->raw_mbs[pstr->raw_mbs_idx + pstr->cur_idx++];
910 }
911
912 static void
re_string_destruct(re_string_t * pstr)913 re_string_destruct (re_string_t *pstr)
914 {
915 #ifdef RE_ENABLE_I18N
916 re_free (pstr->wcs);
917 re_free (pstr->offsets);
918 #endif /* RE_ENABLE_I18N */
919 if (pstr->mbs_allocated)
920 re_free (pstr->mbs);
921 }
922
923 /* Return the context at IDX in INPUT. */
924
925 static unsigned int
re_string_context_at(const re_string_t * input,Idx idx,int eflags)926 re_string_context_at (const re_string_t *input, Idx idx, int eflags)
927 {
928 int c;
929 if (__glibc_unlikely (idx < 0))
930 /* In this case, we use the value stored in input->tip_context,
931 since we can't know the character in input->mbs[-1] here. */
932 return input->tip_context;
933 if (__glibc_unlikely (idx == input->len))
934 return ((eflags & REG_NOTEOL) ? CONTEXT_ENDBUF
935 : CONTEXT_NEWLINE | CONTEXT_ENDBUF);
936 #ifdef RE_ENABLE_I18N
937 if (input->mb_cur_max > 1)
938 {
939 wint_t wc;
940 Idx wc_idx = idx;
941 while(input->wcs[wc_idx] == WEOF)
942 {
943 DEBUG_ASSERT (wc_idx >= 0);
944 --wc_idx;
945 if (wc_idx < 0)
946 return input->tip_context;
947 }
948 wc = input->wcs[wc_idx];
949 if (__glibc_unlikely (input->word_ops_used != 0)
950 && IS_WIDE_WORD_CHAR (wc))
951 return CONTEXT_WORD;
952 return (IS_WIDE_NEWLINE (wc) && input->newline_anchor
953 ? CONTEXT_NEWLINE : 0);
954 }
955 else
956 #endif
957 {
958 c = re_string_byte_at (input, idx);
959 if (bitset_contain (input->word_char, c))
960 return CONTEXT_WORD;
961 return IS_NEWLINE (c) && input->newline_anchor ? CONTEXT_NEWLINE : 0;
962 }
963 }
964
965 /* Functions for set operation. */
966
967 static reg_errcode_t
968 __attribute_warn_unused_result__
re_node_set_alloc(re_node_set * set,Idx size)969 re_node_set_alloc (re_node_set *set, Idx size)
970 {
971 set->alloc = size;
972 set->nelem = 0;
973 set->elems = re_malloc (Idx, size);
974 if (__glibc_unlikely (set->elems == NULL)
975 && (MALLOC_0_IS_NONNULL || size != 0))
976 return REG_ESPACE;
977 return REG_NOERROR;
978 }
979
980 static reg_errcode_t
981 __attribute_warn_unused_result__
re_node_set_init_1(re_node_set * set,Idx elem)982 re_node_set_init_1 (re_node_set *set, Idx elem)
983 {
984 set->alloc = 1;
985 set->nelem = 1;
986 set->elems = re_malloc (Idx, 1);
987 if (__glibc_unlikely (set->elems == NULL))
988 {
989 set->alloc = set->nelem = 0;
990 return REG_ESPACE;
991 }
992 set->elems[0] = elem;
993 return REG_NOERROR;
994 }
995
996 static reg_errcode_t
997 __attribute_warn_unused_result__
re_node_set_init_2(re_node_set * set,Idx elem1,Idx elem2)998 re_node_set_init_2 (re_node_set *set, Idx elem1, Idx elem2)
999 {
1000 set->alloc = 2;
1001 set->elems = re_malloc (Idx, 2);
1002 if (__glibc_unlikely (set->elems == NULL))
1003 return REG_ESPACE;
1004 if (elem1 == elem2)
1005 {
1006 set->nelem = 1;
1007 set->elems[0] = elem1;
1008 }
1009 else
1010 {
1011 set->nelem = 2;
1012 if (elem1 < elem2)
1013 {
1014 set->elems[0] = elem1;
1015 set->elems[1] = elem2;
1016 }
1017 else
1018 {
1019 set->elems[0] = elem2;
1020 set->elems[1] = elem1;
1021 }
1022 }
1023 return REG_NOERROR;
1024 }
1025
1026 static reg_errcode_t
1027 __attribute_warn_unused_result__
re_node_set_init_copy(re_node_set * dest,const re_node_set * src)1028 re_node_set_init_copy (re_node_set *dest, const re_node_set *src)
1029 {
1030 dest->nelem = src->nelem;
1031 if (src->nelem > 0)
1032 {
1033 dest->alloc = dest->nelem;
1034 dest->elems = re_malloc (Idx, dest->alloc);
1035 if (__glibc_unlikely (dest->elems == NULL))
1036 {
1037 dest->alloc = dest->nelem = 0;
1038 return REG_ESPACE;
1039 }
1040 memcpy (dest->elems, src->elems, src->nelem * sizeof (Idx));
1041 }
1042 else
1043 re_node_set_init_empty (dest);
1044 return REG_NOERROR;
1045 }
1046
1047 /* Calculate the intersection of the sets SRC1 and SRC2. And merge it to
1048 DEST. Return value indicate the error code or REG_NOERROR if succeeded.
1049 Note: We assume dest->elems is NULL, when dest->alloc is 0. */
1050
1051 static reg_errcode_t
1052 __attribute_warn_unused_result__
re_node_set_add_intersect(re_node_set * dest,const re_node_set * src1,const re_node_set * src2)1053 re_node_set_add_intersect (re_node_set *dest, const re_node_set *src1,
1054 const re_node_set *src2)
1055 {
1056 Idx i1, i2, is, id, delta, sbase;
1057 if (src1->nelem == 0 || src2->nelem == 0)
1058 return REG_NOERROR;
1059
1060 /* We need dest->nelem + 2 * elems_in_intersection; this is a
1061 conservative estimate. */
1062 if (src1->nelem + src2->nelem + dest->nelem > dest->alloc)
1063 {
1064 Idx new_alloc = src1->nelem + src2->nelem + dest->alloc;
1065 Idx *new_elems = re_realloc (dest->elems, Idx, new_alloc);
1066 if (__glibc_unlikely (new_elems == NULL))
1067 return REG_ESPACE;
1068 dest->elems = new_elems;
1069 dest->alloc = new_alloc;
1070 }
1071
1072 /* Find the items in the intersection of SRC1 and SRC2, and copy
1073 into the top of DEST those that are not already in DEST itself. */
1074 sbase = dest->nelem + src1->nelem + src2->nelem;
1075 i1 = src1->nelem - 1;
1076 i2 = src2->nelem - 1;
1077 id = dest->nelem - 1;
1078 for (;;)
1079 {
1080 if (src1->elems[i1] == src2->elems[i2])
1081 {
1082 /* Try to find the item in DEST. Maybe we could binary search? */
1083 while (id >= 0 && dest->elems[id] > src1->elems[i1])
1084 --id;
1085
1086 if (id < 0 || dest->elems[id] != src1->elems[i1])
1087 dest->elems[--sbase] = src1->elems[i1];
1088
1089 if (--i1 < 0 || --i2 < 0)
1090 break;
1091 }
1092
1093 /* Lower the highest of the two items. */
1094 else if (src1->elems[i1] < src2->elems[i2])
1095 {
1096 if (--i2 < 0)
1097 break;
1098 }
1099 else
1100 {
1101 if (--i1 < 0)
1102 break;
1103 }
1104 }
1105
1106 id = dest->nelem - 1;
1107 is = dest->nelem + src1->nelem + src2->nelem - 1;
1108 delta = is - sbase + 1;
1109
1110 /* Now copy. When DELTA becomes zero, the remaining
1111 DEST elements are already in place; this is more or
1112 less the same loop that is in re_node_set_merge. */
1113 dest->nelem += delta;
1114 if (delta > 0 && id >= 0)
1115 for (;;)
1116 {
1117 if (dest->elems[is] > dest->elems[id])
1118 {
1119 /* Copy from the top. */
1120 dest->elems[id + delta--] = dest->elems[is--];
1121 if (delta == 0)
1122 break;
1123 }
1124 else
1125 {
1126 /* Slide from the bottom. */
1127 dest->elems[id + delta] = dest->elems[id];
1128 if (--id < 0)
1129 break;
1130 }
1131 }
1132
1133 /* Copy remaining SRC elements. */
1134 memcpy (dest->elems, dest->elems + sbase, delta * sizeof (Idx));
1135
1136 return REG_NOERROR;
1137 }
1138
1139 /* Calculate the union set of the sets SRC1 and SRC2. And store it to
1140 DEST. Return value indicate the error code or REG_NOERROR if succeeded. */
1141
1142 static reg_errcode_t
1143 __attribute_warn_unused_result__
re_node_set_init_union(re_node_set * dest,const re_node_set * src1,const re_node_set * src2)1144 re_node_set_init_union (re_node_set *dest, const re_node_set *src1,
1145 const re_node_set *src2)
1146 {
1147 Idx i1, i2, id;
1148 if (src1 != NULL && src1->nelem > 0 && src2 != NULL && src2->nelem > 0)
1149 {
1150 dest->alloc = src1->nelem + src2->nelem;
1151 dest->elems = re_malloc (Idx, dest->alloc);
1152 if (__glibc_unlikely (dest->elems == NULL))
1153 return REG_ESPACE;
1154 }
1155 else
1156 {
1157 if (src1 != NULL && src1->nelem > 0)
1158 return re_node_set_init_copy (dest, src1);
1159 else if (src2 != NULL && src2->nelem > 0)
1160 return re_node_set_init_copy (dest, src2);
1161 else
1162 re_node_set_init_empty (dest);
1163 return REG_NOERROR;
1164 }
1165 for (i1 = i2 = id = 0 ; i1 < src1->nelem && i2 < src2->nelem ;)
1166 {
1167 if (src1->elems[i1] > src2->elems[i2])
1168 {
1169 dest->elems[id++] = src2->elems[i2++];
1170 continue;
1171 }
1172 if (src1->elems[i1] == src2->elems[i2])
1173 ++i2;
1174 dest->elems[id++] = src1->elems[i1++];
1175 }
1176 if (i1 < src1->nelem)
1177 {
1178 memcpy (dest->elems + id, src1->elems + i1,
1179 (src1->nelem - i1) * sizeof (Idx));
1180 id += src1->nelem - i1;
1181 }
1182 else if (i2 < src2->nelem)
1183 {
1184 memcpy (dest->elems + id, src2->elems + i2,
1185 (src2->nelem - i2) * sizeof (Idx));
1186 id += src2->nelem - i2;
1187 }
1188 dest->nelem = id;
1189 return REG_NOERROR;
1190 }
1191
1192 /* Calculate the union set of the sets DEST and SRC. And store it to
1193 DEST. Return value indicate the error code or REG_NOERROR if succeeded. */
1194
1195 static reg_errcode_t
1196 __attribute_warn_unused_result__
re_node_set_merge(re_node_set * dest,const re_node_set * src)1197 re_node_set_merge (re_node_set *dest, const re_node_set *src)
1198 {
1199 Idx is, id, sbase, delta;
1200 if (src == NULL || src->nelem == 0)
1201 return REG_NOERROR;
1202 if (dest->alloc < 2 * src->nelem + dest->nelem)
1203 {
1204 Idx new_alloc = 2 * (src->nelem + dest->alloc);
1205 Idx *new_buffer = re_realloc (dest->elems, Idx, new_alloc);
1206 if (__glibc_unlikely (new_buffer == NULL))
1207 return REG_ESPACE;
1208 dest->elems = new_buffer;
1209 dest->alloc = new_alloc;
1210 }
1211
1212 if (__glibc_unlikely (dest->nelem == 0))
1213 {
1214 dest->nelem = src->nelem;
1215 memcpy (dest->elems, src->elems, src->nelem * sizeof (Idx));
1216 return REG_NOERROR;
1217 }
1218
1219 /* Copy into the top of DEST the items of SRC that are not
1220 found in DEST. Maybe we could binary search in DEST? */
1221 for (sbase = dest->nelem + 2 * src->nelem,
1222 is = src->nelem - 1, id = dest->nelem - 1; is >= 0 && id >= 0; )
1223 {
1224 if (dest->elems[id] == src->elems[is])
1225 is--, id--;
1226 else if (dest->elems[id] < src->elems[is])
1227 dest->elems[--sbase] = src->elems[is--];
1228 else /* if (dest->elems[id] > src->elems[is]) */
1229 --id;
1230 }
1231
1232 if (is >= 0)
1233 {
1234 /* If DEST is exhausted, the remaining items of SRC must be unique. */
1235 sbase -= is + 1;
1236 memcpy (dest->elems + sbase, src->elems, (is + 1) * sizeof (Idx));
1237 }
1238
1239 id = dest->nelem - 1;
1240 is = dest->nelem + 2 * src->nelem - 1;
1241 delta = is - sbase + 1;
1242 if (delta == 0)
1243 return REG_NOERROR;
1244
1245 /* Now copy. When DELTA becomes zero, the remaining
1246 DEST elements are already in place. */
1247 dest->nelem += delta;
1248 for (;;)
1249 {
1250 if (dest->elems[is] > dest->elems[id])
1251 {
1252 /* Copy from the top. */
1253 dest->elems[id + delta--] = dest->elems[is--];
1254 if (delta == 0)
1255 break;
1256 }
1257 else
1258 {
1259 /* Slide from the bottom. */
1260 dest->elems[id + delta] = dest->elems[id];
1261 if (--id < 0)
1262 {
1263 /* Copy remaining SRC elements. */
1264 memcpy (dest->elems, dest->elems + sbase,
1265 delta * sizeof (Idx));
1266 break;
1267 }
1268 }
1269 }
1270
1271 return REG_NOERROR;
1272 }
1273
1274 /* Insert the new element ELEM to the re_node_set* SET.
1275 SET should not already have ELEM.
1276 Return true if successful. */
1277
1278 static bool
1279 __attribute_warn_unused_result__
re_node_set_insert(re_node_set * set,Idx elem)1280 re_node_set_insert (re_node_set *set, Idx elem)
1281 {
1282 Idx idx;
1283 /* In case the set is empty. */
1284 if (set->alloc == 0)
1285 return __glibc_likely (re_node_set_init_1 (set, elem) == REG_NOERROR);
1286
1287 if (__glibc_unlikely (set->nelem) == 0)
1288 {
1289 /* We already guaranteed above that set->alloc != 0. */
1290 set->elems[0] = elem;
1291 ++set->nelem;
1292 return true;
1293 }
1294
1295 /* Realloc if we need. */
1296 if (set->alloc == set->nelem)
1297 {
1298 Idx *new_elems;
1299 set->alloc = set->alloc * 2;
1300 new_elems = re_realloc (set->elems, Idx, set->alloc);
1301 if (__glibc_unlikely (new_elems == NULL))
1302 return false;
1303 set->elems = new_elems;
1304 }
1305
1306 /* Move the elements which follows the new element. Test the
1307 first element separately to skip a check in the inner loop. */
1308 if (elem < set->elems[0])
1309 {
1310 for (idx = set->nelem; idx > 0; idx--)
1311 set->elems[idx] = set->elems[idx - 1];
1312 }
1313 else
1314 {
1315 for (idx = set->nelem; set->elems[idx - 1] > elem; idx--)
1316 set->elems[idx] = set->elems[idx - 1];
1317 DEBUG_ASSERT (set->elems[idx - 1] < elem);
1318 }
1319
1320 /* Insert the new element. */
1321 set->elems[idx] = elem;
1322 ++set->nelem;
1323 return true;
1324 }
1325
1326 /* Insert the new element ELEM to the re_node_set* SET.
1327 SET should not already have any element greater than or equal to ELEM.
1328 Return true if successful. */
1329
1330 static bool
1331 __attribute_warn_unused_result__
re_node_set_insert_last(re_node_set * set,Idx elem)1332 re_node_set_insert_last (re_node_set *set, Idx elem)
1333 {
1334 /* Realloc if we need. */
1335 if (set->alloc == set->nelem)
1336 {
1337 Idx *new_elems;
1338 set->alloc = (set->alloc + 1) * 2;
1339 new_elems = re_realloc (set->elems, Idx, set->alloc);
1340 if (__glibc_unlikely (new_elems == NULL))
1341 return false;
1342 set->elems = new_elems;
1343 }
1344
1345 /* Insert the new element. */
1346 set->elems[set->nelem++] = elem;
1347 return true;
1348 }
1349
1350 /* Compare two node sets SET1 and SET2.
1351 Return true if SET1 and SET2 are equivalent. */
1352
1353 static bool
1354 __attribute__ ((pure))
re_node_set_compare(const re_node_set * set1,const re_node_set * set2)1355 re_node_set_compare (const re_node_set *set1, const re_node_set *set2)
1356 {
1357 Idx i;
1358 if (set1 == NULL || set2 == NULL || set1->nelem != set2->nelem)
1359 return false;
1360 for (i = set1->nelem ; --i >= 0 ; )
1361 if (set1->elems[i] != set2->elems[i])
1362 return false;
1363 return true;
1364 }
1365
1366 /* Return (idx + 1) if SET contains the element ELEM, return 0 otherwise. */
1367
1368 static Idx
1369 __attribute__ ((pure))
re_node_set_contains(const re_node_set * set,Idx elem)1370 re_node_set_contains (const re_node_set *set, Idx elem)
1371 {
1372 __re_size_t idx, right, mid;
1373 if (set->nelem <= 0)
1374 return 0;
1375
1376 /* Binary search the element. */
1377 idx = 0;
1378 right = set->nelem - 1;
1379 while (idx < right)
1380 {
1381 mid = (idx + right) / 2;
1382 if (set->elems[mid] < elem)
1383 idx = mid + 1;
1384 else
1385 right = mid;
1386 }
1387 return set->elems[idx] == elem ? idx + 1 : 0;
1388 }
1389
1390 static void
re_node_set_remove_at(re_node_set * set,Idx idx)1391 re_node_set_remove_at (re_node_set *set, Idx idx)
1392 {
1393 if (idx < 0 || idx >= set->nelem)
1394 return;
1395 --set->nelem;
1396 for (; idx < set->nelem; idx++)
1397 set->elems[idx] = set->elems[idx + 1];
1398 }
1399
1400
1401 /* Add the token TOKEN to dfa->nodes, and return the index of the token.
1402 Or return -1 if an error occurred. */
1403
1404 static Idx
re_dfa_add_node(re_dfa_t * dfa,re_token_t token)1405 re_dfa_add_node (re_dfa_t *dfa, re_token_t token)
1406 {
1407 if (__glibc_unlikely (dfa->nodes_len >= dfa->nodes_alloc))
1408 {
1409 size_t new_nodes_alloc = dfa->nodes_alloc * 2;
1410 Idx *new_nexts, *new_indices;
1411 re_node_set *new_edests, *new_eclosures;
1412 re_token_t *new_nodes;
1413
1414 /* Avoid overflows in realloc. */
1415 const size_t max_object_size = MAX (sizeof (re_token_t),
1416 MAX (sizeof (re_node_set),
1417 sizeof (Idx)));
1418 if (__glibc_unlikely (MIN (IDX_MAX, SIZE_MAX / max_object_size)
1419 < new_nodes_alloc))
1420 return -1;
1421
1422 new_nodes = re_realloc (dfa->nodes, re_token_t, new_nodes_alloc);
1423 if (__glibc_unlikely (new_nodes == NULL))
1424 return -1;
1425 dfa->nodes = new_nodes;
1426 new_nexts = re_realloc (dfa->nexts, Idx, new_nodes_alloc);
1427 new_indices = re_realloc (dfa->org_indices, Idx, new_nodes_alloc);
1428 new_edests = re_realloc (dfa->edests, re_node_set, new_nodes_alloc);
1429 new_eclosures = re_realloc (dfa->eclosures, re_node_set, new_nodes_alloc);
1430 if (__glibc_unlikely (new_nexts == NULL || new_indices == NULL
1431 || new_edests == NULL || new_eclosures == NULL))
1432 {
1433 re_free (new_nexts);
1434 re_free (new_indices);
1435 re_free (new_edests);
1436 re_free (new_eclosures);
1437 return -1;
1438 }
1439 dfa->nexts = new_nexts;
1440 dfa->org_indices = new_indices;
1441 dfa->edests = new_edests;
1442 dfa->eclosures = new_eclosures;
1443 dfa->nodes_alloc = new_nodes_alloc;
1444 }
1445 dfa->nodes[dfa->nodes_len] = token;
1446 dfa->nodes[dfa->nodes_len].constraint = 0;
1447 #ifdef RE_ENABLE_I18N
1448 dfa->nodes[dfa->nodes_len].accept_mb =
1449 ((token.type == OP_PERIOD && dfa->mb_cur_max > 1)
1450 || token.type == COMPLEX_BRACKET);
1451 #endif
1452 dfa->nexts[dfa->nodes_len] = -1;
1453 re_node_set_init_empty (dfa->edests + dfa->nodes_len);
1454 re_node_set_init_empty (dfa->eclosures + dfa->nodes_len);
1455 return dfa->nodes_len++;
1456 }
1457
1458 static re_hashval_t
calc_state_hash(const re_node_set * nodes,unsigned int context)1459 calc_state_hash (const re_node_set *nodes, unsigned int context)
1460 {
1461 re_hashval_t hash = nodes->nelem + context;
1462 Idx i;
1463 for (i = 0 ; i < nodes->nelem ; i++)
1464 hash += nodes->elems[i];
1465 return hash;
1466 }
1467
1468 /* Search for the state whose node_set is equivalent to NODES.
1469 Return the pointer to the state, if we found it in the DFA.
1470 Otherwise create the new one and return it. In case of an error
1471 return NULL and set the error code in ERR.
1472 Note: - We assume NULL as the invalid state, then it is possible that
1473 return value is NULL and ERR is REG_NOERROR.
1474 - We never return non-NULL value in case of any errors, it is for
1475 optimization. */
1476
1477 static re_dfastate_t *
1478 __attribute_warn_unused_result__
re_acquire_state(reg_errcode_t * err,const re_dfa_t * dfa,const re_node_set * nodes)1479 re_acquire_state (reg_errcode_t *err, const re_dfa_t *dfa,
1480 const re_node_set *nodes)
1481 {
1482 re_hashval_t hash;
1483 re_dfastate_t *new_state;
1484 struct re_state_table_entry *spot;
1485 Idx i;
1486 #if defined GCC_LINT || defined lint
1487 /* Suppress bogus uninitialized-variable warnings. */
1488 *err = REG_NOERROR;
1489 #endif
1490 if (__glibc_unlikely (nodes->nelem == 0))
1491 {
1492 *err = REG_NOERROR;
1493 return NULL;
1494 }
1495 hash = calc_state_hash (nodes, 0);
1496 spot = dfa->state_table + (hash & dfa->state_hash_mask);
1497
1498 for (i = 0 ; i < spot->num ; i++)
1499 {
1500 re_dfastate_t *state = spot->array[i];
1501 if (hash != state->hash)
1502 continue;
1503 if (re_node_set_compare (&state->nodes, nodes))
1504 return state;
1505 }
1506
1507 /* There are no appropriate state in the dfa, create the new one. */
1508 new_state = create_ci_newstate (dfa, nodes, hash);
1509 if (__glibc_unlikely (new_state == NULL))
1510 *err = REG_ESPACE;
1511
1512 return new_state;
1513 }
1514
1515 /* Search for the state whose node_set is equivalent to NODES and
1516 whose context is equivalent to CONTEXT.
1517 Return the pointer to the state, if we found it in the DFA.
1518 Otherwise create the new one and return it. In case of an error
1519 return NULL and set the error code in ERR.
1520 Note: - We assume NULL as the invalid state, then it is possible that
1521 return value is NULL and ERR is REG_NOERROR.
1522 - We never return non-NULL value in case of any errors, it is for
1523 optimization. */
1524
1525 static re_dfastate_t *
1526 __attribute_warn_unused_result__
re_acquire_state_context(reg_errcode_t * err,const re_dfa_t * dfa,const re_node_set * nodes,unsigned int context)1527 re_acquire_state_context (reg_errcode_t *err, const re_dfa_t *dfa,
1528 const re_node_set *nodes, unsigned int context)
1529 {
1530 re_hashval_t hash;
1531 re_dfastate_t *new_state;
1532 struct re_state_table_entry *spot;
1533 Idx i;
1534 #if defined GCC_LINT || defined lint
1535 /* Suppress bogus uninitialized-variable warnings. */
1536 *err = REG_NOERROR;
1537 #endif
1538 if (nodes->nelem == 0)
1539 {
1540 *err = REG_NOERROR;
1541 return NULL;
1542 }
1543 hash = calc_state_hash (nodes, context);
1544 spot = dfa->state_table + (hash & dfa->state_hash_mask);
1545
1546 for (i = 0 ; i < spot->num ; i++)
1547 {
1548 re_dfastate_t *state = spot->array[i];
1549 if (state->hash == hash
1550 && state->context == context
1551 && re_node_set_compare (state->entrance_nodes, nodes))
1552 return state;
1553 }
1554 /* There are no appropriate state in 'dfa', create the new one. */
1555 new_state = create_cd_newstate (dfa, nodes, context, hash);
1556 if (__glibc_unlikely (new_state == NULL))
1557 *err = REG_ESPACE;
1558
1559 return new_state;
1560 }
1561
1562 /* Finish initialization of the new state NEWSTATE, and using its hash value
1563 HASH put in the appropriate bucket of DFA's state table. Return value
1564 indicates the error code if failed. */
1565
1566 static reg_errcode_t
1567 __attribute_warn_unused_result__
register_state(const re_dfa_t * dfa,re_dfastate_t * newstate,re_hashval_t hash)1568 register_state (const re_dfa_t *dfa, re_dfastate_t *newstate,
1569 re_hashval_t hash)
1570 {
1571 struct re_state_table_entry *spot;
1572 reg_errcode_t err;
1573 Idx i;
1574
1575 newstate->hash = hash;
1576 err = re_node_set_alloc (&newstate->non_eps_nodes, newstate->nodes.nelem);
1577 if (__glibc_unlikely (err != REG_NOERROR))
1578 return REG_ESPACE;
1579 for (i = 0; i < newstate->nodes.nelem; i++)
1580 {
1581 Idx elem = newstate->nodes.elems[i];
1582 if (!IS_EPSILON_NODE (dfa->nodes[elem].type))
1583 if (! re_node_set_insert_last (&newstate->non_eps_nodes, elem))
1584 return REG_ESPACE;
1585 }
1586
1587 spot = dfa->state_table + (hash & dfa->state_hash_mask);
1588 if (__glibc_unlikely (spot->alloc <= spot->num))
1589 {
1590 Idx new_alloc = 2 * spot->num + 2;
1591 re_dfastate_t **new_array = re_realloc (spot->array, re_dfastate_t *,
1592 new_alloc);
1593 if (__glibc_unlikely (new_array == NULL))
1594 return REG_ESPACE;
1595 spot->array = new_array;
1596 spot->alloc = new_alloc;
1597 }
1598 spot->array[spot->num++] = newstate;
1599 return REG_NOERROR;
1600 }
1601
1602 static void
free_state(re_dfastate_t * state)1603 free_state (re_dfastate_t *state)
1604 {
1605 re_node_set_free (&state->non_eps_nodes);
1606 re_node_set_free (&state->inveclosure);
1607 if (state->entrance_nodes != &state->nodes)
1608 {
1609 re_node_set_free (state->entrance_nodes);
1610 re_free (state->entrance_nodes);
1611 }
1612 re_node_set_free (&state->nodes);
1613 re_free (state->word_trtable);
1614 re_free (state->trtable);
1615 re_free (state);
1616 }
1617
1618 /* Create the new state which is independent of contexts.
1619 Return the new state if succeeded, otherwise return NULL. */
1620
1621 static re_dfastate_t *
1622 __attribute_warn_unused_result__
create_ci_newstate(const re_dfa_t * dfa,const re_node_set * nodes,re_hashval_t hash)1623 create_ci_newstate (const re_dfa_t *dfa, const re_node_set *nodes,
1624 re_hashval_t hash)
1625 {
1626 Idx i;
1627 reg_errcode_t err;
1628 re_dfastate_t *newstate;
1629
1630 newstate = (re_dfastate_t *) calloc (sizeof (re_dfastate_t), 1);
1631 if (__glibc_unlikely (newstate == NULL))
1632 return NULL;
1633 err = re_node_set_init_copy (&newstate->nodes, nodes);
1634 if (__glibc_unlikely (err != REG_NOERROR))
1635 {
1636 re_free (newstate);
1637 return NULL;
1638 }
1639
1640 newstate->entrance_nodes = &newstate->nodes;
1641 for (i = 0 ; i < nodes->nelem ; i++)
1642 {
1643 re_token_t *node = dfa->nodes + nodes->elems[i];
1644 re_token_type_t type = node->type;
1645 if (type == CHARACTER && !node->constraint)
1646 continue;
1647 #ifdef RE_ENABLE_I18N
1648 newstate->accept_mb |= node->accept_mb;
1649 #endif /* RE_ENABLE_I18N */
1650
1651 /* If the state has the halt node, the state is a halt state. */
1652 if (type == END_OF_RE)
1653 newstate->halt = 1;
1654 else if (type == OP_BACK_REF)
1655 newstate->has_backref = 1;
1656 else if (type == ANCHOR || node->constraint)
1657 newstate->has_constraint = 1;
1658 }
1659 err = register_state (dfa, newstate, hash);
1660 if (__glibc_unlikely (err != REG_NOERROR))
1661 {
1662 free_state (newstate);
1663 newstate = NULL;
1664 }
1665 return newstate;
1666 }
1667
1668 /* Create the new state which is depend on the context CONTEXT.
1669 Return the new state if succeeded, otherwise return NULL. */
1670
1671 static re_dfastate_t *
1672 __attribute_warn_unused_result__
create_cd_newstate(const re_dfa_t * dfa,const re_node_set * nodes,unsigned int context,re_hashval_t hash)1673 create_cd_newstate (const re_dfa_t *dfa, const re_node_set *nodes,
1674 unsigned int context, re_hashval_t hash)
1675 {
1676 Idx i, nctx_nodes = 0;
1677 reg_errcode_t err;
1678 re_dfastate_t *newstate;
1679
1680 newstate = (re_dfastate_t *) calloc (sizeof (re_dfastate_t), 1);
1681 if (__glibc_unlikely (newstate == NULL))
1682 return NULL;
1683 err = re_node_set_init_copy (&newstate->nodes, nodes);
1684 if (__glibc_unlikely (err != REG_NOERROR))
1685 {
1686 re_free (newstate);
1687 return NULL;
1688 }
1689
1690 newstate->context = context;
1691 newstate->entrance_nodes = &newstate->nodes;
1692
1693 for (i = 0 ; i < nodes->nelem ; i++)
1694 {
1695 re_token_t *node = dfa->nodes + nodes->elems[i];
1696 re_token_type_t type = node->type;
1697 unsigned int constraint = node->constraint;
1698
1699 if (type == CHARACTER && !constraint)
1700 continue;
1701 #ifdef RE_ENABLE_I18N
1702 newstate->accept_mb |= node->accept_mb;
1703 #endif /* RE_ENABLE_I18N */
1704
1705 /* If the state has the halt node, the state is a halt state. */
1706 if (type == END_OF_RE)
1707 newstate->halt = 1;
1708 else if (type == OP_BACK_REF)
1709 newstate->has_backref = 1;
1710
1711 if (constraint)
1712 {
1713 if (newstate->entrance_nodes == &newstate->nodes)
1714 {
1715 re_node_set *entrance_nodes = re_malloc (re_node_set, 1);
1716 if (__glibc_unlikely (entrance_nodes == NULL))
1717 {
1718 free_state (newstate);
1719 return NULL;
1720 }
1721 newstate->entrance_nodes = entrance_nodes;
1722 if (re_node_set_init_copy (newstate->entrance_nodes, nodes)
1723 != REG_NOERROR)
1724 {
1725 free_state (newstate);
1726 return NULL;
1727 }
1728 nctx_nodes = 0;
1729 newstate->has_constraint = 1;
1730 }
1731
1732 if (NOT_SATISFY_PREV_CONSTRAINT (constraint,context))
1733 {
1734 re_node_set_remove_at (&newstate->nodes, i - nctx_nodes);
1735 ++nctx_nodes;
1736 }
1737 }
1738 }
1739 err = register_state (dfa, newstate, hash);
1740 if (__glibc_unlikely (err != REG_NOERROR))
1741 {
1742 free_state (newstate);
1743 newstate = NULL;
1744 }
1745 return newstate;
1746 }
1747