xref: /Universal-ctags/gnulib/regexec.c (revision 820c1a8d46849a90376d8eb15b319ac05439f656)
1 /* Extended regular expression matching and search library.
2    Copyright (C) 2002-2021 Free Software Foundation, Inc.
3    This file is part of the GNU C Library.
4    Contributed by Isamu Hasegawa <isamu@yamato.ibm.com>.
5 
6    The GNU C Library is free software; you can redistribute it and/or
7    modify it under the terms of the GNU Lesser General Public
8    License as published by the Free Software Foundation; either
9    version 2.1 of the License, or (at your option) any later version.
10 
11    The GNU C Library is distributed in the hope that it will be useful,
12    but WITHOUT ANY WARRANTY; without even the implied warranty of
13    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
14    Lesser General Public License for more details.
15 
16    You should have received a copy of the GNU Lesser General Public
17    License along with the GNU C Library; if not, see
18    <https://www.gnu.org/licenses/>.  */
19 
20 static reg_errcode_t match_ctx_init (re_match_context_t *cache, int eflags,
21 				     Idx n);
22 static void match_ctx_clean (re_match_context_t *mctx);
23 static void match_ctx_free (re_match_context_t *cache);
24 static reg_errcode_t match_ctx_add_entry (re_match_context_t *cache, Idx node,
25 					  Idx str_idx, Idx from, Idx to);
26 static Idx search_cur_bkref_entry (const re_match_context_t *mctx, Idx str_idx);
27 static reg_errcode_t match_ctx_add_subtop (re_match_context_t *mctx, Idx node,
28 					   Idx str_idx);
29 static re_sub_match_last_t * match_ctx_add_sublast (re_sub_match_top_t *subtop,
30 						    Idx node, Idx str_idx);
31 static void sift_ctx_init (re_sift_context_t *sctx, re_dfastate_t **sifted_sts,
32 			   re_dfastate_t **limited_sts, Idx last_node,
33 			   Idx last_str_idx);
34 static reg_errcode_t re_search_internal (const regex_t *preg,
35 					 const char *string, Idx length,
36 					 Idx start, Idx last_start, Idx stop,
37 					 size_t nmatch, regmatch_t pmatch[],
38 					 int eflags);
39 static regoff_t re_search_2_stub (struct re_pattern_buffer *bufp,
40 				  const char *string1, Idx length1,
41 				  const char *string2, Idx length2,
42 				  Idx start, regoff_t range,
43 				  struct re_registers *regs,
44 				  Idx stop, bool ret_len);
45 static regoff_t re_search_stub (struct re_pattern_buffer *bufp,
46 				const char *string, Idx length, Idx start,
47 				regoff_t range, Idx stop,
48 				struct re_registers *regs,
49 				bool ret_len);
50 static unsigned re_copy_regs (struct re_registers *regs, regmatch_t *pmatch,
51                               Idx nregs, int regs_allocated);
52 static reg_errcode_t prune_impossible_nodes (re_match_context_t *mctx);
53 static Idx check_matching (re_match_context_t *mctx, bool fl_longest_match,
54 			   Idx *p_match_first);
55 static Idx check_halt_state_context (const re_match_context_t *mctx,
56 				     const re_dfastate_t *state, Idx idx);
57 static void update_regs (const re_dfa_t *dfa, regmatch_t *pmatch,
58 			 regmatch_t *prev_idx_match, Idx cur_node,
59 			 Idx cur_idx, Idx nmatch);
60 static reg_errcode_t push_fail_stack (struct re_fail_stack_t *fs,
61 				      Idx str_idx, Idx dest_node, Idx nregs,
62 				      regmatch_t *regs, regmatch_t *prevregs,
63 				      re_node_set *eps_via_nodes);
64 static reg_errcode_t set_regs (const regex_t *preg,
65 			       const re_match_context_t *mctx,
66 			       size_t nmatch, regmatch_t *pmatch,
67 			       bool fl_backtrack);
68 static reg_errcode_t free_fail_stack_return (struct re_fail_stack_t *fs);
69 
70 #ifdef RE_ENABLE_I18N
71 static int sift_states_iter_mb (const re_match_context_t *mctx,
72 				re_sift_context_t *sctx,
73 				Idx node_idx, Idx str_idx, Idx max_str_idx);
74 #endif /* RE_ENABLE_I18N */
75 static reg_errcode_t sift_states_backward (const re_match_context_t *mctx,
76 					   re_sift_context_t *sctx);
77 static reg_errcode_t build_sifted_states (const re_match_context_t *mctx,
78 					  re_sift_context_t *sctx, Idx str_idx,
79 					  re_node_set *cur_dest);
80 static reg_errcode_t update_cur_sifted_state (const re_match_context_t *mctx,
81 					      re_sift_context_t *sctx,
82 					      Idx str_idx,
83 					      re_node_set *dest_nodes);
84 static reg_errcode_t add_epsilon_src_nodes (const re_dfa_t *dfa,
85 					    re_node_set *dest_nodes,
86 					    const re_node_set *candidates);
87 static bool check_dst_limits (const re_match_context_t *mctx,
88 			      const re_node_set *limits,
89 			      Idx dst_node, Idx dst_idx, Idx src_node,
90 			      Idx src_idx);
91 static int check_dst_limits_calc_pos_1 (const re_match_context_t *mctx,
92 					int boundaries, Idx subexp_idx,
93 					Idx from_node, Idx bkref_idx);
94 static int check_dst_limits_calc_pos (const re_match_context_t *mctx,
95 				      Idx limit, Idx subexp_idx,
96 				      Idx node, Idx str_idx,
97 				      Idx bkref_idx);
98 static reg_errcode_t check_subexp_limits (const re_dfa_t *dfa,
99 					  re_node_set *dest_nodes,
100 					  const re_node_set *candidates,
101 					  re_node_set *limits,
102 					  struct re_backref_cache_entry *bkref_ents,
103 					  Idx str_idx);
104 static reg_errcode_t sift_states_bkref (const re_match_context_t *mctx,
105 					re_sift_context_t *sctx,
106 					Idx str_idx, const re_node_set *candidates);
107 static reg_errcode_t merge_state_array (const re_dfa_t *dfa,
108 					re_dfastate_t **dst,
109 					re_dfastate_t **src, Idx num);
110 static re_dfastate_t *find_recover_state (reg_errcode_t *err,
111 					 re_match_context_t *mctx);
112 static re_dfastate_t *transit_state (reg_errcode_t *err,
113 				     re_match_context_t *mctx,
114 				     re_dfastate_t *state);
115 static re_dfastate_t *merge_state_with_log (reg_errcode_t *err,
116 					    re_match_context_t *mctx,
117 					    re_dfastate_t *next_state);
118 static reg_errcode_t check_subexp_matching_top (re_match_context_t *mctx,
119 						re_node_set *cur_nodes,
120 						Idx str_idx);
121 #if 0
122 static re_dfastate_t *transit_state_sb (reg_errcode_t *err,
123 					re_match_context_t *mctx,
124 					re_dfastate_t *pstate);
125 #endif
126 #ifdef RE_ENABLE_I18N
127 static reg_errcode_t transit_state_mb (re_match_context_t *mctx,
128 				       re_dfastate_t *pstate);
129 #endif /* RE_ENABLE_I18N */
130 static reg_errcode_t transit_state_bkref (re_match_context_t *mctx,
131 					  const re_node_set *nodes);
132 static reg_errcode_t get_subexp (re_match_context_t *mctx,
133 				 Idx bkref_node, Idx bkref_str_idx);
134 static reg_errcode_t get_subexp_sub (re_match_context_t *mctx,
135 				     const re_sub_match_top_t *sub_top,
136 				     re_sub_match_last_t *sub_last,
137 				     Idx bkref_node, Idx bkref_str);
138 static Idx find_subexp_node (const re_dfa_t *dfa, const re_node_set *nodes,
139 			     Idx subexp_idx, int type);
140 static reg_errcode_t check_arrival (re_match_context_t *mctx,
141 				    state_array_t *path, Idx top_node,
142 				    Idx top_str, Idx last_node, Idx last_str,
143 				    int type);
144 static reg_errcode_t check_arrival_add_next_nodes (re_match_context_t *mctx,
145 						   Idx str_idx,
146 						   re_node_set *cur_nodes,
147 						   re_node_set *next_nodes);
148 static reg_errcode_t check_arrival_expand_ecl (const re_dfa_t *dfa,
149 					       re_node_set *cur_nodes,
150 					       Idx ex_subexp, int type);
151 static reg_errcode_t check_arrival_expand_ecl_sub (const re_dfa_t *dfa,
152 						   re_node_set *dst_nodes,
153 						   Idx target, Idx ex_subexp,
154 						   int type);
155 static reg_errcode_t expand_bkref_cache (re_match_context_t *mctx,
156 					 re_node_set *cur_nodes, Idx cur_str,
157 					 Idx subexp_num, int type);
158 static bool build_trtable (const re_dfa_t *dfa, re_dfastate_t *state);
159 #ifdef RE_ENABLE_I18N
160 static int check_node_accept_bytes (const re_dfa_t *dfa, Idx node_idx,
161 				    const re_string_t *input, Idx idx);
162 # ifdef _LIBC
163 static unsigned int find_collation_sequence_value (const unsigned char *mbs,
164 						   size_t name_len);
165 # endif /* _LIBC */
166 #endif /* RE_ENABLE_I18N */
167 static Idx group_nodes_into_DFAstates (const re_dfa_t *dfa,
168 				       const re_dfastate_t *state,
169 				       re_node_set *states_node,
170 				       bitset_t *states_ch);
171 static bool check_node_accept (const re_match_context_t *mctx,
172 			       const re_token_t *node, Idx idx);
173 static reg_errcode_t extend_buffers (re_match_context_t *mctx, int min_len);
174 
175 /* Entry point for POSIX code.  */
176 
177 /* regexec searches for a given pattern, specified by PREG, in the
178    string STRING.
179 
180    If NMATCH is zero or REG_NOSUB was set in the cflags argument to
181    'regcomp', we ignore PMATCH.  Otherwise, we assume PMATCH has at
182    least NMATCH elements, and we set them to the offsets of the
183    corresponding matched substrings.
184 
185    EFLAGS specifies "execution flags" which affect matching: if
186    REG_NOTBOL is set, then ^ does not match at the beginning of the
187    string; if REG_NOTEOL is set, then $ does not match at the end.
188 
189    Return 0 if a match is found, REG_NOMATCH if not, REG_BADPAT if
190    EFLAGS is invalid.  */
191 
192 int
regexec(const regex_t * __restrict preg,const char * __restrict string,size_t nmatch,regmatch_t pmatch[],int eflags)193 regexec (const regex_t *__restrict preg, const char *__restrict string,
194 	 size_t nmatch, regmatch_t pmatch[], int eflags)
195 {
196   reg_errcode_t err;
197   Idx start, length;
198   re_dfa_t *dfa = preg->buffer;
199 
200   if (eflags & ~(REG_NOTBOL | REG_NOTEOL | REG_STARTEND))
201     return REG_BADPAT;
202 
203   if (eflags & REG_STARTEND)
204     {
205       start = pmatch[0].rm_so;
206       length = pmatch[0].rm_eo;
207     }
208   else
209     {
210       start = 0;
211       length = strlen (string);
212     }
213 
214   lock_lock (dfa->lock);
215   if (preg->no_sub)
216     err = re_search_internal (preg, string, length, start, length,
217 			      length, 0, NULL, eflags);
218   else
219     err = re_search_internal (preg, string, length, start, length,
220 			      length, nmatch, pmatch, eflags);
221   lock_unlock (dfa->lock);
222   return err != REG_NOERROR;
223 }
224 
225 #ifdef _LIBC
226 libc_hidden_def (__regexec)
227 
228 # include <shlib-compat.h>
229 versioned_symbol (libc, __regexec, regexec, GLIBC_2_3_4);
230 
231 # if SHLIB_COMPAT (libc, GLIBC_2_0, GLIBC_2_3_4)
232 __typeof__ (__regexec) __compat_regexec;
233 
234 int
235 attribute_compat_text_section
__compat_regexec(const regex_t * __restrict preg,const char * __restrict string,size_t nmatch,regmatch_t pmatch[],int eflags)236 __compat_regexec (const regex_t *__restrict preg,
237 		  const char *__restrict string, size_t nmatch,
238 		  regmatch_t pmatch[], int eflags)
239 {
240   return regexec (preg, string, nmatch, pmatch,
241 		  eflags & (REG_NOTBOL | REG_NOTEOL));
242 }
243 compat_symbol (libc, __compat_regexec, regexec, GLIBC_2_0);
244 # endif
245 #endif
246 
247 /* Entry points for GNU code.  */
248 
249 /* re_match, re_search, re_match_2, re_search_2
250 
251    The former two functions operate on STRING with length LENGTH,
252    while the later two operate on concatenation of STRING1 and STRING2
253    with lengths LENGTH1 and LENGTH2, respectively.
254 
255    re_match() matches the compiled pattern in BUFP against the string,
256    starting at index START.
257 
258    re_search() first tries matching at index START, then it tries to match
259    starting from index START + 1, and so on.  The last start position tried
260    is START + RANGE.  (Thus RANGE = 0 forces re_search to operate the same
261    way as re_match().)
262 
263    The parameter STOP of re_{match,search}_2 specifies that no match exceeding
264    the first STOP characters of the concatenation of the strings should be
265    concerned.
266 
267    If REGS is not NULL, and BUFP->no_sub is not set, the offsets of the match
268    and all groups is stored in REGS.  (For the "_2" variants, the offsets are
269    computed relative to the concatenation, not relative to the individual
270    strings.)
271 
272    On success, re_match* functions return the length of the match, re_search*
273    return the position of the start of the match.  They return -1 on
274    match failure, -2 on error.  */
275 
276 regoff_t
re_match(struct re_pattern_buffer * bufp,const char * string,Idx length,Idx start,struct re_registers * regs)277 re_match (struct re_pattern_buffer *bufp, const char *string, Idx length,
278 	  Idx start, struct re_registers *regs)
279 {
280   return re_search_stub (bufp, string, length, start, 0, length, regs, true);
281 }
282 #ifdef _LIBC
weak_alias(__re_match,re_match)283 weak_alias (__re_match, re_match)
284 #endif
285 
286 regoff_t
287 re_search (struct re_pattern_buffer *bufp, const char *string, Idx length,
288 	   Idx start, regoff_t range, struct re_registers *regs)
289 {
290   return re_search_stub (bufp, string, length, start, range, length, regs,
291 			 false);
292 }
293 #ifdef _LIBC
weak_alias(__re_search,re_search)294 weak_alias (__re_search, re_search)
295 #endif
296 
297 regoff_t
298 re_match_2 (struct re_pattern_buffer *bufp, const char *string1, Idx length1,
299 	    const char *string2, Idx length2, Idx start,
300 	    struct re_registers *regs, Idx stop)
301 {
302   return re_search_2_stub (bufp, string1, length1, string2, length2,
303 			   start, 0, regs, stop, true);
304 }
305 #ifdef _LIBC
weak_alias(__re_match_2,re_match_2)306 weak_alias (__re_match_2, re_match_2)
307 #endif
308 
309 regoff_t
310 re_search_2 (struct re_pattern_buffer *bufp, const char *string1, Idx length1,
311 	     const char *string2, Idx length2, Idx start, regoff_t range,
312 	     struct re_registers *regs, Idx stop)
313 {
314   return re_search_2_stub (bufp, string1, length1, string2, length2,
315 			   start, range, regs, stop, false);
316 }
317 #ifdef _LIBC
weak_alias(__re_search_2,re_search_2)318 weak_alias (__re_search_2, re_search_2)
319 #endif
320 
321 static regoff_t
322 re_search_2_stub (struct re_pattern_buffer *bufp, const char *string1,
323 		  Idx length1, const char *string2, Idx length2, Idx start,
324 		  regoff_t range, struct re_registers *regs,
325 		  Idx stop, bool ret_len)
326 {
327   const char *str;
328   regoff_t rval;
329   Idx len;
330   char *s = NULL;
331 
332   if (__glibc_unlikely ((length1 < 0 || length2 < 0 || stop < 0
333 			 || INT_ADD_WRAPV (length1, length2, &len))))
334     return -2;
335 
336   /* Concatenate the strings.  */
337   if (length2 > 0)
338     if (length1 > 0)
339       {
340 	s = re_malloc (char, len);
341 
342 	if (__glibc_unlikely (s == NULL))
343 	  return -2;
344 #ifdef _LIBC
345 	memcpy (__mempcpy (s, string1, length1), string2, length2);
346 #else
347 	memcpy (s, string1, length1);
348 	memcpy (s + length1, string2, length2);
349 #endif
350 	str = s;
351       }
352     else
353       str = string2;
354   else
355     str = string1;
356 
357   rval = re_search_stub (bufp, str, len, start, range, stop, regs,
358 			 ret_len);
359   re_free (s);
360   return rval;
361 }
362 
363 /* The parameters have the same meaning as those of re_search.
364    Additional parameters:
365    If RET_LEN is true the length of the match is returned (re_match style);
366    otherwise the position of the match is returned.  */
367 
368 static regoff_t
re_search_stub(struct re_pattern_buffer * bufp,const char * string,Idx length,Idx start,regoff_t range,Idx stop,struct re_registers * regs,bool ret_len)369 re_search_stub (struct re_pattern_buffer *bufp, const char *string, Idx length,
370 		Idx start, regoff_t range, Idx stop, struct re_registers *regs,
371 		bool ret_len)
372 {
373   reg_errcode_t result;
374   regmatch_t *pmatch;
375   Idx nregs;
376   regoff_t rval;
377   int eflags = 0;
378   re_dfa_t *dfa = bufp->buffer;
379   Idx last_start = start + range;
380 
381   /* Check for out-of-range.  */
382   if (__glibc_unlikely (start < 0 || start > length))
383     return -1;
384   if (__glibc_unlikely (length < last_start
385 			|| (0 <= range && last_start < start)))
386     last_start = length;
387   else if (__glibc_unlikely (last_start < 0
388 			     || (range < 0 && start <= last_start)))
389     last_start = 0;
390 
391   lock_lock (dfa->lock);
392 
393   eflags |= (bufp->not_bol) ? REG_NOTBOL : 0;
394   eflags |= (bufp->not_eol) ? REG_NOTEOL : 0;
395 
396   /* Compile fastmap if we haven't yet.  */
397   if (start < last_start && bufp->fastmap != NULL && !bufp->fastmap_accurate)
398     re_compile_fastmap (bufp);
399 
400   if (__glibc_unlikely (bufp->no_sub))
401     regs = NULL;
402 
403   /* We need at least 1 register.  */
404   if (regs == NULL)
405     nregs = 1;
406   else if (__glibc_unlikely (bufp->regs_allocated == REGS_FIXED
407 			     && regs->num_regs <= bufp->re_nsub))
408     {
409       nregs = regs->num_regs;
410       if (__glibc_unlikely (nregs < 1))
411 	{
412 	  /* Nothing can be copied to regs.  */
413 	  regs = NULL;
414 	  nregs = 1;
415 	}
416     }
417   else
418     nregs = bufp->re_nsub + 1;
419   pmatch = re_malloc (regmatch_t, nregs);
420   if (__glibc_unlikely (pmatch == NULL))
421     {
422       rval = -2;
423       goto out;
424     }
425 
426   result = re_search_internal (bufp, string, length, start, last_start, stop,
427 			       nregs, pmatch, eflags);
428 
429   rval = 0;
430 
431   /* I hope we needn't fill their regs with -1's when no match was found.  */
432   if (result != REG_NOERROR)
433     rval = result == REG_NOMATCH ? -1 : -2;
434   else if (regs != NULL)
435     {
436       /* If caller wants register contents data back, copy them.  */
437       bufp->regs_allocated = re_copy_regs (regs, pmatch, nregs,
438 					   bufp->regs_allocated);
439       if (__glibc_unlikely (bufp->regs_allocated == REGS_UNALLOCATED))
440 	rval = -2;
441     }
442 
443   if (__glibc_likely (rval == 0))
444     {
445       if (ret_len)
446 	{
447 	  DEBUG_ASSERT (pmatch[0].rm_so == start);
448 	  rval = pmatch[0].rm_eo - start;
449 	}
450       else
451 	rval = pmatch[0].rm_so;
452     }
453   re_free (pmatch);
454  out:
455   lock_unlock (dfa->lock);
456   return rval;
457 }
458 
459 static unsigned
re_copy_regs(struct re_registers * regs,regmatch_t * pmatch,Idx nregs,int regs_allocated)460 re_copy_regs (struct re_registers *regs, regmatch_t *pmatch, Idx nregs,
461 	      int regs_allocated)
462 {
463   int rval = REGS_REALLOCATE;
464   Idx i;
465   Idx need_regs = nregs + 1;
466   /* We need one extra element beyond 'num_regs' for the '-1' marker GNU code
467      uses.  */
468 
469   /* Have the register data arrays been allocated?  */
470   if (regs_allocated == REGS_UNALLOCATED)
471     { /* No.  So allocate them with malloc.  */
472       regs->start = re_malloc (regoff_t, need_regs);
473       if (__glibc_unlikely (regs->start == NULL))
474 	return REGS_UNALLOCATED;
475       regs->end = re_malloc (regoff_t, need_regs);
476       if (__glibc_unlikely (regs->end == NULL))
477 	{
478 	  re_free (regs->start);
479 	  return REGS_UNALLOCATED;
480 	}
481       regs->num_regs = need_regs;
482     }
483   else if (regs_allocated == REGS_REALLOCATE)
484     { /* Yes.  If we need more elements than were already
485 	 allocated, reallocate them.  If we need fewer, just
486 	 leave it alone.  */
487       if (__glibc_unlikely (need_regs > regs->num_regs))
488 	{
489 	  regoff_t *new_start = re_realloc (regs->start, regoff_t, need_regs);
490 	  regoff_t *new_end;
491 	  if (__glibc_unlikely (new_start == NULL))
492 	    return REGS_UNALLOCATED;
493 	  new_end = re_realloc (regs->end, regoff_t, need_regs);
494 	  if (__glibc_unlikely (new_end == NULL))
495 	    {
496 	      re_free (new_start);
497 	      return REGS_UNALLOCATED;
498 	    }
499 	  regs->start = new_start;
500 	  regs->end = new_end;
501 	  regs->num_regs = need_regs;
502 	}
503     }
504   else
505     {
506       DEBUG_ASSERT (regs_allocated == REGS_FIXED);
507       /* This function may not be called with REGS_FIXED and nregs too big.  */
508       DEBUG_ASSERT (nregs <= regs->num_regs);
509       rval = REGS_FIXED;
510     }
511 
512   /* Copy the regs.  */
513   for (i = 0; i < nregs; ++i)
514     {
515       regs->start[i] = pmatch[i].rm_so;
516       regs->end[i] = pmatch[i].rm_eo;
517     }
518   for ( ; i < regs->num_regs; ++i)
519     regs->start[i] = regs->end[i] = -1;
520 
521   return rval;
522 }
523 
524 /* Set REGS to hold NUM_REGS registers, storing them in STARTS and
525    ENDS.  Subsequent matches using PATTERN_BUFFER and REGS will use
526    this memory for recording register information.  STARTS and ENDS
527    must be allocated using the malloc library routine, and must each
528    be at least NUM_REGS * sizeof (regoff_t) bytes long.
529 
530    If NUM_REGS == 0, then subsequent matches should allocate their own
531    register data.
532 
533    Unless this function is called, the first search or match using
534    PATTERN_BUFFER will allocate its own register data, without
535    freeing the old data.  */
536 
537 void
re_set_registers(struct re_pattern_buffer * bufp,struct re_registers * regs,__re_size_t num_regs,regoff_t * starts,regoff_t * ends)538 re_set_registers (struct re_pattern_buffer *bufp, struct re_registers *regs,
539 		  __re_size_t num_regs, regoff_t *starts, regoff_t *ends)
540 {
541   if (num_regs)
542     {
543       bufp->regs_allocated = REGS_REALLOCATE;
544       regs->num_regs = num_regs;
545       regs->start = starts;
546       regs->end = ends;
547     }
548   else
549     {
550       bufp->regs_allocated = REGS_UNALLOCATED;
551       regs->num_regs = 0;
552       regs->start = regs->end = NULL;
553     }
554 }
555 #ifdef _LIBC
weak_alias(__re_set_registers,re_set_registers)556 weak_alias (__re_set_registers, re_set_registers)
557 #endif
558 
559 /* Entry points compatible with 4.2 BSD regex library.  We don't define
560    them unless specifically requested.  */
561 
562 #if defined _REGEX_RE_COMP || defined _LIBC
563 int
564 # ifdef _LIBC
565 weak_function
566 # endif
567 re_exec (const char *s)
568 {
569   return 0 == regexec (&re_comp_buf, s, 0, NULL, 0);
570 }
571 #endif /* _REGEX_RE_COMP */
572 
573 /* Internal entry point.  */
574 
575 /* Searches for a compiled pattern PREG in the string STRING, whose
576    length is LENGTH.  NMATCH, PMATCH, and EFLAGS have the same
577    meaning as with regexec.  LAST_START is START + RANGE, where
578    START and RANGE have the same meaning as with re_search.
579    Return REG_NOERROR if we find a match, and REG_NOMATCH if not,
580    otherwise return the error code.
581    Note: We assume front end functions already check ranges.
582    (0 <= LAST_START && LAST_START <= LENGTH)  */
583 
584 static reg_errcode_t
585 __attribute_warn_unused_result__
re_search_internal(const regex_t * preg,const char * string,Idx length,Idx start,Idx last_start,Idx stop,size_t nmatch,regmatch_t pmatch[],int eflags)586 re_search_internal (const regex_t *preg, const char *string, Idx length,
587 		    Idx start, Idx last_start, Idx stop, size_t nmatch,
588 		    regmatch_t pmatch[], int eflags)
589 {
590   reg_errcode_t err;
591   const re_dfa_t *dfa = preg->buffer;
592   Idx left_lim, right_lim;
593   int incr;
594   bool fl_longest_match;
595   int match_kind;
596   Idx match_first;
597   Idx match_last = -1;
598   Idx extra_nmatch;
599   bool sb;
600   int ch;
601   re_match_context_t mctx = { .dfa = dfa };
602   char *fastmap = ((preg->fastmap != NULL && preg->fastmap_accurate
603 		    && start != last_start && !preg->can_be_null)
604 		   ? preg->fastmap : NULL);
605   RE_TRANSLATE_TYPE t = preg->translate;
606 
607   extra_nmatch = (nmatch > preg->re_nsub) ? nmatch - (preg->re_nsub + 1) : 0;
608   nmatch -= extra_nmatch;
609 
610   /* Check if the DFA haven't been compiled.  */
611   if (__glibc_unlikely (preg->used == 0 || dfa->init_state == NULL
612 			|| dfa->init_state_word == NULL
613 			|| dfa->init_state_nl == NULL
614 			|| dfa->init_state_begbuf == NULL))
615     return REG_NOMATCH;
616 
617   /* We assume front-end functions already check them.  */
618   DEBUG_ASSERT (0 <= last_start && last_start <= length);
619 
620   /* If initial states with non-begbuf contexts have no elements,
621      the regex must be anchored.  If preg->newline_anchor is set,
622      we'll never use init_state_nl, so do not check it.  */
623   if (dfa->init_state->nodes.nelem == 0
624       && dfa->init_state_word->nodes.nelem == 0
625       && (dfa->init_state_nl->nodes.nelem == 0
626 	  || !preg->newline_anchor))
627     {
628       if (start != 0 && last_start != 0)
629         return REG_NOMATCH;
630       start = last_start = 0;
631     }
632 
633   /* We must check the longest matching, if nmatch > 0.  */
634   fl_longest_match = (nmatch != 0 || dfa->nbackref);
635 
636   err = re_string_allocate (&mctx.input, string, length, dfa->nodes_len + 1,
637 			    preg->translate, (preg->syntax & RE_ICASE) != 0,
638 			    dfa);
639   if (__glibc_unlikely (err != REG_NOERROR))
640     goto free_return;
641   mctx.input.stop = stop;
642   mctx.input.raw_stop = stop;
643   mctx.input.newline_anchor = preg->newline_anchor;
644 
645   err = match_ctx_init (&mctx, eflags, dfa->nbackref * 2);
646   if (__glibc_unlikely (err != REG_NOERROR))
647     goto free_return;
648 
649   /* We will log all the DFA states through which the dfa pass,
650      if nmatch > 1, or this dfa has "multibyte node", which is a
651      back-reference or a node which can accept multibyte character or
652      multi character collating element.  */
653   if (nmatch > 1 || dfa->has_mb_node)
654     {
655       /* Avoid overflow.  */
656       if (__glibc_unlikely ((MIN (IDX_MAX, SIZE_MAX / sizeof (re_dfastate_t *))
657 			     <= mctx.input.bufs_len)))
658 	{
659 	  err = REG_ESPACE;
660 	  goto free_return;
661 	}
662 
663       mctx.state_log = re_malloc (re_dfastate_t *, mctx.input.bufs_len + 1);
664       if (__glibc_unlikely (mctx.state_log == NULL))
665 	{
666 	  err = REG_ESPACE;
667 	  goto free_return;
668 	}
669     }
670 
671   match_first = start;
672   mctx.input.tip_context = (eflags & REG_NOTBOL) ? CONTEXT_BEGBUF
673 			   : CONTEXT_NEWLINE | CONTEXT_BEGBUF;
674 
675   /* Check incrementally whether the input string matches.  */
676   incr = (last_start < start) ? -1 : 1;
677   left_lim = (last_start < start) ? last_start : start;
678   right_lim = (last_start < start) ? start : last_start;
679   sb = dfa->mb_cur_max == 1;
680   match_kind =
681     (fastmap
682      ? ((sb || !(preg->syntax & RE_ICASE || t) ? 4 : 0)
683 	| (start <= last_start ? 2 : 0)
684 	| (t != NULL ? 1 : 0))
685      : 8);
686 
687   for (;; match_first += incr)
688     {
689       err = REG_NOMATCH;
690       if (match_first < left_lim || right_lim < match_first)
691 	goto free_return;
692 
693       /* Advance as rapidly as possible through the string, until we
694 	 find a plausible place to start matching.  This may be done
695 	 with varying efficiency, so there are various possibilities:
696 	 only the most common of them are specialized, in order to
697 	 save on code size.  We use a switch statement for speed.  */
698       switch (match_kind)
699 	{
700 	case 8:
701 	  /* No fastmap.  */
702 	  break;
703 
704 	case 7:
705 	  /* Fastmap with single-byte translation, match forward.  */
706 	  while (__glibc_likely (match_first < right_lim)
707 		 && !fastmap[t[(unsigned char) string[match_first]]])
708 	    ++match_first;
709 	  goto forward_match_found_start_or_reached_end;
710 
711 	case 6:
712 	  /* Fastmap without translation, match forward.  */
713 	  while (__glibc_likely (match_first < right_lim)
714 		 && !fastmap[(unsigned char) string[match_first]])
715 	    ++match_first;
716 
717 	forward_match_found_start_or_reached_end:
718 	  if (__glibc_unlikely (match_first == right_lim))
719 	    {
720 	      ch = match_first >= length
721 		       ? 0 : (unsigned char) string[match_first];
722 	      if (!fastmap[t ? t[ch] : ch])
723 		goto free_return;
724 	    }
725 	  break;
726 
727 	case 4:
728 	case 5:
729 	  /* Fastmap without multi-byte translation, match backwards.  */
730 	  while (match_first >= left_lim)
731 	    {
732 	      ch = match_first >= length
733 		       ? 0 : (unsigned char) string[match_first];
734 	      if (fastmap[t ? t[ch] : ch])
735 		break;
736 	      --match_first;
737 	    }
738 	  if (match_first < left_lim)
739 	    goto free_return;
740 	  break;
741 
742 	default:
743 	  /* In this case, we can't determine easily the current byte,
744 	     since it might be a component byte of a multibyte
745 	     character.  Then we use the constructed buffer instead.  */
746 	  for (;;)
747 	    {
748 	      /* If MATCH_FIRST is out of the valid range, reconstruct the
749 		 buffers.  */
750 	      __re_size_t offset = match_first - mctx.input.raw_mbs_idx;
751 	      if (__glibc_unlikely (offset
752 				    >= (__re_size_t) mctx.input.valid_raw_len))
753 		{
754 		  err = re_string_reconstruct (&mctx.input, match_first,
755 					       eflags);
756 		  if (__glibc_unlikely (err != REG_NOERROR))
757 		    goto free_return;
758 
759 		  offset = match_first - mctx.input.raw_mbs_idx;
760 		}
761 	      /* If MATCH_FIRST is out of the buffer, leave it as '\0'.
762 		 Note that MATCH_FIRST must not be smaller than 0.  */
763 	      ch = (match_first >= length
764 		    ? 0 : re_string_byte_at (&mctx.input, offset));
765 	      if (fastmap[ch])
766 		break;
767 	      match_first += incr;
768 	      if (match_first < left_lim || match_first > right_lim)
769 		{
770 		  err = REG_NOMATCH;
771 		  goto free_return;
772 		}
773 	    }
774 	  break;
775 	}
776 
777       /* Reconstruct the buffers so that the matcher can assume that
778 	 the matching starts from the beginning of the buffer.  */
779       err = re_string_reconstruct (&mctx.input, match_first, eflags);
780       if (__glibc_unlikely (err != REG_NOERROR))
781 	goto free_return;
782 
783 #ifdef RE_ENABLE_I18N
784      /* Don't consider this char as a possible match start if it part,
785 	yet isn't the head, of a multibyte character.  */
786       if (!sb && !re_string_first_byte (&mctx.input, 0))
787 	continue;
788 #endif
789 
790       /* It seems to be appropriate one, then use the matcher.  */
791       /* We assume that the matching starts from 0.  */
792       mctx.state_log_top = mctx.nbkref_ents = mctx.max_mb_elem_len = 0;
793       match_last = check_matching (&mctx, fl_longest_match,
794 				   start <= last_start ? &match_first : NULL);
795       if (match_last != -1)
796 	{
797 	  if (__glibc_unlikely (match_last == -2))
798 	    {
799 	      err = REG_ESPACE;
800 	      goto free_return;
801 	    }
802 	  else
803 	    {
804 	      mctx.match_last = match_last;
805 	      if ((!preg->no_sub && nmatch > 1) || dfa->nbackref)
806 		{
807 		  re_dfastate_t *pstate = mctx.state_log[match_last];
808 		  mctx.last_node = check_halt_state_context (&mctx, pstate,
809 							     match_last);
810 		}
811 	      if ((!preg->no_sub && nmatch > 1 && dfa->has_plural_match)
812 		  || dfa->nbackref)
813 		{
814 		  err = prune_impossible_nodes (&mctx);
815 		  if (err == REG_NOERROR)
816 		    break;
817 		  if (__glibc_unlikely (err != REG_NOMATCH))
818 		    goto free_return;
819 		  match_last = -1;
820 		}
821 	      else
822 		break; /* We found a match.  */
823 	    }
824 	}
825 
826       match_ctx_clean (&mctx);
827     }
828 
829   DEBUG_ASSERT (match_last != -1);
830   DEBUG_ASSERT (err == REG_NOERROR);
831 
832   /* Set pmatch[] if we need.  */
833   if (nmatch > 0)
834     {
835       Idx reg_idx;
836 
837       /* Initialize registers.  */
838       for (reg_idx = 1; reg_idx < nmatch; ++reg_idx)
839 	pmatch[reg_idx].rm_so = pmatch[reg_idx].rm_eo = -1;
840 
841       /* Set the points where matching start/end.  */
842       pmatch[0].rm_so = 0;
843       pmatch[0].rm_eo = mctx.match_last;
844       /* FIXME: This function should fail if mctx.match_last exceeds
845 	 the maximum possible regoff_t value.  We need a new error
846 	 code REG_OVERFLOW.  */
847 
848       if (!preg->no_sub && nmatch > 1)
849 	{
850 	  err = set_regs (preg, &mctx, nmatch, pmatch,
851 			  dfa->has_plural_match && dfa->nbackref > 0);
852 	  if (__glibc_unlikely (err != REG_NOERROR))
853 	    goto free_return;
854 	}
855 
856       /* At last, add the offset to each register, since we slid
857 	 the buffers so that we could assume that the matching starts
858 	 from 0.  */
859       for (reg_idx = 0; reg_idx < nmatch; ++reg_idx)
860 	if (pmatch[reg_idx].rm_so != -1)
861 	  {
862 #ifdef RE_ENABLE_I18N
863 	    if (__glibc_unlikely (mctx.input.offsets_needed != 0))
864 	      {
865 		pmatch[reg_idx].rm_so =
866 		  (pmatch[reg_idx].rm_so == mctx.input.valid_len
867 		   ? mctx.input.valid_raw_len
868 		   : mctx.input.offsets[pmatch[reg_idx].rm_so]);
869 		pmatch[reg_idx].rm_eo =
870 		  (pmatch[reg_idx].rm_eo == mctx.input.valid_len
871 		   ? mctx.input.valid_raw_len
872 		   : mctx.input.offsets[pmatch[reg_idx].rm_eo]);
873 	      }
874 #else
875 	    DEBUG_ASSERT (mctx.input.offsets_needed == 0);
876 #endif
877 	    pmatch[reg_idx].rm_so += match_first;
878 	    pmatch[reg_idx].rm_eo += match_first;
879 	  }
880       for (reg_idx = 0; reg_idx < extra_nmatch; ++reg_idx)
881 	{
882 	  pmatch[nmatch + reg_idx].rm_so = -1;
883 	  pmatch[nmatch + reg_idx].rm_eo = -1;
884 	}
885 
886       if (dfa->subexp_map)
887 	for (reg_idx = 0; reg_idx + 1 < nmatch; reg_idx++)
888 	  if (dfa->subexp_map[reg_idx] != reg_idx)
889 	    {
890 	      pmatch[reg_idx + 1].rm_so
891 		= pmatch[dfa->subexp_map[reg_idx] + 1].rm_so;
892 	      pmatch[reg_idx + 1].rm_eo
893 		= pmatch[dfa->subexp_map[reg_idx] + 1].rm_eo;
894 	    }
895     }
896 
897  free_return:
898   re_free (mctx.state_log);
899   if (dfa->nbackref)
900     match_ctx_free (&mctx);
901   re_string_destruct (&mctx.input);
902   return err;
903 }
904 
905 static reg_errcode_t
906 __attribute_warn_unused_result__
prune_impossible_nodes(re_match_context_t * mctx)907 prune_impossible_nodes (re_match_context_t *mctx)
908 {
909   const re_dfa_t *const dfa = mctx->dfa;
910   Idx halt_node, match_last;
911   reg_errcode_t ret;
912   re_dfastate_t **sifted_states;
913   re_dfastate_t **lim_states = NULL;
914   re_sift_context_t sctx;
915   DEBUG_ASSERT (mctx->state_log != NULL);
916   match_last = mctx->match_last;
917   halt_node = mctx->last_node;
918 
919   /* Avoid overflow.  */
920   if (__glibc_unlikely (MIN (IDX_MAX, SIZE_MAX / sizeof (re_dfastate_t *))
921 			<= match_last))
922     return REG_ESPACE;
923 
924   sifted_states = re_malloc (re_dfastate_t *, match_last + 1);
925   if (__glibc_unlikely (sifted_states == NULL))
926     {
927       ret = REG_ESPACE;
928       goto free_return;
929     }
930   if (dfa->nbackref)
931     {
932       lim_states = re_malloc (re_dfastate_t *, match_last + 1);
933       if (__glibc_unlikely (lim_states == NULL))
934 	{
935 	  ret = REG_ESPACE;
936 	  goto free_return;
937 	}
938       while (1)
939 	{
940 	  memset (lim_states, '\0',
941 		  sizeof (re_dfastate_t *) * (match_last + 1));
942 	  sift_ctx_init (&sctx, sifted_states, lim_states, halt_node,
943 			 match_last);
944 	  ret = sift_states_backward (mctx, &sctx);
945 	  re_node_set_free (&sctx.limits);
946 	  if (__glibc_unlikely (ret != REG_NOERROR))
947 	      goto free_return;
948 	  if (sifted_states[0] != NULL || lim_states[0] != NULL)
949 	    break;
950 	  do
951 	    {
952 	      --match_last;
953 	      if (match_last < 0)
954 		{
955 		  ret = REG_NOMATCH;
956 		  goto free_return;
957 		}
958 	    } while (mctx->state_log[match_last] == NULL
959 		     || !mctx->state_log[match_last]->halt);
960 	  halt_node = check_halt_state_context (mctx,
961 						mctx->state_log[match_last],
962 						match_last);
963 	}
964       ret = merge_state_array (dfa, sifted_states, lim_states,
965 			       match_last + 1);
966       re_free (lim_states);
967       lim_states = NULL;
968       if (__glibc_unlikely (ret != REG_NOERROR))
969 	goto free_return;
970     }
971   else
972     {
973       sift_ctx_init (&sctx, sifted_states, lim_states, halt_node, match_last);
974       ret = sift_states_backward (mctx, &sctx);
975       re_node_set_free (&sctx.limits);
976       if (__glibc_unlikely (ret != REG_NOERROR))
977 	goto free_return;
978       if (sifted_states[0] == NULL)
979 	{
980 	  ret = REG_NOMATCH;
981 	  goto free_return;
982 	}
983     }
984   re_free (mctx->state_log);
985   mctx->state_log = sifted_states;
986   sifted_states = NULL;
987   mctx->last_node = halt_node;
988   mctx->match_last = match_last;
989   ret = REG_NOERROR;
990  free_return:
991   re_free (sifted_states);
992   re_free (lim_states);
993   return ret;
994 }
995 
996 /* Acquire an initial state and return it.
997    We must select appropriate initial state depending on the context,
998    since initial states may have constraints like "\<", "^", etc..  */
999 
1000 static inline re_dfastate_t *
1001 __attribute__ ((always_inline))
acquire_init_state_context(reg_errcode_t * err,const re_match_context_t * mctx,Idx idx)1002 acquire_init_state_context (reg_errcode_t *err, const re_match_context_t *mctx,
1003 			    Idx idx)
1004 {
1005   const re_dfa_t *const dfa = mctx->dfa;
1006   if (dfa->init_state->has_constraint)
1007     {
1008       unsigned int context;
1009       context = re_string_context_at (&mctx->input, idx - 1, mctx->eflags);
1010       if (IS_WORD_CONTEXT (context))
1011 	return dfa->init_state_word;
1012       else if (IS_ORDINARY_CONTEXT (context))
1013 	return dfa->init_state;
1014       else if (IS_BEGBUF_CONTEXT (context) && IS_NEWLINE_CONTEXT (context))
1015 	return dfa->init_state_begbuf;
1016       else if (IS_NEWLINE_CONTEXT (context))
1017 	return dfa->init_state_nl;
1018       else if (IS_BEGBUF_CONTEXT (context))
1019 	{
1020 	  /* It is relatively rare case, then calculate on demand.  */
1021 	  return re_acquire_state_context (err, dfa,
1022 					   dfa->init_state->entrance_nodes,
1023 					   context);
1024 	}
1025       else
1026 	/* Must not happen?  */
1027 	return dfa->init_state;
1028     }
1029   else
1030     return dfa->init_state;
1031 }
1032 
1033 /* Check whether the regular expression match input string INPUT or not,
1034    and return the index where the matching end.  Return -1 if
1035    there is no match, and return -2 in case of an error.
1036    FL_LONGEST_MATCH means we want the POSIX longest matching.
1037    If P_MATCH_FIRST is not NULL, and the match fails, it is set to the
1038    next place where we may want to try matching.
1039    Note that the matcher assumes that the matching starts from the current
1040    index of the buffer.  */
1041 
1042 static Idx
1043 __attribute_warn_unused_result__
check_matching(re_match_context_t * mctx,bool fl_longest_match,Idx * p_match_first)1044 check_matching (re_match_context_t *mctx, bool fl_longest_match,
1045 		Idx *p_match_first)
1046 {
1047   const re_dfa_t *const dfa = mctx->dfa;
1048   reg_errcode_t err;
1049   Idx match = 0;
1050   Idx match_last = -1;
1051   Idx cur_str_idx = re_string_cur_idx (&mctx->input);
1052   re_dfastate_t *cur_state;
1053   bool at_init_state = p_match_first != NULL;
1054   Idx next_start_idx = cur_str_idx;
1055 
1056   err = REG_NOERROR;
1057   cur_state = acquire_init_state_context (&err, mctx, cur_str_idx);
1058   /* An initial state must not be NULL (invalid).  */
1059   if (__glibc_unlikely (cur_state == NULL))
1060     {
1061       DEBUG_ASSERT (err == REG_ESPACE);
1062       return -2;
1063     }
1064 
1065   if (mctx->state_log != NULL)
1066     {
1067       mctx->state_log[cur_str_idx] = cur_state;
1068 
1069       /* Check OP_OPEN_SUBEXP in the initial state in case that we use them
1070 	 later.  E.g. Processing back references.  */
1071       if (__glibc_unlikely (dfa->nbackref))
1072 	{
1073 	  at_init_state = false;
1074 	  err = check_subexp_matching_top (mctx, &cur_state->nodes, 0);
1075 	  if (__glibc_unlikely (err != REG_NOERROR))
1076 	    return err;
1077 
1078 	  if (cur_state->has_backref)
1079 	    {
1080 	      err = transit_state_bkref (mctx, &cur_state->nodes);
1081 	      if (__glibc_unlikely (err != REG_NOERROR))
1082 		return err;
1083 	    }
1084 	}
1085     }
1086 
1087   /* If the RE accepts NULL string.  */
1088   if (__glibc_unlikely (cur_state->halt))
1089     {
1090       if (!cur_state->has_constraint
1091 	  || check_halt_state_context (mctx, cur_state, cur_str_idx))
1092 	{
1093 	  if (!fl_longest_match)
1094 	    return cur_str_idx;
1095 	  else
1096 	    {
1097 	      match_last = cur_str_idx;
1098 	      match = 1;
1099 	    }
1100 	}
1101     }
1102 
1103   while (!re_string_eoi (&mctx->input))
1104     {
1105       re_dfastate_t *old_state = cur_state;
1106       Idx next_char_idx = re_string_cur_idx (&mctx->input) + 1;
1107 
1108       if ((__glibc_unlikely (next_char_idx >= mctx->input.bufs_len)
1109 	   && mctx->input.bufs_len < mctx->input.len)
1110 	  || (__glibc_unlikely (next_char_idx >= mctx->input.valid_len)
1111 	      && mctx->input.valid_len < mctx->input.len))
1112 	{
1113 	  err = extend_buffers (mctx, next_char_idx + 1);
1114 	  if (__glibc_unlikely (err != REG_NOERROR))
1115 	    {
1116 	      DEBUG_ASSERT (err == REG_ESPACE);
1117 	      return -2;
1118 	    }
1119 	}
1120 
1121       cur_state = transit_state (&err, mctx, cur_state);
1122       if (mctx->state_log != NULL)
1123 	cur_state = merge_state_with_log (&err, mctx, cur_state);
1124 
1125       if (cur_state == NULL)
1126 	{
1127 	  /* Reached the invalid state or an error.  Try to recover a valid
1128 	     state using the state log, if available and if we have not
1129 	     already found a valid (even if not the longest) match.  */
1130 	  if (__glibc_unlikely (err != REG_NOERROR))
1131 	    return -2;
1132 
1133 	  if (mctx->state_log == NULL
1134 	      || (match && !fl_longest_match)
1135 	      || (cur_state = find_recover_state (&err, mctx)) == NULL)
1136 	    break;
1137 	}
1138 
1139       if (__glibc_unlikely (at_init_state))
1140 	{
1141 	  if (old_state == cur_state)
1142 	    next_start_idx = next_char_idx;
1143 	  else
1144 	    at_init_state = false;
1145 	}
1146 
1147       if (cur_state->halt)
1148 	{
1149 	  /* Reached a halt state.
1150 	     Check the halt state can satisfy the current context.  */
1151 	  if (!cur_state->has_constraint
1152 	      || check_halt_state_context (mctx, cur_state,
1153 					   re_string_cur_idx (&mctx->input)))
1154 	    {
1155 	      /* We found an appropriate halt state.  */
1156 	      match_last = re_string_cur_idx (&mctx->input);
1157 	      match = 1;
1158 
1159 	      /* We found a match, do not modify match_first below.  */
1160 	      p_match_first = NULL;
1161 	      if (!fl_longest_match)
1162 		break;
1163 	    }
1164 	}
1165     }
1166 
1167   if (p_match_first)
1168     *p_match_first += next_start_idx;
1169 
1170   return match_last;
1171 }
1172 
1173 /* Check NODE match the current context.  */
1174 
1175 static bool
check_halt_node_context(const re_dfa_t * dfa,Idx node,unsigned int context)1176 check_halt_node_context (const re_dfa_t *dfa, Idx node, unsigned int context)
1177 {
1178   re_token_type_t type = dfa->nodes[node].type;
1179   unsigned int constraint = dfa->nodes[node].constraint;
1180   if (type != END_OF_RE)
1181     return false;
1182   if (!constraint)
1183     return true;
1184   if (NOT_SATISFY_NEXT_CONSTRAINT (constraint, context))
1185     return false;
1186   return true;
1187 }
1188 
1189 /* Check the halt state STATE match the current context.
1190    Return 0 if not match, if the node, STATE has, is a halt node and
1191    match the context, return the node.  */
1192 
1193 static Idx
check_halt_state_context(const re_match_context_t * mctx,const re_dfastate_t * state,Idx idx)1194 check_halt_state_context (const re_match_context_t *mctx,
1195 			  const re_dfastate_t *state, Idx idx)
1196 {
1197   Idx i;
1198   unsigned int context;
1199   DEBUG_ASSERT (state->halt);
1200   context = re_string_context_at (&mctx->input, idx, mctx->eflags);
1201   for (i = 0; i < state->nodes.nelem; ++i)
1202     if (check_halt_node_context (mctx->dfa, state->nodes.elems[i], context))
1203       return state->nodes.elems[i];
1204   return 0;
1205 }
1206 
1207 /* Compute the next node to which "NFA" transit from NODE("NFA" is a NFA
1208    corresponding to the DFA).
1209    Return the destination node, and update EPS_VIA_NODES;
1210    return -1 on match failure, -2 on error.  */
1211 
1212 static Idx
proceed_next_node(const re_match_context_t * mctx,Idx nregs,regmatch_t * regs,regmatch_t * prevregs,Idx * pidx,Idx node,re_node_set * eps_via_nodes,struct re_fail_stack_t * fs)1213 proceed_next_node (const re_match_context_t *mctx, Idx nregs, regmatch_t *regs,
1214 		   regmatch_t *prevregs,
1215 		   Idx *pidx, Idx node, re_node_set *eps_via_nodes,
1216 		   struct re_fail_stack_t *fs)
1217 {
1218   const re_dfa_t *const dfa = mctx->dfa;
1219   if (IS_EPSILON_NODE (dfa->nodes[node].type))
1220     {
1221       re_node_set *cur_nodes = &mctx->state_log[*pidx]->nodes;
1222       re_node_set *edests = &dfa->edests[node];
1223 
1224       if (! re_node_set_contains (eps_via_nodes, node))
1225         {
1226           bool ok = re_node_set_insert (eps_via_nodes, node);
1227           if (__glibc_unlikely (! ok))
1228             return -2;
1229         }
1230 
1231       /* Pick a valid destination, or return -1 if none is found.  */
1232       Idx dest_node = -1;
1233       for (Idx i = 0; i < edests->nelem; i++)
1234 	{
1235 	  Idx candidate = edests->elems[i];
1236 	  if (!re_node_set_contains (cur_nodes, candidate))
1237 	    continue;
1238           if (dest_node == -1)
1239 	    dest_node = candidate;
1240 
1241 	  else
1242 	    {
1243 	      /* In order to avoid infinite loop like "(a*)*", return the second
1244 		 epsilon-transition if the first was already considered.  */
1245 	      if (re_node_set_contains (eps_via_nodes, dest_node))
1246 		return candidate;
1247 
1248 	      /* Otherwise, push the second epsilon-transition on the fail stack.  */
1249 	      else if (fs != NULL
1250 		       && push_fail_stack (fs, *pidx, candidate, nregs, regs,
1251 					   prevregs, eps_via_nodes))
1252 		return -2;
1253 
1254 	      /* We know we are going to exit.  */
1255 	      break;
1256 	    }
1257 	}
1258       return dest_node;
1259     }
1260   else
1261     {
1262       Idx naccepted = 0;
1263       re_token_type_t type = dfa->nodes[node].type;
1264 
1265 #ifdef RE_ENABLE_I18N
1266       if (dfa->nodes[node].accept_mb)
1267 	naccepted = check_node_accept_bytes (dfa, node, &mctx->input, *pidx);
1268       else
1269 #endif /* RE_ENABLE_I18N */
1270       if (type == OP_BACK_REF)
1271 	{
1272 	  Idx subexp_idx = dfa->nodes[node].opr.idx + 1;
1273 	  if (subexp_idx < nregs)
1274 	    naccepted = regs[subexp_idx].rm_eo - regs[subexp_idx].rm_so;
1275 	  if (fs != NULL)
1276 	    {
1277 	      if (subexp_idx >= nregs
1278 		  || regs[subexp_idx].rm_so == -1
1279 		  || regs[subexp_idx].rm_eo == -1)
1280 		return -1;
1281 	      else if (naccepted)
1282 		{
1283 		  char *buf = (char *) re_string_get_buffer (&mctx->input);
1284 		  if (mctx->input.valid_len - *pidx < naccepted
1285 		      || (memcmp (buf + regs[subexp_idx].rm_so, buf + *pidx,
1286 				  naccepted)
1287 			  != 0))
1288 		    return -1;
1289 		}
1290 	    }
1291 
1292 	  if (naccepted == 0)
1293 	    {
1294 	      Idx dest_node;
1295 	      bool ok = re_node_set_insert (eps_via_nodes, node);
1296 	      if (__glibc_unlikely (! ok))
1297 		return -2;
1298 	      dest_node = dfa->edests[node].elems[0];
1299 	      if (re_node_set_contains (&mctx->state_log[*pidx]->nodes,
1300 					dest_node))
1301 		return dest_node;
1302 	    }
1303 	}
1304 
1305       if (naccepted != 0
1306 	  || check_node_accept (mctx, dfa->nodes + node, *pidx))
1307 	{
1308 	  Idx dest_node = dfa->nexts[node];
1309 	  *pidx = (naccepted == 0) ? *pidx + 1 : *pidx + naccepted;
1310 	  if (fs && (*pidx > mctx->match_last || mctx->state_log[*pidx] == NULL
1311 		     || !re_node_set_contains (&mctx->state_log[*pidx]->nodes,
1312 					       dest_node)))
1313 	    return -1;
1314 	  re_node_set_empty (eps_via_nodes);
1315 	  return dest_node;
1316 	}
1317     }
1318   return -1;
1319 }
1320 
1321 static reg_errcode_t
1322 __attribute_warn_unused_result__
push_fail_stack(struct re_fail_stack_t * fs,Idx str_idx,Idx dest_node,Idx nregs,regmatch_t * regs,regmatch_t * prevregs,re_node_set * eps_via_nodes)1323 push_fail_stack (struct re_fail_stack_t *fs, Idx str_idx, Idx dest_node,
1324 		 Idx nregs, regmatch_t *regs, regmatch_t *prevregs,
1325 		 re_node_set *eps_via_nodes)
1326 {
1327   reg_errcode_t err;
1328   Idx num = fs->num++;
1329   if (fs->num == fs->alloc)
1330     {
1331       struct re_fail_stack_ent_t *new_array;
1332       new_array = re_realloc (fs->stack, struct re_fail_stack_ent_t,
1333                               fs->alloc * 2);
1334       if (new_array == NULL)
1335 	return REG_ESPACE;
1336       fs->alloc *= 2;
1337       fs->stack = new_array;
1338     }
1339   fs->stack[num].idx = str_idx;
1340   fs->stack[num].node = dest_node;
1341   fs->stack[num].regs = re_malloc (regmatch_t, 2 * nregs);
1342   if (fs->stack[num].regs == NULL)
1343     return REG_ESPACE;
1344   memcpy (fs->stack[num].regs, regs, sizeof (regmatch_t) * nregs);
1345   memcpy (fs->stack[num].regs + nregs, prevregs, sizeof (regmatch_t) * nregs);
1346   err = re_node_set_init_copy (&fs->stack[num].eps_via_nodes, eps_via_nodes);
1347   return err;
1348 }
1349 
1350 static Idx
pop_fail_stack(struct re_fail_stack_t * fs,Idx * pidx,Idx nregs,regmatch_t * regs,regmatch_t * prevregs,re_node_set * eps_via_nodes)1351 pop_fail_stack (struct re_fail_stack_t *fs, Idx *pidx, Idx nregs,
1352 		regmatch_t *regs, regmatch_t *prevregs,
1353 		re_node_set *eps_via_nodes)
1354 {
1355   if (fs == NULL || fs->num == 0)
1356     return -1;
1357   Idx num = --fs->num;
1358   *pidx = fs->stack[num].idx;
1359   memcpy (regs, fs->stack[num].regs, sizeof (regmatch_t) * nregs);
1360   memcpy (prevregs, fs->stack[num].regs + nregs, sizeof (regmatch_t) * nregs);
1361   re_node_set_free (eps_via_nodes);
1362   re_free (fs->stack[num].regs);
1363   *eps_via_nodes = fs->stack[num].eps_via_nodes;
1364   DEBUG_ASSERT (0 <= fs->stack[num].node);
1365   return fs->stack[num].node;
1366 }
1367 
1368 
1369 #define DYNARRAY_STRUCT  regmatch_list
1370 #define DYNARRAY_ELEMENT regmatch_t
1371 #define DYNARRAY_PREFIX  regmatch_list_
1372 #include <malloc/dynarray-skeleton.c>
1373 
1374 /* Set the positions where the subexpressions are starts/ends to registers
1375    PMATCH.
1376    Note: We assume that pmatch[0] is already set, and
1377    pmatch[i].rm_so == pmatch[i].rm_eo == -1 for 0 < i < nmatch.  */
1378 
1379 static reg_errcode_t
1380 __attribute_warn_unused_result__
set_regs(const regex_t * preg,const re_match_context_t * mctx,size_t nmatch,regmatch_t * pmatch,bool fl_backtrack)1381 set_regs (const regex_t *preg, const re_match_context_t *mctx, size_t nmatch,
1382 	  regmatch_t *pmatch, bool fl_backtrack)
1383 {
1384   const re_dfa_t *dfa = preg->buffer;
1385   Idx idx, cur_node;
1386   re_node_set eps_via_nodes;
1387   struct re_fail_stack_t *fs;
1388   struct re_fail_stack_t fs_body = { 0, 2, NULL };
1389   struct regmatch_list prev_match;
1390   regmatch_list_init (&prev_match);
1391 
1392   DEBUG_ASSERT (nmatch > 1);
1393   DEBUG_ASSERT (mctx->state_log != NULL);
1394   if (fl_backtrack)
1395     {
1396       fs = &fs_body;
1397       fs->stack = re_malloc (struct re_fail_stack_ent_t, fs->alloc);
1398       if (fs->stack == NULL)
1399 	return REG_ESPACE;
1400     }
1401   else
1402     fs = NULL;
1403 
1404   cur_node = dfa->init_node;
1405   re_node_set_init_empty (&eps_via_nodes);
1406 
1407   if (!regmatch_list_resize (&prev_match, nmatch))
1408     {
1409       regmatch_list_free (&prev_match);
1410       free_fail_stack_return (fs);
1411       return REG_ESPACE;
1412     }
1413   regmatch_t *prev_idx_match = regmatch_list_begin (&prev_match);
1414   memcpy (prev_idx_match, pmatch, sizeof (regmatch_t) * nmatch);
1415 
1416   for (idx = pmatch[0].rm_so; idx <= pmatch[0].rm_eo ;)
1417     {
1418       update_regs (dfa, pmatch, prev_idx_match, cur_node, idx, nmatch);
1419 
1420       if ((idx == pmatch[0].rm_eo && cur_node == mctx->last_node)
1421 	  || (fs && re_node_set_contains (&eps_via_nodes, cur_node)))
1422 	{
1423 	  Idx reg_idx;
1424 	  cur_node = -1;
1425 	  if (fs)
1426 	    {
1427 	      for (reg_idx = 0; reg_idx < nmatch; ++reg_idx)
1428 		if (pmatch[reg_idx].rm_so > -1 && pmatch[reg_idx].rm_eo == -1)
1429 		  {
1430 		    cur_node = pop_fail_stack (fs, &idx, nmatch, pmatch,
1431 					       prev_idx_match, &eps_via_nodes);
1432 		    break;
1433 		  }
1434 	    }
1435 	  if (cur_node < 0)
1436 	    {
1437 	      re_node_set_free (&eps_via_nodes);
1438 	      regmatch_list_free (&prev_match);
1439 	      return free_fail_stack_return (fs);
1440 	    }
1441 	}
1442 
1443       /* Proceed to next node.  */
1444       cur_node = proceed_next_node (mctx, nmatch, pmatch, prev_idx_match,
1445 				    &idx, cur_node,
1446 				    &eps_via_nodes, fs);
1447 
1448       if (__glibc_unlikely (cur_node < 0))
1449 	{
1450 	  if (__glibc_unlikely (cur_node == -2))
1451 	    {
1452 	      re_node_set_free (&eps_via_nodes);
1453 	      regmatch_list_free (&prev_match);
1454 	      free_fail_stack_return (fs);
1455 	      return REG_ESPACE;
1456 	    }
1457 	  cur_node = pop_fail_stack (fs, &idx, nmatch, pmatch,
1458 				     prev_idx_match, &eps_via_nodes);
1459 	  if (cur_node < 0)
1460 	    {
1461 	      re_node_set_free (&eps_via_nodes);
1462 	      regmatch_list_free (&prev_match);
1463 	      free_fail_stack_return (fs);
1464 	      return REG_NOMATCH;
1465 	    }
1466 	}
1467     }
1468   re_node_set_free (&eps_via_nodes);
1469   regmatch_list_free (&prev_match);
1470   return free_fail_stack_return (fs);
1471 }
1472 
1473 static reg_errcode_t
free_fail_stack_return(struct re_fail_stack_t * fs)1474 free_fail_stack_return (struct re_fail_stack_t *fs)
1475 {
1476   if (fs)
1477     {
1478       Idx fs_idx;
1479       for (fs_idx = 0; fs_idx < fs->num; ++fs_idx)
1480 	{
1481 	  re_node_set_free (&fs->stack[fs_idx].eps_via_nodes);
1482 	  re_free (fs->stack[fs_idx].regs);
1483 	}
1484       re_free (fs->stack);
1485     }
1486   return REG_NOERROR;
1487 }
1488 
1489 static void
update_regs(const re_dfa_t * dfa,regmatch_t * pmatch,regmatch_t * prev_idx_match,Idx cur_node,Idx cur_idx,Idx nmatch)1490 update_regs (const re_dfa_t *dfa, regmatch_t *pmatch,
1491 	     regmatch_t *prev_idx_match, Idx cur_node, Idx cur_idx, Idx nmatch)
1492 {
1493   int type = dfa->nodes[cur_node].type;
1494   if (type == OP_OPEN_SUBEXP)
1495     {
1496       Idx reg_num = dfa->nodes[cur_node].opr.idx + 1;
1497 
1498       /* We are at the first node of this sub expression.  */
1499       if (reg_num < nmatch)
1500 	{
1501 	  pmatch[reg_num].rm_so = cur_idx;
1502 	  pmatch[reg_num].rm_eo = -1;
1503 	}
1504     }
1505   else if (type == OP_CLOSE_SUBEXP)
1506     {
1507       /* We are at the last node of this sub expression.  */
1508       Idx reg_num = dfa->nodes[cur_node].opr.idx + 1;
1509       if (reg_num < nmatch)
1510 	{
1511 	  if (pmatch[reg_num].rm_so < cur_idx)
1512 	    {
1513 	      pmatch[reg_num].rm_eo = cur_idx;
1514 	      /* This is a non-empty match or we are not inside an optional
1515 		 subexpression.  Accept this right away.  */
1516 	      memcpy (prev_idx_match, pmatch, sizeof (regmatch_t) * nmatch);
1517 	    }
1518 	  else
1519 	    {
1520 	      if (dfa->nodes[cur_node].opt_subexp
1521 		  && prev_idx_match[reg_num].rm_so != -1)
1522 		/* We transited through an empty match for an optional
1523 		   subexpression, like (a?)*, and this is not the subexp's
1524 		   first match.  Copy back the old content of the registers
1525 		   so that matches of an inner subexpression are undone as
1526 		   well, like in ((a?))*.  */
1527 		memcpy (pmatch, prev_idx_match, sizeof (regmatch_t) * nmatch);
1528 	      else
1529 		/* We completed a subexpression, but it may be part of
1530 		   an optional one, so do not update PREV_IDX_MATCH.  */
1531 		pmatch[reg_num].rm_eo = cur_idx;
1532 	    }
1533 	}
1534     }
1535 }
1536 
1537 /* This function checks the STATE_LOG from the SCTX->last_str_idx to 0
1538    and sift the nodes in each states according to the following rules.
1539    Updated state_log will be wrote to STATE_LOG.
1540 
1541    Rules: We throw away the Node 'a' in the STATE_LOG[STR_IDX] if...
1542      1. When STR_IDX == MATCH_LAST(the last index in the state_log):
1543 	If 'a' isn't the LAST_NODE and 'a' can't epsilon transit to
1544 	the LAST_NODE, we throw away the node 'a'.
1545      2. When 0 <= STR_IDX < MATCH_LAST and 'a' accepts
1546 	string 's' and transit to 'b':
1547 	i. If 'b' isn't in the STATE_LOG[STR_IDX+strlen('s')], we throw
1548 	   away the node 'a'.
1549 	ii. If 'b' is in the STATE_LOG[STR_IDX+strlen('s')] but 'b' is
1550 	    thrown away, we throw away the node 'a'.
1551      3. When 0 <= STR_IDX < MATCH_LAST and 'a' epsilon transit to 'b':
1552 	i. If 'b' isn't in the STATE_LOG[STR_IDX], we throw away the
1553 	   node 'a'.
1554 	ii. If 'b' is in the STATE_LOG[STR_IDX] but 'b' is thrown away,
1555 	    we throw away the node 'a'.  */
1556 
1557 #define STATE_NODE_CONTAINS(state,node) \
1558   ((state) != NULL && re_node_set_contains (&(state)->nodes, node))
1559 
1560 static reg_errcode_t
sift_states_backward(const re_match_context_t * mctx,re_sift_context_t * sctx)1561 sift_states_backward (const re_match_context_t *mctx, re_sift_context_t *sctx)
1562 {
1563   reg_errcode_t err;
1564   int null_cnt = 0;
1565   Idx str_idx = sctx->last_str_idx;
1566   re_node_set cur_dest;
1567 
1568   DEBUG_ASSERT (mctx->state_log != NULL && mctx->state_log[str_idx] != NULL);
1569 
1570   /* Build sifted state_log[str_idx].  It has the nodes which can epsilon
1571      transit to the last_node and the last_node itself.  */
1572   err = re_node_set_init_1 (&cur_dest, sctx->last_node);
1573   if (__glibc_unlikely (err != REG_NOERROR))
1574     return err;
1575   err = update_cur_sifted_state (mctx, sctx, str_idx, &cur_dest);
1576   if (__glibc_unlikely (err != REG_NOERROR))
1577     goto free_return;
1578 
1579   /* Then check each states in the state_log.  */
1580   while (str_idx > 0)
1581     {
1582       /* Update counters.  */
1583       null_cnt = (sctx->sifted_states[str_idx] == NULL) ? null_cnt + 1 : 0;
1584       if (null_cnt > mctx->max_mb_elem_len)
1585 	{
1586 	  memset (sctx->sifted_states, '\0',
1587 		  sizeof (re_dfastate_t *) * str_idx);
1588 	  re_node_set_free (&cur_dest);
1589 	  return REG_NOERROR;
1590 	}
1591       re_node_set_empty (&cur_dest);
1592       --str_idx;
1593 
1594       if (mctx->state_log[str_idx])
1595 	{
1596 	  err = build_sifted_states (mctx, sctx, str_idx, &cur_dest);
1597 	  if (__glibc_unlikely (err != REG_NOERROR))
1598 	    goto free_return;
1599 	}
1600 
1601       /* Add all the nodes which satisfy the following conditions:
1602 	 - It can epsilon transit to a node in CUR_DEST.
1603 	 - It is in CUR_SRC.
1604 	 And update state_log.  */
1605       err = update_cur_sifted_state (mctx, sctx, str_idx, &cur_dest);
1606       if (__glibc_unlikely (err != REG_NOERROR))
1607 	goto free_return;
1608     }
1609   err = REG_NOERROR;
1610  free_return:
1611   re_node_set_free (&cur_dest);
1612   return err;
1613 }
1614 
1615 static reg_errcode_t
1616 __attribute_warn_unused_result__
build_sifted_states(const re_match_context_t * mctx,re_sift_context_t * sctx,Idx str_idx,re_node_set * cur_dest)1617 build_sifted_states (const re_match_context_t *mctx, re_sift_context_t *sctx,
1618 		     Idx str_idx, re_node_set *cur_dest)
1619 {
1620   const re_dfa_t *const dfa = mctx->dfa;
1621   const re_node_set *cur_src = &mctx->state_log[str_idx]->non_eps_nodes;
1622   Idx i;
1623 
1624   /* Then build the next sifted state.
1625      We build the next sifted state on 'cur_dest', and update
1626      'sifted_states[str_idx]' with 'cur_dest'.
1627      Note:
1628      'cur_dest' is the sifted state from 'state_log[str_idx + 1]'.
1629      'cur_src' points the node_set of the old 'state_log[str_idx]'
1630      (with the epsilon nodes pre-filtered out).  */
1631   for (i = 0; i < cur_src->nelem; i++)
1632     {
1633       Idx prev_node = cur_src->elems[i];
1634       int naccepted = 0;
1635       bool ok;
1636       DEBUG_ASSERT (!IS_EPSILON_NODE (dfa->nodes[prev_node].type));
1637 
1638 #ifdef RE_ENABLE_I18N
1639       /* If the node may accept "multi byte".  */
1640       if (dfa->nodes[prev_node].accept_mb)
1641 	naccepted = sift_states_iter_mb (mctx, sctx, prev_node,
1642 					 str_idx, sctx->last_str_idx);
1643 #endif /* RE_ENABLE_I18N */
1644 
1645       /* We don't check backreferences here.
1646 	 See update_cur_sifted_state().  */
1647       if (!naccepted
1648 	  && check_node_accept (mctx, dfa->nodes + prev_node, str_idx)
1649 	  && STATE_NODE_CONTAINS (sctx->sifted_states[str_idx + 1],
1650 				  dfa->nexts[prev_node]))
1651 	naccepted = 1;
1652 
1653       if (naccepted == 0)
1654 	continue;
1655 
1656       if (sctx->limits.nelem)
1657 	{
1658 	  Idx to_idx = str_idx + naccepted;
1659 	  if (check_dst_limits (mctx, &sctx->limits,
1660 				dfa->nexts[prev_node], to_idx,
1661 				prev_node, str_idx))
1662 	    continue;
1663 	}
1664       ok = re_node_set_insert (cur_dest, prev_node);
1665       if (__glibc_unlikely (! ok))
1666 	return REG_ESPACE;
1667     }
1668 
1669   return REG_NOERROR;
1670 }
1671 
1672 /* Helper functions.  */
1673 
1674 static reg_errcode_t
clean_state_log_if_needed(re_match_context_t * mctx,Idx next_state_log_idx)1675 clean_state_log_if_needed (re_match_context_t *mctx, Idx next_state_log_idx)
1676 {
1677   Idx top = mctx->state_log_top;
1678 
1679   if ((next_state_log_idx >= mctx->input.bufs_len
1680        && mctx->input.bufs_len < mctx->input.len)
1681       || (next_state_log_idx >= mctx->input.valid_len
1682 	  && mctx->input.valid_len < mctx->input.len))
1683     {
1684       reg_errcode_t err;
1685       err = extend_buffers (mctx, next_state_log_idx + 1);
1686       if (__glibc_unlikely (err != REG_NOERROR))
1687 	return err;
1688     }
1689 
1690   if (top < next_state_log_idx)
1691     {
1692       memset (mctx->state_log + top + 1, '\0',
1693 	      sizeof (re_dfastate_t *) * (next_state_log_idx - top));
1694       mctx->state_log_top = next_state_log_idx;
1695     }
1696   return REG_NOERROR;
1697 }
1698 
1699 static reg_errcode_t
merge_state_array(const re_dfa_t * dfa,re_dfastate_t ** dst,re_dfastate_t ** src,Idx num)1700 merge_state_array (const re_dfa_t *dfa, re_dfastate_t **dst,
1701 		   re_dfastate_t **src, Idx num)
1702 {
1703   Idx st_idx;
1704   reg_errcode_t err;
1705   for (st_idx = 0; st_idx < num; ++st_idx)
1706     {
1707       if (dst[st_idx] == NULL)
1708 	dst[st_idx] = src[st_idx];
1709       else if (src[st_idx] != NULL)
1710 	{
1711 	  re_node_set merged_set;
1712 	  err = re_node_set_init_union (&merged_set, &dst[st_idx]->nodes,
1713 					&src[st_idx]->nodes);
1714 	  if (__glibc_unlikely (err != REG_NOERROR))
1715 	    return err;
1716 	  dst[st_idx] = re_acquire_state (&err, dfa, &merged_set);
1717 	  re_node_set_free (&merged_set);
1718 	  if (__glibc_unlikely (err != REG_NOERROR))
1719 	    return err;
1720 	}
1721     }
1722   return REG_NOERROR;
1723 }
1724 
1725 static reg_errcode_t
update_cur_sifted_state(const re_match_context_t * mctx,re_sift_context_t * sctx,Idx str_idx,re_node_set * dest_nodes)1726 update_cur_sifted_state (const re_match_context_t *mctx,
1727 			 re_sift_context_t *sctx, Idx str_idx,
1728 			 re_node_set *dest_nodes)
1729 {
1730   const re_dfa_t *const dfa = mctx->dfa;
1731   reg_errcode_t err = REG_NOERROR;
1732   const re_node_set *candidates;
1733   candidates = ((mctx->state_log[str_idx] == NULL) ? NULL
1734 		: &mctx->state_log[str_idx]->nodes);
1735 
1736   if (dest_nodes->nelem == 0)
1737     sctx->sifted_states[str_idx] = NULL;
1738   else
1739     {
1740       if (candidates)
1741 	{
1742 	  /* At first, add the nodes which can epsilon transit to a node in
1743 	     DEST_NODE.  */
1744 	  err = add_epsilon_src_nodes (dfa, dest_nodes, candidates);
1745 	  if (__glibc_unlikely (err != REG_NOERROR))
1746 	    return err;
1747 
1748 	  /* Then, check the limitations in the current sift_context.  */
1749 	  if (sctx->limits.nelem)
1750 	    {
1751 	      err = check_subexp_limits (dfa, dest_nodes, candidates, &sctx->limits,
1752 					 mctx->bkref_ents, str_idx);
1753 	      if (__glibc_unlikely (err != REG_NOERROR))
1754 		return err;
1755 	    }
1756 	}
1757 
1758       sctx->sifted_states[str_idx] = re_acquire_state (&err, dfa, dest_nodes);
1759       if (__glibc_unlikely (err != REG_NOERROR))
1760 	return err;
1761     }
1762 
1763   if (candidates && mctx->state_log[str_idx]->has_backref)
1764     {
1765       err = sift_states_bkref (mctx, sctx, str_idx, candidates);
1766       if (__glibc_unlikely (err != REG_NOERROR))
1767 	return err;
1768     }
1769   return REG_NOERROR;
1770 }
1771 
1772 static reg_errcode_t
1773 __attribute_warn_unused_result__
add_epsilon_src_nodes(const re_dfa_t * dfa,re_node_set * dest_nodes,const re_node_set * candidates)1774 add_epsilon_src_nodes (const re_dfa_t *dfa, re_node_set *dest_nodes,
1775 		       const re_node_set *candidates)
1776 {
1777   reg_errcode_t err = REG_NOERROR;
1778   Idx i;
1779 
1780   re_dfastate_t *state = re_acquire_state (&err, dfa, dest_nodes);
1781   if (__glibc_unlikely (err != REG_NOERROR))
1782     return err;
1783 
1784   if (!state->inveclosure.alloc)
1785     {
1786       err = re_node_set_alloc (&state->inveclosure, dest_nodes->nelem);
1787       if (__glibc_unlikely (err != REG_NOERROR))
1788 	return REG_ESPACE;
1789       for (i = 0; i < dest_nodes->nelem; i++)
1790 	{
1791 	  err = re_node_set_merge (&state->inveclosure,
1792 				   dfa->inveclosures + dest_nodes->elems[i]);
1793 	  if (__glibc_unlikely (err != REG_NOERROR))
1794 	    return REG_ESPACE;
1795 	}
1796     }
1797   return re_node_set_add_intersect (dest_nodes, candidates,
1798 				    &state->inveclosure);
1799 }
1800 
1801 static reg_errcode_t
sub_epsilon_src_nodes(const re_dfa_t * dfa,Idx node,re_node_set * dest_nodes,const re_node_set * candidates)1802 sub_epsilon_src_nodes (const re_dfa_t *dfa, Idx node, re_node_set *dest_nodes,
1803 		       const re_node_set *candidates)
1804 {
1805     Idx ecl_idx;
1806     reg_errcode_t err;
1807     re_node_set *inv_eclosure = dfa->inveclosures + node;
1808     re_node_set except_nodes;
1809     re_node_set_init_empty (&except_nodes);
1810     for (ecl_idx = 0; ecl_idx < inv_eclosure->nelem; ++ecl_idx)
1811       {
1812 	Idx cur_node = inv_eclosure->elems[ecl_idx];
1813 	if (cur_node == node)
1814 	  continue;
1815 	if (IS_EPSILON_NODE (dfa->nodes[cur_node].type))
1816 	  {
1817 	    Idx edst1 = dfa->edests[cur_node].elems[0];
1818 	    Idx edst2 = ((dfa->edests[cur_node].nelem > 1)
1819 			 ? dfa->edests[cur_node].elems[1] : -1);
1820 	    if ((!re_node_set_contains (inv_eclosure, edst1)
1821 		 && re_node_set_contains (dest_nodes, edst1))
1822 		|| (edst2 > 0
1823 		    && !re_node_set_contains (inv_eclosure, edst2)
1824 		    && re_node_set_contains (dest_nodes, edst2)))
1825 	      {
1826 		err = re_node_set_add_intersect (&except_nodes, candidates,
1827 						 dfa->inveclosures + cur_node);
1828 		if (__glibc_unlikely (err != REG_NOERROR))
1829 		  {
1830 		    re_node_set_free (&except_nodes);
1831 		    return err;
1832 		  }
1833 	      }
1834 	  }
1835       }
1836     for (ecl_idx = 0; ecl_idx < inv_eclosure->nelem; ++ecl_idx)
1837       {
1838 	Idx cur_node = inv_eclosure->elems[ecl_idx];
1839 	if (!re_node_set_contains (&except_nodes, cur_node))
1840 	  {
1841 	    Idx idx = re_node_set_contains (dest_nodes, cur_node) - 1;
1842 	    re_node_set_remove_at (dest_nodes, idx);
1843 	  }
1844       }
1845     re_node_set_free (&except_nodes);
1846     return REG_NOERROR;
1847 }
1848 
1849 static bool
check_dst_limits(const re_match_context_t * mctx,const re_node_set * limits,Idx dst_node,Idx dst_idx,Idx src_node,Idx src_idx)1850 check_dst_limits (const re_match_context_t *mctx, const re_node_set *limits,
1851 		  Idx dst_node, Idx dst_idx, Idx src_node, Idx src_idx)
1852 {
1853   const re_dfa_t *const dfa = mctx->dfa;
1854   Idx lim_idx, src_pos, dst_pos;
1855 
1856   Idx dst_bkref_idx = search_cur_bkref_entry (mctx, dst_idx);
1857   Idx src_bkref_idx = search_cur_bkref_entry (mctx, src_idx);
1858   for (lim_idx = 0; lim_idx < limits->nelem; ++lim_idx)
1859     {
1860       Idx subexp_idx;
1861       struct re_backref_cache_entry *ent;
1862       ent = mctx->bkref_ents + limits->elems[lim_idx];
1863       subexp_idx = dfa->nodes[ent->node].opr.idx;
1864 
1865       dst_pos = check_dst_limits_calc_pos (mctx, limits->elems[lim_idx],
1866 					   subexp_idx, dst_node, dst_idx,
1867 					   dst_bkref_idx);
1868       src_pos = check_dst_limits_calc_pos (mctx, limits->elems[lim_idx],
1869 					   subexp_idx, src_node, src_idx,
1870 					   src_bkref_idx);
1871 
1872       /* In case of:
1873 	 <src> <dst> ( <subexp> )
1874 	 ( <subexp> ) <src> <dst>
1875 	 ( <subexp1> <src> <subexp2> <dst> <subexp3> )  */
1876       if (src_pos == dst_pos)
1877 	continue; /* This is unrelated limitation.  */
1878       else
1879 	return true;
1880     }
1881   return false;
1882 }
1883 
1884 static int
check_dst_limits_calc_pos_1(const re_match_context_t * mctx,int boundaries,Idx subexp_idx,Idx from_node,Idx bkref_idx)1885 check_dst_limits_calc_pos_1 (const re_match_context_t *mctx, int boundaries,
1886 			     Idx subexp_idx, Idx from_node, Idx bkref_idx)
1887 {
1888   const re_dfa_t *const dfa = mctx->dfa;
1889   const re_node_set *eclosures = dfa->eclosures + from_node;
1890   Idx node_idx;
1891 
1892   /* Else, we are on the boundary: examine the nodes on the epsilon
1893      closure.  */
1894   for (node_idx = 0; node_idx < eclosures->nelem; ++node_idx)
1895     {
1896       Idx node = eclosures->elems[node_idx];
1897       switch (dfa->nodes[node].type)
1898 	{
1899 	case OP_BACK_REF:
1900 	  if (bkref_idx != -1)
1901 	    {
1902 	      struct re_backref_cache_entry *ent = mctx->bkref_ents + bkref_idx;
1903 	      do
1904 		{
1905 		  Idx dst;
1906 		  int cpos;
1907 
1908 		  if (ent->node != node)
1909 		    continue;
1910 
1911 		  if (subexp_idx < BITSET_WORD_BITS
1912 		      && !(ent->eps_reachable_subexps_map
1913 			   & ((bitset_word_t) 1 << subexp_idx)))
1914 		    continue;
1915 
1916 		  /* Recurse trying to reach the OP_OPEN_SUBEXP and
1917 		     OP_CLOSE_SUBEXP cases below.  But, if the
1918 		     destination node is the same node as the source
1919 		     node, don't recurse because it would cause an
1920 		     infinite loop: a regex that exhibits this behavior
1921 		     is ()\1*\1*  */
1922 		  dst = dfa->edests[node].elems[0];
1923 		  if (dst == from_node)
1924 		    {
1925 		      if (boundaries & 1)
1926 			return -1;
1927 		      else /* if (boundaries & 2) */
1928 			return 0;
1929 		    }
1930 
1931 		  cpos =
1932 		    check_dst_limits_calc_pos_1 (mctx, boundaries, subexp_idx,
1933 						 dst, bkref_idx);
1934 		  if (cpos == -1 /* && (boundaries & 1) */)
1935 		    return -1;
1936 		  if (cpos == 0 && (boundaries & 2))
1937 		    return 0;
1938 
1939 		  if (subexp_idx < BITSET_WORD_BITS)
1940 		    ent->eps_reachable_subexps_map
1941 		      &= ~((bitset_word_t) 1 << subexp_idx);
1942 		}
1943 	      while (ent++->more);
1944 	    }
1945 	  break;
1946 
1947 	case OP_OPEN_SUBEXP:
1948 	  if ((boundaries & 1) && subexp_idx == dfa->nodes[node].opr.idx)
1949 	    return -1;
1950 	  break;
1951 
1952 	case OP_CLOSE_SUBEXP:
1953 	  if ((boundaries & 2) && subexp_idx == dfa->nodes[node].opr.idx)
1954 	    return 0;
1955 	  break;
1956 
1957 	default:
1958 	    break;
1959 	}
1960     }
1961 
1962   return (boundaries & 2) ? 1 : 0;
1963 }
1964 
1965 static int
check_dst_limits_calc_pos(const re_match_context_t * mctx,Idx limit,Idx subexp_idx,Idx from_node,Idx str_idx,Idx bkref_idx)1966 check_dst_limits_calc_pos (const re_match_context_t *mctx, Idx limit,
1967 			   Idx subexp_idx, Idx from_node, Idx str_idx,
1968 			   Idx bkref_idx)
1969 {
1970   struct re_backref_cache_entry *lim = mctx->bkref_ents + limit;
1971   int boundaries;
1972 
1973   /* If we are outside the range of the subexpression, return -1 or 1.  */
1974   if (str_idx < lim->subexp_from)
1975     return -1;
1976 
1977   if (lim->subexp_to < str_idx)
1978     return 1;
1979 
1980   /* If we are within the subexpression, return 0.  */
1981   boundaries = (str_idx == lim->subexp_from);
1982   boundaries |= (str_idx == lim->subexp_to) << 1;
1983   if (boundaries == 0)
1984     return 0;
1985 
1986   /* Else, examine epsilon closure.  */
1987   return check_dst_limits_calc_pos_1 (mctx, boundaries, subexp_idx,
1988 				      from_node, bkref_idx);
1989 }
1990 
1991 /* Check the limitations of sub expressions LIMITS, and remove the nodes
1992    which are against limitations from DEST_NODES. */
1993 
1994 static reg_errcode_t
check_subexp_limits(const re_dfa_t * dfa,re_node_set * dest_nodes,const re_node_set * candidates,re_node_set * limits,struct re_backref_cache_entry * bkref_ents,Idx str_idx)1995 check_subexp_limits (const re_dfa_t *dfa, re_node_set *dest_nodes,
1996 		     const re_node_set *candidates, re_node_set *limits,
1997 		     struct re_backref_cache_entry *bkref_ents, Idx str_idx)
1998 {
1999   reg_errcode_t err;
2000   Idx node_idx, lim_idx;
2001 
2002   for (lim_idx = 0; lim_idx < limits->nelem; ++lim_idx)
2003     {
2004       Idx subexp_idx;
2005       struct re_backref_cache_entry *ent;
2006       ent = bkref_ents + limits->elems[lim_idx];
2007 
2008       if (str_idx <= ent->subexp_from || ent->str_idx < str_idx)
2009 	continue; /* This is unrelated limitation.  */
2010 
2011       subexp_idx = dfa->nodes[ent->node].opr.idx;
2012       if (ent->subexp_to == str_idx)
2013 	{
2014 	  Idx ops_node = -1;
2015 	  Idx cls_node = -1;
2016 	  for (node_idx = 0; node_idx < dest_nodes->nelem; ++node_idx)
2017 	    {
2018 	      Idx node = dest_nodes->elems[node_idx];
2019 	      re_token_type_t type = dfa->nodes[node].type;
2020 	      if (type == OP_OPEN_SUBEXP
2021 		  && subexp_idx == dfa->nodes[node].opr.idx)
2022 		ops_node = node;
2023 	      else if (type == OP_CLOSE_SUBEXP
2024 		       && subexp_idx == dfa->nodes[node].opr.idx)
2025 		cls_node = node;
2026 	    }
2027 
2028 	  /* Check the limitation of the open subexpression.  */
2029 	  /* Note that (ent->subexp_to = str_idx != ent->subexp_from).  */
2030 	  if (ops_node >= 0)
2031 	    {
2032 	      err = sub_epsilon_src_nodes (dfa, ops_node, dest_nodes,
2033 					   candidates);
2034 	      if (__glibc_unlikely (err != REG_NOERROR))
2035 		return err;
2036 	    }
2037 
2038 	  /* Check the limitation of the close subexpression.  */
2039 	  if (cls_node >= 0)
2040 	    for (node_idx = 0; node_idx < dest_nodes->nelem; ++node_idx)
2041 	      {
2042 		Idx node = dest_nodes->elems[node_idx];
2043 		if (!re_node_set_contains (dfa->inveclosures + node,
2044 					   cls_node)
2045 		    && !re_node_set_contains (dfa->eclosures + node,
2046 					      cls_node))
2047 		  {
2048 		    /* It is against this limitation.
2049 		       Remove it form the current sifted state.  */
2050 		    err = sub_epsilon_src_nodes (dfa, node, dest_nodes,
2051 						 candidates);
2052 		    if (__glibc_unlikely (err != REG_NOERROR))
2053 		      return err;
2054 		    --node_idx;
2055 		  }
2056 	      }
2057 	}
2058       else /* (ent->subexp_to != str_idx)  */
2059 	{
2060 	  for (node_idx = 0; node_idx < dest_nodes->nelem; ++node_idx)
2061 	    {
2062 	      Idx node = dest_nodes->elems[node_idx];
2063 	      re_token_type_t type = dfa->nodes[node].type;
2064 	      if (type == OP_CLOSE_SUBEXP || type == OP_OPEN_SUBEXP)
2065 		{
2066 		  if (subexp_idx != dfa->nodes[node].opr.idx)
2067 		    continue;
2068 		  /* It is against this limitation.
2069 		     Remove it form the current sifted state.  */
2070 		  err = sub_epsilon_src_nodes (dfa, node, dest_nodes,
2071 					       candidates);
2072 		  if (__glibc_unlikely (err != REG_NOERROR))
2073 		    return err;
2074 		}
2075 	    }
2076 	}
2077     }
2078   return REG_NOERROR;
2079 }
2080 
2081 static reg_errcode_t
2082 __attribute_warn_unused_result__
sift_states_bkref(const re_match_context_t * mctx,re_sift_context_t * sctx,Idx str_idx,const re_node_set * candidates)2083 sift_states_bkref (const re_match_context_t *mctx, re_sift_context_t *sctx,
2084 		   Idx str_idx, const re_node_set *candidates)
2085 {
2086   const re_dfa_t *const dfa = mctx->dfa;
2087   reg_errcode_t err;
2088   Idx node_idx, node;
2089   re_sift_context_t local_sctx;
2090   Idx first_idx = search_cur_bkref_entry (mctx, str_idx);
2091 
2092   if (first_idx == -1)
2093     return REG_NOERROR;
2094 
2095   local_sctx.sifted_states = NULL; /* Mark that it hasn't been initialized.  */
2096 
2097   for (node_idx = 0; node_idx < candidates->nelem; ++node_idx)
2098     {
2099       Idx enabled_idx;
2100       re_token_type_t type;
2101       struct re_backref_cache_entry *entry;
2102       node = candidates->elems[node_idx];
2103       type = dfa->nodes[node].type;
2104       /* Avoid infinite loop for the REs like "()\1+".  */
2105       if (node == sctx->last_node && str_idx == sctx->last_str_idx)
2106 	continue;
2107       if (type != OP_BACK_REF)
2108 	continue;
2109 
2110       entry = mctx->bkref_ents + first_idx;
2111       enabled_idx = first_idx;
2112       do
2113 	{
2114 	  Idx subexp_len;
2115 	  Idx to_idx;
2116 	  Idx dst_node;
2117 	  bool ok;
2118 	  re_dfastate_t *cur_state;
2119 
2120 	  if (entry->node != node)
2121 	    continue;
2122 	  subexp_len = entry->subexp_to - entry->subexp_from;
2123 	  to_idx = str_idx + subexp_len;
2124 	  dst_node = (subexp_len ? dfa->nexts[node]
2125 		      : dfa->edests[node].elems[0]);
2126 
2127 	  if (to_idx > sctx->last_str_idx
2128 	      || sctx->sifted_states[to_idx] == NULL
2129 	      || !STATE_NODE_CONTAINS (sctx->sifted_states[to_idx], dst_node)
2130 	      || check_dst_limits (mctx, &sctx->limits, node,
2131 				   str_idx, dst_node, to_idx))
2132 	    continue;
2133 
2134 	  if (local_sctx.sifted_states == NULL)
2135 	    {
2136 	      local_sctx = *sctx;
2137 	      err = re_node_set_init_copy (&local_sctx.limits, &sctx->limits);
2138 	      if (__glibc_unlikely (err != REG_NOERROR))
2139 		goto free_return;
2140 	    }
2141 	  local_sctx.last_node = node;
2142 	  local_sctx.last_str_idx = str_idx;
2143 	  ok = re_node_set_insert (&local_sctx.limits, enabled_idx);
2144 	  if (__glibc_unlikely (! ok))
2145 	    {
2146 	      err = REG_ESPACE;
2147 	      goto free_return;
2148 	    }
2149 	  cur_state = local_sctx.sifted_states[str_idx];
2150 	  err = sift_states_backward (mctx, &local_sctx);
2151 	  if (__glibc_unlikely (err != REG_NOERROR))
2152 	    goto free_return;
2153 	  if (sctx->limited_states != NULL)
2154 	    {
2155 	      err = merge_state_array (dfa, sctx->limited_states,
2156 				       local_sctx.sifted_states,
2157 				       str_idx + 1);
2158 	      if (__glibc_unlikely (err != REG_NOERROR))
2159 		goto free_return;
2160 	    }
2161 	  local_sctx.sifted_states[str_idx] = cur_state;
2162 	  re_node_set_remove (&local_sctx.limits, enabled_idx);
2163 
2164 	  /* mctx->bkref_ents may have changed, reload the pointer.  */
2165 	  entry = mctx->bkref_ents + enabled_idx;
2166 	}
2167       while (enabled_idx++, entry++->more);
2168     }
2169   err = REG_NOERROR;
2170  free_return:
2171   if (local_sctx.sifted_states != NULL)
2172     {
2173       re_node_set_free (&local_sctx.limits);
2174     }
2175 
2176   return err;
2177 }
2178 
2179 
2180 #ifdef RE_ENABLE_I18N
2181 static int
sift_states_iter_mb(const re_match_context_t * mctx,re_sift_context_t * sctx,Idx node_idx,Idx str_idx,Idx max_str_idx)2182 sift_states_iter_mb (const re_match_context_t *mctx, re_sift_context_t *sctx,
2183 		     Idx node_idx, Idx str_idx, Idx max_str_idx)
2184 {
2185   const re_dfa_t *const dfa = mctx->dfa;
2186   int naccepted;
2187   /* Check the node can accept "multi byte".  */
2188   naccepted = check_node_accept_bytes (dfa, node_idx, &mctx->input, str_idx);
2189   if (naccepted > 0 && str_idx + naccepted <= max_str_idx
2190       && !STATE_NODE_CONTAINS (sctx->sifted_states[str_idx + naccepted],
2191 			       dfa->nexts[node_idx]))
2192     /* The node can't accept the "multi byte", or the
2193        destination was already thrown away, then the node
2194        couldn't accept the current input "multi byte".   */
2195     naccepted = 0;
2196   /* Otherwise, it is sure that the node could accept
2197      'naccepted' bytes input.  */
2198   return naccepted;
2199 }
2200 #endif /* RE_ENABLE_I18N */
2201 
2202 
2203 /* Functions for state transition.  */
2204 
2205 /* Return the next state to which the current state STATE will transit by
2206    accepting the current input byte, and update STATE_LOG if necessary.
2207    Return NULL on failure.
2208    If STATE can accept a multibyte char/collating element/back reference
2209    update the destination of STATE_LOG.  */
2210 
2211 static re_dfastate_t *
2212 __attribute_warn_unused_result__
transit_state(reg_errcode_t * err,re_match_context_t * mctx,re_dfastate_t * state)2213 transit_state (reg_errcode_t *err, re_match_context_t *mctx,
2214 	       re_dfastate_t *state)
2215 {
2216   re_dfastate_t **trtable;
2217   unsigned char ch;
2218 
2219 #ifdef RE_ENABLE_I18N
2220   /* If the current state can accept multibyte.  */
2221   if (__glibc_unlikely (state->accept_mb))
2222     {
2223       *err = transit_state_mb (mctx, state);
2224       if (__glibc_unlikely (*err != REG_NOERROR))
2225 	return NULL;
2226     }
2227 #endif /* RE_ENABLE_I18N */
2228 
2229   /* Then decide the next state with the single byte.  */
2230 #if 0
2231   if (0)
2232     /* don't use transition table  */
2233     return transit_state_sb (err, mctx, state);
2234 #endif
2235 
2236   /* Use transition table  */
2237   ch = re_string_fetch_byte (&mctx->input);
2238   for (;;)
2239     {
2240       trtable = state->trtable;
2241       if (__glibc_likely (trtable != NULL))
2242 	return trtable[ch];
2243 
2244       trtable = state->word_trtable;
2245       if (__glibc_likely (trtable != NULL))
2246 	{
2247 	  unsigned int context;
2248 	  context
2249 	    = re_string_context_at (&mctx->input,
2250 				    re_string_cur_idx (&mctx->input) - 1,
2251 				    mctx->eflags);
2252 	  if (IS_WORD_CONTEXT (context))
2253 	    return trtable[ch + SBC_MAX];
2254 	  else
2255 	    return trtable[ch];
2256 	}
2257 
2258       if (!build_trtable (mctx->dfa, state))
2259 	{
2260 	  *err = REG_ESPACE;
2261 	  return NULL;
2262 	}
2263 
2264       /* Retry, we now have a transition table.  */
2265     }
2266 }
2267 
2268 /* Update the state_log if we need */
2269 static re_dfastate_t *
merge_state_with_log(reg_errcode_t * err,re_match_context_t * mctx,re_dfastate_t * next_state)2270 merge_state_with_log (reg_errcode_t *err, re_match_context_t *mctx,
2271 		      re_dfastate_t *next_state)
2272 {
2273   const re_dfa_t *const dfa = mctx->dfa;
2274   Idx cur_idx = re_string_cur_idx (&mctx->input);
2275 
2276   if (cur_idx > mctx->state_log_top)
2277     {
2278       mctx->state_log[cur_idx] = next_state;
2279       mctx->state_log_top = cur_idx;
2280     }
2281   else if (mctx->state_log[cur_idx] == 0)
2282     {
2283       mctx->state_log[cur_idx] = next_state;
2284     }
2285   else
2286     {
2287       re_dfastate_t *pstate;
2288       unsigned int context;
2289       re_node_set next_nodes, *log_nodes, *table_nodes = NULL;
2290       /* If (state_log[cur_idx] != 0), it implies that cur_idx is
2291 	 the destination of a multibyte char/collating element/
2292 	 back reference.  Then the next state is the union set of
2293 	 these destinations and the results of the transition table.  */
2294       pstate = mctx->state_log[cur_idx];
2295       log_nodes = pstate->entrance_nodes;
2296       if (next_state != NULL)
2297 	{
2298 	  table_nodes = next_state->entrance_nodes;
2299 	  *err = re_node_set_init_union (&next_nodes, table_nodes,
2300 					     log_nodes);
2301 	  if (__glibc_unlikely (*err != REG_NOERROR))
2302 	    return NULL;
2303 	}
2304       else
2305 	next_nodes = *log_nodes;
2306       /* Note: We already add the nodes of the initial state,
2307 	 then we don't need to add them here.  */
2308 
2309       context = re_string_context_at (&mctx->input,
2310 				      re_string_cur_idx (&mctx->input) - 1,
2311 				      mctx->eflags);
2312       next_state = mctx->state_log[cur_idx]
2313 	= re_acquire_state_context (err, dfa, &next_nodes, context);
2314       /* We don't need to check errors here, since the return value of
2315 	 this function is next_state and ERR is already set.  */
2316 
2317       if (table_nodes != NULL)
2318 	re_node_set_free (&next_nodes);
2319     }
2320 
2321   if (__glibc_unlikely (dfa->nbackref) && next_state != NULL)
2322     {
2323       /* Check OP_OPEN_SUBEXP in the current state in case that we use them
2324 	 later.  We must check them here, since the back references in the
2325 	 next state might use them.  */
2326       *err = check_subexp_matching_top (mctx, &next_state->nodes,
2327 					cur_idx);
2328       if (__glibc_unlikely (*err != REG_NOERROR))
2329 	return NULL;
2330 
2331       /* If the next state has back references.  */
2332       if (next_state->has_backref)
2333 	{
2334 	  *err = transit_state_bkref (mctx, &next_state->nodes);
2335 	  if (__glibc_unlikely (*err != REG_NOERROR))
2336 	    return NULL;
2337 	  next_state = mctx->state_log[cur_idx];
2338 	}
2339     }
2340 
2341   return next_state;
2342 }
2343 
2344 /* Skip bytes in the input that correspond to part of a
2345    multi-byte match, then look in the log for a state
2346    from which to restart matching.  */
2347 static re_dfastate_t *
find_recover_state(reg_errcode_t * err,re_match_context_t * mctx)2348 find_recover_state (reg_errcode_t *err, re_match_context_t *mctx)
2349 {
2350   re_dfastate_t *cur_state;
2351   do
2352     {
2353       Idx max = mctx->state_log_top;
2354       Idx cur_str_idx = re_string_cur_idx (&mctx->input);
2355 
2356       do
2357 	{
2358 	  if (++cur_str_idx > max)
2359 	    return NULL;
2360 	  re_string_skip_bytes (&mctx->input, 1);
2361 	}
2362       while (mctx->state_log[cur_str_idx] == NULL);
2363 
2364       cur_state = merge_state_with_log (err, mctx, NULL);
2365     }
2366   while (*err == REG_NOERROR && cur_state == NULL);
2367   return cur_state;
2368 }
2369 
2370 /* Helper functions for transit_state.  */
2371 
2372 /* From the node set CUR_NODES, pick up the nodes whose types are
2373    OP_OPEN_SUBEXP and which have corresponding back references in the regular
2374    expression. And register them to use them later for evaluating the
2375    corresponding back references.  */
2376 
2377 static reg_errcode_t
check_subexp_matching_top(re_match_context_t * mctx,re_node_set * cur_nodes,Idx str_idx)2378 check_subexp_matching_top (re_match_context_t *mctx, re_node_set *cur_nodes,
2379 			   Idx str_idx)
2380 {
2381   const re_dfa_t *const dfa = mctx->dfa;
2382   Idx node_idx;
2383   reg_errcode_t err;
2384 
2385   /* TODO: This isn't efficient.
2386 	   Because there might be more than one nodes whose types are
2387 	   OP_OPEN_SUBEXP and whose index is SUBEXP_IDX, we must check all
2388 	   nodes.
2389 	   E.g. RE: (a){2}  */
2390   for (node_idx = 0; node_idx < cur_nodes->nelem; ++node_idx)
2391     {
2392       Idx node = cur_nodes->elems[node_idx];
2393       if (dfa->nodes[node].type == OP_OPEN_SUBEXP
2394 	  && dfa->nodes[node].opr.idx < BITSET_WORD_BITS
2395 	  && (dfa->used_bkref_map
2396 	      & ((bitset_word_t) 1 << dfa->nodes[node].opr.idx)))
2397 	{
2398 	  err = match_ctx_add_subtop (mctx, node, str_idx);
2399 	  if (__glibc_unlikely (err != REG_NOERROR))
2400 	    return err;
2401 	}
2402     }
2403   return REG_NOERROR;
2404 }
2405 
2406 #if 0
2407 /* Return the next state to which the current state STATE will transit by
2408    accepting the current input byte.  Return NULL on failure.  */
2409 
2410 static re_dfastate_t *
2411 transit_state_sb (reg_errcode_t *err, re_match_context_t *mctx,
2412 		  re_dfastate_t *state)
2413 {
2414   const re_dfa_t *const dfa = mctx->dfa;
2415   re_node_set next_nodes;
2416   re_dfastate_t *next_state;
2417   Idx node_cnt, cur_str_idx = re_string_cur_idx (&mctx->input);
2418   unsigned int context;
2419 
2420   *err = re_node_set_alloc (&next_nodes, state->nodes.nelem + 1);
2421   if (__glibc_unlikely (*err != REG_NOERROR))
2422     return NULL;
2423   for (node_cnt = 0; node_cnt < state->nodes.nelem; ++node_cnt)
2424     {
2425       Idx cur_node = state->nodes.elems[node_cnt];
2426       if (check_node_accept (mctx, dfa->nodes + cur_node, cur_str_idx))
2427 	{
2428 	  *err = re_node_set_merge (&next_nodes,
2429 				    dfa->eclosures + dfa->nexts[cur_node]);
2430 	  if (__glibc_unlikely (*err != REG_NOERROR))
2431 	    {
2432 	      re_node_set_free (&next_nodes);
2433 	      return NULL;
2434 	    }
2435 	}
2436     }
2437   context = re_string_context_at (&mctx->input, cur_str_idx, mctx->eflags);
2438   next_state = re_acquire_state_context (err, dfa, &next_nodes, context);
2439   /* We don't need to check errors here, since the return value of
2440      this function is next_state and ERR is already set.  */
2441 
2442   re_node_set_free (&next_nodes);
2443   re_string_skip_bytes (&mctx->input, 1);
2444   return next_state;
2445 }
2446 #endif
2447 
2448 #ifdef RE_ENABLE_I18N
2449 static reg_errcode_t
transit_state_mb(re_match_context_t * mctx,re_dfastate_t * pstate)2450 transit_state_mb (re_match_context_t *mctx, re_dfastate_t *pstate)
2451 {
2452   const re_dfa_t *const dfa = mctx->dfa;
2453   reg_errcode_t err;
2454   Idx i;
2455 
2456   for (i = 0; i < pstate->nodes.nelem; ++i)
2457     {
2458       re_node_set dest_nodes, *new_nodes;
2459       Idx cur_node_idx = pstate->nodes.elems[i];
2460       int naccepted;
2461       Idx dest_idx;
2462       unsigned int context;
2463       re_dfastate_t *dest_state;
2464 
2465       if (!dfa->nodes[cur_node_idx].accept_mb)
2466 	continue;
2467 
2468       if (dfa->nodes[cur_node_idx].constraint)
2469 	{
2470 	  context = re_string_context_at (&mctx->input,
2471 					  re_string_cur_idx (&mctx->input),
2472 					  mctx->eflags);
2473 	  if (NOT_SATISFY_NEXT_CONSTRAINT (dfa->nodes[cur_node_idx].constraint,
2474 					   context))
2475 	    continue;
2476 	}
2477 
2478       /* How many bytes the node can accept?  */
2479       naccepted = check_node_accept_bytes (dfa, cur_node_idx, &mctx->input,
2480 					   re_string_cur_idx (&mctx->input));
2481       if (naccepted == 0)
2482 	continue;
2483 
2484       /* The node can accepts 'naccepted' bytes.  */
2485       dest_idx = re_string_cur_idx (&mctx->input) + naccepted;
2486       mctx->max_mb_elem_len = ((mctx->max_mb_elem_len < naccepted) ? naccepted
2487 			       : mctx->max_mb_elem_len);
2488       err = clean_state_log_if_needed (mctx, dest_idx);
2489       if (__glibc_unlikely (err != REG_NOERROR))
2490 	return err;
2491       DEBUG_ASSERT (dfa->nexts[cur_node_idx] != -1);
2492       new_nodes = dfa->eclosures + dfa->nexts[cur_node_idx];
2493 
2494       dest_state = mctx->state_log[dest_idx];
2495       if (dest_state == NULL)
2496 	dest_nodes = *new_nodes;
2497       else
2498 	{
2499 	  err = re_node_set_init_union (&dest_nodes,
2500 					dest_state->entrance_nodes, new_nodes);
2501 	  if (__glibc_unlikely (err != REG_NOERROR))
2502 	    return err;
2503 	}
2504       context = re_string_context_at (&mctx->input, dest_idx - 1,
2505 				      mctx->eflags);
2506       mctx->state_log[dest_idx]
2507 	= re_acquire_state_context (&err, dfa, &dest_nodes, context);
2508       if (dest_state != NULL)
2509 	re_node_set_free (&dest_nodes);
2510       if (__glibc_unlikely (mctx->state_log[dest_idx] == NULL
2511 			    && err != REG_NOERROR))
2512 	return err;
2513     }
2514   return REG_NOERROR;
2515 }
2516 #endif /* RE_ENABLE_I18N */
2517 
2518 static reg_errcode_t
transit_state_bkref(re_match_context_t * mctx,const re_node_set * nodes)2519 transit_state_bkref (re_match_context_t *mctx, const re_node_set *nodes)
2520 {
2521   const re_dfa_t *const dfa = mctx->dfa;
2522   reg_errcode_t err;
2523   Idx i;
2524   Idx cur_str_idx = re_string_cur_idx (&mctx->input);
2525 
2526   for (i = 0; i < nodes->nelem; ++i)
2527     {
2528       Idx dest_str_idx, prev_nelem, bkc_idx;
2529       Idx node_idx = nodes->elems[i];
2530       unsigned int context;
2531       const re_token_t *node = dfa->nodes + node_idx;
2532       re_node_set *new_dest_nodes;
2533 
2534       /* Check whether 'node' is a backreference or not.  */
2535       if (node->type != OP_BACK_REF)
2536 	continue;
2537 
2538       if (node->constraint)
2539 	{
2540 	  context = re_string_context_at (&mctx->input, cur_str_idx,
2541 					  mctx->eflags);
2542 	  if (NOT_SATISFY_NEXT_CONSTRAINT (node->constraint, context))
2543 	    continue;
2544 	}
2545 
2546       /* 'node' is a backreference.
2547 	 Check the substring which the substring matched.  */
2548       bkc_idx = mctx->nbkref_ents;
2549       err = get_subexp (mctx, node_idx, cur_str_idx);
2550       if (__glibc_unlikely (err != REG_NOERROR))
2551 	goto free_return;
2552 
2553       /* And add the epsilon closures (which is 'new_dest_nodes') of
2554 	 the backreference to appropriate state_log.  */
2555       DEBUG_ASSERT (dfa->nexts[node_idx] != -1);
2556       for (; bkc_idx < mctx->nbkref_ents; ++bkc_idx)
2557 	{
2558 	  Idx subexp_len;
2559 	  re_dfastate_t *dest_state;
2560 	  struct re_backref_cache_entry *bkref_ent;
2561 	  bkref_ent = mctx->bkref_ents + bkc_idx;
2562 	  if (bkref_ent->node != node_idx || bkref_ent->str_idx != cur_str_idx)
2563 	    continue;
2564 	  subexp_len = bkref_ent->subexp_to - bkref_ent->subexp_from;
2565 	  new_dest_nodes = (subexp_len == 0
2566 			    ? dfa->eclosures + dfa->edests[node_idx].elems[0]
2567 			    : dfa->eclosures + dfa->nexts[node_idx]);
2568 	  dest_str_idx = (cur_str_idx + bkref_ent->subexp_to
2569 			  - bkref_ent->subexp_from);
2570 	  context = re_string_context_at (&mctx->input, dest_str_idx - 1,
2571 					  mctx->eflags);
2572 	  dest_state = mctx->state_log[dest_str_idx];
2573 	  prev_nelem = ((mctx->state_log[cur_str_idx] == NULL) ? 0
2574 			: mctx->state_log[cur_str_idx]->nodes.nelem);
2575 	  /* Add 'new_dest_node' to state_log.  */
2576 	  if (dest_state == NULL)
2577 	    {
2578 	      mctx->state_log[dest_str_idx]
2579 		= re_acquire_state_context (&err, dfa, new_dest_nodes,
2580 					    context);
2581 	      if (__glibc_unlikely (mctx->state_log[dest_str_idx] == NULL
2582 				    && err != REG_NOERROR))
2583 		goto free_return;
2584 	    }
2585 	  else
2586 	    {
2587 	      re_node_set dest_nodes;
2588 	      err = re_node_set_init_union (&dest_nodes,
2589 					    dest_state->entrance_nodes,
2590 					    new_dest_nodes);
2591 	      if (__glibc_unlikely (err != REG_NOERROR))
2592 		{
2593 		  re_node_set_free (&dest_nodes);
2594 		  goto free_return;
2595 		}
2596 	      mctx->state_log[dest_str_idx]
2597 		= re_acquire_state_context (&err, dfa, &dest_nodes, context);
2598 	      re_node_set_free (&dest_nodes);
2599 	      if (__glibc_unlikely (mctx->state_log[dest_str_idx] == NULL
2600 				    && err != REG_NOERROR))
2601 		goto free_return;
2602 	    }
2603 	  /* We need to check recursively if the backreference can epsilon
2604 	     transit.  */
2605 	  if (subexp_len == 0
2606 	      && mctx->state_log[cur_str_idx]->nodes.nelem > prev_nelem)
2607 	    {
2608 	      err = check_subexp_matching_top (mctx, new_dest_nodes,
2609 					       cur_str_idx);
2610 	      if (__glibc_unlikely (err != REG_NOERROR))
2611 		goto free_return;
2612 	      err = transit_state_bkref (mctx, new_dest_nodes);
2613 	      if (__glibc_unlikely (err != REG_NOERROR))
2614 		goto free_return;
2615 	    }
2616 	}
2617     }
2618   err = REG_NOERROR;
2619  free_return:
2620   return err;
2621 }
2622 
2623 /* Enumerate all the candidates which the backreference BKREF_NODE can match
2624    at BKREF_STR_IDX, and register them by match_ctx_add_entry().
2625    Note that we might collect inappropriate candidates here.
2626    However, the cost of checking them strictly here is too high, then we
2627    delay these checking for prune_impossible_nodes().  */
2628 
2629 static reg_errcode_t
2630 __attribute_warn_unused_result__
get_subexp(re_match_context_t * mctx,Idx bkref_node,Idx bkref_str_idx)2631 get_subexp (re_match_context_t *mctx, Idx bkref_node, Idx bkref_str_idx)
2632 {
2633   const re_dfa_t *const dfa = mctx->dfa;
2634   Idx subexp_num, sub_top_idx;
2635   const char *buf = (const char *) re_string_get_buffer (&mctx->input);
2636   /* Return if we have already checked BKREF_NODE at BKREF_STR_IDX.  */
2637   Idx cache_idx = search_cur_bkref_entry (mctx, bkref_str_idx);
2638   if (cache_idx != -1)
2639     {
2640       const struct re_backref_cache_entry *entry
2641 	= mctx->bkref_ents + cache_idx;
2642       do
2643 	if (entry->node == bkref_node)
2644 	  return REG_NOERROR; /* We already checked it.  */
2645       while (entry++->more);
2646     }
2647 
2648   subexp_num = dfa->nodes[bkref_node].opr.idx;
2649 
2650   /* For each sub expression  */
2651   for (sub_top_idx = 0; sub_top_idx < mctx->nsub_tops; ++sub_top_idx)
2652     {
2653       reg_errcode_t err;
2654       re_sub_match_top_t *sub_top = mctx->sub_tops[sub_top_idx];
2655       re_sub_match_last_t *sub_last;
2656       Idx sub_last_idx, sl_str, bkref_str_off;
2657 
2658       if (dfa->nodes[sub_top->node].opr.idx != subexp_num)
2659 	continue; /* It isn't related.  */
2660 
2661       sl_str = sub_top->str_idx;
2662       bkref_str_off = bkref_str_idx;
2663       /* At first, check the last node of sub expressions we already
2664 	 evaluated.  */
2665       for (sub_last_idx = 0; sub_last_idx < sub_top->nlasts; ++sub_last_idx)
2666 	{
2667 	  regoff_t sl_str_diff;
2668 	  sub_last = sub_top->lasts[sub_last_idx];
2669 	  sl_str_diff = sub_last->str_idx - sl_str;
2670 	  /* The matched string by the sub expression match with the substring
2671 	     at the back reference?  */
2672 	  if (sl_str_diff > 0)
2673 	    {
2674 	      if (__glibc_unlikely (bkref_str_off + sl_str_diff
2675 				    > mctx->input.valid_len))
2676 		{
2677 		  /* Not enough chars for a successful match.  */
2678 		  if (bkref_str_off + sl_str_diff > mctx->input.len)
2679 		    break;
2680 
2681 		  err = clean_state_log_if_needed (mctx,
2682 						   bkref_str_off
2683 						   + sl_str_diff);
2684 		  if (__glibc_unlikely (err != REG_NOERROR))
2685 		    return err;
2686 		  buf = (const char *) re_string_get_buffer (&mctx->input);
2687 		}
2688 	      if (memcmp (buf + bkref_str_off, buf + sl_str, sl_str_diff) != 0)
2689 		/* We don't need to search this sub expression any more.  */
2690 		break;
2691 	    }
2692 	  bkref_str_off += sl_str_diff;
2693 	  sl_str += sl_str_diff;
2694 	  err = get_subexp_sub (mctx, sub_top, sub_last, bkref_node,
2695 				bkref_str_idx);
2696 
2697 	  /* Reload buf, since the preceding call might have reallocated
2698 	     the buffer.  */
2699 	  buf = (const char *) re_string_get_buffer (&mctx->input);
2700 
2701 	  if (err == REG_NOMATCH)
2702 	    continue;
2703 	  if (__glibc_unlikely (err != REG_NOERROR))
2704 	    return err;
2705 	}
2706 
2707       if (sub_last_idx < sub_top->nlasts)
2708 	continue;
2709       if (sub_last_idx > 0)
2710 	++sl_str;
2711       /* Then, search for the other last nodes of the sub expression.  */
2712       for (; sl_str <= bkref_str_idx; ++sl_str)
2713 	{
2714 	  Idx cls_node;
2715 	  regoff_t sl_str_off;
2716 	  const re_node_set *nodes;
2717 	  sl_str_off = sl_str - sub_top->str_idx;
2718 	  /* The matched string by the sub expression match with the substring
2719 	     at the back reference?  */
2720 	  if (sl_str_off > 0)
2721 	    {
2722 	      if (__glibc_unlikely (bkref_str_off >= mctx->input.valid_len))
2723 		{
2724 		  /* If we are at the end of the input, we cannot match.  */
2725 		  if (bkref_str_off >= mctx->input.len)
2726 		    break;
2727 
2728 		  err = extend_buffers (mctx, bkref_str_off + 1);
2729 		  if (__glibc_unlikely (err != REG_NOERROR))
2730 		    return err;
2731 
2732 		  buf = (const char *) re_string_get_buffer (&mctx->input);
2733 		}
2734 	      if (buf [bkref_str_off++] != buf[sl_str - 1])
2735 		break; /* We don't need to search this sub expression
2736 			  any more.  */
2737 	    }
2738 	  if (mctx->state_log[sl_str] == NULL)
2739 	    continue;
2740 	  /* Does this state have a ')' of the sub expression?  */
2741 	  nodes = &mctx->state_log[sl_str]->nodes;
2742 	  cls_node = find_subexp_node (dfa, nodes, subexp_num,
2743 				       OP_CLOSE_SUBEXP);
2744 	  if (cls_node == -1)
2745 	    continue; /* No.  */
2746 	  if (sub_top->path == NULL)
2747 	    {
2748 	      sub_top->path = calloc (sizeof (state_array_t),
2749 				      sl_str - sub_top->str_idx + 1);
2750 	      if (sub_top->path == NULL)
2751 		return REG_ESPACE;
2752 	    }
2753 	  /* Can the OP_OPEN_SUBEXP node arrive the OP_CLOSE_SUBEXP node
2754 	     in the current context?  */
2755 	  err = check_arrival (mctx, sub_top->path, sub_top->node,
2756 			       sub_top->str_idx, cls_node, sl_str,
2757 			       OP_CLOSE_SUBEXP);
2758 	  if (err == REG_NOMATCH)
2759 	      continue;
2760 	  if (__glibc_unlikely (err != REG_NOERROR))
2761 	      return err;
2762 	  sub_last = match_ctx_add_sublast (sub_top, cls_node, sl_str);
2763 	  if (__glibc_unlikely (sub_last == NULL))
2764 	    return REG_ESPACE;
2765 	  err = get_subexp_sub (mctx, sub_top, sub_last, bkref_node,
2766 				bkref_str_idx);
2767 	  buf = (const char *) re_string_get_buffer (&mctx->input);
2768 	  if (err == REG_NOMATCH)
2769 	    continue;
2770 	  if (__glibc_unlikely (err != REG_NOERROR))
2771 	    return err;
2772 	}
2773     }
2774   return REG_NOERROR;
2775 }
2776 
2777 /* Helper functions for get_subexp().  */
2778 
2779 /* Check SUB_LAST can arrive to the back reference BKREF_NODE at BKREF_STR.
2780    If it can arrive, register the sub expression expressed with SUB_TOP
2781    and SUB_LAST.  */
2782 
2783 static reg_errcode_t
get_subexp_sub(re_match_context_t * mctx,const re_sub_match_top_t * sub_top,re_sub_match_last_t * sub_last,Idx bkref_node,Idx bkref_str)2784 get_subexp_sub (re_match_context_t *mctx, const re_sub_match_top_t *sub_top,
2785 		re_sub_match_last_t *sub_last, Idx bkref_node, Idx bkref_str)
2786 {
2787   reg_errcode_t err;
2788   Idx to_idx;
2789   /* Can the subexpression arrive the back reference?  */
2790   err = check_arrival (mctx, &sub_last->path, sub_last->node,
2791 		       sub_last->str_idx, bkref_node, bkref_str,
2792 		       OP_OPEN_SUBEXP);
2793   if (err != REG_NOERROR)
2794     return err;
2795   err = match_ctx_add_entry (mctx, bkref_node, bkref_str, sub_top->str_idx,
2796 			     sub_last->str_idx);
2797   if (__glibc_unlikely (err != REG_NOERROR))
2798     return err;
2799   to_idx = bkref_str + sub_last->str_idx - sub_top->str_idx;
2800   return clean_state_log_if_needed (mctx, to_idx);
2801 }
2802 
2803 /* Find the first node which is '(' or ')' and whose index is SUBEXP_IDX.
2804    Search '(' if FL_OPEN, or search ')' otherwise.
2805    TODO: This function isn't efficient...
2806 	 Because there might be more than one nodes whose types are
2807 	 OP_OPEN_SUBEXP and whose index is SUBEXP_IDX, we must check all
2808 	 nodes.
2809 	 E.g. RE: (a){2}  */
2810 
2811 static Idx
find_subexp_node(const re_dfa_t * dfa,const re_node_set * nodes,Idx subexp_idx,int type)2812 find_subexp_node (const re_dfa_t *dfa, const re_node_set *nodes,
2813 		  Idx subexp_idx, int type)
2814 {
2815   Idx cls_idx;
2816   for (cls_idx = 0; cls_idx < nodes->nelem; ++cls_idx)
2817     {
2818       Idx cls_node = nodes->elems[cls_idx];
2819       const re_token_t *node = dfa->nodes + cls_node;
2820       if (node->type == type
2821 	  && node->opr.idx == subexp_idx)
2822 	return cls_node;
2823     }
2824   return -1;
2825 }
2826 
2827 /* Check whether the node TOP_NODE at TOP_STR can arrive to the node
2828    LAST_NODE at LAST_STR.  We record the path onto PATH since it will be
2829    heavily reused.
2830    Return REG_NOERROR if it can arrive, REG_NOMATCH if it cannot,
2831    REG_ESPACE if memory is exhausted.  */
2832 
2833 static reg_errcode_t
2834 __attribute_warn_unused_result__
check_arrival(re_match_context_t * mctx,state_array_t * path,Idx top_node,Idx top_str,Idx last_node,Idx last_str,int type)2835 check_arrival (re_match_context_t *mctx, state_array_t *path, Idx top_node,
2836 	       Idx top_str, Idx last_node, Idx last_str, int type)
2837 {
2838   const re_dfa_t *const dfa = mctx->dfa;
2839   reg_errcode_t err = REG_NOERROR;
2840   Idx subexp_num, backup_cur_idx, str_idx, null_cnt;
2841   re_dfastate_t *cur_state = NULL;
2842   re_node_set *cur_nodes, next_nodes;
2843   re_dfastate_t **backup_state_log;
2844   unsigned int context;
2845 
2846   subexp_num = dfa->nodes[top_node].opr.idx;
2847   /* Extend the buffer if we need.  */
2848   if (__glibc_unlikely (path->alloc < last_str + mctx->max_mb_elem_len + 1))
2849     {
2850       re_dfastate_t **new_array;
2851       Idx old_alloc = path->alloc;
2852       Idx incr_alloc = last_str + mctx->max_mb_elem_len + 1;
2853       Idx new_alloc;
2854       if (__glibc_unlikely (IDX_MAX - old_alloc < incr_alloc))
2855 	return REG_ESPACE;
2856       new_alloc = old_alloc + incr_alloc;
2857       if (__glibc_unlikely (SIZE_MAX / sizeof (re_dfastate_t *) < new_alloc))
2858 	return REG_ESPACE;
2859       new_array = re_realloc (path->array, re_dfastate_t *, new_alloc);
2860       if (__glibc_unlikely (new_array == NULL))
2861 	return REG_ESPACE;
2862       path->array = new_array;
2863       path->alloc = new_alloc;
2864       memset (new_array + old_alloc, '\0',
2865 	      sizeof (re_dfastate_t *) * (path->alloc - old_alloc));
2866     }
2867 
2868   str_idx = path->next_idx ? path->next_idx : top_str;
2869 
2870   /* Temporary modify MCTX.  */
2871   backup_state_log = mctx->state_log;
2872   backup_cur_idx = mctx->input.cur_idx;
2873   mctx->state_log = path->array;
2874   mctx->input.cur_idx = str_idx;
2875 
2876   /* Setup initial node set.  */
2877   context = re_string_context_at (&mctx->input, str_idx - 1, mctx->eflags);
2878   if (str_idx == top_str)
2879     {
2880       err = re_node_set_init_1 (&next_nodes, top_node);
2881       if (__glibc_unlikely (err != REG_NOERROR))
2882 	return err;
2883       err = check_arrival_expand_ecl (dfa, &next_nodes, subexp_num, type);
2884       if (__glibc_unlikely (err != REG_NOERROR))
2885 	{
2886 	  re_node_set_free (&next_nodes);
2887 	  return err;
2888 	}
2889     }
2890   else
2891     {
2892       cur_state = mctx->state_log[str_idx];
2893       if (cur_state && cur_state->has_backref)
2894 	{
2895 	  err = re_node_set_init_copy (&next_nodes, &cur_state->nodes);
2896 	  if (__glibc_unlikely (err != REG_NOERROR))
2897 	    return err;
2898 	}
2899       else
2900 	re_node_set_init_empty (&next_nodes);
2901     }
2902   if (str_idx == top_str || (cur_state && cur_state->has_backref))
2903     {
2904       if (next_nodes.nelem)
2905 	{
2906 	  err = expand_bkref_cache (mctx, &next_nodes, str_idx,
2907 				    subexp_num, type);
2908 	  if (__glibc_unlikely (err != REG_NOERROR))
2909 	    {
2910 	      re_node_set_free (&next_nodes);
2911 	      return err;
2912 	    }
2913 	}
2914       cur_state = re_acquire_state_context (&err, dfa, &next_nodes, context);
2915       if (__glibc_unlikely (cur_state == NULL && err != REG_NOERROR))
2916 	{
2917 	  re_node_set_free (&next_nodes);
2918 	  return err;
2919 	}
2920       mctx->state_log[str_idx] = cur_state;
2921     }
2922 
2923   for (null_cnt = 0; str_idx < last_str && null_cnt <= mctx->max_mb_elem_len;)
2924     {
2925       re_node_set_empty (&next_nodes);
2926       if (mctx->state_log[str_idx + 1])
2927 	{
2928 	  err = re_node_set_merge (&next_nodes,
2929 				   &mctx->state_log[str_idx + 1]->nodes);
2930 	  if (__glibc_unlikely (err != REG_NOERROR))
2931 	    {
2932 	      re_node_set_free (&next_nodes);
2933 	      return err;
2934 	    }
2935 	}
2936       if (cur_state)
2937 	{
2938 	  err = check_arrival_add_next_nodes (mctx, str_idx,
2939 					      &cur_state->non_eps_nodes,
2940 					      &next_nodes);
2941 	  if (__glibc_unlikely (err != REG_NOERROR))
2942 	    {
2943 	      re_node_set_free (&next_nodes);
2944 	      return err;
2945 	    }
2946 	}
2947       ++str_idx;
2948       if (next_nodes.nelem)
2949 	{
2950 	  err = check_arrival_expand_ecl (dfa, &next_nodes, subexp_num, type);
2951 	  if (__glibc_unlikely (err != REG_NOERROR))
2952 	    {
2953 	      re_node_set_free (&next_nodes);
2954 	      return err;
2955 	    }
2956 	  err = expand_bkref_cache (mctx, &next_nodes, str_idx,
2957 				    subexp_num, type);
2958 	  if (__glibc_unlikely (err != REG_NOERROR))
2959 	    {
2960 	      re_node_set_free (&next_nodes);
2961 	      return err;
2962 	    }
2963 	}
2964       context = re_string_context_at (&mctx->input, str_idx - 1, mctx->eflags);
2965       cur_state = re_acquire_state_context (&err, dfa, &next_nodes, context);
2966       if (__glibc_unlikely (cur_state == NULL && err != REG_NOERROR))
2967 	{
2968 	  re_node_set_free (&next_nodes);
2969 	  return err;
2970 	}
2971       mctx->state_log[str_idx] = cur_state;
2972       null_cnt = cur_state == NULL ? null_cnt + 1 : 0;
2973     }
2974   re_node_set_free (&next_nodes);
2975   cur_nodes = (mctx->state_log[last_str] == NULL ? NULL
2976 	       : &mctx->state_log[last_str]->nodes);
2977   path->next_idx = str_idx;
2978 
2979   /* Fix MCTX.  */
2980   mctx->state_log = backup_state_log;
2981   mctx->input.cur_idx = backup_cur_idx;
2982 
2983   /* Then check the current node set has the node LAST_NODE.  */
2984   if (cur_nodes != NULL && re_node_set_contains (cur_nodes, last_node))
2985     return REG_NOERROR;
2986 
2987   return REG_NOMATCH;
2988 }
2989 
2990 /* Helper functions for check_arrival.  */
2991 
2992 /* Calculate the destination nodes of CUR_NODES at STR_IDX, and append them
2993    to NEXT_NODES.
2994    TODO: This function is similar to the functions transit_state*(),
2995 	 however this function has many additional works.
2996 	 Can't we unify them?  */
2997 
2998 static reg_errcode_t
2999 __attribute_warn_unused_result__
check_arrival_add_next_nodes(re_match_context_t * mctx,Idx str_idx,re_node_set * cur_nodes,re_node_set * next_nodes)3000 check_arrival_add_next_nodes (re_match_context_t *mctx, Idx str_idx,
3001 			      re_node_set *cur_nodes, re_node_set *next_nodes)
3002 {
3003   const re_dfa_t *const dfa = mctx->dfa;
3004   bool ok;
3005   Idx cur_idx;
3006 #ifdef RE_ENABLE_I18N
3007   reg_errcode_t err = REG_NOERROR;
3008 #endif
3009   re_node_set union_set;
3010   re_node_set_init_empty (&union_set);
3011   for (cur_idx = 0; cur_idx < cur_nodes->nelem; ++cur_idx)
3012     {
3013       int naccepted = 0;
3014       Idx cur_node = cur_nodes->elems[cur_idx];
3015       DEBUG_ASSERT (!IS_EPSILON_NODE (dfa->nodes[cur_node].type));
3016 
3017 #ifdef RE_ENABLE_I18N
3018       /* If the node may accept "multi byte".  */
3019       if (dfa->nodes[cur_node].accept_mb)
3020 	{
3021 	  naccepted = check_node_accept_bytes (dfa, cur_node, &mctx->input,
3022 					       str_idx);
3023 	  if (naccepted > 1)
3024 	    {
3025 	      re_dfastate_t *dest_state;
3026 	      Idx next_node = dfa->nexts[cur_node];
3027 	      Idx next_idx = str_idx + naccepted;
3028 	      dest_state = mctx->state_log[next_idx];
3029 	      re_node_set_empty (&union_set);
3030 	      if (dest_state)
3031 		{
3032 		  err = re_node_set_merge (&union_set, &dest_state->nodes);
3033 		  if (__glibc_unlikely (err != REG_NOERROR))
3034 		    {
3035 		      re_node_set_free (&union_set);
3036 		      return err;
3037 		    }
3038 		}
3039 	      ok = re_node_set_insert (&union_set, next_node);
3040 	      if (__glibc_unlikely (! ok))
3041 		{
3042 		  re_node_set_free (&union_set);
3043 		  return REG_ESPACE;
3044 		}
3045 	      mctx->state_log[next_idx] = re_acquire_state (&err, dfa,
3046 							    &union_set);
3047 	      if (__glibc_unlikely (mctx->state_log[next_idx] == NULL
3048 				    && err != REG_NOERROR))
3049 		{
3050 		  re_node_set_free (&union_set);
3051 		  return err;
3052 		}
3053 	    }
3054 	}
3055 #endif /* RE_ENABLE_I18N */
3056       if (naccepted
3057 	  || check_node_accept (mctx, dfa->nodes + cur_node, str_idx))
3058 	{
3059 	  ok = re_node_set_insert (next_nodes, dfa->nexts[cur_node]);
3060 	  if (__glibc_unlikely (! ok))
3061 	    {
3062 	      re_node_set_free (&union_set);
3063 	      return REG_ESPACE;
3064 	    }
3065 	}
3066     }
3067   re_node_set_free (&union_set);
3068   return REG_NOERROR;
3069 }
3070 
3071 /* For all the nodes in CUR_NODES, add the epsilon closures of them to
3072    CUR_NODES, however exclude the nodes which are:
3073     - inside the sub expression whose number is EX_SUBEXP, if FL_OPEN.
3074     - out of the sub expression whose number is EX_SUBEXP, if !FL_OPEN.
3075 */
3076 
3077 static reg_errcode_t
check_arrival_expand_ecl(const re_dfa_t * dfa,re_node_set * cur_nodes,Idx ex_subexp,int type)3078 check_arrival_expand_ecl (const re_dfa_t *dfa, re_node_set *cur_nodes,
3079 			  Idx ex_subexp, int type)
3080 {
3081   reg_errcode_t err;
3082   Idx idx, outside_node;
3083   re_node_set new_nodes;
3084   DEBUG_ASSERT (cur_nodes->nelem);
3085   err = re_node_set_alloc (&new_nodes, cur_nodes->nelem);
3086   if (__glibc_unlikely (err != REG_NOERROR))
3087     return err;
3088   /* Create a new node set NEW_NODES with the nodes which are epsilon
3089      closures of the node in CUR_NODES.  */
3090 
3091   for (idx = 0; idx < cur_nodes->nelem; ++idx)
3092     {
3093       Idx cur_node = cur_nodes->elems[idx];
3094       const re_node_set *eclosure = dfa->eclosures + cur_node;
3095       outside_node = find_subexp_node (dfa, eclosure, ex_subexp, type);
3096       if (outside_node == -1)
3097 	{
3098 	  /* There are no problematic nodes, just merge them.  */
3099 	  err = re_node_set_merge (&new_nodes, eclosure);
3100 	  if (__glibc_unlikely (err != REG_NOERROR))
3101 	    {
3102 	      re_node_set_free (&new_nodes);
3103 	      return err;
3104 	    }
3105 	}
3106       else
3107 	{
3108 	  /* There are problematic nodes, re-calculate incrementally.  */
3109 	  err = check_arrival_expand_ecl_sub (dfa, &new_nodes, cur_node,
3110 					      ex_subexp, type);
3111 	  if (__glibc_unlikely (err != REG_NOERROR))
3112 	    {
3113 	      re_node_set_free (&new_nodes);
3114 	      return err;
3115 	    }
3116 	}
3117     }
3118   re_node_set_free (cur_nodes);
3119   *cur_nodes = new_nodes;
3120   return REG_NOERROR;
3121 }
3122 
3123 /* Helper function for check_arrival_expand_ecl.
3124    Check incrementally the epsilon closure of TARGET, and if it isn't
3125    problematic append it to DST_NODES.  */
3126 
3127 static reg_errcode_t
3128 __attribute_warn_unused_result__
check_arrival_expand_ecl_sub(const re_dfa_t * dfa,re_node_set * dst_nodes,Idx target,Idx ex_subexp,int type)3129 check_arrival_expand_ecl_sub (const re_dfa_t *dfa, re_node_set *dst_nodes,
3130 			      Idx target, Idx ex_subexp, int type)
3131 {
3132   Idx cur_node;
3133   for (cur_node = target; !re_node_set_contains (dst_nodes, cur_node);)
3134     {
3135       bool ok;
3136 
3137       if (dfa->nodes[cur_node].type == type
3138 	  && dfa->nodes[cur_node].opr.idx == ex_subexp)
3139 	{
3140 	  if (type == OP_CLOSE_SUBEXP)
3141 	    {
3142 	      ok = re_node_set_insert (dst_nodes, cur_node);
3143 	      if (__glibc_unlikely (! ok))
3144 		return REG_ESPACE;
3145 	    }
3146 	  break;
3147 	}
3148       ok = re_node_set_insert (dst_nodes, cur_node);
3149       if (__glibc_unlikely (! ok))
3150 	return REG_ESPACE;
3151       if (dfa->edests[cur_node].nelem == 0)
3152 	break;
3153       if (dfa->edests[cur_node].nelem == 2)
3154 	{
3155 	  reg_errcode_t err;
3156 	  err = check_arrival_expand_ecl_sub (dfa, dst_nodes,
3157 					      dfa->edests[cur_node].elems[1],
3158 					      ex_subexp, type);
3159 	  if (__glibc_unlikely (err != REG_NOERROR))
3160 	    return err;
3161 	}
3162       cur_node = dfa->edests[cur_node].elems[0];
3163     }
3164   return REG_NOERROR;
3165 }
3166 
3167 
3168 /* For all the back references in the current state, calculate the
3169    destination of the back references by the appropriate entry
3170    in MCTX->BKREF_ENTS.  */
3171 
3172 static reg_errcode_t
3173 __attribute_warn_unused_result__
expand_bkref_cache(re_match_context_t * mctx,re_node_set * cur_nodes,Idx cur_str,Idx subexp_num,int type)3174 expand_bkref_cache (re_match_context_t *mctx, re_node_set *cur_nodes,
3175 		    Idx cur_str, Idx subexp_num, int type)
3176 {
3177   const re_dfa_t *const dfa = mctx->dfa;
3178   reg_errcode_t err;
3179   Idx cache_idx_start = search_cur_bkref_entry (mctx, cur_str);
3180   struct re_backref_cache_entry *ent;
3181 
3182   if (cache_idx_start == -1)
3183     return REG_NOERROR;
3184 
3185  restart:
3186   ent = mctx->bkref_ents + cache_idx_start;
3187   do
3188     {
3189       Idx to_idx, next_node;
3190 
3191       /* Is this entry ENT is appropriate?  */
3192       if (!re_node_set_contains (cur_nodes, ent->node))
3193 	continue; /* No.  */
3194 
3195       to_idx = cur_str + ent->subexp_to - ent->subexp_from;
3196       /* Calculate the destination of the back reference, and append it
3197 	 to MCTX->STATE_LOG.  */
3198       if (to_idx == cur_str)
3199 	{
3200 	  /* The backreference did epsilon transit, we must re-check all the
3201 	     node in the current state.  */
3202 	  re_node_set new_dests;
3203 	  reg_errcode_t err2, err3;
3204 	  next_node = dfa->edests[ent->node].elems[0];
3205 	  if (re_node_set_contains (cur_nodes, next_node))
3206 	    continue;
3207 	  err = re_node_set_init_1 (&new_dests, next_node);
3208 	  err2 = check_arrival_expand_ecl (dfa, &new_dests, subexp_num, type);
3209 	  err3 = re_node_set_merge (cur_nodes, &new_dests);
3210 	  re_node_set_free (&new_dests);
3211 	  if (__glibc_unlikely (err != REG_NOERROR || err2 != REG_NOERROR
3212 				|| err3 != REG_NOERROR))
3213 	    {
3214 	      err = (err != REG_NOERROR ? err
3215 		     : (err2 != REG_NOERROR ? err2 : err3));
3216 	      return err;
3217 	    }
3218 	  /* TODO: It is still inefficient...  */
3219 	  goto restart;
3220 	}
3221       else
3222 	{
3223 	  re_node_set union_set;
3224 	  next_node = dfa->nexts[ent->node];
3225 	  if (mctx->state_log[to_idx])
3226 	    {
3227 	      bool ok;
3228 	      if (re_node_set_contains (&mctx->state_log[to_idx]->nodes,
3229 					next_node))
3230 		continue;
3231 	      err = re_node_set_init_copy (&union_set,
3232 					   &mctx->state_log[to_idx]->nodes);
3233 	      ok = re_node_set_insert (&union_set, next_node);
3234 	      if (__glibc_unlikely (err != REG_NOERROR || ! ok))
3235 		{
3236 		  re_node_set_free (&union_set);
3237 		  err = err != REG_NOERROR ? err : REG_ESPACE;
3238 		  return err;
3239 		}
3240 	    }
3241 	  else
3242 	    {
3243 	      err = re_node_set_init_1 (&union_set, next_node);
3244 	      if (__glibc_unlikely (err != REG_NOERROR))
3245 		return err;
3246 	    }
3247 	  mctx->state_log[to_idx] = re_acquire_state (&err, dfa, &union_set);
3248 	  re_node_set_free (&union_set);
3249 	  if (__glibc_unlikely (mctx->state_log[to_idx] == NULL
3250 				&& err != REG_NOERROR))
3251 	    return err;
3252 	}
3253     }
3254   while (ent++->more);
3255   return REG_NOERROR;
3256 }
3257 
3258 /* Build transition table for the state.
3259    Return true if successful.  */
3260 
3261 static bool __attribute_noinline__
build_trtable(const re_dfa_t * dfa,re_dfastate_t * state)3262 build_trtable (const re_dfa_t *dfa, re_dfastate_t *state)
3263 {
3264   reg_errcode_t err;
3265   Idx i, j;
3266   int ch;
3267   bool need_word_trtable = false;
3268   bitset_word_t elem, mask;
3269   Idx ndests; /* Number of the destination states from 'state'.  */
3270   re_dfastate_t **trtable;
3271   re_dfastate_t *dest_states[SBC_MAX];
3272   re_dfastate_t *dest_states_word[SBC_MAX];
3273   re_dfastate_t *dest_states_nl[SBC_MAX];
3274   re_node_set follows;
3275   bitset_t acceptable;
3276 
3277   /* We build DFA states which corresponds to the destination nodes
3278      from 'state'.  'dests_node[i]' represents the nodes which i-th
3279      destination state contains, and 'dests_ch[i]' represents the
3280      characters which i-th destination state accepts.  */
3281   re_node_set dests_node[SBC_MAX];
3282   bitset_t dests_ch[SBC_MAX];
3283 
3284   /* Initialize transition table.  */
3285   state->word_trtable = state->trtable = NULL;
3286 
3287   /* At first, group all nodes belonging to 'state' into several
3288      destinations.  */
3289   ndests = group_nodes_into_DFAstates (dfa, state, dests_node, dests_ch);
3290   if (__glibc_unlikely (ndests <= 0))
3291     {
3292       /* Return false in case of an error, true otherwise.  */
3293       if (ndests == 0)
3294 	{
3295 	  state->trtable = (re_dfastate_t **)
3296 	    calloc (sizeof (re_dfastate_t *), SBC_MAX);
3297           if (__glibc_unlikely (state->trtable == NULL))
3298             return false;
3299 	  return true;
3300 	}
3301       return false;
3302     }
3303 
3304   err = re_node_set_alloc (&follows, ndests + 1);
3305   if (__glibc_unlikely (err != REG_NOERROR))
3306     {
3307     out_free:
3308       re_node_set_free (&follows);
3309       for (i = 0; i < ndests; ++i)
3310 	re_node_set_free (dests_node + i);
3311       return false;
3312     }
3313 
3314   bitset_empty (acceptable);
3315 
3316   /* Then build the states for all destinations.  */
3317   for (i = 0; i < ndests; ++i)
3318     {
3319       Idx next_node;
3320       re_node_set_empty (&follows);
3321       /* Merge the follows of this destination states.  */
3322       for (j = 0; j < dests_node[i].nelem; ++j)
3323 	{
3324 	  next_node = dfa->nexts[dests_node[i].elems[j]];
3325 	  if (next_node != -1)
3326 	    {
3327 	      err = re_node_set_merge (&follows, dfa->eclosures + next_node);
3328 	      if (__glibc_unlikely (err != REG_NOERROR))
3329 		goto out_free;
3330 	    }
3331 	}
3332       dest_states[i] = re_acquire_state_context (&err, dfa, &follows, 0);
3333       if (__glibc_unlikely (dest_states[i] == NULL && err != REG_NOERROR))
3334 	goto out_free;
3335       /* If the new state has context constraint,
3336 	 build appropriate states for these contexts.  */
3337       if (dest_states[i]->has_constraint)
3338 	{
3339 	  dest_states_word[i] = re_acquire_state_context (&err, dfa, &follows,
3340 							  CONTEXT_WORD);
3341 	  if (__glibc_unlikely (dest_states_word[i] == NULL
3342 				&& err != REG_NOERROR))
3343 	    goto out_free;
3344 
3345 	  if (dest_states[i] != dest_states_word[i] && dfa->mb_cur_max > 1)
3346 	    need_word_trtable = true;
3347 
3348 	  dest_states_nl[i] = re_acquire_state_context (&err, dfa, &follows,
3349 							CONTEXT_NEWLINE);
3350 	  if (__glibc_unlikely (dest_states_nl[i] == NULL && err != REG_NOERROR))
3351 	    goto out_free;
3352 	}
3353       else
3354 	{
3355 	  dest_states_word[i] = dest_states[i];
3356 	  dest_states_nl[i] = dest_states[i];
3357 	}
3358       bitset_merge (acceptable, dests_ch[i]);
3359     }
3360 
3361   if (!__glibc_unlikely (need_word_trtable))
3362     {
3363       /* We don't care about whether the following character is a word
3364 	 character, or we are in a single-byte character set so we can
3365 	 discern by looking at the character code: allocate a
3366 	 256-entry transition table.  */
3367       trtable = state->trtable =
3368 	(re_dfastate_t **) calloc (sizeof (re_dfastate_t *), SBC_MAX);
3369       if (__glibc_unlikely (trtable == NULL))
3370 	goto out_free;
3371 
3372       /* For all characters ch...:  */
3373       for (i = 0; i < BITSET_WORDS; ++i)
3374 	for (ch = i * BITSET_WORD_BITS, elem = acceptable[i], mask = 1;
3375 	     elem;
3376 	     mask <<= 1, elem >>= 1, ++ch)
3377 	  if (__glibc_unlikely (elem & 1))
3378 	    {
3379 	      /* There must be exactly one destination which accepts
3380 		 character ch.  See group_nodes_into_DFAstates.  */
3381 	      for (j = 0; (dests_ch[j][i] & mask) == 0; ++j)
3382 		;
3383 
3384 	      /* j-th destination accepts the word character ch.  */
3385 	      if (dfa->word_char[i] & mask)
3386 		trtable[ch] = dest_states_word[j];
3387 	      else
3388 		trtable[ch] = dest_states[j];
3389 	    }
3390     }
3391   else
3392     {
3393       /* We care about whether the following character is a word
3394 	 character, and we are in a multi-byte character set: discern
3395 	 by looking at the character code: build two 256-entry
3396 	 transition tables, one starting at trtable[0] and one
3397 	 starting at trtable[SBC_MAX].  */
3398       trtable = state->word_trtable =
3399 	(re_dfastate_t **) calloc (sizeof (re_dfastate_t *), 2 * SBC_MAX);
3400       if (__glibc_unlikely (trtable == NULL))
3401 	goto out_free;
3402 
3403       /* For all characters ch...:  */
3404       for (i = 0; i < BITSET_WORDS; ++i)
3405 	for (ch = i * BITSET_WORD_BITS, elem = acceptable[i], mask = 1;
3406 	     elem;
3407 	     mask <<= 1, elem >>= 1, ++ch)
3408 	  if (__glibc_unlikely (elem & 1))
3409 	    {
3410 	      /* There must be exactly one destination which accepts
3411 		 character ch.  See group_nodes_into_DFAstates.  */
3412 	      for (j = 0; (dests_ch[j][i] & mask) == 0; ++j)
3413 		;
3414 
3415 	      /* j-th destination accepts the word character ch.  */
3416 	      trtable[ch] = dest_states[j];
3417 	      trtable[ch + SBC_MAX] = dest_states_word[j];
3418 	    }
3419     }
3420 
3421   /* new line */
3422   if (bitset_contain (acceptable, NEWLINE_CHAR))
3423     {
3424       /* The current state accepts newline character.  */
3425       for (j = 0; j < ndests; ++j)
3426 	if (bitset_contain (dests_ch[j], NEWLINE_CHAR))
3427 	  {
3428 	    /* k-th destination accepts newline character.  */
3429 	    trtable[NEWLINE_CHAR] = dest_states_nl[j];
3430 	    if (need_word_trtable)
3431 	      trtable[NEWLINE_CHAR + SBC_MAX] = dest_states_nl[j];
3432 	    /* There must be only one destination which accepts
3433 	       newline.  See group_nodes_into_DFAstates.  */
3434 	    break;
3435 	  }
3436     }
3437 
3438   re_node_set_free (&follows);
3439   for (i = 0; i < ndests; ++i)
3440     re_node_set_free (dests_node + i);
3441   return true;
3442 }
3443 
3444 /* Group all nodes belonging to STATE into several destinations.
3445    Then for all destinations, set the nodes belonging to the destination
3446    to DESTS_NODE[i] and set the characters accepted by the destination
3447    to DEST_CH[i].  Return the number of destinations if successful,
3448    -1 on internal error.  */
3449 
3450 static Idx
group_nodes_into_DFAstates(const re_dfa_t * dfa,const re_dfastate_t * state,re_node_set * dests_node,bitset_t * dests_ch)3451 group_nodes_into_DFAstates (const re_dfa_t *dfa, const re_dfastate_t *state,
3452 			    re_node_set *dests_node, bitset_t *dests_ch)
3453 {
3454   reg_errcode_t err;
3455   bool ok;
3456   Idx i, j, k;
3457   Idx ndests; /* Number of the destinations from 'state'.  */
3458   bitset_t accepts; /* Characters a node can accept.  */
3459   const re_node_set *cur_nodes = &state->nodes;
3460   bitset_empty (accepts);
3461   ndests = 0;
3462 
3463   /* For all the nodes belonging to 'state',  */
3464   for (i = 0; i < cur_nodes->nelem; ++i)
3465     {
3466       re_token_t *node = &dfa->nodes[cur_nodes->elems[i]];
3467       re_token_type_t type = node->type;
3468       unsigned int constraint = node->constraint;
3469 
3470       /* Enumerate all single byte character this node can accept.  */
3471       if (type == CHARACTER)
3472 	bitset_set (accepts, node->opr.c);
3473       else if (type == SIMPLE_BRACKET)
3474 	{
3475 	  bitset_merge (accepts, node->opr.sbcset);
3476 	}
3477       else if (type == OP_PERIOD)
3478 	{
3479 #ifdef RE_ENABLE_I18N
3480 	  if (dfa->mb_cur_max > 1)
3481 	    bitset_merge (accepts, dfa->sb_char);
3482 	  else
3483 #endif
3484 	    bitset_set_all (accepts);
3485 	  if (!(dfa->syntax & RE_DOT_NEWLINE))
3486 	    bitset_clear (accepts, '\n');
3487 	  if (dfa->syntax & RE_DOT_NOT_NULL)
3488 	    bitset_clear (accepts, '\0');
3489 	}
3490 #ifdef RE_ENABLE_I18N
3491       else if (type == OP_UTF8_PERIOD)
3492 	{
3493 	  if (ASCII_CHARS % BITSET_WORD_BITS == 0)
3494 	    memset (accepts, -1, ASCII_CHARS / CHAR_BIT);
3495 	  else
3496 	    bitset_merge (accepts, utf8_sb_map);
3497 	  if (!(dfa->syntax & RE_DOT_NEWLINE))
3498 	    bitset_clear (accepts, '\n');
3499 	  if (dfa->syntax & RE_DOT_NOT_NULL)
3500 	    bitset_clear (accepts, '\0');
3501 	}
3502 #endif
3503       else
3504 	continue;
3505 
3506       /* Check the 'accepts' and sift the characters which are not
3507 	 match it the context.  */
3508       if (constraint)
3509 	{
3510 	  if (constraint & NEXT_NEWLINE_CONSTRAINT)
3511 	    {
3512 	      bool accepts_newline = bitset_contain (accepts, NEWLINE_CHAR);
3513 	      bitset_empty (accepts);
3514 	      if (accepts_newline)
3515 		bitset_set (accepts, NEWLINE_CHAR);
3516 	      else
3517 		continue;
3518 	    }
3519 	  if (constraint & NEXT_ENDBUF_CONSTRAINT)
3520 	    {
3521 	      bitset_empty (accepts);
3522 	      continue;
3523 	    }
3524 
3525 	  if (constraint & NEXT_WORD_CONSTRAINT)
3526 	    {
3527 	      bitset_word_t any_set = 0;
3528 	      if (type == CHARACTER && !node->word_char)
3529 		{
3530 		  bitset_empty (accepts);
3531 		  continue;
3532 		}
3533 #ifdef RE_ENABLE_I18N
3534 	      if (dfa->mb_cur_max > 1)
3535 		for (j = 0; j < BITSET_WORDS; ++j)
3536 		  any_set |= (accepts[j] &= (dfa->word_char[j] | ~dfa->sb_char[j]));
3537 	      else
3538 #endif
3539 		for (j = 0; j < BITSET_WORDS; ++j)
3540 		  any_set |= (accepts[j] &= dfa->word_char[j]);
3541 	      if (!any_set)
3542 		continue;
3543 	    }
3544 	  if (constraint & NEXT_NOTWORD_CONSTRAINT)
3545 	    {
3546 	      bitset_word_t any_set = 0;
3547 	      if (type == CHARACTER && node->word_char)
3548 		{
3549 		  bitset_empty (accepts);
3550 		  continue;
3551 		}
3552 #ifdef RE_ENABLE_I18N
3553 	      if (dfa->mb_cur_max > 1)
3554 		for (j = 0; j < BITSET_WORDS; ++j)
3555 		  any_set |= (accepts[j] &= ~(dfa->word_char[j] & dfa->sb_char[j]));
3556 	      else
3557 #endif
3558 		for (j = 0; j < BITSET_WORDS; ++j)
3559 		  any_set |= (accepts[j] &= ~dfa->word_char[j]);
3560 	      if (!any_set)
3561 		continue;
3562 	    }
3563 	}
3564 
3565       /* Then divide 'accepts' into DFA states, or create a new
3566 	 state.  Above, we make sure that accepts is not empty.  */
3567       for (j = 0; j < ndests; ++j)
3568 	{
3569 	  bitset_t intersec; /* Intersection sets, see below.  */
3570 	  bitset_t remains;
3571 	  /* Flags, see below.  */
3572 	  bitset_word_t has_intersec, not_subset, not_consumed;
3573 
3574 	  /* Optimization, skip if this state doesn't accept the character.  */
3575 	  if (type == CHARACTER && !bitset_contain (dests_ch[j], node->opr.c))
3576 	    continue;
3577 
3578 	  /* Enumerate the intersection set of this state and 'accepts'.  */
3579 	  has_intersec = 0;
3580 	  for (k = 0; k < BITSET_WORDS; ++k)
3581 	    has_intersec |= intersec[k] = accepts[k] & dests_ch[j][k];
3582 	  /* And skip if the intersection set is empty.  */
3583 	  if (!has_intersec)
3584 	    continue;
3585 
3586 	  /* Then check if this state is a subset of 'accepts'.  */
3587 	  not_subset = not_consumed = 0;
3588 	  for (k = 0; k < BITSET_WORDS; ++k)
3589 	    {
3590 	      not_subset |= remains[k] = ~accepts[k] & dests_ch[j][k];
3591 	      not_consumed |= accepts[k] = accepts[k] & ~dests_ch[j][k];
3592 	    }
3593 
3594 	  /* If this state isn't a subset of 'accepts', create a
3595 	     new group state, which has the 'remains'. */
3596 	  if (not_subset)
3597 	    {
3598 	      bitset_copy (dests_ch[ndests], remains);
3599 	      bitset_copy (dests_ch[j], intersec);
3600 	      err = re_node_set_init_copy (dests_node + ndests, &dests_node[j]);
3601 	      if (__glibc_unlikely (err != REG_NOERROR))
3602 		goto error_return;
3603 	      ++ndests;
3604 	    }
3605 
3606 	  /* Put the position in the current group. */
3607 	  ok = re_node_set_insert (&dests_node[j], cur_nodes->elems[i]);
3608 	  if (__glibc_unlikely (! ok))
3609 	    goto error_return;
3610 
3611 	  /* If all characters are consumed, go to next node. */
3612 	  if (!not_consumed)
3613 	    break;
3614 	}
3615       /* Some characters remain, create a new group. */
3616       if (j == ndests)
3617 	{
3618 	  bitset_copy (dests_ch[ndests], accepts);
3619 	  err = re_node_set_init_1 (dests_node + ndests, cur_nodes->elems[i]);
3620 	  if (__glibc_unlikely (err != REG_NOERROR))
3621 	    goto error_return;
3622 	  ++ndests;
3623 	  bitset_empty (accepts);
3624 	}
3625     }
3626   assume (ndests <= SBC_MAX);
3627   return ndests;
3628  error_return:
3629   for (j = 0; j < ndests; ++j)
3630     re_node_set_free (dests_node + j);
3631   return -1;
3632 }
3633 
3634 #ifdef RE_ENABLE_I18N
3635 /* Check how many bytes the node 'dfa->nodes[node_idx]' accepts.
3636    Return the number of the bytes the node accepts.
3637    STR_IDX is the current index of the input string.
3638 
3639    This function handles the nodes which can accept one character, or
3640    one collating element like '.', '[a-z]', opposite to the other nodes
3641    can only accept one byte.  */
3642 
3643 # ifdef _LIBC
3644 #  include <locale/weight.h>
3645 # endif
3646 
3647 static int
check_node_accept_bytes(const re_dfa_t * dfa,Idx node_idx,const re_string_t * input,Idx str_idx)3648 check_node_accept_bytes (const re_dfa_t *dfa, Idx node_idx,
3649 			 const re_string_t *input, Idx str_idx)
3650 {
3651   const re_token_t *node = dfa->nodes + node_idx;
3652   int char_len, elem_len;
3653   Idx i;
3654 
3655   if (__glibc_unlikely (node->type == OP_UTF8_PERIOD))
3656     {
3657       unsigned char c = re_string_byte_at (input, str_idx), d;
3658       if (__glibc_likely (c < 0xc2))
3659 	return 0;
3660 
3661       if (str_idx + 2 > input->len)
3662 	return 0;
3663 
3664       d = re_string_byte_at (input, str_idx + 1);
3665       if (c < 0xe0)
3666 	return (d < 0x80 || d > 0xbf) ? 0 : 2;
3667       else if (c < 0xf0)
3668 	{
3669 	  char_len = 3;
3670 	  if (c == 0xe0 && d < 0xa0)
3671 	    return 0;
3672 	}
3673       else if (c < 0xf8)
3674 	{
3675 	  char_len = 4;
3676 	  if (c == 0xf0 && d < 0x90)
3677 	    return 0;
3678 	}
3679       else if (c < 0xfc)
3680 	{
3681 	  char_len = 5;
3682 	  if (c == 0xf8 && d < 0x88)
3683 	    return 0;
3684 	}
3685       else if (c < 0xfe)
3686 	{
3687 	  char_len = 6;
3688 	  if (c == 0xfc && d < 0x84)
3689 	    return 0;
3690 	}
3691       else
3692 	return 0;
3693 
3694       if (str_idx + char_len > input->len)
3695 	return 0;
3696 
3697       for (i = 1; i < char_len; ++i)
3698 	{
3699 	  d = re_string_byte_at (input, str_idx + i);
3700 	  if (d < 0x80 || d > 0xbf)
3701 	    return 0;
3702 	}
3703       return char_len;
3704     }
3705 
3706   char_len = re_string_char_size_at (input, str_idx);
3707   if (node->type == OP_PERIOD)
3708     {
3709       if (char_len <= 1)
3710 	return 0;
3711       /* FIXME: I don't think this if is needed, as both '\n'
3712 	 and '\0' are char_len == 1.  */
3713       /* '.' accepts any one character except the following two cases.  */
3714       if ((!(dfa->syntax & RE_DOT_NEWLINE)
3715 	   && re_string_byte_at (input, str_idx) == '\n')
3716 	  || ((dfa->syntax & RE_DOT_NOT_NULL)
3717 	      && re_string_byte_at (input, str_idx) == '\0'))
3718 	return 0;
3719       return char_len;
3720     }
3721 
3722   elem_len = re_string_elem_size_at (input, str_idx);
3723   if ((elem_len <= 1 && char_len <= 1) || char_len == 0)
3724     return 0;
3725 
3726   if (node->type == COMPLEX_BRACKET)
3727     {
3728       const re_charset_t *cset = node->opr.mbcset;
3729 # ifdef _LIBC
3730       const unsigned char *pin
3731 	= ((const unsigned char *) re_string_get_buffer (input) + str_idx);
3732       Idx j;
3733       uint32_t nrules;
3734 # endif /* _LIBC */
3735       int match_len = 0;
3736       wchar_t wc = ((cset->nranges || cset->nchar_classes || cset->nmbchars)
3737 		    ? re_string_wchar_at (input, str_idx) : 0);
3738 
3739       /* match with multibyte character?  */
3740       for (i = 0; i < cset->nmbchars; ++i)
3741 	if (wc == cset->mbchars[i])
3742 	  {
3743 	    match_len = char_len;
3744 	    goto check_node_accept_bytes_match;
3745 	  }
3746       /* match with character_class?  */
3747       for (i = 0; i < cset->nchar_classes; ++i)
3748 	{
3749 	  wctype_t wt = cset->char_classes[i];
3750 	  if (__iswctype (wc, wt))
3751 	    {
3752 	      match_len = char_len;
3753 	      goto check_node_accept_bytes_match;
3754 	    }
3755 	}
3756 
3757 # ifdef _LIBC
3758       nrules = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES);
3759       if (nrules != 0)
3760 	{
3761 	  unsigned int in_collseq = 0;
3762 	  const int32_t *table, *indirect;
3763 	  const unsigned char *weights, *extra;
3764 	  const char *collseqwc;
3765 
3766 	  /* match with collating_symbol?  */
3767 	  if (cset->ncoll_syms)
3768 	    extra = (const unsigned char *)
3769 	      _NL_CURRENT (LC_COLLATE, _NL_COLLATE_SYMB_EXTRAMB);
3770 	  for (i = 0; i < cset->ncoll_syms; ++i)
3771 	    {
3772 	      const unsigned char *coll_sym = extra + cset->coll_syms[i];
3773 	      /* Compare the length of input collating element and
3774 		 the length of current collating element.  */
3775 	      if (*coll_sym != elem_len)
3776 		continue;
3777 	      /* Compare each bytes.  */
3778 	      for (j = 0; j < *coll_sym; j++)
3779 		if (pin[j] != coll_sym[1 + j])
3780 		  break;
3781 	      if (j == *coll_sym)
3782 		{
3783 		  /* Match if every bytes is equal.  */
3784 		  match_len = j;
3785 		  goto check_node_accept_bytes_match;
3786 		}
3787 	    }
3788 
3789 	  if (cset->nranges)
3790 	    {
3791 	      if (elem_len <= char_len)
3792 		{
3793 		  collseqwc = _NL_CURRENT (LC_COLLATE, _NL_COLLATE_COLLSEQWC);
3794 		  in_collseq = __collseq_table_lookup (collseqwc, wc);
3795 		}
3796 	      else
3797 		in_collseq = find_collation_sequence_value (pin, elem_len);
3798 	    }
3799 	  /* match with range expression?  */
3800 	  /* FIXME: Implement rational ranges here, too.  */
3801 	  for (i = 0; i < cset->nranges; ++i)
3802 	    if (cset->range_starts[i] <= in_collseq
3803 		&& in_collseq <= cset->range_ends[i])
3804 	      {
3805 		match_len = elem_len;
3806 		goto check_node_accept_bytes_match;
3807 	      }
3808 
3809 	  /* match with equivalence_class?  */
3810 	  if (cset->nequiv_classes)
3811 	    {
3812 	      const unsigned char *cp = pin;
3813 	      table = (const int32_t *)
3814 		_NL_CURRENT (LC_COLLATE, _NL_COLLATE_TABLEMB);
3815 	      weights = (const unsigned char *)
3816 		_NL_CURRENT (LC_COLLATE, _NL_COLLATE_WEIGHTMB);
3817 	      extra = (const unsigned char *)
3818 		_NL_CURRENT (LC_COLLATE, _NL_COLLATE_EXTRAMB);
3819 	      indirect = (const int32_t *)
3820 		_NL_CURRENT (LC_COLLATE, _NL_COLLATE_INDIRECTMB);
3821 	      int32_t idx = findidx (table, indirect, extra, &cp, elem_len);
3822 	      int32_t rule = idx >> 24;
3823 	      idx &= 0xffffff;
3824 	      if (idx > 0)
3825 		{
3826 		  size_t weight_len = weights[idx];
3827 		  for (i = 0; i < cset->nequiv_classes; ++i)
3828 		    {
3829 		      int32_t equiv_class_idx = cset->equiv_classes[i];
3830 		      int32_t equiv_class_rule = equiv_class_idx >> 24;
3831 		      equiv_class_idx &= 0xffffff;
3832 		      if (weights[equiv_class_idx] == weight_len
3833 			  && equiv_class_rule == rule
3834 			  && memcmp (weights + idx + 1,
3835 				     weights + equiv_class_idx + 1,
3836 				     weight_len) == 0)
3837 			{
3838 			  match_len = elem_len;
3839 			  goto check_node_accept_bytes_match;
3840 			}
3841 		    }
3842 		}
3843 	    }
3844 	}
3845       else
3846 # endif /* _LIBC */
3847 	{
3848 	  /* match with range expression?  */
3849 	  for (i = 0; i < cset->nranges; ++i)
3850 	    {
3851 	      if (cset->range_starts[i] <= wc && wc <= cset->range_ends[i])
3852 		{
3853 		  match_len = char_len;
3854 		  goto check_node_accept_bytes_match;
3855 		}
3856 	    }
3857 	}
3858     check_node_accept_bytes_match:
3859       if (!cset->non_match)
3860 	return match_len;
3861       else
3862 	{
3863 	  if (match_len > 0)
3864 	    return 0;
3865 	  else
3866 	    return (elem_len > char_len) ? elem_len : char_len;
3867 	}
3868     }
3869   return 0;
3870 }
3871 
3872 # ifdef _LIBC
3873 static unsigned int
find_collation_sequence_value(const unsigned char * mbs,size_t mbs_len)3874 find_collation_sequence_value (const unsigned char *mbs, size_t mbs_len)
3875 {
3876   uint32_t nrules = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES);
3877   if (nrules == 0)
3878     {
3879       if (mbs_len == 1)
3880 	{
3881 	  /* No valid character.  Match it as a single byte character.  */
3882 	  const unsigned char *collseq = (const unsigned char *)
3883 	    _NL_CURRENT (LC_COLLATE, _NL_COLLATE_COLLSEQMB);
3884 	  return collseq[mbs[0]];
3885 	}
3886       return UINT_MAX;
3887     }
3888   else
3889     {
3890       int32_t idx;
3891       const unsigned char *extra = (const unsigned char *)
3892 	_NL_CURRENT (LC_COLLATE, _NL_COLLATE_SYMB_EXTRAMB);
3893       int32_t extrasize = (const unsigned char *)
3894 	_NL_CURRENT (LC_COLLATE, _NL_COLLATE_SYMB_EXTRAMB + 1) - extra;
3895 
3896       for (idx = 0; idx < extrasize;)
3897 	{
3898 	  int mbs_cnt;
3899 	  bool found = false;
3900 	  int32_t elem_mbs_len;
3901 	  /* Skip the name of collating element name.  */
3902 	  idx = idx + extra[idx] + 1;
3903 	  elem_mbs_len = extra[idx++];
3904 	  if (mbs_len == elem_mbs_len)
3905 	    {
3906 	      for (mbs_cnt = 0; mbs_cnt < elem_mbs_len; ++mbs_cnt)
3907 		if (extra[idx + mbs_cnt] != mbs[mbs_cnt])
3908 		  break;
3909 	      if (mbs_cnt == elem_mbs_len)
3910 		/* Found the entry.  */
3911 		found = true;
3912 	    }
3913 	  /* Skip the byte sequence of the collating element.  */
3914 	  idx += elem_mbs_len;
3915 	  /* Adjust for the alignment.  */
3916 	  idx = (idx + 3) & ~3;
3917 	  /* Skip the collation sequence value.  */
3918 	  idx += sizeof (uint32_t);
3919 	  /* Skip the wide char sequence of the collating element.  */
3920 	  idx = idx + sizeof (uint32_t) * (*(int32_t *) (extra + idx) + 1);
3921 	  /* If we found the entry, return the sequence value.  */
3922 	  if (found)
3923 	    return *(uint32_t *) (extra + idx);
3924 	  /* Skip the collation sequence value.  */
3925 	  idx += sizeof (uint32_t);
3926 	}
3927       return UINT_MAX;
3928     }
3929 }
3930 # endif /* _LIBC */
3931 #endif /* RE_ENABLE_I18N */
3932 
3933 /* Check whether the node accepts the byte which is IDX-th
3934    byte of the INPUT.  */
3935 
3936 static bool
check_node_accept(const re_match_context_t * mctx,const re_token_t * node,Idx idx)3937 check_node_accept (const re_match_context_t *mctx, const re_token_t *node,
3938 		   Idx idx)
3939 {
3940   unsigned char ch;
3941   ch = re_string_byte_at (&mctx->input, idx);
3942   switch (node->type)
3943     {
3944     case CHARACTER:
3945       if (node->opr.c != ch)
3946         return false;
3947       break;
3948 
3949     case SIMPLE_BRACKET:
3950       if (!bitset_contain (node->opr.sbcset, ch))
3951         return false;
3952       break;
3953 
3954 #ifdef RE_ENABLE_I18N
3955     case OP_UTF8_PERIOD:
3956       if (ch >= ASCII_CHARS)
3957         return false;
3958       FALLTHROUGH;
3959 #endif
3960     case OP_PERIOD:
3961       if ((ch == '\n' && !(mctx->dfa->syntax & RE_DOT_NEWLINE))
3962 	  || (ch == '\0' && (mctx->dfa->syntax & RE_DOT_NOT_NULL)))
3963 	return false;
3964       break;
3965 
3966     default:
3967       return false;
3968     }
3969 
3970   if (node->constraint)
3971     {
3972       /* The node has constraints.  Check whether the current context
3973 	 satisfies the constraints.  */
3974       unsigned int context = re_string_context_at (&mctx->input, idx,
3975 						   mctx->eflags);
3976       if (NOT_SATISFY_NEXT_CONSTRAINT (node->constraint, context))
3977 	return false;
3978     }
3979 
3980   return true;
3981 }
3982 
3983 /* Extend the buffers, if the buffers have run out.  */
3984 
3985 static reg_errcode_t
3986 __attribute_warn_unused_result__
extend_buffers(re_match_context_t * mctx,int min_len)3987 extend_buffers (re_match_context_t *mctx, int min_len)
3988 {
3989   reg_errcode_t ret;
3990   re_string_t *pstr = &mctx->input;
3991 
3992   /* Avoid overflow.  */
3993   if (__glibc_unlikely (MIN (IDX_MAX, SIZE_MAX / sizeof (re_dfastate_t *)) / 2
3994 			<= pstr->bufs_len))
3995     return REG_ESPACE;
3996 
3997   /* Double the lengths of the buffers, but allocate at least MIN_LEN.  */
3998   ret = re_string_realloc_buffers (pstr,
3999 				   MAX (min_len,
4000 					MIN (pstr->len, pstr->bufs_len * 2)));
4001   if (__glibc_unlikely (ret != REG_NOERROR))
4002     return ret;
4003 
4004   if (mctx->state_log != NULL)
4005     {
4006       /* And double the length of state_log.  */
4007       /* XXX We have no indication of the size of this buffer.  If this
4008 	 allocation fail we have no indication that the state_log array
4009 	 does not have the right size.  */
4010       re_dfastate_t **new_array = re_realloc (mctx->state_log, re_dfastate_t *,
4011 					      pstr->bufs_len + 1);
4012       if (__glibc_unlikely (new_array == NULL))
4013 	return REG_ESPACE;
4014       mctx->state_log = new_array;
4015     }
4016 
4017   /* Then reconstruct the buffers.  */
4018   if (pstr->icase)
4019     {
4020 #ifdef RE_ENABLE_I18N
4021       if (pstr->mb_cur_max > 1)
4022 	{
4023 	  ret = build_wcs_upper_buffer (pstr);
4024 	  if (__glibc_unlikely (ret != REG_NOERROR))
4025 	    return ret;
4026 	}
4027       else
4028 #endif /* RE_ENABLE_I18N  */
4029 	build_upper_buffer (pstr);
4030     }
4031   else
4032     {
4033 #ifdef RE_ENABLE_I18N
4034       if (pstr->mb_cur_max > 1)
4035 	build_wcs_buffer (pstr);
4036       else
4037 #endif /* RE_ENABLE_I18N  */
4038 	{
4039 	  if (pstr->trans != NULL)
4040 	    re_string_translate_buffer (pstr);
4041 	}
4042     }
4043   return REG_NOERROR;
4044 }
4045 
4046 
4047 /* Functions for matching context.  */
4048 
4049 /* Initialize MCTX.  */
4050 
4051 static reg_errcode_t
4052 __attribute_warn_unused_result__
match_ctx_init(re_match_context_t * mctx,int eflags,Idx n)4053 match_ctx_init (re_match_context_t *mctx, int eflags, Idx n)
4054 {
4055   mctx->eflags = eflags;
4056   mctx->match_last = -1;
4057   if (n > 0)
4058     {
4059       /* Avoid overflow.  */
4060       size_t max_object_size =
4061 	MAX (sizeof (struct re_backref_cache_entry),
4062 	     sizeof (re_sub_match_top_t *));
4063       if (__glibc_unlikely (MIN (IDX_MAX, SIZE_MAX / max_object_size) < n))
4064 	return REG_ESPACE;
4065 
4066       mctx->bkref_ents = re_malloc (struct re_backref_cache_entry, n);
4067       mctx->sub_tops = re_malloc (re_sub_match_top_t *, n);
4068       if (__glibc_unlikely (mctx->bkref_ents == NULL || mctx->sub_tops == NULL))
4069 	return REG_ESPACE;
4070     }
4071   /* Already zero-ed by the caller.
4072      else
4073        mctx->bkref_ents = NULL;
4074      mctx->nbkref_ents = 0;
4075      mctx->nsub_tops = 0;  */
4076   mctx->abkref_ents = n;
4077   mctx->max_mb_elem_len = 1;
4078   mctx->asub_tops = n;
4079   return REG_NOERROR;
4080 }
4081 
4082 /* Clean the entries which depend on the current input in MCTX.
4083    This function must be invoked when the matcher changes the start index
4084    of the input, or changes the input string.  */
4085 
4086 static void
match_ctx_clean(re_match_context_t * mctx)4087 match_ctx_clean (re_match_context_t *mctx)
4088 {
4089   Idx st_idx;
4090   for (st_idx = 0; st_idx < mctx->nsub_tops; ++st_idx)
4091     {
4092       Idx sl_idx;
4093       re_sub_match_top_t *top = mctx->sub_tops[st_idx];
4094       for (sl_idx = 0; sl_idx < top->nlasts; ++sl_idx)
4095 	{
4096 	  re_sub_match_last_t *last = top->lasts[sl_idx];
4097 	  re_free (last->path.array);
4098 	  re_free (last);
4099 	}
4100       re_free (top->lasts);
4101       if (top->path)
4102 	{
4103 	  re_free (top->path->array);
4104 	  re_free (top->path);
4105 	}
4106       re_free (top);
4107     }
4108 
4109   mctx->nsub_tops = 0;
4110   mctx->nbkref_ents = 0;
4111 }
4112 
4113 /* Free all the memory associated with MCTX.  */
4114 
4115 static void
match_ctx_free(re_match_context_t * mctx)4116 match_ctx_free (re_match_context_t *mctx)
4117 {
4118   /* First, free all the memory associated with MCTX->SUB_TOPS.  */
4119   match_ctx_clean (mctx);
4120   re_free (mctx->sub_tops);
4121   re_free (mctx->bkref_ents);
4122 }
4123 
4124 /* Add a new backreference entry to MCTX.
4125    Note that we assume that caller never call this function with duplicate
4126    entry, and call with STR_IDX which isn't smaller than any existing entry.
4127 */
4128 
4129 static reg_errcode_t
4130 __attribute_warn_unused_result__
match_ctx_add_entry(re_match_context_t * mctx,Idx node,Idx str_idx,Idx from,Idx to)4131 match_ctx_add_entry (re_match_context_t *mctx, Idx node, Idx str_idx, Idx from,
4132 		     Idx to)
4133 {
4134   if (mctx->nbkref_ents >= mctx->abkref_ents)
4135     {
4136       struct re_backref_cache_entry* new_entry;
4137       new_entry = re_realloc (mctx->bkref_ents, struct re_backref_cache_entry,
4138 			      mctx->abkref_ents * 2);
4139       if (__glibc_unlikely (new_entry == NULL))
4140 	{
4141 	  re_free (mctx->bkref_ents);
4142 	  return REG_ESPACE;
4143 	}
4144       mctx->bkref_ents = new_entry;
4145       memset (mctx->bkref_ents + mctx->nbkref_ents, '\0',
4146 	      sizeof (struct re_backref_cache_entry) * mctx->abkref_ents);
4147       mctx->abkref_ents *= 2;
4148     }
4149   if (mctx->nbkref_ents > 0
4150       && mctx->bkref_ents[mctx->nbkref_ents - 1].str_idx == str_idx)
4151     mctx->bkref_ents[mctx->nbkref_ents - 1].more = 1;
4152 
4153   mctx->bkref_ents[mctx->nbkref_ents].node = node;
4154   mctx->bkref_ents[mctx->nbkref_ents].str_idx = str_idx;
4155   mctx->bkref_ents[mctx->nbkref_ents].subexp_from = from;
4156   mctx->bkref_ents[mctx->nbkref_ents].subexp_to = to;
4157 
4158   /* This is a cache that saves negative results of check_dst_limits_calc_pos.
4159      If bit N is clear, means that this entry won't epsilon-transition to
4160      an OP_OPEN_SUBEXP or OP_CLOSE_SUBEXP for the N+1-th subexpression.  If
4161      it is set, check_dst_limits_calc_pos_1 will recurse and try to find one
4162      such node.
4163 
4164      A backreference does not epsilon-transition unless it is empty, so set
4165      to all zeros if FROM != TO.  */
4166   mctx->bkref_ents[mctx->nbkref_ents].eps_reachable_subexps_map
4167     = (from == to ? -1 : 0);
4168 
4169   mctx->bkref_ents[mctx->nbkref_ents++].more = 0;
4170   if (mctx->max_mb_elem_len < to - from)
4171     mctx->max_mb_elem_len = to - from;
4172   return REG_NOERROR;
4173 }
4174 
4175 /* Return the first entry with the same str_idx, or -1 if none is
4176    found.  Note that MCTX->BKREF_ENTS is already sorted by MCTX->STR_IDX.  */
4177 
4178 static Idx
search_cur_bkref_entry(const re_match_context_t * mctx,Idx str_idx)4179 search_cur_bkref_entry (const re_match_context_t *mctx, Idx str_idx)
4180 {
4181   Idx left, right, mid, last;
4182   last = right = mctx->nbkref_ents;
4183   for (left = 0; left < right;)
4184     {
4185       mid = (left + right) / 2;
4186       if (mctx->bkref_ents[mid].str_idx < str_idx)
4187 	left = mid + 1;
4188       else
4189 	right = mid;
4190     }
4191   if (left < last && mctx->bkref_ents[left].str_idx == str_idx)
4192     return left;
4193   else
4194     return -1;
4195 }
4196 
4197 /* Register the node NODE, whose type is OP_OPEN_SUBEXP, and which matches
4198    at STR_IDX.  */
4199 
4200 static reg_errcode_t
4201 __attribute_warn_unused_result__
match_ctx_add_subtop(re_match_context_t * mctx,Idx node,Idx str_idx)4202 match_ctx_add_subtop (re_match_context_t *mctx, Idx node, Idx str_idx)
4203 {
4204   DEBUG_ASSERT (mctx->sub_tops != NULL);
4205   DEBUG_ASSERT (mctx->asub_tops > 0);
4206   if (__glibc_unlikely (mctx->nsub_tops == mctx->asub_tops))
4207     {
4208       Idx new_asub_tops = mctx->asub_tops * 2;
4209       re_sub_match_top_t **new_array = re_realloc (mctx->sub_tops,
4210 						   re_sub_match_top_t *,
4211 						   new_asub_tops);
4212       if (__glibc_unlikely (new_array == NULL))
4213 	return REG_ESPACE;
4214       mctx->sub_tops = new_array;
4215       mctx->asub_tops = new_asub_tops;
4216     }
4217   mctx->sub_tops[mctx->nsub_tops] = calloc (1, sizeof (re_sub_match_top_t));
4218   if (__glibc_unlikely (mctx->sub_tops[mctx->nsub_tops] == NULL))
4219     return REG_ESPACE;
4220   mctx->sub_tops[mctx->nsub_tops]->node = node;
4221   mctx->sub_tops[mctx->nsub_tops++]->str_idx = str_idx;
4222   return REG_NOERROR;
4223 }
4224 
4225 /* Register the node NODE, whose type is OP_CLOSE_SUBEXP, and which matches
4226    at STR_IDX, whose corresponding OP_OPEN_SUBEXP is SUB_TOP.
4227    Return the new entry if successful, NULL if memory is exhausted.  */
4228 
4229 static re_sub_match_last_t *
match_ctx_add_sublast(re_sub_match_top_t * subtop,Idx node,Idx str_idx)4230 match_ctx_add_sublast (re_sub_match_top_t *subtop, Idx node, Idx str_idx)
4231 {
4232   re_sub_match_last_t *new_entry;
4233   if (__glibc_unlikely (subtop->nlasts == subtop->alasts))
4234     {
4235       Idx new_alasts = 2 * subtop->alasts + 1;
4236       re_sub_match_last_t **new_array = re_realloc (subtop->lasts,
4237 						    re_sub_match_last_t *,
4238 						    new_alasts);
4239       if (__glibc_unlikely (new_array == NULL))
4240 	return NULL;
4241       subtop->lasts = new_array;
4242       subtop->alasts = new_alasts;
4243     }
4244   new_entry = calloc (1, sizeof (re_sub_match_last_t));
4245   if (__glibc_likely (new_entry != NULL))
4246     {
4247       subtop->lasts[subtop->nlasts] = new_entry;
4248       new_entry->node = node;
4249       new_entry->str_idx = str_idx;
4250       ++subtop->nlasts;
4251     }
4252   return new_entry;
4253 }
4254 
4255 static void
sift_ctx_init(re_sift_context_t * sctx,re_dfastate_t ** sifted_sts,re_dfastate_t ** limited_sts,Idx last_node,Idx last_str_idx)4256 sift_ctx_init (re_sift_context_t *sctx, re_dfastate_t **sifted_sts,
4257 	       re_dfastate_t **limited_sts, Idx last_node, Idx last_str_idx)
4258 {
4259   sctx->sifted_states = sifted_sts;
4260   sctx->limited_states = limited_sts;
4261   sctx->last_node = last_node;
4262   sctx->last_str_idx = last_str_idx;
4263   re_node_set_init_empty (&sctx->limits);
4264 }
4265