1 /* Extended regular expression matching and search library.
2 Copyright (C) 2002, 2003 Free Software Foundation, Inc.
3 This file is part of the GNU C Library.
4 Contributed by Isamu Hasegawa <isamu@yamato.ibm.com>.
6 The GNU C Library is free software; you can redistribute it and/or
7 modify it under the terms of the GNU Lesser General Public
8 License as published by the Free Software Foundation; either
9 version 2.1 of the License, or (at your option) any later version.
11 The GNU C Library is distributed in the hope that it will be useful,
12 but WITHOUT ANY WARRANTY; without even the implied warranty of
13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14 Lesser General Public License for more details.
16 You should have received a copy of the GNU Lesser General Public
17 License along with the GNU C Library; if not, write to the Free
18 Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
21 static reg_errcode_t re_compile_internal (regex_t *preg, const char * pattern,
22 int length, reg_syntax_t syntax);
23 static void re_compile_fastmap_iter (regex_t *bufp,
24 const re_dfastate_t *init_state,
26 static reg_errcode_t init_dfa (re_dfa_t *dfa, int pat_len);
27 static reg_errcode_t init_word_char (re_dfa_t *dfa);
29 static void free_charset (re_charset_t *cset);
30 #endif /* RE_ENABLE_I18N */
31 static void free_workarea_compile (regex_t *preg);
32 static reg_errcode_t create_initial_state (re_dfa_t *dfa);
33 static reg_errcode_t analyze (re_dfa_t *dfa);
34 static reg_errcode_t analyze_tree (re_dfa_t *dfa, bin_tree_t *node);
35 static void calc_first (re_dfa_t *dfa, bin_tree_t *node);
36 static void calc_next (re_dfa_t *dfa, bin_tree_t *node);
37 static void calc_epsdest (re_dfa_t *dfa, bin_tree_t *node);
38 static reg_errcode_t duplicate_node_closure (re_dfa_t *dfa, int top_org_node,
39 int top_clone_node, int root_node,
40 unsigned int constraint);
41 static reg_errcode_t duplicate_node (int *new_idx, re_dfa_t *dfa, int org_idx,
42 unsigned int constraint);
43 static int search_duplicated_node (re_dfa_t *dfa, int org_node,
44 unsigned int constraint);
45 static reg_errcode_t calc_eclosure (re_dfa_t *dfa);
46 static reg_errcode_t calc_eclosure_iter (re_node_set *new_set, re_dfa_t *dfa,
48 static void calc_inveclosure (re_dfa_t *dfa);
49 static int fetch_number (re_string_t *input, re_token_t *token,
51 static re_token_t fetch_token (re_string_t *input, reg_syntax_t syntax);
52 static int peek_token (re_token_t *token, re_string_t *input,
54 static int peek_token_bracket (re_token_t *token, re_string_t *input,
56 static bin_tree_t *parse (re_string_t *regexp, regex_t *preg,
57 reg_syntax_t syntax, reg_errcode_t *err);
58 static bin_tree_t *parse_reg_exp (re_string_t *regexp, regex_t *preg,
59 re_token_t *token, reg_syntax_t syntax,
60 int nest, reg_errcode_t *err);
61 static bin_tree_t *parse_branch (re_string_t *regexp, regex_t *preg,
62 re_token_t *token, reg_syntax_t syntax,
63 int nest, reg_errcode_t *err);
64 static bin_tree_t *parse_expression (re_string_t *regexp, regex_t *preg,
65 re_token_t *token, reg_syntax_t syntax,
66 int nest, reg_errcode_t *err);
67 static bin_tree_t *parse_sub_exp (re_string_t *regexp, regex_t *preg,
68 re_token_t *token, reg_syntax_t syntax,
69 int nest, reg_errcode_t *err);
70 static bin_tree_t *parse_dup_op (bin_tree_t *dup_elem, re_string_t *regexp,
71 re_dfa_t *dfa, re_token_t *token,
72 reg_syntax_t syntax, reg_errcode_t *err);
73 static bin_tree_t *parse_bracket_exp (re_string_t *regexp, re_dfa_t *dfa,
74 re_token_t *token, reg_syntax_t syntax,
76 static reg_errcode_t parse_bracket_element (bracket_elem_t *elem,
78 re_token_t *token, int token_len,
81 static reg_errcode_t parse_bracket_symbol (bracket_elem_t *elem,
85 # ifdef RE_ENABLE_I18N
86 static reg_errcode_t build_range_exp (re_bitset_ptr_t sbcset,
87 re_charset_t *mbcset, int *range_alloc,
88 bracket_elem_t *start_elem,
89 bracket_elem_t *end_elem);
90 static reg_errcode_t build_collating_symbol (re_bitset_ptr_t sbcset,
93 const unsigned char *name);
94 # else /* not RE_ENABLE_I18N */
95 static reg_errcode_t build_range_exp (re_bitset_ptr_t sbcset,
96 bracket_elem_t *start_elem,
97 bracket_elem_t *end_elem);
98 static reg_errcode_t build_collating_symbol (re_bitset_ptr_t sbcset,
99 const unsigned char *name);
100 # endif /* not RE_ENABLE_I18N */
101 #endif /* not _LIBC */
102 #ifdef RE_ENABLE_I18N
103 static reg_errcode_t build_equiv_class (re_bitset_ptr_t sbcset,
104 re_charset_t *mbcset,
105 int *equiv_class_alloc,
106 const unsigned char *name);
107 static reg_errcode_t build_charclass (re_bitset_ptr_t sbcset,
108 re_charset_t *mbcset,
109 int *char_class_alloc,
110 const unsigned char *class_name,
111 reg_syntax_t syntax);
112 #else /* not RE_ENABLE_I18N */
113 static reg_errcode_t build_equiv_class (re_bitset_ptr_t sbcset,
114 const unsigned char *name);
115 static reg_errcode_t build_charclass (re_bitset_ptr_t sbcset,
116 const unsigned char *class_name,
117 reg_syntax_t syntax);
118 #endif /* not RE_ENABLE_I18N */
119 static bin_tree_t *build_word_op (re_dfa_t *dfa, int not, reg_errcode_t *err);
120 static void free_bin_tree (bin_tree_t *tree);
121 static bin_tree_t *create_tree (bin_tree_t *left, bin_tree_t *right,
122 re_token_type_t type, int index);
123 static bin_tree_t *duplicate_tree (const bin_tree_t *src, re_dfa_t *dfa);
125 /* This table gives an error message for each of the error codes listed
126 in regex.h. Obviously the order here has to be same as there.
127 POSIX doesn't require that we do anything for REG_NOERROR,
128 but why not be nice? */
130 const char __re_error_msgid[] attribute_hidden =
132 #define REG_NOERROR_IDX 0
133 gettext_noop ("Success") /* REG_NOERROR */
135 #define REG_NOMATCH_IDX (REG_NOERROR_IDX + sizeof "Success")
136 gettext_noop ("No match") /* REG_NOMATCH */
138 #define REG_BADPAT_IDX (REG_NOMATCH_IDX + sizeof "No match")
139 gettext_noop ("Invalid regular expression") /* REG_BADPAT */
141 #define REG_ECOLLATE_IDX (REG_BADPAT_IDX + sizeof "Invalid regular expression")
142 gettext_noop ("Invalid collation character") /* REG_ECOLLATE */
144 #define REG_ECTYPE_IDX (REG_ECOLLATE_IDX + sizeof "Invalid collation character")
145 gettext_noop ("Invalid character class name") /* REG_ECTYPE */
147 #define REG_EESCAPE_IDX (REG_ECTYPE_IDX + sizeof "Invalid character class name")
148 gettext_noop ("Trailing backslash") /* REG_EESCAPE */
150 #define REG_ESUBREG_IDX (REG_EESCAPE_IDX + sizeof "Trailing backslash")
151 gettext_noop ("Invalid back reference") /* REG_ESUBREG */
153 #define REG_EBRACK_IDX (REG_ESUBREG_IDX + sizeof "Invalid back reference")
154 gettext_noop ("Unmatched [ or [^") /* REG_EBRACK */
156 #define REG_EPAREN_IDX (REG_EBRACK_IDX + sizeof "Unmatched [ or [^")
157 gettext_noop ("Unmatched ( or \\(") /* REG_EPAREN */
159 #define REG_EBRACE_IDX (REG_EPAREN_IDX + sizeof "Unmatched ( or \\(")
160 gettext_noop ("Unmatched \\{") /* REG_EBRACE */
162 #define REG_BADBR_IDX (REG_EBRACE_IDX + sizeof "Unmatched \\{")
163 gettext_noop ("Invalid content of \\{\\}") /* REG_BADBR */
165 #define REG_ERANGE_IDX (REG_BADBR_IDX + sizeof "Invalid content of \\{\\}")
166 gettext_noop ("Invalid range end") /* REG_ERANGE */
168 #define REG_ESPACE_IDX (REG_ERANGE_IDX + sizeof "Invalid range end")
169 gettext_noop ("Memory exhausted") /* REG_ESPACE */
171 #define REG_BADRPT_IDX (REG_ESPACE_IDX + sizeof "Memory exhausted")
172 gettext_noop ("Invalid preceding regular expression") /* REG_BADRPT */
174 #define REG_EEND_IDX (REG_BADRPT_IDX + sizeof "Invalid preceding regular expression")
175 gettext_noop ("Premature end of regular expression") /* REG_EEND */
177 #define REG_ESIZE_IDX (REG_EEND_IDX + sizeof "Premature end of regular expression")
178 gettext_noop ("Regular expression too big") /* REG_ESIZE */
180 #define REG_ERPAREN_IDX (REG_ESIZE_IDX + sizeof "Regular expression too big")
181 gettext_noop ("Unmatched ) or \\)") /* REG_ERPAREN */
184 const size_t __re_error_msgid_idx[] attribute_hidden =
205 /* Entry points for GNU code. */
207 /* re_compile_pattern is the GNU regular expression compiler: it
208 compiles PATTERN (of length LENGTH) and puts the result in BUFP.
209 Returns 0 if the pattern was valid, otherwise an error string.
211 Assumes the `allocated' (and perhaps `buffer') and `translate' fields
212 are set in BUFP on entry. */
215 re_compile_pattern (pattern, length, bufp)
218 struct re_pattern_buffer *bufp;
222 /* And GNU code determines whether or not to get register information
223 by passing null for the REGS argument to re_match, etc., not by
227 /* Match anchors at newline. */
228 bufp->newline_anchor = 1;
230 ret = re_compile_internal (bufp, pattern, length, re_syntax_options);
234 return gettext (__re_error_msgid + __re_error_msgid_idx[(int) ret]);
237 weak_alias (__re_compile_pattern, re_compile_pattern)
240 /* Set by `re_set_syntax' to the current regexp syntax to recognize. Can
241 also be assigned to arbitrarily: each pattern buffer stores its own
242 syntax, so it can be changed between regex compilations. */
243 /* This has no initializer because initialized variables in Emacs
244 become read-only after dumping. */
245 reg_syntax_t re_syntax_options;
248 /* Specify the precise syntax of regexps for compilation. This provides
249 for compatibility for various utilities which historically have
250 different, incompatible syntaxes.
252 The argument SYNTAX is a bit mask comprised of the various bits
253 defined in regex.h. We return the old syntax. */
256 re_set_syntax (syntax)
259 reg_syntax_t ret = re_syntax_options;
261 re_syntax_options = syntax;
265 weak_alias (__re_set_syntax, re_set_syntax)
269 re_compile_fastmap (bufp)
270 struct re_pattern_buffer *bufp;
272 re_dfa_t *dfa = (re_dfa_t *) bufp->buffer;
273 char *fastmap = bufp->fastmap;
275 memset (fastmap, '\0', sizeof (char) * SBC_MAX);
276 re_compile_fastmap_iter (bufp, dfa->init_state, fastmap);
277 if (dfa->init_state != dfa->init_state_word)
278 re_compile_fastmap_iter (bufp, dfa->init_state_word, fastmap);
279 if (dfa->init_state != dfa->init_state_nl)
280 re_compile_fastmap_iter (bufp, dfa->init_state_nl, fastmap);
281 if (dfa->init_state != dfa->init_state_begbuf)
282 re_compile_fastmap_iter (bufp, dfa->init_state_begbuf, fastmap);
283 bufp->fastmap_accurate = 1;
287 weak_alias (__re_compile_fastmap, re_compile_fastmap)
291 re_set_fastmap (char *fastmap, int icase, int ch)
295 fastmap[tolower (ch)] = 1;
298 /* Helper function for re_compile_fastmap.
299 Compile fastmap for the initial_state INIT_STATE. */
302 re_compile_fastmap_iter (bufp, init_state, fastmap)
304 const re_dfastate_t *init_state;
307 re_dfa_t *dfa = (re_dfa_t *) bufp->buffer;
309 int icase = (MB_CUR_MAX == 1 && (bufp->syntax & RE_ICASE));
310 for (node_cnt = 0; node_cnt < init_state->nodes.nelem; ++node_cnt)
312 int node = init_state->nodes.elems[node_cnt];
313 re_token_type_t type = dfa->nodes[node].type;
315 if (type == CHARACTER)
316 re_set_fastmap (fastmap, icase, dfa->nodes[node].opr.c);
317 else if (type == SIMPLE_BRACKET)
320 for (i = 0, ch = 0; i < BITSET_UINTS; ++i)
321 for (j = 0; j < UINT_BITS; ++j, ++ch)
322 if (dfa->nodes[node].opr.sbcset[i] & (1 << j))
323 re_set_fastmap (fastmap, icase, ch);
325 #ifdef RE_ENABLE_I18N
326 else if (type == COMPLEX_BRACKET)
329 re_charset_t *cset = dfa->nodes[node].opr.mbcset;
330 if (cset->non_match || cset->ncoll_syms || cset->nequiv_classes
331 || cset->nranges || cset->nchar_classes)
334 if (_NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES) != 0)
336 /* In this case we want to catch the bytes which are
337 the first byte of any collation elements.
338 e.g. In da_DK, we want to catch 'a' since "aa"
339 is a valid collation element, and don't catch
340 'b' since 'b' is the only collation element
341 which starts from 'b'. */
343 const int32_t *table = (const int32_t *)
344 _NL_CURRENT (LC_COLLATE, _NL_COLLATE_TABLEMB);
345 for (i = 0, ch = 0; i < BITSET_UINTS; ++i)
346 for (j = 0; j < UINT_BITS; ++j, ++ch)
348 re_set_fastmap (fastmap, icase, ch);
352 for (i = 0; i < SBC_MAX; ++i)
353 if (__btowc (i) == WEOF)
354 re_set_fastmap (fastmap, icase, i);
355 # endif /* not _LIBC */
357 for (i = 0; i < cset->nmbchars; ++i)
361 memset (&state, '\0', sizeof (state));
362 __wcrtomb (buf, cset->mbchars[i], &state);
363 re_set_fastmap (fastmap, icase, *(unsigned char *) buf);
366 #endif /* RE_ENABLE_I18N */
367 else if (type == END_OF_RE || type == OP_PERIOD)
369 memset (fastmap, '\1', sizeof (char) * SBC_MAX);
370 if (type == END_OF_RE)
371 bufp->can_be_null = 1;
377 /* Entry point for POSIX code. */
378 /* regcomp takes a regular expression as a string and compiles it.
380 PREG is a regex_t *. We do not expect any fields to be initialized,
381 since POSIX says we shouldn't. Thus, we set
383 `buffer' to the compiled pattern;
384 `used' to the length of the compiled pattern;
385 `syntax' to RE_SYNTAX_POSIX_EXTENDED if the
386 REG_EXTENDED bit in CFLAGS is set; otherwise, to
387 RE_SYNTAX_POSIX_BASIC;
388 `newline_anchor' to REG_NEWLINE being set in CFLAGS;
389 `fastmap' to an allocated space for the fastmap;
390 `fastmap_accurate' to zero;
391 `re_nsub' to the number of subexpressions in PATTERN.
393 PATTERN is the address of the pattern string.
395 CFLAGS is a series of bits which affect compilation.
397 If REG_EXTENDED is set, we use POSIX extended syntax; otherwise, we
398 use POSIX basic syntax.
400 If REG_NEWLINE is set, then . and [^...] don't match newline.
401 Also, regexec will try a match beginning after every newline.
403 If REG_ICASE is set, then we considers upper- and lowercase
404 versions of letters to be equivalent when matching.
406 If REG_NOSUB is set, then when PREG is passed to regexec, that
407 routine will report only success or failure, and nothing about the
410 It returns 0 if it succeeds, nonzero if it doesn't. (See regex.h for
411 the return codes and their meanings.) */
414 regcomp (preg, pattern, cflags)
415 regex_t *__restrict preg;
416 const char *__restrict pattern;
420 reg_syntax_t syntax = ((cflags & REG_EXTENDED) ? RE_SYNTAX_POSIX_EXTENDED
421 : RE_SYNTAX_POSIX_BASIC);
427 /* Try to allocate space for the fastmap. */
428 preg->fastmap = re_malloc (char, SBC_MAX);
429 if (BE (preg->fastmap == NULL, 0))
432 syntax |= (cflags & REG_ICASE) ? RE_ICASE : 0;
434 /* If REG_NEWLINE is set, newlines are treated differently. */
435 if (cflags & REG_NEWLINE)
436 { /* REG_NEWLINE implies neither . nor [^...] match newline. */
437 syntax &= ~RE_DOT_NEWLINE;
438 syntax |= RE_HAT_LISTS_NOT_NEWLINE;
439 /* It also changes the matching behavior. */
440 preg->newline_anchor = 1;
443 preg->newline_anchor = 0;
444 preg->no_sub = !!(cflags & REG_NOSUB);
445 preg->translate = NULL;
447 ret = re_compile_internal (preg, pattern, strlen (pattern), syntax);
449 /* POSIX doesn't distinguish between an unmatched open-group and an
450 unmatched close-group: both are REG_EPAREN. */
451 if (ret == REG_ERPAREN)
454 /* We have already checked preg->fastmap != NULL. */
455 if (BE (ret == REG_NOERROR, 1))
456 /* Compute the fastmap now, since regexec cannot modify the pattern
457 buffer. This function nevers fails in this implementation. */
458 (void) re_compile_fastmap (preg);
461 /* Some error occurred while compiling the expression. */
462 re_free (preg->fastmap);
463 preg->fastmap = NULL;
469 weak_alias (__regcomp, regcomp)
472 /* Returns a message corresponding to an error code, ERRCODE, returned
473 from either regcomp or regexec. We don't use PREG here. */
476 regerror (errcode, preg, errbuf, errbuf_size)
486 || errcode >= (int) (sizeof (__re_error_msgid_idx)
487 / sizeof (__re_error_msgid_idx[0])), 0))
488 /* Only error codes returned by the rest of the code should be passed
489 to this routine. If we are given anything else, or if other regex
490 code generates an invalid error code, then the program has a bug.
491 Dump core so we can fix it. */
494 msg = gettext (__re_error_msgid + __re_error_msgid_idx[errcode]);
496 msg_size = strlen (msg) + 1; /* Includes the null. */
498 if (BE (errbuf_size != 0, 1))
500 if (BE (msg_size > errbuf_size, 0))
502 #if defined HAVE_MEMPCPY || defined _LIBC
503 *((char *) __mempcpy (errbuf, msg, errbuf_size - 1)) = '\0';
505 memcpy (errbuf, msg, errbuf_size - 1);
506 errbuf[errbuf_size - 1] = 0;
510 memcpy (errbuf, msg, msg_size);
516 weak_alias (__regerror, regerror)
521 free_dfa_content (re_dfa_t *dfa)
525 re_free (dfa->subexps);
527 for (i = 0; i < dfa->nodes_len; ++i)
529 re_token_t *node = dfa->nodes + i;
530 #ifdef RE_ENABLE_I18N
531 if (node->type == COMPLEX_BRACKET && node->duplicated == 0)
532 free_charset (node->opr.mbcset);
534 #endif /* RE_ENABLE_I18N */
535 if (node->type == SIMPLE_BRACKET && node->duplicated == 0)
536 re_free (node->opr.sbcset);
538 re_free (dfa->nexts);
539 for (i = 0; i < dfa->nodes_len; ++i)
541 if (dfa->eclosures != NULL)
542 re_node_set_free (dfa->eclosures + i);
543 if (dfa->inveclosures != NULL)
544 re_node_set_free (dfa->inveclosures + i);
545 if (dfa->edests != NULL)
546 re_node_set_free (dfa->edests + i);
548 re_free (dfa->edests);
549 re_free (dfa->eclosures);
550 re_free (dfa->inveclosures);
551 re_free (dfa->nodes);
553 for (i = 0; i <= dfa->state_hash_mask; ++i)
555 struct re_state_table_entry *entry = dfa->state_table + i;
556 for (j = 0; j < entry->num; ++j)
558 re_dfastate_t *state = entry->array[j];
561 re_free (entry->array);
563 re_free (dfa->state_table);
565 if (dfa->word_char != NULL)
566 re_free (dfa->word_char);
568 re_free (dfa->re_str);
575 /* Free dynamically allocated space used by PREG. */
581 re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
582 if (BE (dfa != NULL, 1))
583 free_dfa_content (dfa);
585 re_free (preg->fastmap);
588 weak_alias (__regfree, regfree)
591 /* Entry points compatible with 4.2 BSD regex library. We don't define
592 them unless specifically requested. */
594 #if defined _REGEX_RE_COMP || defined _LIBC
596 /* BSD has one and only one pattern buffer. */
597 static struct re_pattern_buffer re_comp_buf;
601 /* Make these definitions weak in libc, so POSIX programs can redefine
602 these names if they don't use our functions, and still use
603 regcomp/regexec above without link errors. */
614 if (!re_comp_buf.buffer)
615 return gettext ("No previous regular expression");
619 if (re_comp_buf.buffer)
621 fastmap = re_comp_buf.fastmap;
622 re_comp_buf.fastmap = NULL;
623 __regfree (&re_comp_buf);
624 memset (&re_comp_buf, '\0', sizeof (re_comp_buf));
625 re_comp_buf.fastmap = fastmap;
628 if (re_comp_buf.fastmap == NULL)
630 re_comp_buf.fastmap = (char *) malloc (SBC_MAX);
631 if (re_comp_buf.fastmap == NULL)
632 return (char *) gettext (__re_error_msgid
633 + __re_error_msgid_idx[(int) REG_ESPACE]);
636 /* Since `re_exec' always passes NULL for the `regs' argument, we
637 don't need to initialize the pattern buffer fields which affect it. */
639 /* Match anchors at newlines. */
640 re_comp_buf.newline_anchor = 1;
642 ret = re_compile_internal (&re_comp_buf, s, strlen (s), re_syntax_options);
647 /* Yes, we're discarding `const' here if !HAVE_LIBINTL. */
648 return (char *) gettext (__re_error_msgid + __re_error_msgid_idx[(int) ret]);
652 libc_freeres_fn (free_mem)
654 __regfree (&re_comp_buf);
658 #endif /* _REGEX_RE_COMP */
660 /* Internal entry point.
661 Compile the regular expression PATTERN, whose length is LENGTH.
662 SYNTAX indicate regular expression's syntax. */
665 re_compile_internal (preg, pattern, length, syntax)
667 const char * pattern;
671 reg_errcode_t err = REG_NOERROR;
675 /* Initialize the pattern buffer. */
676 preg->fastmap_accurate = 0;
677 preg->syntax = syntax;
678 preg->not_bol = preg->not_eol = 0;
681 preg->can_be_null = 0;
682 preg->regs_allocated = REGS_UNALLOCATED;
684 /* Initialize the dfa. */
685 dfa = (re_dfa_t *) preg->buffer;
686 if (preg->allocated < sizeof (re_dfa_t))
688 /* If zero allocated, but buffer is non-null, try to realloc
689 enough space. This loses if buffer's address is bogus, but
690 that is the user's responsibility. If ->buffer is NULL this
691 is a simple allocation. */
692 dfa = re_realloc (preg->buffer, re_dfa_t, 1);
695 preg->allocated = sizeof (re_dfa_t);
697 preg->buffer = (unsigned char *) dfa;
698 preg->used = sizeof (re_dfa_t);
700 err = init_dfa (dfa, length);
701 if (BE (err != REG_NOERROR, 0))
709 dfa->re_str = re_malloc (char, length + 1);
710 strncpy (dfa->re_str, pattern, length + 1);
713 err = re_string_construct (®exp, pattern, length, preg->translate,
715 if (BE (err != REG_NOERROR, 0))
723 /* Parse the regular expression, and build a structure tree. */
725 dfa->str_tree = parse (®exp, preg, syntax, &err);
726 if (BE (dfa->str_tree == NULL, 0))
727 goto re_compile_internal_free_return;
729 /* Analyze the tree and collect information which is necessary to
732 if (BE (err != REG_NOERROR, 0))
733 goto re_compile_internal_free_return;
735 /* Then create the initial state of the dfa. */
736 err = create_initial_state (dfa);
738 /* Release work areas. */
739 free_workarea_compile (preg);
740 re_string_destruct (®exp);
742 if (BE (err != REG_NOERROR, 0))
744 re_compile_internal_free_return:
745 free_dfa_content (dfa);
753 /* Initialize DFA. We use the length of the regular expression PAT_LEN
754 as the initial length of some arrays. */
757 init_dfa (dfa, pat_len)
763 memset (dfa, '\0', sizeof (re_dfa_t));
765 dfa->nodes_alloc = pat_len + 1;
766 dfa->nodes = re_malloc (re_token_t, dfa->nodes_alloc);
768 dfa->states_alloc = pat_len + 1;
770 /* table_size = 2 ^ ceil(log pat_len) */
771 for (table_size = 1; table_size > 0; table_size <<= 1)
772 if (table_size > pat_len)
775 dfa->state_table = calloc (sizeof (struct re_state_table_entry), table_size);
776 dfa->state_hash_mask = table_size - 1;
778 dfa->subexps_alloc = 1;
779 dfa->subexps = re_malloc (re_subexp_t, dfa->subexps_alloc);
780 dfa->word_char = NULL;
782 if (BE (dfa->nodes == NULL || dfa->state_table == NULL
783 || dfa->subexps == NULL, 0))
785 /* We don't bother to free anything which was allocated. Very
786 soon the process will go down anyway. */
788 dfa->state_table = NULL;
795 /* Initialize WORD_CHAR table, which indicate which character is
796 "word". In this case "word" means that it is the word construction
797 character used by some operators like "\<", "\>", etc. */
804 dfa->word_char = (re_bitset_ptr_t) calloc (sizeof (bitset), 1);
805 if (BE (dfa->word_char == NULL, 0))
807 for (i = 0, ch = 0; i < BITSET_UINTS; ++i)
808 for (j = 0; j < UINT_BITS; ++j, ++ch)
809 if (isalnum (ch) || ch == '_')
810 dfa->word_char[i] |= 1 << j;
814 /* Free the work area which are only used while compiling. */
817 free_workarea_compile (preg)
820 re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
821 free_bin_tree (dfa->str_tree);
822 dfa->str_tree = NULL;
823 re_free (dfa->org_indices);
824 dfa->org_indices = NULL;
827 /* Create initial states for all contexts. */
830 create_initial_state (dfa)
835 re_node_set init_nodes;
837 /* Initial states have the epsilon closure of the node which is
838 the first node of the regular expression. */
839 first = dfa->str_tree->first;
840 dfa->init_node = first;
841 err = re_node_set_init_copy (&init_nodes, dfa->eclosures + first);
842 if (BE (err != REG_NOERROR, 0))
845 /* The back-references which are in initial states can epsilon transit,
846 since in this case all of the subexpressions can be null.
847 Then we add epsilon closures of the nodes which are the next nodes of
848 the back-references. */
849 if (dfa->nbackref > 0)
850 for (i = 0; i < init_nodes.nelem; ++i)
852 int node_idx = init_nodes.elems[i];
853 re_token_type_t type = dfa->nodes[node_idx].type;
856 if (type != OP_BACK_REF)
858 for (clexp_idx = 0; clexp_idx < init_nodes.nelem; ++clexp_idx)
860 re_token_t *clexp_node;
861 clexp_node = dfa->nodes + init_nodes.elems[clexp_idx];
862 if (clexp_node->type == OP_CLOSE_SUBEXP
863 && clexp_node->opr.idx + 1 == dfa->nodes[node_idx].opr.idx)
866 if (clexp_idx == init_nodes.nelem)
869 if (type == OP_BACK_REF)
871 int dest_idx = dfa->edests[node_idx].elems[0];
872 if (!re_node_set_contains (&init_nodes, dest_idx))
874 re_node_set_merge (&init_nodes, dfa->eclosures + dest_idx);
880 /* It must be the first time to invoke acquire_state. */
881 dfa->init_state = re_acquire_state_context (&err, dfa, &init_nodes, 0);
882 /* We don't check ERR here, since the initial state must not be NULL. */
883 if (BE (dfa->init_state == NULL, 0))
885 if (dfa->init_state->has_constraint)
887 dfa->init_state_word = re_acquire_state_context (&err, dfa, &init_nodes,
889 dfa->init_state_nl = re_acquire_state_context (&err, dfa, &init_nodes,
891 dfa->init_state_begbuf = re_acquire_state_context (&err, dfa,
895 if (BE (dfa->init_state_word == NULL || dfa->init_state_nl == NULL
896 || dfa->init_state_begbuf == NULL, 0))
900 dfa->init_state_word = dfa->init_state_nl
901 = dfa->init_state_begbuf = dfa->init_state;
903 re_node_set_free (&init_nodes);
907 /* Analyze the structure tree, and calculate "first", "next", "edest",
908 "eclosure", and "inveclosure". */
917 /* Allocate arrays. */
918 dfa->nexts = re_malloc (int, dfa->nodes_alloc);
919 dfa->org_indices = re_malloc (int, dfa->nodes_alloc);
920 dfa->edests = re_malloc (re_node_set, dfa->nodes_alloc);
921 dfa->eclosures = re_malloc (re_node_set, dfa->nodes_alloc);
922 dfa->inveclosures = re_malloc (re_node_set, dfa->nodes_alloc);
923 if (BE (dfa->nexts == NULL || dfa->org_indices == NULL || dfa->edests == NULL
924 || dfa->eclosures == NULL || dfa->inveclosures == NULL, 0))
926 /* Initialize them. */
927 for (i = 0; i < dfa->nodes_len; ++i)
930 re_node_set_init_empty (dfa->edests + i);
931 re_node_set_init_empty (dfa->eclosures + i);
932 re_node_set_init_empty (dfa->inveclosures + i);
935 ret = analyze_tree (dfa, dfa->str_tree);
936 if (BE (ret == REG_NOERROR, 1))
938 ret = calc_eclosure (dfa);
939 if (ret == REG_NOERROR)
940 calc_inveclosure (dfa);
945 /* Helper functions for analyze.
946 This function calculate "first", "next", and "edest" for the subtree
947 whose root is NODE. */
950 analyze_tree (dfa, node)
955 if (node->first == -1)
956 calc_first (dfa, node);
957 if (node->next == -1)
958 calc_next (dfa, node);
959 if (node->eclosure.nelem == 0)
960 calc_epsdest (dfa, node);
961 /* Calculate "first" etc. for the left child. */
962 if (node->left != NULL)
964 ret = analyze_tree (dfa, node->left);
965 if (BE (ret != REG_NOERROR, 0))
968 /* Calculate "first" etc. for the right child. */
969 if (node->right != NULL)
971 ret = analyze_tree (dfa, node->right);
972 if (BE (ret != REG_NOERROR, 0))
978 /* Calculate "first" for the node NODE. */
980 calc_first (dfa, node)
985 idx = node->node_idx;
986 type = (node->type == 0) ? dfa->nodes[idx].type : node->type;
991 case OP_OPEN_BRACKET:
992 case OP_CLOSE_BRACKET:
993 case OP_OPEN_DUP_NUM:
994 case OP_CLOSE_DUP_NUM:
995 case OP_NON_MATCH_LIST:
996 case OP_OPEN_COLL_ELEM:
997 case OP_CLOSE_COLL_ELEM:
998 case OP_OPEN_EQUIV_CLASS:
999 case OP_CLOSE_EQUIV_CLASS:
1000 case OP_OPEN_CHAR_CLASS:
1001 case OP_CLOSE_CHAR_CLASS:
1002 /* These must not be appeared here. */
1008 case OP_DUP_ASTERISK:
1009 case OP_DUP_QUESTION:
1010 #ifdef RE_ENABLE_I18N
1011 case COMPLEX_BRACKET:
1012 #endif /* RE_ENABLE_I18N */
1013 case SIMPLE_BRACKET:
1016 case OP_OPEN_SUBEXP:
1017 case OP_CLOSE_SUBEXP:
1022 assert (node->left != NULL);
1024 if (node->left->first == -1)
1025 calc_first (dfa, node->left);
1026 node->first = node->left->first;
1031 /* else fall through */
1034 assert (node->left != NULL);
1036 if (node->left->first == -1)
1037 calc_first (dfa, node->left);
1038 node->first = node->left->first;
1043 /* Calculate "next" for the node NODE. */
1046 calc_next (dfa, node)
1051 bin_tree_t *parent = node->parent;
1055 idx = node->node_idx;
1056 if (node->type == 0)
1057 dfa->nexts[idx] = node->next;
1061 idx = parent->node_idx;
1062 type = (parent->type == 0) ? dfa->nodes[idx].type : parent->type;
1066 case OP_DUP_ASTERISK:
1071 if (parent->left == node)
1073 if (parent->right->first == -1)
1074 calc_first (dfa, parent->right);
1075 node->next = parent->right->first;
1078 /* else fall through */
1080 if (parent->next == -1)
1081 calc_next (dfa, parent);
1082 node->next = parent->next;
1085 idx = node->node_idx;
1086 if (node->type == 0)
1087 dfa->nexts[idx] = node->next;
1090 /* Calculate "edest" for the node NODE. */
1093 calc_epsdest (dfa, node)
1098 idx = node->node_idx;
1099 if (node->type == 0)
1101 if (dfa->nodes[idx].type == OP_DUP_ASTERISK
1102 || dfa->nodes[idx].type == OP_DUP_PLUS
1103 || dfa->nodes[idx].type == OP_DUP_QUESTION)
1105 if (node->left->first == -1)
1106 calc_first (dfa, node->left);
1107 if (node->next == -1)
1108 calc_next (dfa, node);
1109 re_node_set_init_2 (dfa->edests + idx, node->left->first,
1112 else if (dfa->nodes[idx].type == OP_ALT)
1115 if (node->left != NULL)
1117 if (node->left->first == -1)
1118 calc_first (dfa, node->left);
1119 left = node->left->first;
1123 if (node->next == -1)
1124 calc_next (dfa, node);
1127 if (node->right != NULL)
1129 if (node->right->first == -1)
1130 calc_first (dfa, node->right);
1131 right = node->right->first;
1135 if (node->next == -1)
1136 calc_next (dfa, node);
1139 re_node_set_init_2 (dfa->edests + idx, left, right);
1141 else if (dfa->nodes[idx].type == ANCHOR
1142 || dfa->nodes[idx].type == OP_OPEN_SUBEXP
1143 || dfa->nodes[idx].type == OP_CLOSE_SUBEXP
1144 || dfa->nodes[idx].type == OP_BACK_REF)
1145 re_node_set_init_1 (dfa->edests + idx, node->next);
1149 /* Duplicate the epsilon closure of the node ROOT_NODE.
1150 Note that duplicated nodes have constraint INIT_CONSTRAINT in addition
1151 to their own constraint. */
1153 static reg_errcode_t
1154 duplicate_node_closure (dfa, top_org_node, top_clone_node, root_node,
1157 int top_org_node, top_clone_node, root_node;
1158 unsigned int init_constraint;
1161 int org_node, clone_node, ret;
1162 unsigned int constraint = init_constraint;
1163 for (org_node = top_org_node, clone_node = top_clone_node;;)
1165 int org_dest, clone_dest;
1166 if (dfa->nodes[org_node].type == OP_BACK_REF)
1168 /* If the back reference epsilon-transit, its destination must
1169 also have the constraint. Then duplicate the epsilon closure
1170 of the destination of the back reference, and store it in
1171 edests of the back reference. */
1172 org_dest = dfa->nexts[org_node];
1173 re_node_set_empty (dfa->edests + clone_node);
1174 err = duplicate_node (&clone_dest, dfa, org_dest, constraint);
1175 if (BE (err != REG_NOERROR, 0))
1177 dfa->nexts[clone_node] = dfa->nexts[org_node];
1178 ret = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1179 if (BE (ret < 0, 0))
1182 else if (dfa->edests[org_node].nelem == 0)
1184 /* In case of the node can't epsilon-transit, don't duplicate the
1185 destination and store the original destination as the
1186 destination of the node. */
1187 dfa->nexts[clone_node] = dfa->nexts[org_node];
1190 else if (dfa->edests[org_node].nelem == 1)
1192 /* In case of the node can epsilon-transit, and it has only one
1194 org_dest = dfa->edests[org_node].elems[0];
1195 re_node_set_empty (dfa->edests + clone_node);
1196 if (dfa->nodes[org_node].type == ANCHOR)
1198 /* In case of the node has another constraint, append it. */
1199 if (org_node == root_node && clone_node != org_node)
1201 /* ...but if the node is root_node itself, it means the
1202 epsilon closure have a loop, then tie it to the
1203 destination of the root_node. */
1204 ret = re_node_set_insert (dfa->edests + clone_node,
1206 if (BE (ret < 0, 0))
1210 constraint |= dfa->nodes[org_node].opr.ctx_type;
1212 err = duplicate_node (&clone_dest, dfa, org_dest, constraint);
1213 if (BE (err != REG_NOERROR, 0))
1215 ret = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1216 if (BE (ret < 0, 0))
1219 else /* dfa->edests[org_node].nelem == 2 */
1221 /* In case of the node can epsilon-transit, and it has two
1222 destinations. E.g. '|', '*', '+', '?'. */
1223 org_dest = dfa->edests[org_node].elems[0];
1224 re_node_set_empty (dfa->edests + clone_node);
1225 /* Search for a duplicated node which satisfies the constraint. */
1226 clone_dest = search_duplicated_node (dfa, org_dest, constraint);
1227 if (clone_dest == -1)
1229 /* There are no such a duplicated node, create a new one. */
1230 err = duplicate_node (&clone_dest, dfa, org_dest, constraint);
1231 if (BE (err != REG_NOERROR, 0))
1233 ret = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1234 if (BE (ret < 0, 0))
1236 err = duplicate_node_closure (dfa, org_dest, clone_dest,
1237 root_node, constraint);
1238 if (BE (err != REG_NOERROR, 0))
1243 /* There are a duplicated node which satisfy the constraint,
1244 use it to avoid infinite loop. */
1245 ret = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1246 if (BE (ret < 0, 0))
1250 org_dest = dfa->edests[org_node].elems[1];
1251 err = duplicate_node (&clone_dest, dfa, org_dest, constraint);
1252 if (BE (err != REG_NOERROR, 0))
1254 ret = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1255 if (BE (ret < 0, 0))
1258 org_node = org_dest;
1259 clone_node = clone_dest;
1264 /* Search for a node which is duplicated from the node ORG_NODE, and
1265 satisfies the constraint CONSTRAINT. */
1268 search_duplicated_node (dfa, org_node, constraint)
1271 unsigned int constraint;
1274 for (idx = dfa->nodes_len - 1; dfa->nodes[idx].duplicated && idx > 0; --idx)
1276 if (org_node == dfa->org_indices[idx]
1277 && constraint == dfa->nodes[idx].constraint)
1278 return idx; /* Found. */
1280 return -1; /* Not found. */
1283 /* Duplicate the node whose index is ORG_IDX and set the constraint CONSTRAINT.
1284 The new index will be stored in NEW_IDX and return REG_NOERROR if succeeded,
1285 otherwise return the error code. */
1287 static reg_errcode_t
1288 duplicate_node (new_idx, dfa, org_idx, constraint)
1290 int *new_idx, org_idx;
1291 unsigned int constraint;
1296 dup = dfa->nodes[org_idx];
1297 dup_idx = re_dfa_add_node (dfa, dup, 1);
1298 if (BE (dup_idx == -1, 0))
1300 dfa->nodes[dup_idx].constraint = constraint;
1301 if (dfa->nodes[org_idx].type == ANCHOR)
1302 dfa->nodes[dup_idx].constraint |= dfa->nodes[org_idx].opr.ctx_type;
1303 dfa->nodes[dup_idx].duplicated = 1;
1304 re_node_set_init_empty (dfa->edests + dup_idx);
1305 re_node_set_init_empty (dfa->eclosures + dup_idx);
1306 re_node_set_init_empty (dfa->inveclosures + dup_idx);
1308 /* Store the index of the original node. */
1309 dfa->org_indices[dup_idx] = org_idx;
1315 calc_inveclosure (dfa)
1319 for (src = 0; src < dfa->nodes_len; ++src)
1321 for (idx = 0; idx < dfa->eclosures[src].nelem; ++idx)
1323 dest = dfa->eclosures[src].elems[idx];
1324 re_node_set_insert (dfa->inveclosures + dest, src);
1329 /* Calculate "eclosure" for all the node in DFA. */
1331 static reg_errcode_t
1335 int node_idx, incomplete;
1337 assert (dfa->nodes_len > 0);
1340 /* For each nodes, calculate epsilon closure. */
1341 for (node_idx = 0; ; ++node_idx)
1344 re_node_set eclosure_elem;
1345 if (node_idx == dfa->nodes_len)
1354 assert (dfa->eclosures[node_idx].nelem != -1);
1356 /* If we have already calculated, skip it. */
1357 if (dfa->eclosures[node_idx].nelem != 0)
1359 /* Calculate epsilon closure of `node_idx'. */
1360 err = calc_eclosure_iter (&eclosure_elem, dfa, node_idx, 1);
1361 if (BE (err != REG_NOERROR, 0))
1364 if (dfa->eclosures[node_idx].nelem == 0)
1367 re_node_set_free (&eclosure_elem);
1373 /* Calculate epsilon closure of NODE. */
1375 static reg_errcode_t
1376 calc_eclosure_iter (new_set, dfa, node, root)
1377 re_node_set *new_set;
1382 unsigned int constraint;
1384 re_node_set eclosure;
1386 err = re_node_set_alloc (&eclosure, dfa->edests[node].nelem + 1);
1387 if (BE (err != REG_NOERROR, 0))
1390 /* This indicates that we are calculating this node now.
1391 We reference this value to avoid infinite loop. */
1392 dfa->eclosures[node].nelem = -1;
1394 constraint = ((dfa->nodes[node].type == ANCHOR)
1395 ? dfa->nodes[node].opr.ctx_type : 0);
1396 /* If the current node has constraints, duplicate all nodes.
1397 Since they must inherit the constraints. */
1398 if (constraint && !dfa->nodes[dfa->edests[node].elems[0]].duplicated)
1400 int org_node, cur_node;
1401 org_node = cur_node = node;
1402 err = duplicate_node_closure (dfa, node, node, node, constraint);
1403 if (BE (err != REG_NOERROR, 0))
1407 /* Expand each epsilon destination nodes. */
1408 if (IS_EPSILON_NODE(dfa->nodes[node].type))
1409 for (i = 0; i < dfa->edests[node].nelem; ++i)
1411 re_node_set eclosure_elem;
1412 int edest = dfa->edests[node].elems[i];
1413 /* If calculating the epsilon closure of `edest' is in progress,
1414 return intermediate result. */
1415 if (dfa->eclosures[edest].nelem == -1)
1420 /* If we haven't calculated the epsilon closure of `edest' yet,
1421 calculate now. Otherwise use calculated epsilon closure. */
1422 if (dfa->eclosures[edest].nelem == 0)
1424 err = calc_eclosure_iter (&eclosure_elem, dfa, edest, 0);
1425 if (BE (err != REG_NOERROR, 0))
1429 eclosure_elem = dfa->eclosures[edest];
1430 /* Merge the epsilon closure of `edest'. */
1431 re_node_set_merge (&eclosure, &eclosure_elem);
1432 /* If the epsilon closure of `edest' is incomplete,
1433 the epsilon closure of this node is also incomplete. */
1434 if (dfa->eclosures[edest].nelem == 0)
1437 re_node_set_free (&eclosure_elem);
1441 /* Epsilon closures include itself. */
1442 re_node_set_insert (&eclosure, node);
1443 if (incomplete && !root)
1444 dfa->eclosures[node].nelem = 0;
1446 dfa->eclosures[node] = eclosure;
1447 *new_set = eclosure;
1451 /* Functions for token which are used in the parser. */
1453 /* Fetch a token from INPUT.
1454 We must not use this function inside bracket expressions. */
1457 fetch_token (input, syntax)
1459 reg_syntax_t syntax;
1463 consumed_byte = peek_token (&token, input, syntax);
1464 re_string_skip_bytes (input, consumed_byte);
1468 /* Peek a token from INPUT, and return the length of the token.
1469 We must not use this function inside bracket expressions. */
1472 peek_token (token, input, syntax)
1475 reg_syntax_t syntax;
1479 if (re_string_eoi (input))
1481 token->type = END_OF_RE;
1485 c = re_string_peek_byte (input, 0);
1488 #ifdef RE_ENABLE_I18N
1489 token->mb_partial = 0;
1490 if (MB_CUR_MAX > 1 &&
1491 !re_string_first_byte (input, re_string_cur_idx (input)))
1493 token->type = CHARACTER;
1494 token->mb_partial = 1;
1501 if (re_string_cur_idx (input) + 1 >= re_string_length (input))
1503 token->type = BACK_SLASH;
1507 c2 = re_string_peek_byte_case (input, 1);
1509 token->type = CHARACTER;
1513 if (!(syntax & RE_LIMITED_OPS) && !(syntax & RE_NO_BK_VBAR))
1514 token->type = OP_ALT;
1516 case '1': case '2': case '3': case '4': case '5':
1517 case '6': case '7': case '8': case '9':
1518 if (!(syntax & RE_NO_BK_REFS))
1520 token->type = OP_BACK_REF;
1521 token->opr.idx = c2 - '0';
1525 if (!(syntax & RE_NO_GNU_OPS))
1527 token->type = ANCHOR;
1528 token->opr.idx = WORD_FIRST;
1532 if (!(syntax & RE_NO_GNU_OPS))
1534 token->type = ANCHOR;
1535 token->opr.idx = WORD_LAST;
1539 if (!(syntax & RE_NO_GNU_OPS))
1541 token->type = ANCHOR;
1542 token->opr.idx = WORD_DELIM;
1546 if (!(syntax & RE_NO_GNU_OPS))
1548 token->type = ANCHOR;
1549 token->opr.idx = INSIDE_WORD;
1553 if (!(syntax & RE_NO_GNU_OPS))
1554 token->type = OP_WORD;
1557 if (!(syntax & RE_NO_GNU_OPS))
1558 token->type = OP_NOTWORD;
1561 if (!(syntax & RE_NO_GNU_OPS))
1563 token->type = ANCHOR;
1564 token->opr.idx = BUF_FIRST;
1568 if (!(syntax & RE_NO_GNU_OPS))
1570 token->type = ANCHOR;
1571 token->opr.idx = BUF_LAST;
1575 if (!(syntax & RE_NO_BK_PARENS))
1576 token->type = OP_OPEN_SUBEXP;
1579 if (!(syntax & RE_NO_BK_PARENS))
1580 token->type = OP_CLOSE_SUBEXP;
1583 if (!(syntax & RE_LIMITED_OPS) && (syntax & RE_BK_PLUS_QM))
1584 token->type = OP_DUP_PLUS;
1587 if (!(syntax & RE_LIMITED_OPS) && (syntax & RE_BK_PLUS_QM))
1588 token->type = OP_DUP_QUESTION;
1591 if ((syntax & RE_INTERVALS) && (!(syntax & RE_NO_BK_BRACES)))
1592 token->type = OP_OPEN_DUP_NUM;
1595 if ((syntax & RE_INTERVALS) && (!(syntax & RE_NO_BK_BRACES)))
1596 token->type = OP_CLOSE_DUP_NUM;
1604 token->type = CHARACTER;
1608 if (syntax & RE_NEWLINE_ALT)
1609 token->type = OP_ALT;
1612 if (!(syntax & RE_LIMITED_OPS) && (syntax & RE_NO_BK_VBAR))
1613 token->type = OP_ALT;
1616 token->type = OP_DUP_ASTERISK;
1619 if (!(syntax & RE_LIMITED_OPS) && !(syntax & RE_BK_PLUS_QM))
1620 token->type = OP_DUP_PLUS;
1623 if (!(syntax & RE_LIMITED_OPS) && !(syntax & RE_BK_PLUS_QM))
1624 token->type = OP_DUP_QUESTION;
1627 if ((syntax & RE_INTERVALS) && (syntax & RE_NO_BK_BRACES))
1628 token->type = OP_OPEN_DUP_NUM;
1631 if ((syntax & RE_INTERVALS) && (syntax & RE_NO_BK_BRACES))
1632 token->type = OP_CLOSE_DUP_NUM;
1635 if (syntax & RE_NO_BK_PARENS)
1636 token->type = OP_OPEN_SUBEXP;
1639 if (syntax & RE_NO_BK_PARENS)
1640 token->type = OP_CLOSE_SUBEXP;
1643 token->type = OP_OPEN_BRACKET;
1646 token->type = OP_PERIOD;
1649 if (!(syntax & RE_CONTEXT_INDEP_ANCHORS) &&
1650 re_string_cur_idx (input) != 0)
1652 char prev = re_string_peek_byte (input, -1);
1653 if (prev != '|' && prev != '(' &&
1654 (!(syntax & RE_NEWLINE_ALT) || prev != '\n'))
1657 token->type = ANCHOR;
1658 token->opr.idx = LINE_FIRST;
1661 if (!(syntax & RE_CONTEXT_INDEP_ANCHORS) &&
1662 re_string_cur_idx (input) + 1 != re_string_length (input))
1665 re_string_skip_bytes (input, 1);
1666 peek_token (&next, input, syntax);
1667 re_string_skip_bytes (input, -1);
1668 if (next.type != OP_ALT && next.type != OP_CLOSE_SUBEXP)
1671 token->type = ANCHOR;
1672 token->opr.idx = LINE_LAST;
1680 /* Peek a token from INPUT, and return the length of the token.
1681 We must not use this function out of bracket expressions. */
1684 peek_token_bracket (token, input, syntax)
1687 reg_syntax_t syntax;
1690 if (re_string_eoi (input))
1692 token->type = END_OF_RE;
1695 c = re_string_peek_byte (input, 0);
1698 #ifdef RE_ENABLE_I18N
1699 if (MB_CUR_MAX > 1 &&
1700 !re_string_first_byte (input, re_string_cur_idx (input)))
1702 token->type = CHARACTER;
1705 #endif /* RE_ENABLE_I18N */
1707 if (c == '\\' && (syntax & RE_BACKSLASH_ESCAPE_IN_LISTS))
1709 /* In this case, '\' escape a character. */
1711 re_string_skip_bytes (input, 1);
1712 c2 = re_string_peek_byte (input, 0);
1714 token->type = CHARACTER;
1717 if (c == '[') /* '[' is a special char in a bracket exps. */
1721 c2 = re_string_peek_byte (input, 1);
1727 token->type = OP_OPEN_COLL_ELEM;
1730 token->type = OP_OPEN_EQUIV_CLASS;
1733 if (syntax & RE_CHAR_CLASSES)
1735 token->type = OP_OPEN_CHAR_CLASS;
1738 /* else fall through. */
1740 token->type = CHARACTER;
1750 token->type = OP_CHARSET_RANGE;
1753 token->type = OP_CLOSE_BRACKET;
1756 token->type = OP_NON_MATCH_LIST;
1759 token->type = CHARACTER;
1764 /* Functions for parser. */
1766 /* Entry point of the parser.
1767 Parse the regular expression REGEXP and return the structure tree.
1768 If an error is occured, ERR is set by error code, and return NULL.
1769 This function build the following tree, from regular expression <reg_exp>:
1775 CAT means concatenation.
1776 EOR means end of regular expression. */
1779 parse (regexp, preg, syntax, err)
1780 re_string_t *regexp;
1782 reg_syntax_t syntax;
1785 re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
1786 bin_tree_t *tree, *eor, *root;
1787 re_token_t current_token;
1789 current_token = fetch_token (regexp, syntax);
1790 tree = parse_reg_exp (regexp, preg, ¤t_token, syntax, 0, err);
1791 if (BE (*err != REG_NOERROR && tree == NULL, 0))
1793 new_idx = re_dfa_add_node (dfa, current_token, 0);
1794 eor = create_tree (NULL, NULL, 0, new_idx);
1796 root = create_tree (tree, eor, CONCAT, 0);
1799 if (BE (new_idx == -1 || eor == NULL || root == NULL, 0))
1807 /* This function build the following tree, from regular expression
1808 <branch1>|<branch2>:
1814 ALT means alternative, which represents the operator `|'. */
1817 parse_reg_exp (regexp, preg, token, syntax, nest, err)
1818 re_string_t *regexp;
1821 reg_syntax_t syntax;
1825 re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
1826 bin_tree_t *tree, *branch = NULL;
1828 tree = parse_branch (regexp, preg, token, syntax, nest, err);
1829 if (BE (*err != REG_NOERROR && tree == NULL, 0))
1832 while (token->type == OP_ALT)
1834 re_token_t alt_token = *token;
1835 new_idx = re_dfa_add_node (dfa, alt_token, 0);
1836 *token = fetch_token (regexp, syntax);
1837 if (token->type != OP_ALT && token->type != END_OF_RE
1838 && (nest == 0 || token->type != OP_CLOSE_SUBEXP))
1840 branch = parse_branch (regexp, preg, token, syntax, nest, err);
1841 if (BE (*err != REG_NOERROR && branch == NULL, 0))
1843 free_bin_tree (tree);
1849 tree = create_tree (tree, branch, 0, new_idx);
1850 if (BE (new_idx == -1 || tree == NULL, 0))
1855 dfa->has_plural_match = 1;
1860 /* This function build the following tree, from regular expression
1867 CAT means concatenation. */
1870 parse_branch (regexp, preg, token, syntax, nest, err)
1871 re_string_t *regexp;
1874 reg_syntax_t syntax;
1878 bin_tree_t *tree, *exp;
1879 tree = parse_expression (regexp, preg, token, syntax, nest, err);
1880 if (BE (*err != REG_NOERROR && tree == NULL, 0))
1883 while (token->type != OP_ALT && token->type != END_OF_RE
1884 && (nest == 0 || token->type != OP_CLOSE_SUBEXP))
1886 exp = parse_expression (regexp, preg, token, syntax, nest, err);
1887 if (BE (*err != REG_NOERROR && exp == NULL, 0))
1889 free_bin_tree (tree);
1892 if (tree != NULL && exp != NULL)
1894 tree = create_tree (tree, exp, CONCAT, 0);
1901 else if (tree == NULL)
1903 /* Otherwise exp == NULL, we don't need to create new tree. */
1908 /* This function build the following tree, from regular expression a*:
1915 parse_expression (regexp, preg, token, syntax, nest, err)
1916 re_string_t *regexp;
1919 reg_syntax_t syntax;
1923 re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
1926 switch (token->type)
1929 new_idx = re_dfa_add_node (dfa, *token, 0);
1930 tree = create_tree (NULL, NULL, 0, new_idx);
1931 if (BE (new_idx == -1 || tree == NULL, 0))
1936 #ifdef RE_ENABLE_I18N
1939 while (!re_string_eoi (regexp)
1940 && !re_string_first_byte (regexp, re_string_cur_idx (regexp)))
1942 bin_tree_t *mbc_remain;
1943 *token = fetch_token (regexp, syntax);
1944 new_idx = re_dfa_add_node (dfa, *token, 0);
1945 mbc_remain = create_tree (NULL, NULL, 0, new_idx);
1946 tree = create_tree (tree, mbc_remain, CONCAT, 0);
1947 if (BE (new_idx == -1 || mbc_remain == NULL || tree == NULL, 0))
1956 case OP_OPEN_SUBEXP:
1957 tree = parse_sub_exp (regexp, preg, token, syntax, nest + 1, err);
1958 if (BE (*err != REG_NOERROR && tree == NULL, 0))
1961 case OP_OPEN_BRACKET:
1962 tree = parse_bracket_exp (regexp, dfa, token, syntax, err);
1963 if (BE (*err != REG_NOERROR && tree == NULL, 0))
1967 if (BE (preg->re_nsub < token->opr.idx
1968 || dfa->subexps[token->opr.idx - 1].end == -1, 0))
1973 dfa->used_bkref_map |= 1 << (token->opr.idx - 1);
1974 new_idx = re_dfa_add_node (dfa, *token, 0);
1975 tree = create_tree (NULL, NULL, 0, new_idx);
1976 if (BE (new_idx == -1 || tree == NULL, 0))
1982 dfa->has_mb_node = 1;
1984 case OP_DUP_ASTERISK:
1986 case OP_DUP_QUESTION:
1987 case OP_OPEN_DUP_NUM:
1988 if (syntax & RE_CONTEXT_INVALID_OPS)
1993 else if (syntax & RE_CONTEXT_INDEP_OPS)
1995 *token = fetch_token (regexp, syntax);
1996 return parse_expression (regexp, preg, token, syntax, nest, err);
1998 /* else fall through */
1999 case OP_CLOSE_SUBEXP:
2000 if ((token->type == OP_CLOSE_SUBEXP) &&
2001 !(syntax & RE_UNMATCHED_RIGHT_PAREN_ORD))
2006 /* else fall through */
2007 case OP_CLOSE_DUP_NUM:
2008 /* We treat it as a normal character. */
2010 /* Then we can these characters as normal characters. */
2011 token->type = CHARACTER;
2012 new_idx = re_dfa_add_node (dfa, *token, 0);
2013 tree = create_tree (NULL, NULL, 0, new_idx);
2014 if (BE (new_idx == -1 || tree == NULL, 0))
2021 if (dfa->word_char == NULL)
2023 *err = init_word_char (dfa);
2024 if (BE (*err != REG_NOERROR, 0))
2027 if (token->opr.ctx_type == WORD_DELIM)
2029 bin_tree_t *tree_first, *tree_last;
2030 int idx_first, idx_last;
2031 token->opr.ctx_type = WORD_FIRST;
2032 idx_first = re_dfa_add_node (dfa, *token, 0);
2033 tree_first = create_tree (NULL, NULL, 0, idx_first);
2034 token->opr.ctx_type = WORD_LAST;
2035 idx_last = re_dfa_add_node (dfa, *token, 0);
2036 tree_last = create_tree (NULL, NULL, 0, idx_last);
2037 token->type = OP_ALT;
2038 new_idx = re_dfa_add_node (dfa, *token, 0);
2039 tree = create_tree (tree_first, tree_last, 0, new_idx);
2040 if (BE (idx_first == -1 || idx_last == -1 || new_idx == -1
2041 || tree_first == NULL || tree_last == NULL
2042 || tree == NULL, 0))
2050 new_idx = re_dfa_add_node (dfa, *token, 0);
2051 tree = create_tree (NULL, NULL, 0, new_idx);
2052 if (BE (new_idx == -1 || tree == NULL, 0))
2058 /* We must return here, since ANCHORs can't be followed
2059 by repetition operators.
2060 eg. RE"^*" is invalid or "<ANCHOR(^)><CHAR(*)>",
2061 it must not be "<ANCHOR(^)><REPEAT(*)>". */
2062 *token = fetch_token (regexp, syntax);
2065 new_idx = re_dfa_add_node (dfa, *token, 0);
2066 tree = create_tree (NULL, NULL, 0, new_idx);
2067 if (BE (new_idx == -1 || tree == NULL, 0))
2073 dfa->has_mb_node = 1;
2076 tree = build_word_op (dfa, 0, err);
2077 if (BE (*err != REG_NOERROR && tree == NULL, 0))
2081 tree = build_word_op (dfa, 1, err);
2082 if (BE (*err != REG_NOERROR && tree == NULL, 0))
2092 /* Must not happen? */
2098 *token = fetch_token (regexp, syntax);
2100 while (token->type == OP_DUP_ASTERISK || token->type == OP_DUP_PLUS
2101 || token->type == OP_DUP_QUESTION || token->type == OP_OPEN_DUP_NUM)
2103 tree = parse_dup_op (tree, regexp, dfa, token, syntax, err);
2104 if (BE (*err != REG_NOERROR && tree == NULL, 0))
2106 dfa->has_plural_match = 1;
2112 /* This function build the following tree, from regular expression
2120 parse_sub_exp (regexp, preg, token, syntax, nest, err)
2121 re_string_t *regexp;
2124 reg_syntax_t syntax;
2128 re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
2129 bin_tree_t *tree, *left_par, *right_par;
2132 cur_nsub = preg->re_nsub++;
2133 if (dfa->subexps_alloc < preg->re_nsub)
2135 re_subexp_t *new_array;
2136 dfa->subexps_alloc *= 2;
2137 new_array = re_realloc (dfa->subexps, re_subexp_t, dfa->subexps_alloc);
2138 if (BE (new_array == NULL, 0))
2140 dfa->subexps_alloc /= 2;
2144 dfa->subexps = new_array;
2146 dfa->subexps[cur_nsub].start = dfa->nodes_len;
2147 dfa->subexps[cur_nsub].end = -1;
2149 new_idx = re_dfa_add_node (dfa, *token, 0);
2150 left_par = create_tree (NULL, NULL, 0, new_idx);
2151 if (BE (new_idx == -1 || left_par == NULL, 0))
2156 dfa->nodes[new_idx].opr.idx = cur_nsub;
2157 *token = fetch_token (regexp, syntax);
2159 /* The subexpression may be a null string. */
2160 if (token->type == OP_CLOSE_SUBEXP)
2164 tree = parse_reg_exp (regexp, preg, token, syntax, nest, err);
2165 if (BE (*err != REG_NOERROR && tree == NULL, 0))
2168 if (BE (token->type != OP_CLOSE_SUBEXP, 0))
2170 free_bin_tree (tree);
2174 new_idx = re_dfa_add_node (dfa, *token, 0);
2175 dfa->subexps[cur_nsub].end = dfa->nodes_len;
2176 right_par = create_tree (NULL, NULL, 0, new_idx);
2177 tree = ((tree == NULL) ? right_par
2178 : create_tree (tree, right_par, CONCAT, 0));
2179 tree = create_tree (left_par, tree, CONCAT, 0);
2180 if (BE (new_idx == -1 || right_par == NULL || tree == NULL, 0))
2185 dfa->nodes[new_idx].opr.idx = cur_nsub;
2190 /* This function parse repetition operators like "*", "+", "{1,3}" etc. */
2193 parse_dup_op (dup_elem, regexp, dfa, token, syntax, err)
2194 bin_tree_t *dup_elem;
2195 re_string_t *regexp;
2198 reg_syntax_t syntax;
2201 re_token_t dup_token;
2202 bin_tree_t *tree = dup_elem, *work_tree;
2203 int new_idx, start_idx = re_string_cur_idx (regexp);
2204 re_token_t start_token = *token;
2205 if (token->type == OP_OPEN_DUP_NUM)
2209 int start = fetch_number (regexp, token, syntax);
2213 if (token->type == CHARACTER && token->opr.c == ',')
2214 start = 0; /* We treat "{,m}" as "{0,m}". */
2217 *err = REG_BADBR; /* <re>{} is invalid. */
2221 if (BE (start != -2, 1))
2223 /* We treat "{n}" as "{n,n}". */
2224 end = ((token->type == OP_CLOSE_DUP_NUM) ? start
2225 : ((token->type == CHARACTER && token->opr.c == ',')
2226 ? fetch_number (regexp, token, syntax) : -2));
2228 if (BE (start == -2 || end == -2, 0))
2230 /* Invalid sequence. */
2231 if (token->type == OP_CLOSE_DUP_NUM)
2232 goto parse_dup_op_invalid_interval;
2234 goto parse_dup_op_ebrace;
2236 if (BE (start == 0 && end == 0, 0))
2238 /* We treat "<re>{0}" and "<re>{0,0}" as null string. */
2239 *token = fetch_token (regexp, syntax);
2240 free_bin_tree (dup_elem);
2244 /* Extract "<re>{n,m}" to "<re><re>...<re><re>{0,<m-n>}". */
2246 for (i = 0; i < start; ++i)
2249 work_tree = duplicate_tree (elem, dfa);
2250 tree = create_tree (tree, work_tree, CONCAT, 0);
2251 if (BE (work_tree == NULL || tree == NULL, 0))
2252 goto parse_dup_op_espace;
2257 /* We treat "<re>{0,}" as "<re>*". */
2258 dup_token.type = OP_DUP_ASTERISK;
2261 elem = duplicate_tree (elem, dfa);
2262 new_idx = re_dfa_add_node (dfa, dup_token, 0);
2263 work_tree = create_tree (elem, NULL, 0, new_idx);
2264 tree = create_tree (tree, work_tree, CONCAT, 0);
2265 if (BE (elem == NULL || new_idx == -1 || work_tree == NULL
2266 || tree == NULL, 0))
2267 goto parse_dup_op_espace;
2271 new_idx = re_dfa_add_node (dfa, dup_token, 0);
2272 tree = create_tree (elem, NULL, 0, new_idx);
2273 if (BE (new_idx == -1 || tree == NULL, 0))
2274 goto parse_dup_op_espace;
2277 else if (end - start > 0)
2279 /* Then extract "<re>{0,m}" to "<re>?<re>?...<re>?". */
2280 dup_token.type = OP_DUP_QUESTION;
2283 elem = duplicate_tree (elem, dfa);
2284 new_idx = re_dfa_add_node (dfa, dup_token, 0);
2285 elem = create_tree (elem, NULL, 0, new_idx);
2286 tree = create_tree (tree, elem, CONCAT, 0);
2287 if (BE (elem == NULL || new_idx == -1 || tree == NULL, 0))
2288 goto parse_dup_op_espace;
2292 new_idx = re_dfa_add_node (dfa, dup_token, 0);
2293 tree = elem = create_tree (elem, NULL, 0, new_idx);
2294 if (BE (new_idx == -1 || tree == NULL, 0))
2295 goto parse_dup_op_espace;
2297 for (i = 1; i < end - start; ++i)
2299 work_tree = duplicate_tree (elem, dfa);
2300 tree = create_tree (tree, work_tree, CONCAT, 0);
2301 if (BE (work_tree == NULL || tree == NULL, 0))
2311 new_idx = re_dfa_add_node (dfa, *token, 0);
2312 tree = create_tree (tree, NULL, 0, new_idx);
2313 if (BE (new_idx == -1 || tree == NULL, 0))
2319 *token = fetch_token (regexp, syntax);
2322 parse_dup_op_espace:
2323 free_bin_tree (tree);
2327 parse_dup_op_ebrace:
2328 if (BE (!(syntax & RE_INVALID_INTERVAL_ORD), 0))
2333 goto parse_dup_op_rollback;
2334 parse_dup_op_invalid_interval:
2335 if (BE (!(syntax & RE_INVALID_INTERVAL_ORD), 0))
2340 parse_dup_op_rollback:
2341 re_string_set_index (regexp, start_idx);
2342 *token = start_token;
2343 token->type = CHARACTER;
2347 /* Size of the names for collating symbol/equivalence_class/character_class.
2348 I'm not sure, but maybe enough. */
2349 #define BRACKET_NAME_BUF_SIZE 32
2352 /* Local function for parse_bracket_exp only used in case of NOT _LIBC.
2353 Build the range expression which starts from START_ELEM, and ends
2354 at END_ELEM. The result are written to MBCSET and SBCSET.
2355 RANGE_ALLOC is the allocated size of mbcset->range_starts, and
2356 mbcset->range_ends, is a pointer argument sinse we may
2359 static reg_errcode_t
2360 # ifdef RE_ENABLE_I18N
2361 build_range_exp (sbcset, mbcset, range_alloc, start_elem, end_elem)
2362 re_charset_t *mbcset;
2364 # else /* not RE_ENABLE_I18N */
2365 build_range_exp (sbcset, start_elem, end_elem)
2366 # endif /* not RE_ENABLE_I18N */
2367 re_bitset_ptr_t sbcset;
2368 bracket_elem_t *start_elem, *end_elem;
2370 unsigned int start_ch, end_ch;
2371 /* Equivalence Classes and Character Classes can't be a range start/end. */
2372 if (BE (start_elem->type == EQUIV_CLASS || start_elem->type == CHAR_CLASS
2373 || end_elem->type == EQUIV_CLASS || end_elem->type == CHAR_CLASS,
2377 /* We can handle no multi character collating elements without libc
2379 if (BE ((start_elem->type == COLL_SYM
2380 && strlen ((char *) start_elem->opr.name) > 1)
2381 || (end_elem->type == COLL_SYM
2382 && strlen ((char *) end_elem->opr.name) > 1), 0))
2383 return REG_ECOLLATE;
2385 # ifdef RE_ENABLE_I18N
2387 wchar_t wc, start_wc, end_wc;
2388 wchar_t cmp_buf[6] = {L'\0', L'\0', L'\0', L'\0', L'\0', L'\0'};
2390 start_ch = ((start_elem->type == SB_CHAR) ? start_elem->opr.ch
2391 : ((start_elem->type == COLL_SYM) ? start_elem->opr.name[0]
2393 end_ch = ((end_elem->type == SB_CHAR) ? end_elem->opr.ch
2394 : ((end_elem->type == COLL_SYM) ? end_elem->opr.name[0]
2396 start_wc = ((start_elem->type == SB_CHAR || start_elem->type == COLL_SYM)
2397 ? __btowc (start_ch) : start_elem->opr.wch);
2398 end_wc = ((end_elem->type == SB_CHAR || end_elem->type == COLL_SYM)
2399 ? __btowc (end_ch) : end_elem->opr.wch);
2400 cmp_buf[0] = start_wc;
2401 cmp_buf[4] = end_wc;
2402 if (wcscoll (cmp_buf, cmp_buf + 4) > 0)
2405 /* Check the space of the arrays. */
2406 if (*range_alloc == mbcset->nranges)
2408 /* There are not enough space, need realloc. */
2409 wchar_t *new_array_start, *new_array_end;
2412 /* +1 in case of mbcset->nranges is 0. */
2413 new_nranges = 2 * mbcset->nranges + 1;
2414 /* Use realloc since mbcset->range_starts and mbcset->range_ends
2415 are NULL if *range_alloc == 0. */
2416 new_array_start = re_realloc (mbcset->range_starts, wchar_t,
2418 new_array_end = re_realloc (mbcset->range_ends, wchar_t,
2421 if (BE (new_array_start == NULL || new_array_end == NULL, 0))
2424 mbcset->range_starts = new_array_start;
2425 mbcset->range_ends = new_array_end;
2426 *range_alloc = new_nranges;
2429 mbcset->range_starts[mbcset->nranges] = start_wc;
2430 mbcset->range_ends[mbcset->nranges++] = end_wc;
2432 /* Build the table for single byte characters. */
2433 for (wc = 0; wc <= SBC_MAX; ++wc)
2436 if (wcscoll (cmp_buf, cmp_buf + 2) <= 0
2437 && wcscoll (cmp_buf + 2, cmp_buf + 4) <= 0)
2438 bitset_set (sbcset, wc);
2441 # else /* not RE_ENABLE_I18N */
2444 start_ch = ((start_elem->type == SB_CHAR ) ? start_elem->opr.ch
2445 : ((start_elem->type == COLL_SYM) ? start_elem->opr.name[0]
2447 end_ch = ((end_elem->type == SB_CHAR ) ? end_elem->opr.ch
2448 : ((end_elem->type == COLL_SYM) ? end_elem->opr.name[0]
2450 if (start_ch > end_ch)
2452 /* Build the table for single byte characters. */
2453 for (ch = 0; ch <= SBC_MAX; ++ch)
2454 if (start_ch <= ch && ch <= end_ch)
2455 bitset_set (sbcset, ch);
2457 # endif /* not RE_ENABLE_I18N */
2460 #endif /* not _LIBC */
2463 /* Helper function for parse_bracket_exp only used in case of NOT _LIBC..
2464 Build the collating element which is represented by NAME.
2465 The result are written to MBCSET and SBCSET.
2466 COLL_SYM_ALLOC is the allocated size of mbcset->coll_sym, is a
2467 pointer argument since we may update it. */
2469 static reg_errcode_t
2470 # ifdef RE_ENABLE_I18N
2471 build_collating_symbol (sbcset, mbcset, coll_sym_alloc, name)
2472 re_charset_t *mbcset;
2473 int *coll_sym_alloc;
2474 # else /* not RE_ENABLE_I18N */
2475 build_collating_symbol (sbcset, name)
2476 # endif /* not RE_ENABLE_I18N */
2477 re_bitset_ptr_t sbcset;
2478 const unsigned char *name;
2480 size_t name_len = strlen ((const char *) name);
2481 if (BE (name_len != 1, 0))
2482 return REG_ECOLLATE;
2485 bitset_set (sbcset, name[0]);
2489 #endif /* not _LIBC */
2491 /* This function parse bracket expression like "[abc]", "[a-c]",
2495 parse_bracket_exp (regexp, dfa, token, syntax, err)
2496 re_string_t *regexp;
2499 reg_syntax_t syntax;
2503 const unsigned char *collseqmb;
2504 const char *collseqwc;
2507 const int32_t *symb_table;
2508 const unsigned char *extra;
2510 /* Local function for parse_bracket_exp used in _LIBC environement.
2511 Seek the collating symbol entry correspondings to NAME.
2512 Return the index of the symbol in the SYMB_TABLE. */
2514 static inline int32_t
2515 seek_collating_symbol_entry (name, name_len)
2516 const unsigned char *name;
2519 int32_t hash = elem_hash ((const char *) name, name_len);
2520 int32_t elem = hash % table_size;
2521 int32_t second = hash % (table_size - 2);
2522 while (symb_table[2 * elem] != 0)
2524 /* First compare the hashing value. */
2525 if (symb_table[2 * elem] == hash
2526 /* Compare the length of the name. */
2527 && name_len == extra[symb_table[2 * elem + 1]]
2528 /* Compare the name. */
2529 && memcmp (name, &extra[symb_table[2 * elem + 1] + 1],
2532 /* Yep, this is the entry. */
2542 /* Local function for parse_bracket_exp used in _LIBC environement.
2543 Look up the collation sequence value of BR_ELEM.
2544 Return the value if succeeded, UINT_MAX otherwise. */
2546 static inline unsigned int
2547 lookup_collation_sequence_value (br_elem)
2548 bracket_elem_t *br_elem;
2550 if (br_elem->type == SB_CHAR)
2553 if (MB_CUR_MAX == 1)
2556 return collseqmb[br_elem->opr.ch];
2559 wint_t wc = __btowc (br_elem->opr.ch);
2560 return collseq_table_lookup (collseqwc, wc);
2563 else if (br_elem->type == MB_CHAR)
2565 return collseq_table_lookup (collseqwc, br_elem->opr.wch);
2567 else if (br_elem->type == COLL_SYM)
2569 size_t sym_name_len = strlen ((char *) br_elem->opr.name);
2573 elem = seek_collating_symbol_entry (br_elem->opr.name,
2575 if (symb_table[2 * elem] != 0)
2577 /* We found the entry. */
2578 idx = symb_table[2 * elem + 1];
2579 /* Skip the name of collating element name. */
2580 idx += 1 + extra[idx];
2581 /* Skip the byte sequence of the collating element. */
2582 idx += 1 + extra[idx];
2583 /* Adjust for the alignment. */
2584 idx = (idx + 3) & ~3;
2585 /* Skip the multibyte collation sequence value. */
2586 idx += sizeof (unsigned int);
2587 /* Skip the wide char sequence of the collating element. */
2588 idx += sizeof (unsigned int) *
2589 (1 + *(unsigned int *) (extra + idx));
2590 /* Return the collation sequence value. */
2591 return *(unsigned int *) (extra + idx);
2593 else if (symb_table[2 * elem] == 0 && sym_name_len == 1)
2595 /* No valid character. Match it as a single byte
2597 return collseqmb[br_elem->opr.name[0]];
2600 else if (sym_name_len == 1)
2601 return collseqmb[br_elem->opr.name[0]];
2606 /* Local function for parse_bracket_exp used in _LIBC environement.
2607 Build the range expression which starts from START_ELEM, and ends
2608 at END_ELEM. The result are written to MBCSET and SBCSET.
2609 RANGE_ALLOC is the allocated size of mbcset->range_starts, and
2610 mbcset->range_ends, is a pointer argument sinse we may
2613 static inline reg_errcode_t
2614 # ifdef RE_ENABLE_I18N
2615 build_range_exp (sbcset, mbcset, range_alloc, start_elem, end_elem)
2616 re_charset_t *mbcset;
2618 # else /* not RE_ENABLE_I18N */
2619 build_range_exp (sbcset, start_elem, end_elem)
2620 # endif /* not RE_ENABLE_I18N */
2621 re_bitset_ptr_t sbcset;
2622 bracket_elem_t *start_elem, *end_elem;
2625 uint32_t start_collseq;
2626 uint32_t end_collseq;
2628 # ifdef RE_ENABLE_I18N
2629 /* Check the space of the arrays. */
2630 if (*range_alloc == mbcset->nranges)
2632 /* There are not enough space, need realloc. */
2633 uint32_t *new_array_start;
2634 uint32_t *new_array_end;
2637 /* +1 in case of mbcset->nranges is 0. */
2638 new_nranges = 2 * mbcset->nranges + 1;
2639 /* Use realloc since mbcset->range_starts and mbcset->range_ends
2640 are NULL if *range_alloc == 0. */
2641 new_array_start = re_realloc (mbcset->range_starts, uint32_t,
2643 new_array_end = re_realloc (mbcset->range_ends, uint32_t,
2646 if (BE (new_array_start == NULL || new_array_end == NULL, 0))
2649 mbcset->range_starts = new_array_start;
2650 mbcset->range_ends = new_array_end;
2651 *range_alloc = new_nranges;
2653 # endif /* RE_ENABLE_I18N */
2655 /* Equivalence Classes and Character Classes can't be a range
2657 if (BE (start_elem->type == EQUIV_CLASS || start_elem->type == CHAR_CLASS
2658 || end_elem->type == EQUIV_CLASS || end_elem->type == CHAR_CLASS,
2662 start_collseq = lookup_collation_sequence_value (start_elem);
2663 end_collseq = lookup_collation_sequence_value (end_elem);
2664 /* Check start/end collation sequence values. */
2665 if (BE (start_collseq == UINT_MAX || end_collseq == UINT_MAX, 0))
2666 return REG_ECOLLATE;
2667 if (BE ((syntax & RE_NO_EMPTY_RANGES) && start_collseq > end_collseq, 0))
2670 # ifdef RE_ENABLE_I18N
2671 /* Got valid collation sequence values, add them as a new entry. */
2672 mbcset->range_starts[mbcset->nranges] = start_collseq;
2673 mbcset->range_ends[mbcset->nranges++] = end_collseq;
2674 # endif /* RE_ENABLE_I18N */
2676 /* Build the table for single byte characters. */
2677 for (ch = 0; ch <= SBC_MAX; ch++)
2679 uint32_t ch_collseq;
2681 if (MB_CUR_MAX == 1)
2684 ch_collseq = collseqmb[ch];
2686 ch_collseq = collseq_table_lookup (collseqwc, __btowc (ch));
2687 if (start_collseq <= ch_collseq && ch_collseq <= end_collseq)
2688 bitset_set (sbcset, ch);
2693 /* Local function for parse_bracket_exp used in _LIBC environement.
2694 Build the collating element which is represented by NAME.
2695 The result are written to MBCSET and SBCSET.
2696 COLL_SYM_ALLOC is the allocated size of mbcset->coll_sym, is a
2697 pointer argument sinse we may update it. */
2699 static inline reg_errcode_t
2700 # ifdef RE_ENABLE_I18N
2701 build_collating_symbol (sbcset, mbcset, coll_sym_alloc, name)
2702 re_charset_t *mbcset;
2703 int *coll_sym_alloc;
2704 # else /* not RE_ENABLE_I18N */
2705 build_collating_symbol (sbcset, name)
2706 # endif /* not RE_ENABLE_I18N */
2707 re_bitset_ptr_t sbcset;
2708 const unsigned char *name;
2711 size_t name_len = strlen ((const char *) name);
2714 elem = seek_collating_symbol_entry (name, name_len);
2715 if (symb_table[2 * elem] != 0)
2717 /* We found the entry. */
2718 idx = symb_table[2 * elem + 1];
2719 /* Skip the name of collating element name. */
2720 idx += 1 + extra[idx];
2722 else if (symb_table[2 * elem] == 0 && name_len == 1)
2724 /* No valid character, treat it as a normal
2726 bitset_set (sbcset, name[0]);
2730 return REG_ECOLLATE;
2732 # ifdef RE_ENABLE_I18N
2733 /* Got valid collation sequence, add it as a new entry. */
2734 /* Check the space of the arrays. */
2735 if (*coll_sym_alloc == mbcset->ncoll_syms)
2737 /* Not enough, realloc it. */
2738 /* +1 in case of mbcset->ncoll_syms is 0. */
2739 *coll_sym_alloc = 2 * mbcset->ncoll_syms + 1;
2740 /* Use realloc since mbcset->coll_syms is NULL
2742 mbcset->coll_syms = re_realloc (mbcset->coll_syms, int32_t,
2744 if (BE (mbcset->coll_syms == NULL, 0))
2747 mbcset->coll_syms[mbcset->ncoll_syms++] = idx;
2748 # endif /* RE_ENABLE_I18N */
2753 if (BE (name_len != 1, 0))
2754 return REG_ECOLLATE;
2757 bitset_set (sbcset, name[0]);
2764 re_token_t br_token;
2765 re_bitset_ptr_t sbcset;
2766 #ifdef RE_ENABLE_I18N
2767 re_charset_t *mbcset;
2768 int coll_sym_alloc = 0, range_alloc = 0, mbchar_alloc = 0;
2769 int equiv_class_alloc = 0, char_class_alloc = 0;
2770 #else /* not RE_ENABLE_I18N */
2772 #endif /* not RE_ENABLE_I18N */
2773 bin_tree_t *work_tree;
2774 int token_len, new_idx;
2776 collseqmb = (const unsigned char *)
2777 _NL_CURRENT (LC_COLLATE, _NL_COLLATE_COLLSEQMB);
2778 nrules = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES);
2784 collseqwc = _NL_CURRENT (LC_COLLATE, _NL_COLLATE_COLLSEQWC);
2785 table_size = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_SYMB_HASH_SIZEMB);
2786 symb_table = (const int32_t *) _NL_CURRENT (LC_COLLATE,
2787 _NL_COLLATE_SYMB_TABLEMB);
2788 extra = (const unsigned char *) _NL_CURRENT (LC_COLLATE,
2789 _NL_COLLATE_SYMB_EXTRAMB);
2792 sbcset = (re_bitset_ptr_t) calloc (sizeof (unsigned int), BITSET_UINTS);
2793 #ifdef RE_ENABLE_I18N
2794 mbcset = (re_charset_t *) calloc (sizeof (re_charset_t), 1);
2795 #endif /* RE_ENABLE_I18N */
2796 #ifdef RE_ENABLE_I18N
2797 if (BE (sbcset == NULL || mbcset == NULL, 0))
2799 if (BE (sbcset == NULL, 0))
2800 #endif /* RE_ENABLE_I18N */
2806 token_len = peek_token_bracket (token, regexp, syntax);
2807 if (BE (token->type == END_OF_RE, 0))
2810 goto parse_bracket_exp_free_return;
2812 if (token->type == OP_NON_MATCH_LIST)
2814 #ifdef RE_ENABLE_I18N
2816 mbcset->non_match = 1;
2817 #else /* not RE_ENABLE_I18N */
2819 #endif /* not RE_ENABLE_I18N */
2820 if (syntax & RE_HAT_LISTS_NOT_NEWLINE)
2821 bitset_set (sbcset, '\0');
2822 re_string_skip_bytes (regexp, token_len); /* Skip a token. */
2823 token_len = peek_token_bracket (token, regexp, syntax);
2824 if (BE (token->type == END_OF_RE, 0))
2827 goto parse_bracket_exp_free_return;
2829 #ifdef RE_ENABLE_I18N
2831 for (i = 0; i < SBC_MAX; ++i)
2832 if (__btowc (i) == WEOF)
2833 bitset_set (sbcset, i);
2834 #endif /* RE_ENABLE_I18N */
2837 /* We treat the first ']' as a normal character. */
2838 if (token->type == OP_CLOSE_BRACKET)
2839 token->type = CHARACTER;
2843 bracket_elem_t start_elem, end_elem;
2844 unsigned char start_name_buf[BRACKET_NAME_BUF_SIZE];
2845 unsigned char end_name_buf[BRACKET_NAME_BUF_SIZE];
2847 int token_len2 = 0, is_range_exp = 0;
2850 start_elem.opr.name = start_name_buf;
2851 ret = parse_bracket_element (&start_elem, regexp, token, token_len, dfa,
2853 if (BE (ret != REG_NOERROR, 0))
2856 goto parse_bracket_exp_free_return;
2859 token_len = peek_token_bracket (token, regexp, syntax);
2860 if (BE (token->type == END_OF_RE, 0))
2863 goto parse_bracket_exp_free_return;
2865 if (token->type == OP_CHARSET_RANGE)
2867 re_string_skip_bytes (regexp, token_len); /* Skip '-'. */
2868 token_len2 = peek_token_bracket (&token2, regexp, syntax);
2869 if (BE (token->type == END_OF_RE, 0))
2872 goto parse_bracket_exp_free_return;
2874 if (token2.type == OP_CLOSE_BRACKET)
2876 /* We treat the last '-' as a normal character. */
2877 re_string_skip_bytes (regexp, -token_len);
2878 token->type = CHARACTER;
2884 if (is_range_exp == 1)
2886 end_elem.opr.name = end_name_buf;
2887 ret = parse_bracket_element (&end_elem, regexp, &token2, token_len2,
2889 if (BE (ret != REG_NOERROR, 0))
2892 goto parse_bracket_exp_free_return;
2895 token_len = peek_token_bracket (token, regexp, syntax);
2896 if (BE (token->type == END_OF_RE, 0))
2899 goto parse_bracket_exp_free_return;
2901 *err = build_range_exp (sbcset,
2902 #ifdef RE_ENABLE_I18N
2903 mbcset, &range_alloc,
2904 #endif /* RE_ENABLE_I18N */
2905 &start_elem, &end_elem);
2906 if (BE (*err != REG_NOERROR, 0))
2907 goto parse_bracket_exp_free_return;
2911 switch (start_elem.type)
2914 bitset_set (sbcset, start_elem.opr.ch);
2916 #ifdef RE_ENABLE_I18N
2918 /* Check whether the array has enough space. */
2919 if (mbchar_alloc == mbcset->nmbchars)
2921 /* Not enough, realloc it. */
2922 /* +1 in case of mbcset->nmbchars is 0. */
2923 mbchar_alloc = 2 * mbcset->nmbchars + 1;
2924 /* Use realloc since array is NULL if *alloc == 0. */
2925 mbcset->mbchars = re_realloc (mbcset->mbchars, wchar_t,
2927 if (BE (mbcset->mbchars == NULL, 0))
2928 goto parse_bracket_exp_espace;
2930 mbcset->mbchars[mbcset->nmbchars++] = start_elem.opr.wch;
2932 #endif /* RE_ENABLE_I18N */
2934 *err = build_equiv_class (sbcset,
2935 #ifdef RE_ENABLE_I18N
2936 mbcset, &equiv_class_alloc,
2937 #endif /* RE_ENABLE_I18N */
2938 start_elem.opr.name);
2939 if (BE (*err != REG_NOERROR, 0))
2940 goto parse_bracket_exp_free_return;
2943 *err = build_collating_symbol (sbcset,
2944 #ifdef RE_ENABLE_I18N
2945 mbcset, &coll_sym_alloc,
2946 #endif /* RE_ENABLE_I18N */
2947 start_elem.opr.name);
2948 if (BE (*err != REG_NOERROR, 0))
2949 goto parse_bracket_exp_free_return;
2952 *err = build_charclass (sbcset,
2953 #ifdef RE_ENABLE_I18N
2954 mbcset, &char_class_alloc,
2955 #endif /* RE_ENABLE_I18N */
2956 start_elem.opr.name, syntax);
2957 if (BE (*err != REG_NOERROR, 0))
2958 goto parse_bracket_exp_free_return;
2965 if (token->type == OP_CLOSE_BRACKET)
2969 re_string_skip_bytes (regexp, token_len); /* Skip a token. */
2971 /* If it is non-matching list. */
2972 #ifdef RE_ENABLE_I18N
2973 if (mbcset->non_match)
2974 #else /* not RE_ENABLE_I18N */
2976 #endif /* not RE_ENABLE_I18N */
2977 bitset_not (sbcset);
2979 /* Build a tree for simple bracket. */
2980 br_token.type = SIMPLE_BRACKET;
2981 br_token.opr.sbcset = sbcset;
2982 new_idx = re_dfa_add_node (dfa, br_token, 0);
2983 work_tree = create_tree (NULL, NULL, 0, new_idx);
2984 if (BE (new_idx == -1 || work_tree == NULL, 0))
2985 goto parse_bracket_exp_espace;
2987 #ifdef RE_ENABLE_I18N
2988 if (mbcset->nmbchars || mbcset->ncoll_syms || mbcset->nequiv_classes
2989 || mbcset->nranges || (MB_CUR_MAX > 1 && (mbcset->nchar_classes
2990 || mbcset->non_match)))
2992 re_token_t alt_token;
2993 bin_tree_t *mbc_tree;
2994 /* Build a tree for complex bracket. */
2995 br_token.type = COMPLEX_BRACKET;
2996 br_token.opr.mbcset = mbcset;
2997 dfa->has_mb_node = 1;
2998 new_idx = re_dfa_add_node (dfa, br_token, 0);
2999 mbc_tree = create_tree (NULL, NULL, 0, new_idx);
3000 if (BE (new_idx == -1 || mbc_tree == NULL, 0))
3001 goto parse_bracket_exp_espace;
3002 /* Then join them by ALT node. */
3003 dfa->has_plural_match = 1;
3004 alt_token.type = OP_ALT;
3005 new_idx = re_dfa_add_node (dfa, alt_token, 0);
3006 work_tree = create_tree (work_tree, mbc_tree, 0, new_idx);
3007 if (BE (new_idx != -1 && mbc_tree != NULL, 1))
3012 free_charset (mbcset);
3015 #else /* not RE_ENABLE_I18N */
3017 #endif /* not RE_ENABLE_I18N */
3019 parse_bracket_exp_espace:
3021 parse_bracket_exp_free_return:
3023 #ifdef RE_ENABLE_I18N
3024 free_charset (mbcset);
3025 #endif /* RE_ENABLE_I18N */
3029 /* Parse an element in the bracket expression. */
3031 static reg_errcode_t
3032 parse_bracket_element (elem, regexp, token, token_len, dfa, syntax)
3033 bracket_elem_t *elem;
3034 re_string_t *regexp;
3038 reg_syntax_t syntax;
3040 #ifdef RE_ENABLE_I18N
3042 cur_char_size = re_string_char_size_at (regexp, re_string_cur_idx (regexp));
3043 if (cur_char_size > 1)
3045 elem->type = MB_CHAR;
3046 elem->opr.wch = re_string_wchar_at (regexp, re_string_cur_idx (regexp));
3047 re_string_skip_bytes (regexp, cur_char_size);
3050 #endif /* RE_ENABLE_I18N */
3051 re_string_skip_bytes (regexp, token_len); /* Skip a token. */
3052 if (token->type == OP_OPEN_COLL_ELEM || token->type == OP_OPEN_CHAR_CLASS
3053 || token->type == OP_OPEN_EQUIV_CLASS)
3054 return parse_bracket_symbol (elem, regexp, token);
3055 elem->type = SB_CHAR;
3056 elem->opr.ch = token->opr.c;
3060 /* Parse a bracket symbol in the bracket expression. Bracket symbols are
3061 such as [:<character_class>:], [.<collating_element>.], and
3062 [=<equivalent_class>=]. */
3064 static reg_errcode_t
3065 parse_bracket_symbol (elem, regexp, token)
3066 bracket_elem_t *elem;
3067 re_string_t *regexp;
3070 unsigned char ch, delim = token->opr.c;
3074 if (re_string_eoi(regexp) || i >= BRACKET_NAME_BUF_SIZE)
3076 if (token->type == OP_OPEN_CHAR_CLASS)
3077 ch = re_string_fetch_byte_case (regexp);
3079 ch = re_string_fetch_byte (regexp);
3080 if (ch == delim && re_string_peek_byte (regexp, 0) == ']')
3082 elem->opr.name[i] = ch;
3084 re_string_skip_bytes (regexp, 1);
3085 elem->opr.name[i] = '\0';
3086 switch (token->type)
3088 case OP_OPEN_COLL_ELEM:
3089 elem->type = COLL_SYM;
3091 case OP_OPEN_EQUIV_CLASS:
3092 elem->type = EQUIV_CLASS;
3094 case OP_OPEN_CHAR_CLASS:
3095 elem->type = CHAR_CLASS;
3103 /* Helper function for parse_bracket_exp.
3104 Build the equivalence class which is represented by NAME.
3105 The result are written to MBCSET and SBCSET.
3106 EQUIV_CLASS_ALLOC is the allocated size of mbcset->equiv_classes,
3107 is a pointer argument sinse we may update it. */
3109 static reg_errcode_t
3110 #ifdef RE_ENABLE_I18N
3111 build_equiv_class (sbcset, mbcset, equiv_class_alloc, name)
3112 re_charset_t *mbcset;
3113 int *equiv_class_alloc;
3114 #else /* not RE_ENABLE_I18N */
3115 build_equiv_class (sbcset, name)
3116 #endif /* not RE_ENABLE_I18N */
3117 re_bitset_ptr_t sbcset;
3118 const unsigned char *name;
3120 #if defined _LIBC && defined RE_ENABLE_I18N
3121 uint32_t nrules = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES);
3124 const int32_t *table, *indirect;
3125 const unsigned char *weights, *extra, *cp;
3126 unsigned char char_buf[2];
3130 /* This #include defines a local function! */
3131 # include <locale/weight.h>
3132 /* Calculate the index for equivalence class. */
3134 table = (const int32_t *) _NL_CURRENT (LC_COLLATE, _NL_COLLATE_TABLEMB);
3135 weights = (const unsigned char *) _NL_CURRENT (LC_COLLATE,
3136 _NL_COLLATE_WEIGHTMB);
3137 extra = (const unsigned char *) _NL_CURRENT (LC_COLLATE,
3138 _NL_COLLATE_EXTRAMB);
3139 indirect = (const int32_t *) _NL_CURRENT (LC_COLLATE,
3140 _NL_COLLATE_INDIRECTMB);
3141 idx1 = findidx (&cp);
3142 if (BE (idx1 == 0 || cp < name + strlen ((const char *) name), 0))
3143 /* This isn't a valid character. */
3144 return REG_ECOLLATE;
3146 /* Build single byte matcing table for this equivalence class. */
3147 char_buf[1] = (unsigned char) '\0';
3148 len = weights[idx1];
3149 for (ch = 0; ch < SBC_MAX; ++ch)
3153 idx2 = findidx (&cp);
3158 /* This isn't a valid character. */
3160 if (len == weights[idx2])
3163 while (cnt <= len &&
3164 weights[idx1 + 1 + cnt] == weights[idx2 + 1 + cnt])
3168 bitset_set (sbcset, ch);
3171 /* Check whether the array has enough space. */
3172 if (*equiv_class_alloc == mbcset->nequiv_classes)
3174 /* Not enough, realloc it. */
3175 /* +1 in case of mbcset->nequiv_classes is 0. */
3176 *equiv_class_alloc = 2 * mbcset->nequiv_classes + 1;
3177 /* Use realloc since the array is NULL if *alloc == 0. */
3178 mbcset->equiv_classes = re_realloc (mbcset->equiv_classes, int32_t,
3179 *equiv_class_alloc);
3180 if (BE (mbcset->equiv_classes == NULL, 0))
3183 mbcset->equiv_classes[mbcset->nequiv_classes++] = idx1;
3186 #endif /* _LIBC && RE_ENABLE_I18N */
3188 if (BE (strlen ((const char *) name) != 1, 0))
3189 return REG_ECOLLATE;
3190 bitset_set (sbcset, *name);
3195 /* Helper function for parse_bracket_exp.
3196 Build the character class which is represented by NAME.
3197 The result are written to MBCSET and SBCSET.
3198 CHAR_CLASS_ALLOC is the allocated size of mbcset->char_classes,
3199 is a pointer argument sinse we may update it. */
3201 static reg_errcode_t
3202 #ifdef RE_ENABLE_I18N
3203 build_charclass (sbcset, mbcset, char_class_alloc, class_name, syntax)
3204 re_charset_t *mbcset;
3205 int *char_class_alloc;
3206 #else /* not RE_ENABLE_I18N */
3207 build_charclass (sbcset, class_name, syntax)
3208 #endif /* not RE_ENABLE_I18N */
3209 re_bitset_ptr_t sbcset;
3210 const unsigned char *class_name;
3211 reg_syntax_t syntax;
3214 const char *name = (const char *) class_name;
3216 /* In case of REG_ICASE "upper" and "lower" match the both of
3217 upper and lower cases. */
3218 if ((syntax & RE_ICASE)
3219 && (strcmp (name, "upper") == 0 || strcmp (name, "lower") == 0))
3222 #ifdef RE_ENABLE_I18N
3223 /* Check the space of the arrays. */
3224 if (*char_class_alloc == mbcset->nchar_classes)
3226 /* Not enough, realloc it. */
3227 /* +1 in case of mbcset->nchar_classes is 0. */
3228 *char_class_alloc = 2 * mbcset->nchar_classes + 1;
3229 /* Use realloc since array is NULL if *alloc == 0. */
3230 mbcset->char_classes = re_realloc (mbcset->char_classes, wctype_t,
3232 if (BE (mbcset->char_classes == NULL, 0))
3235 mbcset->char_classes[mbcset->nchar_classes++] = __wctype (name);
3236 #endif /* RE_ENABLE_I18N */
3238 #define BUILD_CHARCLASS_LOOP(ctype_func)\
3239 for (i = 0; i < SBC_MAX; ++i) \
3241 if (ctype_func (i)) \
3242 bitset_set (sbcset, i); \
3245 if (strcmp (name, "alnum") == 0)
3246 BUILD_CHARCLASS_LOOP (isalnum)
3247 else if (strcmp (name, "cntrl") == 0)
3248 BUILD_CHARCLASS_LOOP (iscntrl)
3249 else if (strcmp (name, "lower") == 0)
3250 BUILD_CHARCLASS_LOOP (islower)
3251 else if (strcmp (name, "space") == 0)
3252 BUILD_CHARCLASS_LOOP (isspace)
3253 else if (strcmp (name, "alpha") == 0)
3254 BUILD_CHARCLASS_LOOP (isalpha)
3255 else if (strcmp (name, "digit") == 0)
3256 BUILD_CHARCLASS_LOOP (isdigit)
3257 else if (strcmp (name, "print") == 0)
3258 BUILD_CHARCLASS_LOOP (isprint)
3259 else if (strcmp (name, "upper") == 0)
3260 BUILD_CHARCLASS_LOOP (isupper)
3261 else if (strcmp (name, "blank") == 0)
3262 BUILD_CHARCLASS_LOOP (isblank)
3263 else if (strcmp (name, "graph") == 0)
3264 BUILD_CHARCLASS_LOOP (isgraph)
3265 else if (strcmp (name, "punct") == 0)
3266 BUILD_CHARCLASS_LOOP (ispunct)
3267 else if (strcmp (name, "xdigit") == 0)
3268 BUILD_CHARCLASS_LOOP (isxdigit)
3276 build_word_op (dfa, not, err)
3281 re_bitset_ptr_t sbcset;
3282 #ifdef RE_ENABLE_I18N
3283 re_charset_t *mbcset;
3285 #else /* not RE_ENABLE_I18N */
3287 #endif /* not RE_ENABLE_I18N */
3289 re_token_t br_token;
3293 sbcset = (re_bitset_ptr_t) calloc (sizeof (unsigned int), BITSET_UINTS);
3294 #ifdef RE_ENABLE_I18N
3295 mbcset = (re_charset_t *) calloc (sizeof (re_charset_t), 1);
3296 #endif /* RE_ENABLE_I18N */
3298 #ifdef RE_ENABLE_I18N
3299 if (BE (sbcset == NULL || mbcset == NULL, 0))
3300 #else /* not RE_ENABLE_I18N */
3301 if (BE (sbcset == NULL, 0))
3302 #endif /* not RE_ENABLE_I18N */
3310 #ifdef RE_ENABLE_I18N
3313 if (syntax & RE_HAT_LISTS_NOT_NEWLINE)
3314 bitset_set(cset->sbcset, '\0');
3316 mbcset->non_match = 1;
3318 for (i = 0; i < SBC_MAX; ++i)
3319 if (__btowc (i) == WEOF)
3320 bitset_set (sbcset, i);
3321 #else /* not RE_ENABLE_I18N */
3323 #endif /* not RE_ENABLE_I18N */
3326 /* We don't care the syntax in this case. */
3327 ret = build_charclass (sbcset,
3328 #ifdef RE_ENABLE_I18N
3330 #endif /* RE_ENABLE_I18N */
3331 (const unsigned char *) "alpha", 0);
3333 if (BE (ret != REG_NOERROR, 0))
3336 #ifdef RE_ENABLE_I18N
3337 free_charset (mbcset);
3338 #endif /* RE_ENABLE_I18N */
3342 /* \w match '_' also. */
3343 bitset_set (sbcset, '_');
3345 /* If it is non-matching list. */
3346 #ifdef RE_ENABLE_I18N
3347 if (mbcset->non_match)
3348 #else /* not RE_ENABLE_I18N */
3350 #endif /* not RE_ENABLE_I18N */
3351 bitset_not (sbcset);
3353 /* Build a tree for simple bracket. */
3354 br_token.type = SIMPLE_BRACKET;
3355 br_token.opr.sbcset = sbcset;
3356 new_idx = re_dfa_add_node (dfa, br_token, 0);
3357 tree = create_tree (NULL, NULL, 0, new_idx);
3358 if (BE (new_idx == -1 || tree == NULL, 0))
3359 goto build_word_op_espace;
3361 #ifdef RE_ENABLE_I18N
3364 re_token_t alt_token;
3365 bin_tree_t *mbc_tree;
3366 /* Build a tree for complex bracket. */
3367 br_token.type = COMPLEX_BRACKET;
3368 br_token.opr.mbcset = mbcset;
3369 dfa->has_mb_node = 1;
3370 new_idx = re_dfa_add_node (dfa, br_token, 0);
3371 mbc_tree = create_tree (NULL, NULL, 0, new_idx);
3372 if (BE (new_idx == -1 || mbc_tree == NULL, 0))
3373 goto build_word_op_espace;
3374 /* Then join them by ALT node. */
3375 alt_token.type = OP_ALT;
3376 new_idx = re_dfa_add_node (dfa, alt_token, 0);
3377 tree = create_tree (tree, mbc_tree, 0, new_idx);
3378 if (BE (new_idx != -1 && mbc_tree != NULL, 1))
3383 free_charset (mbcset);
3386 #else /* not RE_ENABLE_I18N */
3388 #endif /* not RE_ENABLE_I18N */
3390 build_word_op_espace:
3392 #ifdef RE_ENABLE_I18N
3393 free_charset (mbcset);
3394 #endif /* RE_ENABLE_I18N */
3399 /* This is intended for the expressions like "a{1,3}".
3400 Fetch a number from `input', and return the number.
3401 Return -1, if the number field is empty like "{,1}".
3402 Return -2, If an error is occured. */
3405 fetch_number (input, token, syntax)
3408 reg_syntax_t syntax;
3414 *token = fetch_token (input, syntax);
3416 if (BE (token->type == END_OF_RE, 0))
3418 if (token->type == OP_CLOSE_DUP_NUM || c == ',')
3420 num = ((token->type != CHARACTER || c < '0' || '9' < c || num == -2)
3421 ? -2 : ((num == -1) ? c - '0' : num * 10 + c - '0'));
3422 num = (num > RE_DUP_MAX) ? -2 : num;
3427 #ifdef RE_ENABLE_I18N
3429 free_charset (re_charset_t *cset)
3431 re_free (cset->mbchars);
3433 re_free (cset->coll_syms);
3434 re_free (cset->equiv_classes);
3435 re_free (cset->range_starts);
3436 re_free (cset->range_ends);
3438 re_free (cset->char_classes);
3441 #endif /* RE_ENABLE_I18N */
3443 /* Functions for binary tree operation. */
3445 /* Create a node of tree.
3446 Note: This function automatically free left and right if malloc fails. */
3449 create_tree (left, right, type, index)
3452 re_token_type_t type;
3456 tree = re_malloc (bin_tree_t, 1);
3457 if (BE (tree == NULL, 0))
3459 free_bin_tree (left);
3460 free_bin_tree (right);
3463 tree->parent = NULL;
3465 tree->right = right;
3467 tree->node_idx = index;
3470 re_node_set_init_empty (&tree->eclosure);
3473 left->parent = tree;
3475 right->parent = tree;
3479 /* Free the sub tree pointed by TREE. */
3482 free_bin_tree (tree)
3487 /*re_node_set_free (&tree->eclosure);*/
3488 free_bin_tree (tree->left);
3489 free_bin_tree (tree->right);
3493 /* Duplicate the node SRC, and return new node. */
3496 duplicate_tree (src, dfa)
3497 const bin_tree_t *src;
3500 bin_tree_t *left = NULL, *right = NULL, *new_tree;
3502 /* Since node indies must be according to Post-order of the tree,
3503 we must duplicate the left at first. */
3504 if (src->left != NULL)
3506 left = duplicate_tree (src->left, dfa);
3511 /* Secondaly, duplicate the right. */
3512 if (src->right != NULL)
3514 right = duplicate_tree (src->right, dfa);
3517 free_bin_tree (left);
3522 /* At last, duplicate itself. */
3523 if (src->type == NON_TYPE)
3525 new_node_idx = re_dfa_add_node (dfa, dfa->nodes[src->node_idx], 0);
3526 dfa->nodes[new_node_idx].duplicated = 1;
3527 if (BE (new_node_idx == -1, 0))
3529 free_bin_tree (left);
3530 free_bin_tree (right);
3535 new_node_idx = src->type;
3537 new_tree = create_tree (left, right, src->type, new_node_idx);
3538 if (BE (new_tree == NULL, 0))
3540 free_bin_tree (left);
3541 free_bin_tree (right);