1 /* dfa.c - deterministic extended regexp routines for GNU
   2    Copyright (C) 1988, 1998, 2000, 2002, 2004-2005, 2007-2025 Free Software
   3    Foundation, Inc.
   4
   5    This program is free software; you can redistribute it and/or modify
   6    it under the terms of the GNU General Public License as published by
   7    the Free Software Foundation, either version 3, or (at your option)
   8    any later version.
   9
  10    This program is distributed in the hope that it will be useful,
  11    but WITHOUT ANY WARRANTY; without even the implied warranty of
  12    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  13    GNU General Public License for more details.
  14
  15    You should have received a copy of the GNU General Public License
  16    along with this program.  If not, see <https://www.gnu.org/licenses/>.  */
  17
  18 /* Written June, 1988 by Mike Haertel
  19    Modified July, 1988 by Arthur David Olson to assist BMG speedups  */
  20
  21 #include <config.h>
  22
  23 #include "dfa.h"
  24
  25 #include "flexmember.h"
  26 #include "idx.h"
  27 #include "verify.h"
  28
  29 #include <assert.h>
  30 #include <ctype.h>
  31 #include <stdint.h>
  32 #include <stdio.h>
  33 #include <stdlib.h>
  34 #include <limits.h>
  35 #include <string.h>
  36 #include <wchar.h>
  37
  38 #include "xalloc.h"
  39 #include "localeinfo.h"
  40
  41 #include "gettext.h"
  42 #define _(msgid) dgettext ("gnulib", msgid)
  43
  44 #if GAWK
  45 /* Use ISO C 99 API.  */
  46 # include <wctype.h>
  47 # define char32_t wchar_t
  48 # define mbrtoc32 mbrtowc
  49 # define c32rtomb wcrtomb
  50 # define c32tob wctob
  51 # define c32isprint iswprint
  52 # define c32isspace iswspace
  53 # define mbszero(p) memset ((p), 0, sizeof (mbstate_t))
  54 #else
  55 /* Use ISO C 11 + gnulib API.  */
  56 # include <uchar.h>
  57 #endif
  58
  59 /* Pacify gcc -Wanalyzer-null-dereference in areas where GCC
  60    understandably cannot deduce that the input comes from a
  61    well-formed regular expression.  There's little point to the
  62    runtime overhead of 'assert' instead of 'assume_nonnull' when the
  63    MMU will check anyway.  */
  64 #define assume_nonnull(x) assume ((x) != NULL)
  65
  66 static bool
  67 str_eq (char const *a, char const *b)
  68 {
  69   return strcmp (a, b) == 0;
  70 }
  71
  72 static bool
  73 c_isdigit (char c)
  74 {
  75   return '0' <= c && c <= '9';
  76 }
  77
  78 #ifndef FALLTHROUGH
  79 # if 201710L < __STDC_VERSION__
  80 #  define FALLTHROUGH [[__fallthrough__]]
  81 # elif ((__GNUC__ >= 7 && !defined __clang__) \
  82         || (defined __apple_build_version__ \
  83             ? __apple_build_version__ >= 12000000 \
  84             : __clang_major__ >= 10))
  85 #  define FALLTHROUGH __attribute__ ((__fallthrough__))
  86 # else
  87 #  define FALLTHROUGH ((void) 0)
  88 # endif
  89 #endif
  90
  91 #ifndef MIN
  92 # define MIN(a,b) ((a) < (b) ? (a) : (b))
  93 #endif
  94
  95 /* HPUX defines these as macros in sys/param.h.  */
  96 #ifdef setbit
  97 # undef setbit
  98 #endif
  99 #ifdef clrbit
 100 # undef clrbit
 101 #endif
 102
 103 /* For code that does not use Gnulib’s isblank module.  */
 104 #if !defined isblank && !defined HAVE_ISBLANK && !defined GNULIB_ISBLANK
 105 # define isblank dfa_isblank
 106 static int
 107 isblank (int c)
 108 {
 109   return c == ' ' || c == '\t';
 110 }
 111 #endif
 112
 113 /* First integer value that is greater than any character code.  */
 114 enum { NOTCHAR = 1 << CHAR_BIT };
 115
 116 #ifdef UINT_LEAST64_MAX
 117
 118 /* Number of bits used in a charclass word.  */
 119 enum { CHARCLASS_WORD_BITS = 64 };
 120
 121 /* This represents part of a character class.  It must be unsigned and
 122    at least CHARCLASS_WORD_BITS wide.  Any excess bits are zero.  */
 123 typedef uint_least64_t charclass_word;
 124
 125 /* Part of a charclass initializer that represents 64 bits' worth of a
 126    charclass, where LO and HI are the low and high-order 32 bits of
 127    the 64-bit quantity.  */
 128 # define CHARCLASS_PAIR(lo, hi) (((charclass_word) (hi) << 32) + (lo))
 129
 130 #else
 131 /* Fallbacks for pre-C99 hosts that lack 64-bit integers.  */
 132 enum { CHARCLASS_WORD_BITS = 32 };
 133 typedef unsigned long charclass_word;
 134 # define CHARCLASS_PAIR(lo, hi) lo, hi
 135 #endif
 136
 137 /* An initializer for a charclass whose 32-bit words are A through H.  */
 138 #define CHARCLASS_INIT(a, b, c, d, e, f, g, h)          \
 139    {{                                                   \
 140       CHARCLASS_PAIR (a, b), CHARCLASS_PAIR (c, d),     \
 141       CHARCLASS_PAIR (e, f), CHARCLASS_PAIR (g, h)      \
 142    }}
 143
 144 /* The maximum useful value of a charclass_word; all used bits are 1.  */
 145 static charclass_word const CHARCLASS_WORD_MASK
 146   = ((charclass_word) 1 << (CHARCLASS_WORD_BITS - 1) << 1) - 1;
 147
 148 /* Number of words required to hold a bit for every character.  */
 149 enum
 150 {
 151   CHARCLASS_WORDS = (NOTCHAR + CHARCLASS_WORD_BITS - 1) / CHARCLASS_WORD_BITS
 152 };
 153
 154 /* Sets of unsigned characters are stored as bit vectors in arrays of ints.  */
 155 typedef struct { charclass_word w[CHARCLASS_WORDS]; } charclass;
 156
 157 /* Convert a possibly-signed character to an unsigned character.  This is
 158    a bit safer than casting to unsigned char, since it catches some type
 159    errors that the cast doesn't.  */
 160 static unsigned char
 161 to_uchar (char ch)
 162 {
 163   return ch;
 164 }
 165
 166 /* Contexts tell us whether a character is a newline or a word constituent.
 167    Word-constituent characters are those that satisfy iswalnum, plus '_'.
 168    Each character has a single CTX_* value; bitmasks of CTX_* values denote
 169    a particular character class.
 170
 171    A state also stores a context value, which is a bitmask of CTX_* values.
 172    A state's context represents a set of characters that the state's
 173    predecessors must match.  For example, a state whose context does not
 174    include CTX_LETTER will never have transitions where the previous
 175    character is a word constituent.  A state whose context is CTX_ANY
 176    might have transitions from any character.  */
 177
 178 enum
 179   {
 180     CTX_NONE = 1,
 181     CTX_LETTER = 2,
 182     CTX_NEWLINE = 4,
 183     CTX_ANY = 7
 184   };
 185
 186 /* Sometimes characters can only be matched depending on the surrounding
 187    context.  Such context decisions depend on what the previous character
 188    was, and the value of the current (lookahead) character.  Context
 189    dependent constraints are encoded as 9-bit integers.  Each bit that
 190    is set indicates that the constraint succeeds in the corresponding
 191    context.
 192
 193    bit 6-8  - valid contexts when next character is CTX_NEWLINE
 194    bit 3-5  - valid contexts when next character is CTX_LETTER
 195    bit 0-2  - valid contexts when next character is CTX_NONE
 196
 197    succeeds_in_context determines whether a given constraint
 198    succeeds in a particular context.  Prev is a bitmask of possible
 199    context values for the previous character, curr is the (single-bit)
 200    context value for the lookahead character.  */
 201 static int
 202 newline_constraint (int constraint)
 203 {
 204   return (constraint >> 6) & 7;
 205 }
 206 static int
 207 letter_constraint (int constraint)
 208 {
 209   return (constraint >> 3) & 7;
 210 }
 211 static int
 212 other_constraint (int constraint)
 213 {
 214   return constraint & 7;
 215 }
 216
 217 static bool
 218 succeeds_in_context (int constraint, int prev, int curr)
 219 {
 220   return !! (((curr & CTX_NONE      ? other_constraint (constraint) : 0) \
 221               | (curr & CTX_LETTER  ? letter_constraint (constraint) : 0) \
 222               | (curr & CTX_NEWLINE ? newline_constraint (constraint) : 0)) \
 223              & prev);
 224 }
 225
 226 /* The following describe what a constraint depends on.  */
 227 static bool
 228 prev_newline_dependent (int constraint)
 229 {
 230   return ((constraint ^ constraint >> 2) & 0111) != 0;
 231 }
 232 static bool
 233 prev_letter_dependent (int constraint)
 234 {
 235   return ((constraint ^ constraint >> 1) & 0111) != 0;
 236 }
 237
 238 /* Tokens that match the empty string subject to some constraint actually
 239    work by applying that constraint to determine what may follow them,
 240    taking into account what has gone before.  The following values are
 241    the constraints corresponding to the special tokens previously defined.  */
 242 enum
 243   {
 244     NO_CONSTRAINT = 0777,
 245     BEGLINE_CONSTRAINT = 0444,
 246     ENDLINE_CONSTRAINT = 0700,
 247     BEGWORD_CONSTRAINT = 0050,
 248     ENDWORD_CONSTRAINT = 0202,
 249     LIMWORD_CONSTRAINT = 0252,
 250     NOTLIMWORD_CONSTRAINT = 0525
 251   };
 252
 253 /* The regexp is parsed into an array of tokens in postfix form.  Some tokens
 254    are operators and others are terminal symbols.  Most (but not all) of these
 255    codes are returned by the lexical analyzer.  */
 256
 257 typedef ptrdiff_t token;
 258 static token const TOKEN_MAX = PTRDIFF_MAX;
 259
 260 /* States are indexed by state_num values.  These are normally
 261    nonnegative but -1 is used as a special value.  */
 262 typedef ptrdiff_t state_num;
 263
 264 /* Predefined token values.  */
 265 enum
 266 {
 267   END = -1,                     /* END is a terminal symbol that matches the
 268                                    end of input; any value of END or less in
 269                                    the parse tree is such a symbol.  Accepting
 270                                    states of the DFA are those that would have
 271                                    a transition on END.  This is -1, not some
 272                                    more-negative value, to tweak the speed of
 273                                    comparisons to END.  */
 274
 275   /* Ordinary character values are terminal symbols that match themselves.  */
 276
 277   /* CSET must come last in the following list of special tokens.  Otherwise,
 278      the list order matters only for performance.  Related special tokens
 279      should have nearby values so that code like (t == ANYCHAR || t == MBCSET
 280      || CSET <= t) can be done with a single machine-level comparison.  */
 281
 282   EMPTY = NOTCHAR,              /* EMPTY is a terminal symbol that matches
 283                                    the empty string.  */
 284
 285   QMARK,                        /* QMARK is an operator of one argument that
 286                                    matches zero or one occurrences of its
 287                                    argument.  */
 288
 289   STAR,                         /* STAR is an operator of one argument that
 290                                    matches the Kleene closure (zero or more
 291                                    occurrences) of its argument.  */
 292
 293   PLUS,                         /* PLUS is an operator of one argument that
 294                                    matches the positive closure (one or more
 295                                    occurrences) of its argument.  */
 296
 297   REPMN,                        /* REPMN is a lexical token corresponding
 298                                    to the {m,n} construct.  REPMN never
 299                                    appears in the compiled token vector.  */
 300
 301   CAT,                          /* CAT is an operator of two arguments that
 302                                    matches the concatenation of its
 303                                    arguments.  CAT is never returned by the
 304                                    lexical analyzer.  */
 305
 306   OR,                           /* OR is an operator of two arguments that
 307                                    matches either of its arguments.  */
 308
 309   LPAREN,                       /* LPAREN never appears in the parse tree,
 310                                    it is only a lexeme.  */
 311
 312   RPAREN,                       /* RPAREN never appears in the parse tree.  */
 313
 314   WCHAR,                        /* Only returned by lex.  wctok contains the
 315                                    32-bit wide character representation.  */
 316
 317   ANYCHAR,                      /* ANYCHAR is a terminal symbol that matches
 318                                    a valid multibyte (or single byte) character.
 319                                    It is used only if MB_CUR_MAX > 1.  */
 320
 321   BEG,                          /* BEG is an initial symbol that matches the
 322                                    beginning of input.  */
 323
 324   BEGLINE,                      /* BEGLINE is a terminal symbol that matches
 325                                    the empty string at the beginning of a
 326                                    line.  */
 327
 328   ENDLINE,                      /* ENDLINE is a terminal symbol that matches
 329                                    the empty string at the end of a line.  */
 330
 331   BEGWORD,                      /* BEGWORD is a terminal symbol that matches
 332                                    the empty string at the beginning of a
 333                                    word.  */
 334
 335   ENDWORD,                      /* ENDWORD is a terminal symbol that matches
 336                                    the empty string at the end of a word.  */
 337
 338   LIMWORD,                      /* LIMWORD is a terminal symbol that matches
 339                                    the empty string at the beginning or the
 340                                    end of a word.  */
 341
 342   NOTLIMWORD,                   /* NOTLIMWORD is a terminal symbol that
 343                                    matches the empty string not at
 344                                    the beginning or end of a word.  */
 345
 346   BACKREF,                      /* BACKREF is generated by \<digit>
 347                                    or by any other construct that
 348                                    is not completely handled.  If the scanner
 349                                    detects a transition on backref, it returns
 350                                    a kind of "semi-success" indicating that
 351                                    the match will have to be verified with
 352                                    a backtracking matcher.  */
 353
 354   MBCSET,                       /* MBCSET is similar to CSET, but for
 355                                    multibyte characters.  */
 356
 357   CSET                          /* CSET and (and any value greater) is a
 358                                    terminal symbol that matches any of a
 359                                    class of characters.  */
 360 };
 361
 362
 363 /* States of the recognizer correspond to sets of positions in the parse
 364    tree, together with the constraints under which they may be matched.
 365    So a position is encoded as an index into the parse tree together with
 366    a constraint.  */
 367 typedef struct
 368 {
 369   idx_t index;                  /* Index into the parse array.  */
 370   unsigned int constraint;      /* Constraint for matching this position.  */
 371 } position;
 372
 373 /* Sets of positions are stored as arrays.  */
 374 typedef struct
 375 {
 376   position *elems;              /* Elements of this position set.  */
 377   idx_t nelem;                  /* Number of elements in this set.  */
 378   idx_t alloc;                  /* Number of elements allocated in ELEMS.  */
 379 } position_set;
 380
 381 /* A state of the dfa consists of a set of positions, some flags,
 382    and the token value of the lowest-numbered position of the state that
 383    contains an END token.  */
 384 typedef struct
 385 {
 386   size_t hash;                  /* Hash of the positions of this state.  */
 387   position_set elems;           /* Positions this state could match.  */
 388   unsigned char context;        /* Context from previous state.  */
 389   unsigned short constraint;    /* Constraint for this state to accept.  */
 390   position_set mbps;            /* Positions which can match multibyte
 391                                    characters or the follows, e.g., period.
 392                                    Used only if MB_CUR_MAX > 1.  */
 393   state_num mb_trindex;         /* Index of this state in MB_TRANS, or
 394                                    negative if the state does not have
 395                                    ANYCHAR.  */
 396 } dfa_state;
 397
 398 /* Maximum for any transition table count.  This should be at least 3,
 399    for the initial state setup.  */
 400 enum { MAX_TRCOUNT = 1024 };
 401
 402 /* A bracket operator.
 403    e.g., [a-c], [[:alpha:]], etc.  */
 404 struct mb_char_classes
 405 {
 406   ptrdiff_t cset;
 407   bool invert;
 408   char32_t *chars;              /* Normal characters.  */
 409   idx_t nchars;
 410   idx_t nchars_alloc;
 411 };
 412
 413 struct regex_syntax
 414 {
 415   /* Syntax bits controlling the behavior of the lexical analyzer.  */
 416   reg_syntax_t syntax_bits;
 417   int dfaopts;
 418   bool syntax_bits_set;
 419
 420   /* Flag for case-folding letters into sets.  */
 421   bool case_fold;
 422
 423   /* End-of-line byte in data.  */
 424   unsigned char eolbyte;
 425
 426   /* Cache of char-context values.  */
 427   char sbit[NOTCHAR];
 428
 429   /* If never_trail[B], the byte B cannot be a non-initial byte in a
 430      multibyte character.  */
 431   bool never_trail[NOTCHAR];
 432
 433   /* Set of characters considered letters.  */
 434   charclass letters;
 435
 436   /* Set of characters that are newline.  */
 437   charclass newline;
 438 };
 439
 440 /* Lexical analyzer.  All the dross that deals with the obnoxious
 441    GNU Regex syntax bits is located here.  The poor, suffering
 442    reader is referred to the GNU Regex documentation for the
 443    meaning of the @#%!@#%^!@ syntax bits.  */
 444 struct lexer_state
 445 {
 446   char const *ptr;      /* Pointer to next input character.  */
 447   idx_t left;           /* Number of characters remaining.  */
 448   token lasttok;        /* Previous token returned; initially END.  */
 449   idx_t parens;         /* Count of outstanding left parens.  */
 450   int minrep, maxrep;   /* Repeat counts for {m,n}.  */
 451
 452   /* 32-bit wide character representation of the current multibyte character,
 453      or WEOF if there was an encoding error.  Used only if
 454      MB_CUR_MAX > 1.  */
 455   wint_t wctok;
 456
 457   /* The most recently analyzed multibyte bracket expression.  */
 458   struct mb_char_classes brack;
 459
 460   /* We're separated from beginning or (, | only by zero-width characters.  */
 461   bool laststart;
 462 };
 463
 464 /* Recursive descent parser for regular expressions.  */
 465
 466 struct parser_state
 467 {
 468   token tok;               /* Lookahead token.  */
 469   idx_t depth;             /* Current depth of a hypothetical stack
 470                               holding deferred productions.  This is
 471                               used to determine the depth that will be
 472                               required of the real stack later on in
 473                               dfaanalyze.  */
 474 };
 475
 476 /* A compiled regular expression.  */
 477 struct dfa
 478 {
 479   /* Fields filled by the scanner.  */
 480   charclass *charclasses;       /* Array of character sets for CSET tokens.  */
 481   idx_t cindex;                 /* Index for adding new charclasses.  */
 482   idx_t calloc;                 /* Number of charclasses allocated.  */
 483   ptrdiff_t canychar;           /* Index of anychar class, or -1.  */
 484
 485   /* Scanner state */
 486   struct lexer_state lex;
 487
 488   /* Parser state */
 489   struct parser_state parse;
 490
 491   /* Fields filled by the parser.  */
 492   token *tokens;                /* Postfix parse array.  */
 493   idx_t tindex;                 /* Index for adding new tokens.  */
 494   idx_t talloc;                 /* Number of tokens currently allocated.  */
 495   idx_t depth;                  /* Depth required of an evaluation stack
 496                                    used for depth-first traversal of the
 497                                    parse tree.  */
 498   idx_t nleaves;                /* Number of non-EMPTY leaves
 499                                    in the parse tree.  */
 500   idx_t nregexps;               /* Count of parallel regexps being built
 501                                    with dfaparse.  */
 502   bool fast;                    /* The DFA is fast.  */
 503   bool epsilon;                 /* Does a token match only the empty string?  */
 504   token utf8_anychar_classes[9]; /* To lower ANYCHAR in UTF-8 locales.  */
 505   mbstate_t mbs;                /* Multibyte conversion state.  */
 506
 507   /* The following are valid only if MB_CUR_MAX > 1.  */
 508
 509   /* The value of multibyte_prop[i] is defined by following rule.
 510      if tokens[i] < NOTCHAR
 511      bit 0 : tokens[i] is the first byte of a character, including
 512      single-byte characters.
 513      bit 1 : tokens[i] is the last byte of a character, including
 514      single-byte characters.
 515
 516      e.g.
 517      tokens
 518      = 'single_byte_a', 'multi_byte_A', single_byte_b'
 519      = 'sb_a', 'mb_A(1st byte)', 'mb_A(2nd byte)', 'mb_A(3rd byte)', 'sb_b'
 520      multibyte_prop
 521      = 3     , 1               ,  0              ,  2              , 3
 522    */
 523   char *multibyte_prop;
 524
 525   /* Fields filled by the superset.  */
 526   struct dfa *superset;             /* Hint of the dfa.  */
 527
 528   /* Fields filled by the state builder.  */
 529   dfa_state *states;            /* States of the dfa.  */
 530   state_num sindex;             /* Index for adding new states.  */
 531   idx_t salloc;                 /* Number of states currently allocated.  */
 532
 533   /* Fields filled by the parse tree->NFA conversion.  */
 534   position_set *follows;        /* Array of follow sets, indexed by position
 535                                    index.  The follow of a position is the set
 536                                    of positions containing characters that
 537                                    could conceivably follow a character
 538                                    matching the given position in a string
 539                                    matching the regexp.  Allocated to the
 540                                    maximum possible position index.  */
 541   bool searchflag;              /* We are supposed to build a searching
 542                                    as opposed to an exact matcher.  A searching
 543                                    matcher finds the first and shortest string
 544                                    matching a regexp anywhere in the buffer,
 545                                    whereas an exact matcher finds the longest
 546                                    string matching, but anchored to the
 547                                    beginning of the buffer.  */
 548
 549   /* Fields filled by dfaanalyze.  */
 550   int *constraints;             /* Array of union of accepting constraints
 551                                    in the follow of a position.  */
 552   int *separates;               /* Array of contexts on follow of a
 553                                    position.  */
 554
 555   /* Fields filled by dfaexec.  */
 556   state_num tralloc;            /* Number of transition tables that have
 557                                    slots so far, not counting trans[-1] and
 558                                    trans[-2].  */
 559   int trcount;                  /* Number of transition tables that have
 560                                    been built, other than for initial
 561                                    states.  */
 562   int min_trcount;              /* Number of initial states.  Equivalently,
 563                                    the minimum state number for which trcount
 564                                    counts transitions.  */
 565   state_num **trans;            /* Transition tables for states that can
 566                                    never accept.  If the transitions for a
 567                                    state have not yet been computed, or the
 568                                    state could possibly accept, its entry in
 569                                    this table is NULL.  This points to two
 570                                    past the start of the allocated array,
 571                                    and trans[-1] and trans[-2] are always
 572                                    NULL.  */
 573   state_num **fails;            /* Transition tables after failing to accept
 574                                    on a state that potentially could do so.
 575                                    If trans[i] is non-null, fails[i] must
 576                                    be null.  */
 577   char *success;                /* Table of acceptance conditions used in
 578                                    dfaexec and computed in build_state.  */
 579   state_num *newlines;          /* Transitions on newlines.  The entry for a
 580                                    newline in any transition table is always
 581                                    -1 so we can count lines without wasting
 582                                    too many cycles.  The transition for a
 583                                    newline is stored separately and handled
 584                                    as a special case.  Newline is also used
 585                                    as a sentinel at the end of the buffer.  */
 586   state_num initstate_notbol;   /* Initial state for CTX_LETTER and CTX_NONE
 587                                    context in multibyte locales, in which we
 588                                    do not distinguish between their contexts,
 589                                    as not supported word.  */
 590   position_set mb_follows;      /* Follow set added by ANYCHAR on demand.  */
 591   state_num **mb_trans;         /* Transition tables for states with
 592                                    ANYCHAR.  */
 593   state_num mb_trcount;         /* Number of transition tables for states with
 594                                    ANYCHAR that have actually been built.  */
 595
 596   /* Syntax configuration.  This is near the end so that dfacopysyntax
 597      can memset up to here.  */
 598   struct regex_syntax syntax;
 599
 600   /* Information derived from the locale.  This is at the end so that
 601      a quick memset need not clear it specially.  */
 602
 603   /* dfaexec implementation.  */
 604   char *(*dfaexec) (struct dfa *, char const *, char *,
 605                     bool, idx_t *, bool *);
 606
 607   /* Other cached information derived from the locale.  */
 608   struct localeinfo localeinfo;
 609 };
 610
 611 /* User access to dfa internals.  */
 612
 613 /* S could possibly be an accepting state of R.  */
 614 static bool
 615 accepting (state_num s, struct dfa const *r)
 616 {
 617   return r->states[s].constraint != 0;
 618 }
 619
 620 /* STATE accepts in the specified context.  */
 621 static bool
 622 accepts_in_context (int prev, int curr, state_num state, struct dfa const *dfa)
 623 {
 624   return succeeds_in_context (dfa->states[state].constraint, prev, curr);
 625 }
 626
 627 static void regexp (struct dfa *dfa);
 628
 629 /* Store into *PWC the result of converting the leading bytes of the
 630    multibyte buffer S of length N bytes, using D->localeinfo.sbctowc
 631    and updating the conversion state in *D.  On conversion error,
 632    convert just a single byte, to WEOF.  Return the number of bytes
 633    converted.
 634
 635    This differs from mbrtoc32 (PWC, S, N, &D->mbs) as follows:
 636
 637    * PWC points to wint_t, not to char32_t.
 638    * The last arg is a dfa *D instead of merely a multibyte conversion
 639      state D->mbs.
 640    * N is idx_t not size_t, and must be at least 1.
 641    * S[N - 1] must be a sentinel byte.
 642    * Shift encodings are not supported.
 643    * The return value is always in the range 1..N.
 644    * D->mbs is always valid afterwards.
 645    * *PWC is always set to something.  */
 646 static int
 647 mbs_to_wchar (wint_t *pwc, char const *s, idx_t n, struct dfa *d)
 648 {
 649   unsigned char uc = s[0];
 650   wint_t wc = d->localeinfo.sbctowc[uc];
 651
 652   if (wc == WEOF)
 653     {
 654       char32_t wch;
 655       size_t nbytes = mbrtoc32 (&wch, s, n, &d->mbs);
 656       if (0 < nbytes && nbytes < (size_t) -2)
 657         {
 658           *pwc = wch;
 659           /* nbytes cannot be == (size) -3 here, since we rely on the
 660              'mbrtoc32-regular' module.  */
 661           return nbytes;
 662         }
 663       mbszero (&d->mbs);
 664     }
 665
 666   *pwc = wc;
 667   return 1;
 668 }
 669
 670 #ifdef DEBUG
 671
 672 static void
 673 prtok (token t)
 674 {
 675   if (t <= END)
 676     fprintf (stderr, "END");
 677   else if (0 <= t && t < NOTCHAR)
 678     {
 679       unsigned int ch = t;
 680       fprintf (stderr, "0x%02x", ch);
 681     }
 682   else
 683     {
 684       char const *s;
 685       switch (t)
 686         {
 687         case BEG:
 688           s = "BEG";
 689           break;
 690         case EMPTY:
 691           s = "EMPTY";
 692           break;
 693         case BACKREF:
 694           s = "BACKREF";
 695           break;
 696         case BEGLINE:
 697           s = "BEGLINE";
 698           break;
 699         case ENDLINE:
 700           s = "ENDLINE";
 701           break;
 702         case BEGWORD:
 703           s = "BEGWORD";
 704           break;
 705         case ENDWORD:
 706           s = "ENDWORD";
 707           break;
 708         case LIMWORD:
 709           s = "LIMWORD";
 710           break;
 711         case NOTLIMWORD:
 712           s = "NOTLIMWORD";
 713           break;
 714         case QMARK:
 715           s = "QMARK";
 716           break;
 717         case STAR:
 718           s = "STAR";
 719           break;
 720         case PLUS:
 721           s = "PLUS";
 722           break;
 723         case CAT:
 724           s = "CAT";
 725           break;
 726         case OR:
 727           s = "OR";
 728           break;
 729         case LPAREN:
 730           s = "LPAREN";
 731           break;
 732         case RPAREN:
 733           s = "RPAREN";
 734           break;
 735         case ANYCHAR:
 736           s = "ANYCHAR";
 737           break;
 738         case MBCSET:
 739           s = "MBCSET";
 740           break;
 741         default:
 742           s = "CSET";
 743           break;
 744         }
 745       fprintf (stderr, "%s", s);
 746     }
 747 }
 748 #endif /* DEBUG */
 749
 750 /* Stuff pertaining to charclasses.  */
 751
 752 static bool
 753 tstbit (unsigned int b, charclass const *c)
 754 {
 755   return c->w[b / CHARCLASS_WORD_BITS] >> b % CHARCLASS_WORD_BITS & 1;
 756 }
 757
 758 static void
 759 setbit (unsigned int b, charclass *c)
 760 {
 761   charclass_word one = 1;
 762   c->w[b / CHARCLASS_WORD_BITS] |= one << b % CHARCLASS_WORD_BITS;
 763 }
 764
 765 static void
 766 clrbit (unsigned int b, charclass *c)
 767 {
 768   charclass_word one = 1;
 769   c->w[b / CHARCLASS_WORD_BITS] &= ~(one << b % CHARCLASS_WORD_BITS);
 770 }
 771
 772 static void
 773 zeroset (charclass *s)
 774 {
 775   memset (s, 0, sizeof *s);
 776 }
 777
 778 static void
 779 fillset (charclass *s)
 780 {
 781   for (int i = 0; i < CHARCLASS_WORDS; i++)
 782     s->w[i] = CHARCLASS_WORD_MASK;
 783 }
 784
 785 static void
 786 notset (charclass *s)
 787 {
 788   for (int i = 0; i < CHARCLASS_WORDS; ++i)
 789     s->w[i] = CHARCLASS_WORD_MASK & ~s->w[i];
 790 }
 791
 792 static bool
 793 equal (charclass const *s1, charclass const *s2)
 794 {
 795   charclass_word w = 0;
 796   for (int i = 0; i < CHARCLASS_WORDS; i++)
 797     w |= s1->w[i] ^ s2->w[i];
 798   return w == 0;
 799 }
 800
 801 static bool
 802 emptyset (charclass const *s)
 803 {
 804   charclass_word w = 0;
 805   for (int i = 0; i < CHARCLASS_WORDS; i++)
 806     w |= s->w[i];
 807   return w == 0;
 808 }
 809
 810 /* Ensure that the array addressed by PA holds at least I + 1 items.
 811    Either return PA, or reallocate the array and return its new address.
 812    Although PA may be null, the returned value is never null.
 813
 814    The array holds *NITEMS items, where 0 <= I <= *NITEMS; *NITEMS
 815    is updated on reallocation.  If PA is null, *NITEMS must be zero.
 816    Do not allocate more than NITEMS_MAX items total; -1 means no limit.
 817    ITEM_SIZE is the size of one item; it must be positive.
 818    Avoid O(N**2) behavior on arrays growing linearly.  */
 819 static void *
 820 maybe_realloc (void *pa, idx_t i, idx_t *nitems,
 821                ptrdiff_t nitems_max, idx_t item_size)
 822 {
 823   if (i < *nitems)
 824     return pa;
 825   return xpalloc (pa, nitems, 1, nitems_max, item_size);
 826 }
 827
 828 /* In DFA D, find the index of charclass S, or allocate a new one.  */
 829 static idx_t
 830 charclass_index (struct dfa *d, charclass const *s)
 831 {
 832   idx_t i;
 833
 834   for (i = 0; i < d->cindex; ++i)
 835     if (equal (s, &d->charclasses[i]))
 836       return i;
 837   d->charclasses = maybe_realloc (d->charclasses, d->cindex, &d->calloc,
 838                                   TOKEN_MAX - CSET, sizeof *d->charclasses);
 839   ++d->cindex;
 840   d->charclasses[i] = *s;
 841   return i;
 842 }
 843
 844 static bool
 845 unibyte_word_constituent (struct dfa const *dfa, unsigned char c)
 846 {
 847   return dfa->localeinfo.sbctowc[c] != WEOF && (isalnum (c) || (c) == '_');
 848 }
 849
 850 static int
 851 char_context (struct dfa const *dfa, unsigned char c)
 852 {
 853   if (c == dfa->syntax.eolbyte && !(dfa->syntax.dfaopts & DFA_ANCHOR))
 854     return CTX_NEWLINE;
 855   if (unibyte_word_constituent (dfa, c))
 856     return CTX_LETTER;
 857   return CTX_NONE;
 858 }
 859
 860 /* Set a bit in the charclass for the given char32_t.  Do nothing if WC
 861    is represented by a multi-byte sequence.  Even for MB_CUR_MAX == 1,
 862    this may happen when folding case in weird Turkish locales where
 863    dotless i/dotted I are not included in the chosen character set.
 864    Return whether a bit was set in the charclass.  */
 865 static bool
 866 setbit_wc (char32_t wc, charclass *c)
 867 {
 868   int b = c32tob (wc);
 869   if (b < 0)
 870     return false;
 871
 872   setbit (b, c);
 873   return true;
 874 }
 875
 876 /* Set a bit for B and its case variants in the charclass C.
 877    MB_CUR_MAX must be 1.  */
 878 static void
 879 setbit_case_fold_c (int b, charclass *c)
 880 {
 881   int ub = toupper (b);
 882   for (int i = 0; i < NOTCHAR; i++)
 883     if (toupper (i) == ub)
 884       setbit (i, c);
 885 }
 886
 887 /* Fetch the next lexical input character from the pattern.  There
 888    must at least one byte of pattern input.  Set DFA->lex.wctok to the
 889    value of the character or to WEOF depending on whether the input is
 890    a valid multibyte character (possibly of length 1).  Then return
 891    the next input byte value, except return EOF if the input is a
 892    multibyte character of length greater than 1.  */
 893 static int
 894 fetch_wc (struct dfa *dfa)
 895 {
 896   int nbytes = mbs_to_wchar (&dfa->lex.wctok, dfa->lex.ptr, dfa->lex.left,
 897                              dfa);
 898   int c = nbytes == 1 ? to_uchar (dfa->lex.ptr[0]) : EOF;
 899   dfa->lex.ptr += nbytes;
 900   dfa->lex.left -= nbytes;
 901   return c;
 902 }
 903
 904 /* If there is no more input, report an error about unbalanced brackets.
 905    Otherwise, behave as with fetch_wc (DFA).  */
 906 static int
 907 bracket_fetch_wc (struct dfa *dfa)
 908 {
 909   if (! dfa->lex.left)
 910     dfaerror (_("unbalanced ["));
 911   return fetch_wc (dfa);
 912 }
 913
 914 typedef int predicate (int);
 915
 916 /* The following list maps the names of the Posix named character classes
 917    to predicate functions that determine whether a given character is in
 918    the class.  The leading [ has already been eaten by the lexical
 919    analyzer.  */
 920 struct dfa_ctype
 921 {
 922   const char *name;
 923   predicate *func;
 924   bool single_byte_only;
 925 };
 926
 927 static const struct dfa_ctype prednames[] = {
 928   {"alpha", isalpha, false},
 929   {"upper", isupper, false},
 930   {"lower", islower, false},
 931   {"digit", isdigit, true},
 932   {"xdigit", isxdigit, false},
 933   {"space", isspace, false},
 934   {"punct", ispunct, false},
 935   {"alnum", isalnum, false},
 936   {"print", isprint, false},
 937   {"graph", isgraph, false},
 938   {"cntrl", iscntrl, false},
 939   {"blank", isblank, false},
 940   {NULL, NULL, false}
 941 };
 942
 943 static const struct dfa_ctype *_GL_ATTRIBUTE_PURE
 944 find_pred (const char *str)
 945 {
 946   for (int i = 0; prednames[i].name; i++)
 947     if (str_eq (str, prednames[i].name))
 948       return &prednames[i];
 949   return NULL;
 950 }
 951
 952 /* Parse a bracket expression, which possibly includes multibyte
 953    characters.  */
 954 static token
 955 parse_bracket_exp (struct dfa *dfa)
 956 {
 957   /* This is a bracket expression that dfaexec is known to
 958      process correctly.  */
 959   bool known_bracket_exp = true;
 960
 961   /* Used to warn about [:space:].
 962      Bit 0 = first character is a colon.
 963      Bit 1 = last character is a colon.
 964      Bit 2 = includes any other character but a colon.
 965      Bit 3 = includes ranges, char/equiv classes or collation elements.  */
 966   int colon_warning_state;
 967
 968   dfa->lex.brack.nchars = 0;
 969   charclass ccl;
 970   zeroset (&ccl);
 971   int c = bracket_fetch_wc (dfa);
 972   bool invert = c == '^';
 973   if (invert)
 974     {
 975       c = bracket_fetch_wc (dfa);
 976       known_bracket_exp = dfa->localeinfo.simple;
 977     }
 978   wint_t wc = dfa->lex.wctok;
 979   int c1;
 980   wint_t wc1;
 981   colon_warning_state = (c == ':');
 982   do
 983     {
 984       c1 = NOTCHAR;     /* Mark c1 as not initialized.  */
 985       colon_warning_state &= ~2;
 986
 987       /* Note that if we're looking at some other [:...:] construct,
 988          we just treat it as a bunch of ordinary characters.  We can do
 989          this because we assume regex has checked for syntax errors before
 990          dfa is ever called.  */
 991       if (c == '[')
 992         {
 993           c1 = bracket_fetch_wc (dfa);
 994           wc1 = dfa->lex.wctok;
 995
 996           if ((c1 == ':' && (dfa->syntax.syntax_bits & RE_CHAR_CLASSES))
 997               || c1 == '.' || c1 == '=')
 998             {
 999               enum { MAX_BRACKET_STRING_LEN = 32 };
1000               char str[MAX_BRACKET_STRING_LEN + 1];
1001               int len = 0;
1002               for (;;)
1003                 {
1004                   c = bracket_fetch_wc (dfa);
1005                   if (dfa->lex.left == 0
1006                       || (c == c1 && dfa->lex.ptr[0] == ']'))
1007                     break;
1008                   if (len < MAX_BRACKET_STRING_LEN)
1009                     str[len++] = c;
1010                   else
1011                     /* This is in any case an invalid class name.  */
1012                     str[0] = '\0';
1013                 }
1014               str[len] = '\0';
1015
1016               /* Fetch bracket.  */
1017               c = bracket_fetch_wc (dfa);
1018               wc = dfa->lex.wctok;
1019               if (c1 == ':')
1020                 /* Build character class.  POSIX allows character
1021                    classes to match multicharacter collating elements,
1022                    but the regex code does not support that, so do not
1023                    worry about that possibility.  */
1024                 {
1025                   char const *class
1026                     = (dfa->syntax.case_fold && (str_eq (str, "upper")
1027                                                  || str_eq (str, "lower"))
1028                        ? "alpha" : str);
1029                   const struct dfa_ctype *pred = find_pred (class);
1030                   if (!pred)
1031                     dfaerror (_("invalid character class"));
1032
1033                   if (dfa->localeinfo.multibyte && !pred->single_byte_only)
1034                     known_bracket_exp = false;
1035                   else
1036                     for (int c2 = 0; c2 < NOTCHAR; ++c2)
1037                       if (pred->func (c2))
1038                         setbit (c2, &ccl);
1039                 }
1040               else
1041                 known_bracket_exp = false;
1042
1043               colon_warning_state |= 8;
1044
1045               /* Fetch new lookahead character.  */
1046               c1 = bracket_fetch_wc (dfa);
1047               wc1 = dfa->lex.wctok;
1048               continue;
1049             }
1050
1051           /* We treat '[' as a normal character here.  c/c1/wc/wc1
1052              are already set up.  */
1053         }
1054
1055       if (c == '\\'
1056           && (dfa->syntax.syntax_bits & RE_BACKSLASH_ESCAPE_IN_LISTS))
1057         {
1058           c = bracket_fetch_wc (dfa);
1059           wc = dfa->lex.wctok;
1060         }
1061
1062       if (c1 == NOTCHAR)
1063         {
1064           c1 = bracket_fetch_wc (dfa);
1065           wc1 = dfa->lex.wctok;
1066         }
1067
1068       if (c1 == '-')
1069         /* build range characters.  */
1070         {
1071           int c2 = bracket_fetch_wc (dfa);
1072           wint_t wc2 = dfa->lex.wctok;
1073
1074           /* A bracket expression like [a-[.aa.]] matches an unknown set.
1075              Treat it like [-a[.aa.]] while parsing it, and
1076              remember that the set is unknown.  */
1077           if (c2 == '[' && dfa->lex.ptr[0] == '.')
1078             {
1079               known_bracket_exp = false;
1080               c2 = ']';
1081             }
1082
1083           if (c2 == ']')
1084             {
1085               /* In the case [x-], the - is an ordinary hyphen,
1086                  which is left in c1, the lookahead character.  */
1087               dfa->lex.ptr--;
1088               dfa->lex.left++;
1089             }
1090           else
1091             {
1092               if (c2 == '\\' && (dfa->syntax.syntax_bits
1093                                  & RE_BACKSLASH_ESCAPE_IN_LISTS))
1094                 {
1095                   c2 = bracket_fetch_wc (dfa);
1096                   wc2 = dfa->lex.wctok;
1097                 }
1098
1099               colon_warning_state |= 8;
1100               c1 = bracket_fetch_wc (dfa);
1101               wc1 = dfa->lex.wctok;
1102
1103               /* Treat [x-y] as a range if x != y.  */
1104               if (wc != wc2 || wc == WEOF)
1105                 {
1106                   if (dfa->localeinfo.simple
1107                       || (c_isdigit (c) & c_isdigit (c2)))
1108                     {
1109                       for (int ci = c; ci <= c2; ci++)
1110                         if (dfa->syntax.case_fold && isalpha (ci))
1111                           setbit_case_fold_c (ci, &ccl);
1112                         else
1113                           setbit (ci, &ccl);
1114                     }
1115                   else
1116                     known_bracket_exp = false;
1117
1118                   continue;
1119                 }
1120             }
1121         }
1122
1123       colon_warning_state |= (c == ':') ? 2 : 4;
1124
1125       if (!dfa->localeinfo.multibyte)
1126         {
1127           if (dfa->syntax.case_fold && isalpha (c))
1128             setbit_case_fold_c (c, &ccl);
1129           else
1130             setbit (c, &ccl);
1131           continue;
1132         }
1133
1134       if (wc == WEOF)
1135         known_bracket_exp = false;
1136       else
1137         {
1138           char32_t folded[CASE_FOLDED_BUFSIZE + 1];
1139           int n = (dfa->syntax.case_fold
1140                    ? case_folded_counterparts (wc, folded + 1) + 1
1141                    : 1);
1142           folded[0] = wc;
1143           for (int i = 0; i < n; i++)
1144             if (!setbit_wc (folded[i], &ccl))
1145               {
1146                 dfa->lex.brack.chars
1147                   = maybe_realloc (dfa->lex.brack.chars, dfa->lex.brack.nchars,
1148                                    &dfa->lex.brack.nchars_alloc, -1,
1149                                    sizeof *dfa->lex.brack.chars);
1150                 dfa->lex.brack.chars[dfa->lex.brack.nchars++] = folded[i];
1151               }
1152         }
1153     }
1154   while ((wc = wc1, (c = c1) != ']'));
1155
1156   if (colon_warning_state == 7)
1157     {
1158       char const *msg
1159         = _("character class syntax is [[:space:]], not [:space:]");
1160       if (dfa->syntax.dfaopts & DFA_CONFUSING_BRACKETS_ERROR)
1161         dfaerror (msg);
1162       else
1163         dfawarn (msg);
1164     }
1165
1166   if (! known_bracket_exp)
1167     return BACKREF;
1168
1169   if (dfa->localeinfo.multibyte && (invert || dfa->lex.brack.nchars != 0))
1170     {
1171       dfa->lex.brack.invert = invert;
1172       dfa->lex.brack.cset = emptyset (&ccl) ? -1 : charclass_index (dfa, &ccl);
1173       return MBCSET;
1174     }
1175
1176   if (invert)
1177     {
1178       notset (&ccl);
1179       if (dfa->syntax.syntax_bits & RE_HAT_LISTS_NOT_NEWLINE)
1180         clrbit ('\n', &ccl);
1181     }
1182
1183   return CSET + charclass_index (dfa, &ccl);
1184 }
1185
1186 struct lexptr
1187 {
1188   char const *ptr;
1189   idx_t left;
1190 };
1191
1192 static void
1193 push_lex_state (struct dfa *dfa, struct lexptr *ls, char const *s)
1194 {
1195   ls->ptr = dfa->lex.ptr;
1196   ls->left = dfa->lex.left;
1197   dfa->lex.ptr = s;
1198   dfa->lex.left = strlen (s);
1199 }
1200
1201 static void
1202 pop_lex_state (struct dfa *dfa, struct lexptr const *ls)
1203 {
1204   dfa->lex.ptr = ls->ptr;
1205   dfa->lex.left = ls->left;
1206 }
1207
1208 static token
1209 lex (struct dfa *dfa)
1210 {
1211   bool backslash = false;
1212
1213   /* Basic plan: We fetch a character.  If it's a backslash,
1214      we set the backslash flag and go through the loop again.
1215      On the plus side, this avoids having a duplicate of the
1216      main switch inside the backslash case.  On the minus side,
1217      it means that just about every case tests the backslash flag.  */
1218   for (int i = 0; ; i++)
1219     {
1220       /* This loop should consume at most a backslash and some other
1221          character.  */
1222       assume (i < 2);
1223
1224       if (! dfa->lex.left)
1225         return dfa->lex.lasttok = END;
1226       int c = fetch_wc (dfa);
1227
1228       switch (c)
1229         {
1230         case '\\':
1231           if (backslash)
1232             goto normal_char;
1233           if (dfa->lex.left == 0)
1234             dfaerror (_("unfinished \\ escape"));
1235           backslash = true;
1236           break;
1237
1238         case '^':
1239           if (backslash)
1240             goto normal_char;
1241           if (dfa->syntax.syntax_bits & RE_CONTEXT_INDEP_ANCHORS
1242               || dfa->lex.lasttok == END || dfa->lex.lasttok == LPAREN
1243               || dfa->lex.lasttok == OR)
1244             return dfa->lex.lasttok = BEGLINE;
1245           goto normal_char;
1246
1247         case '$':
1248           if (backslash)
1249             goto normal_char;
1250           if (dfa->syntax.syntax_bits & RE_CONTEXT_INDEP_ANCHORS
1251               || dfa->lex.left == 0
1252               || ((dfa->lex.left
1253                    > !(dfa->syntax.syntax_bits & RE_NO_BK_PARENS))
1254                   && (dfa->lex.ptr[!(dfa->syntax.syntax_bits & RE_NO_BK_PARENS)
1255                                    & (dfa->lex.ptr[0] == '\\')]
1256                       == ')'))
1257               || ((dfa->lex.left
1258                    > !(dfa->syntax.syntax_bits & RE_NO_BK_VBAR))
1259                   && (dfa->lex.ptr[!(dfa->syntax.syntax_bits & RE_NO_BK_VBAR)
1260                                    & (dfa->lex.ptr[0] == '\\')]
1261                       == '|'))
1262               || ((dfa->syntax.syntax_bits & RE_NEWLINE_ALT)
1263                   && dfa->lex.left > 0 && dfa->lex.ptr[0] == '\n'))
1264             return dfa->lex.lasttok = ENDLINE;
1265           goto normal_char;
1266
1267         case '1':
1268         case '2':
1269         case '3':
1270         case '4':
1271         case '5':
1272         case '6':
1273         case '7':
1274         case '8':
1275         case '9':
1276           if (!backslash)
1277             goto normal_char;
1278           if (dfa->syntax.syntax_bits & RE_NO_BK_REFS)
1279             goto stray_backslash;
1280
1281           dfa->lex.laststart = false;
1282           return dfa->lex.lasttok = BACKREF;
1283
1284         case '`':
1285           if (!backslash)
1286             goto normal_char;
1287           if (dfa->syntax.syntax_bits & RE_NO_GNU_OPS)
1288             goto stray_backslash;
1289
1290           /* FIXME: should be beginning of string */
1291           return dfa->lex.lasttok = BEGLINE;
1292
1293         case '\'':
1294           if (!backslash)
1295             goto normal_char;
1296           if (dfa->syntax.syntax_bits & RE_NO_GNU_OPS)
1297             goto stray_backslash;
1298
1299           /* FIXME: should be end of string */
1300           return dfa->lex.lasttok = ENDLINE;
1301
1302         case '<':
1303           if (!backslash)
1304             goto normal_char;
1305           if (dfa->syntax.syntax_bits & RE_NO_GNU_OPS)
1306             goto stray_backslash;
1307
1308           return dfa->lex.lasttok = BEGWORD;
1309
1310         case '>':
1311           if (!backslash)
1312             goto normal_char;
1313           if (dfa->syntax.syntax_bits & RE_NO_GNU_OPS)
1314             goto stray_backslash;
1315
1316           return dfa->lex.lasttok = ENDWORD;
1317
1318         case 'b':
1319           if (!backslash)
1320             goto normal_char;
1321           if (dfa->syntax.syntax_bits & RE_NO_GNU_OPS)
1322             goto stray_backslash;
1323
1324           return dfa->lex.lasttok = LIMWORD;
1325
1326         case 'B':
1327           if (!backslash)
1328             goto normal_char;
1329           if (dfa->syntax.syntax_bits & RE_NO_GNU_OPS)
1330             goto stray_backslash;
1331
1332           return dfa->lex.lasttok = NOTLIMWORD;
1333
1334         case '?':
1335           if (dfa->syntax.syntax_bits & RE_LIMITED_OPS)
1336             goto default_case;
1337           if (backslash != ((dfa->syntax.syntax_bits & RE_BK_PLUS_QM) != 0))
1338             goto normal_char;
1339           if (dfa->lex.laststart)
1340             {
1341               if (!(dfa->syntax.syntax_bits & RE_CONTEXT_INDEP_OPS))
1342                 goto default_case;
1343               if (dfa->syntax.dfaopts & DFA_PLUS_WARN)
1344                 dfawarn (_("? at start of expression"));
1345             }
1346           return dfa->lex.lasttok = QMARK;
1347
1348         case '*':
1349           if (backslash)
1350             goto normal_char;
1351           if (dfa->lex.laststart)
1352             {
1353               if (!(dfa->syntax.syntax_bits & RE_CONTEXT_INDEP_OPS))
1354                 goto default_case;
1355               if (dfa->syntax.dfaopts & DFA_STAR_WARN)
1356                 dfawarn (_("* at start of expression"));
1357             }
1358           return dfa->lex.lasttok = STAR;
1359
1360         case '+':
1361           if (dfa->syntax.syntax_bits & RE_LIMITED_OPS)
1362             goto default_case;
1363           if (backslash != ((dfa->syntax.syntax_bits & RE_BK_PLUS_QM) != 0))
1364             goto normal_char;
1365           if (dfa->lex.laststart)
1366             {
1367               if (!(dfa->syntax.syntax_bits & RE_CONTEXT_INDEP_OPS))
1368                 goto default_case;
1369               if (dfa->syntax.dfaopts & DFA_PLUS_WARN)
1370                 dfawarn (_("+ at start of expression"));
1371             }
1372           return dfa->lex.lasttok = PLUS;
1373
1374         case '{':
1375           if (!(dfa->syntax.syntax_bits & RE_INTERVALS))
1376             goto default_case;
1377           if (backslash != ((dfa->syntax.syntax_bits & RE_NO_BK_BRACES) == 0))
1378             goto normal_char;
1379
1380           /* Cases:
1381              {M} - exact count
1382              {M,} - minimum count, maximum is infinity
1383              {,N} - 0 through N
1384              {,} - 0 to infinity (same as '*')
1385              {M,N} - M through N */
1386           {
1387             char const *p = dfa->lex.ptr;
1388             char const *lim = p + dfa->lex.left;
1389             dfa->lex.minrep = dfa->lex.maxrep = -1;
1390             for (; p != lim && c_isdigit (*p); p++)
1391               dfa->lex.minrep = (dfa->lex.minrep < 0
1392                                  ? *p - '0'
1393                                  : MIN (RE_DUP_MAX + 1,
1394                                         dfa->lex.minrep * 10 + *p - '0'));
1395             if (p != lim)
1396               {
1397                 if (*p != ',')
1398                   dfa->lex.maxrep = dfa->lex.minrep;
1399                 else
1400                   {
1401                     if (dfa->lex.minrep < 0)
1402                       dfa->lex.minrep = 0;
1403                     while (++p != lim && c_isdigit (*p))
1404                       dfa->lex.maxrep
1405                         = (dfa->lex.maxrep < 0
1406                            ? *p - '0'
1407                            : MIN (RE_DUP_MAX + 1,
1408                                   dfa->lex.maxrep * 10 + *p - '0'));
1409                   }
1410               }
1411             bool invalid_content
1412               = ! ((! backslash || (p != lim && *p++ == '\\'))
1413                    && p != lim && *p++ == '}'
1414                    && 0 <= dfa->lex.minrep
1415                    && (dfa->lex.maxrep < 0
1416                        || dfa->lex.minrep <= dfa->lex.maxrep));
1417             if (invalid_content
1418                 && (dfa->syntax.syntax_bits & RE_INVALID_INTERVAL_ORD))
1419               goto normal_char;
1420             if (dfa->lex.laststart)
1421               {
1422                 if (!(dfa->syntax.syntax_bits & RE_CONTEXT_INDEP_OPS))
1423                   goto default_case;
1424                 if (dfa->syntax.dfaopts & DFA_PLUS_WARN)
1425                   dfawarn (_("{...} at start of expression"));
1426               }
1427             if (invalid_content)
1428               dfaerror (_("invalid content of \\{\\}"));
1429             if (RE_DUP_MAX < dfa->lex.maxrep)
1430               dfaerror (_("regular expression too big"));
1431             dfa->lex.ptr = p;
1432             dfa->lex.left = lim - p;
1433           }
1434           dfa->lex.laststart = false;
1435           return dfa->lex.lasttok = REPMN;
1436
1437         case '|':
1438           if (dfa->syntax.syntax_bits & RE_LIMITED_OPS)
1439             goto default_case;
1440           if (backslash != ((dfa->syntax.syntax_bits & RE_NO_BK_VBAR) == 0))
1441             goto normal_char;
1442           dfa->lex.laststart = true;
1443           return dfa->lex.lasttok = OR;
1444
1445         case '\n':
1446           if (!(dfa->syntax.syntax_bits & RE_NEWLINE_ALT))
1447             goto default_case;
1448           if (backslash)
1449             goto normal_char;
1450           dfa->lex.laststart = true;
1451           return dfa->lex.lasttok = OR;
1452
1453         case '(':
1454           if (backslash != ((dfa->syntax.syntax_bits & RE_NO_BK_PARENS) == 0))
1455             goto normal_char;
1456           dfa->lex.parens++;
1457           dfa->lex.laststart = true;
1458           return dfa->lex.lasttok = LPAREN;
1459
1460         case ')':
1461           if (backslash != ((dfa->syntax.syntax_bits & RE_NO_BK_PARENS) == 0))
1462             goto normal_char;
1463           if (dfa->lex.parens == 0
1464               && dfa->syntax.syntax_bits & RE_UNMATCHED_RIGHT_PAREN_ORD)
1465             goto normal_char;
1466           dfa->lex.parens--;
1467           dfa->lex.laststart = false;
1468           return dfa->lex.lasttok = RPAREN;
1469
1470         case '.':
1471           if (backslash)
1472             goto normal_char;
1473           if (dfa->canychar < 0)
1474             {
1475               charclass ccl;
1476               fillset (&ccl);
1477               if (!(dfa->syntax.syntax_bits & RE_DOT_NEWLINE))
1478                 clrbit ('\n', &ccl);
1479               if (dfa->syntax.syntax_bits & RE_DOT_NOT_NULL)
1480                 clrbit ('\0', &ccl);
1481               if (dfa->localeinfo.multibyte)
1482                 for (int c2 = 0; c2 < NOTCHAR; c2++)
1483                   if (dfa->localeinfo.sbctowc[c2] == WEOF)
1484                     clrbit (c2, &ccl);
1485               dfa->canychar = charclass_index (dfa, &ccl);
1486             }
1487           dfa->lex.laststart = false;
1488           return dfa->lex.lasttok = (dfa->localeinfo.multibyte
1489                                      ? ANYCHAR
1490                                      : CSET + dfa->canychar);
1491
1492         case 's':
1493         case 'S':
1494           if (!backslash)
1495             goto normal_char;
1496           if (dfa->syntax.syntax_bits & RE_NO_GNU_OPS)
1497             goto stray_backslash;
1498
1499           if (!dfa->localeinfo.multibyte)
1500             {
1501               charclass ccl;
1502               zeroset (&ccl);
1503               for (int c2 = 0; c2 < NOTCHAR; ++c2)
1504                 if (isspace (c2))
1505                   setbit (c2, &ccl);
1506               if (c == 'S')
1507                 notset (&ccl);
1508               dfa->lex.laststart = false;
1509               return dfa->lex.lasttok = CSET + charclass_index (dfa, &ccl);
1510             }
1511
1512           /* FIXME: see if optimizing this, as is done with ANYCHAR and
1513              add_utf8_anychar, makes sense.  */
1514
1515           /* \s and \S are documented to be equivalent to [[:space:]] and
1516              [^[:space:]] respectively, so tell the lexer to process those
1517              strings, each minus its "already processed" '['.  */
1518           {
1519             struct lexptr ls;
1520             push_lex_state (dfa, &ls, &"^[:space:]]"[c == 's']);
1521             dfa->lex.lasttok = parse_bracket_exp (dfa);
1522             pop_lex_state (dfa, &ls);
1523           }
1524
1525           dfa->lex.laststart = false;
1526           return dfa->lex.lasttok;
1527
1528         case 'w':
1529         case 'W':
1530           if (!backslash)
1531             goto normal_char;
1532           if (dfa->syntax.syntax_bits & RE_NO_GNU_OPS)
1533             goto stray_backslash;
1534
1535           if (!dfa->localeinfo.multibyte)
1536             {
1537               charclass ccl;
1538               zeroset (&ccl);
1539               for (int c2 = 0; c2 < NOTCHAR; ++c2)
1540                 if (dfa->syntax.sbit[c2] == CTX_LETTER)
1541                   setbit (c2, &ccl);
1542               if (c == 'W')
1543                 notset (&ccl);
1544               dfa->lex.laststart = false;
1545               return dfa->lex.lasttok = CSET + charclass_index (dfa, &ccl);
1546             }
1547
1548           /* FIXME: see if optimizing this, as is done with ANYCHAR and
1549              add_utf8_anychar, makes sense.  */
1550
1551           /* \w and \W are documented to be equivalent to [_[:alnum:]] and
1552              [^_[:alnum:]] respectively, so tell the lexer to process those
1553              strings, each minus its "already processed" '['.  */
1554           {
1555             struct lexptr ls;
1556             push_lex_state (dfa, &ls, &"^_[:alnum:]]"[c == 'w']);
1557             dfa->lex.lasttok = parse_bracket_exp (dfa);
1558             pop_lex_state (dfa, &ls);
1559           }
1560
1561           dfa->lex.laststart = false;
1562           return dfa->lex.lasttok;
1563
1564         case '[':
1565           if (backslash)
1566             goto normal_char;
1567           dfa->lex.laststart = false;
1568           return dfa->lex.lasttok = parse_bracket_exp (dfa);
1569
1570         default:
1571         default_case:
1572           if (!backslash)
1573             goto normal_char;
1574         stray_backslash:
1575           if (dfa->syntax.dfaopts & DFA_STRAY_BACKSLASH_WARN)
1576             {
1577               char const *msg;
1578               char msgbuf[100];
1579               if (!c32isprint (dfa->lex.wctok))
1580                 msg = _("stray \\ before unprintable character");
1581               else if (c32isspace (dfa->lex.wctok))
1582                 msg = _("stray \\ before white space");
1583               else
1584                 {
1585                   char buf[MB_LEN_MAX + 1];
1586                   mbstate_t s;
1587                   mbszero (&s);
1588                   size_t stored_bytes = c32rtomb (buf, dfa->lex.wctok, &s);
1589                   if (stored_bytes < (size_t) -1)
1590                     {
1591                       buf[stored_bytes] = '\0';
1592                       int n = snprintf (msgbuf, sizeof msgbuf,
1593                                         _("stray \\ before %s"), buf);
1594                       msg = 0 <= n && n < sizeof msgbuf ? msgbuf : _("stray \\");
1595                     }
1596                   else
1597                     msg = _("stray \\");
1598                 }
1599               dfawarn (msg);
1600             }
1601           FALLTHROUGH;
1602         case ']': case '}':
1603         normal_char:
1604           dfa->lex.laststart = false;
1605           /* For multibyte character sets, folding is done in atom.  Always
1606              return WCHAR.  */
1607           if (dfa->localeinfo.multibyte)
1608             return dfa->lex.lasttok = WCHAR;
1609
1610           if (dfa->syntax.case_fold && isalpha (c))
1611             {
1612               charclass ccl;
1613               zeroset (&ccl);
1614               setbit_case_fold_c (c, &ccl);
1615               return dfa->lex.lasttok = CSET + charclass_index (dfa, &ccl);
1616             }
1617
1618           return dfa->lex.lasttok = c;
1619         }
1620     }
1621 }
1622
1623 static void
1624 addtok_mb (struct dfa *dfa, token t, char mbprop)
1625 {
1626   if (dfa->talloc == dfa->tindex)
1627     {
1628       dfa->tokens = xpalloc (dfa->tokens, &dfa->talloc, 1, -1,
1629                              sizeof *dfa->tokens);
1630       if (dfa->localeinfo.multibyte)
1631         dfa->multibyte_prop = xreallocarray (dfa->multibyte_prop, dfa->talloc,
1632                                              sizeof *dfa->multibyte_prop);
1633     }
1634   if (dfa->localeinfo.multibyte)
1635     dfa->multibyte_prop[dfa->tindex] = mbprop;
1636   dfa->tokens[dfa->tindex++] = t;
1637
1638   switch (t)
1639     {
1640     case QMARK:
1641     case STAR:
1642     case PLUS:
1643       break;
1644
1645     case CAT:
1646     case OR:
1647       dfa->parse.depth--;
1648       break;
1649
1650     case EMPTY:
1651       dfa->epsilon = true;
1652       goto increment_depth;
1653
1654     case BACKREF:
1655       dfa->fast = false;
1656       goto increment_nleaves;
1657
1658     case BEGLINE:
1659     case ENDLINE:
1660     case BEGWORD:
1661     case ENDWORD:
1662     case LIMWORD:
1663     case NOTLIMWORD:
1664       dfa->epsilon = true;
1665       FALLTHROUGH;
1666     default:
1667     increment_nleaves:
1668       dfa->nleaves++;
1669     increment_depth:
1670       dfa->parse.depth++;
1671       if (dfa->depth < dfa->parse.depth)
1672         dfa->depth = dfa->parse.depth;
1673       break;
1674     }
1675 }
1676
1677 static void addtok_wc (struct dfa *dfa, wint_t wc);
1678
1679 /* Add the given token to the parse tree, maintaining the depth count and
1680    updating the maximum depth if necessary.  */
1681 static void
1682 addtok (struct dfa *dfa, token t)
1683 {
1684   if (dfa->localeinfo.multibyte && t == MBCSET)
1685     {
1686       bool need_or = false;
1687
1688       /* Extract wide characters into alternations for better performance.
1689          This does not require UTF-8.  */
1690       for (idx_t i = 0; i < dfa->lex.brack.nchars; i++)
1691         {
1692           addtok_wc (dfa, dfa->lex.brack.chars[i]);
1693           if (need_or)
1694             addtok (dfa, OR);
1695           need_or = true;
1696         }
1697       dfa->lex.brack.nchars = 0;
1698
1699       /* Wide characters have been handled above, so it is possible
1700          that the set is empty now.  Do nothing in that case.  */
1701       if (dfa->lex.brack.cset != -1)
1702         {
1703           addtok (dfa, CSET + dfa->lex.brack.cset);
1704           if (need_or)
1705             addtok (dfa, OR);
1706         }
1707     }
1708   else
1709     {
1710       addtok_mb (dfa, t, 3);
1711     }
1712 }
1713
1714 /* We treat a multibyte character as a single atom, so that DFA
1715    can treat a multibyte character as a single expression.
1716
1717    e.g., we construct the following tree from "<mb1><mb2>".
1718    <mb1(1st-byte)><mb1(2nd-byte)><CAT><mb1(3rd-byte)><CAT>
1719    <mb2(1st-byte)><mb2(2nd-byte)><CAT><mb2(3rd-byte)><CAT><CAT> */
1720 static void
1721 addtok_wc (struct dfa *dfa, wint_t wc)
1722 {
1723   unsigned char buf[MB_LEN_MAX];
1724   mbstate_t s;
1725   mbszero (&s);
1726   size_t stored_bytes = c32rtomb ((char *) buf, wc, &s);
1727   int buflen;
1728
1729   if (stored_bytes != (size_t) -1)
1730     buflen = stored_bytes;
1731   else
1732     {
1733       /* This is merely stop-gap.  buf[0] is undefined, yet skipping
1734          the addtok_mb call altogether can corrupt the heap.  */
1735       buflen = 1;
1736       buf[0] = 0;
1737     }
1738
1739   addtok_mb (dfa, buf[0], buflen == 1 ? 3 : 1);
1740   for (int i = 1; i < buflen; i++)
1741     {
1742       addtok_mb (dfa, buf[i], i == buflen - 1 ? 2 : 0);
1743       addtok (dfa, CAT);
1744     }
1745 }
1746
1747 static void
1748 add_utf8_anychar (struct dfa *dfa)
1749 {
1750   /* Since the Unicode Standard Version 4.0.0 (2003), a well-formed
1751      UTF-8 byte sequence has been defined as follows:
1752
1753      ([\x00-\x7f]
1754      |[\xc2-\xdf][\x80-\xbf]
1755      |[\xe0][\xa0-\xbf][\x80-\xbf]
1756      |[\xe1-\xec\xee-\xef][\x80-\xbf][\x80-\xbf]
1757      |[\xed][\x80-\x9f][\x80-\xbf]
1758      |[\xf0][\x90-\xbf][\x80-\xbf][\x80-\xbf])
1759      |[\xf1-\xf3][\x80-\xbf][\x80-\xbf][\x80-\xbf]
1760      |[\xf4][\x80-\x8f][\x80-\xbf][\x80-\xbf])
1761
1762      which I'll write more concisely "A|BC|DEC|FCC|GHC|IJCC|KCCC|LMCC",
1763      where A = [\x00-\x7f], B = [\xc2-\xdf], C = [\x80-\xbf],
1764      D = [\xe0], E = [\xa0-\xbf], F = [\xe1-\xec\xee-\xef], G = [\xed],
1765      H = [\x80-\x9f], I = [\xf0],
1766      J = [\x90-\xbf], K = [\xf1-\xf3], L = [\xf4], M = [\x80-\x8f].
1767
1768      This can be refactored to "A|(B|DE|GH|(F|IJ|LM|KC)C)C".  */
1769
1770   /* Mnemonics for classes containing two or more bytes.  */
1771   enum { A, B, C, E, F, H, J, K, M };
1772
1773   /* Mnemonics for single-byte tokens.  */
1774   enum { D_token = 0xe0, G_token = 0xed, I_token = 0xf0, L_token = 0xf4 };
1775
1776   static charclass const utf8_classes[] = {
1777     /* A. 00-7f: 1-byte sequence.  */
1778     CHARCLASS_INIT (0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0, 0, 0, 0),
1779
1780     /* B. c2-df: 1st byte of a 2-byte sequence.  */
1781     CHARCLASS_INIT (0, 0, 0, 0, 0, 0, 0xfffffffc, 0),
1782
1783     /* C. 80-bf: non-leading bytes.  */
1784     CHARCLASS_INIT (0, 0, 0, 0, 0xffffffff, 0xffffffff, 0, 0),
1785
1786     /* D. e0 (just a token).  */
1787
1788     /* E. a0-bf: 2nd byte of a "DEC" sequence.  */
1789     CHARCLASS_INIT (0, 0, 0, 0, 0, 0xffffffff, 0, 0),
1790
1791     /* F. e1-ec + ee-ef: 1st byte of an "FCC" sequence.  */
1792     CHARCLASS_INIT (0, 0, 0, 0, 0, 0, 0, 0xdffe),
1793
1794     /* G. ed (just a token).  */
1795
1796     /* H. 80-9f: 2nd byte of a "GHC" sequence.  */
1797     CHARCLASS_INIT (0, 0, 0, 0, 0xffffffff, 0, 0, 0),
1798
1799     /* I. f0 (just a token).  */
1800
1801     /* J. 90-bf: 2nd byte of an "IJCC" sequence.  */
1802     CHARCLASS_INIT (0, 0, 0, 0, 0xffff0000, 0xffffffff, 0, 0),
1803
1804     /* K. f1-f3: 1st byte of a "KCCC" sequence.  */
1805     CHARCLASS_INIT (0, 0, 0, 0, 0, 0, 0, 0xe0000),
1806
1807     /* L. f4 (just a token).  */
1808
1809     /* M. 80-8f: 2nd byte of a "LMCC" sequence.  */
1810     CHARCLASS_INIT (0, 0, 0, 0, 0xffff, 0, 0, 0),
1811   };
1812
1813   /* Define the character classes that are needed below.  */
1814   if (dfa->utf8_anychar_classes[0] == 0)
1815     {
1816       charclass c = utf8_classes[0];
1817       if (! (dfa->syntax.syntax_bits & RE_DOT_NEWLINE))
1818         clrbit ('\n', &c);
1819       if (dfa->syntax.syntax_bits & RE_DOT_NOT_NULL)
1820         clrbit ('\0', &c);
1821       dfa->utf8_anychar_classes[0] = CSET + charclass_index (dfa, &c);
1822
1823       for (int i = 1; i < sizeof utf8_classes / sizeof *utf8_classes; i++)
1824         dfa->utf8_anychar_classes[i]
1825           = CSET + charclass_index (dfa, &utf8_classes[i]);
1826     }
1827
1828   /* Implement the "A|(B|DE|GH|(F|IJ|LM|KC)C)C" pattern mentioned above.
1829      The token buffer is in reverse Polish order, so we get
1830      "A B D E CAT OR G H CAT OR F I J CAT OR L M CAT OR K
1831       C CAT OR C CAT OR C CAT OR".  */
1832   addtok (dfa, dfa->utf8_anychar_classes[A]);
1833   addtok (dfa, dfa->utf8_anychar_classes[B]);
1834   addtok (dfa, D_token);
1835   addtok (dfa, dfa->utf8_anychar_classes[E]);
1836   addtok (dfa, CAT);
1837   addtok (dfa, OR);
1838   addtok (dfa, G_token);
1839   addtok (dfa, dfa->utf8_anychar_classes[H]);
1840   addtok (dfa, CAT);
1841   addtok (dfa, OR);
1842   addtok (dfa, dfa->utf8_anychar_classes[F]);
1843   addtok (dfa, I_token);
1844   addtok (dfa, dfa->utf8_anychar_classes[J]);
1845   addtok (dfa, CAT);
1846   addtok (dfa, OR);
1847   addtok (dfa, L_token);
1848   addtok (dfa, dfa->utf8_anychar_classes[M]);
1849   addtok (dfa, CAT);
1850   addtok (dfa, OR);
1851   addtok (dfa, dfa->utf8_anychar_classes[K]);
1852   for (int i = 0; i < 3; i++)
1853     {
1854       addtok (dfa, dfa->utf8_anychar_classes[C]);
1855       addtok (dfa, CAT);
1856       addtok (dfa, OR);
1857     }
1858 }
1859
1860 /* The grammar understood by the parser is as follows.
1861
1862    regexp:
1863      regexp OR branch
1864      branch
1865
1866    branch:
1867      branch closure
1868      closure
1869
1870    closure:
1871      closure QMARK
1872      closure STAR
1873      closure PLUS
1874      closure REPMN
1875      atom
1876
1877    atom:
1878      <normal character>
1879      <multibyte character>
1880      ANYCHAR
1881      MBCSET
1882      CSET
1883      BACKREF
1884      BEGLINE
1885      ENDLINE
1886      BEGWORD
1887      ENDWORD
1888      LIMWORD
1889      NOTLIMWORD
1890      LPAREN regexp RPAREN
1891      <empty>
1892
1893    The parser builds a parse tree in postfix form in an array of tokens.  */
1894
1895 static void
1896 atom (struct dfa *dfa)
1897 {
1898   if ((0 <= dfa->parse.tok && dfa->parse.tok < NOTCHAR)
1899       || dfa->parse.tok >= CSET
1900       || dfa->parse.tok == BEG || dfa->parse.tok == BACKREF
1901       || dfa->parse.tok == BEGLINE || dfa->parse.tok == ENDLINE
1902       || dfa->parse.tok == BEGWORD || dfa->parse.tok == ENDWORD
1903       || dfa->parse.tok == LIMWORD || dfa->parse.tok == NOTLIMWORD
1904       || dfa->parse.tok == ANYCHAR || dfa->parse.tok == MBCSET)
1905     {
1906       if (dfa->parse.tok == ANYCHAR && dfa->localeinfo.using_utf8)
1907         {
1908           /* For UTF-8 expand the period to a series of CSETs that define a
1909              valid UTF-8 character.  This avoids using the slow multibyte
1910              path.  I'm pretty sure it would be both profitable and correct to
1911              do it for any encoding; however, the optimization must be done
1912              manually as it is done above in add_utf8_anychar.  So, let's
1913              start with UTF-8: it is the most used, and the structure of the
1914              encoding makes the correctness more obvious.  */
1915           add_utf8_anychar (dfa);
1916         }
1917       else
1918         addtok (dfa, dfa->parse.tok);
1919       dfa->parse.tok = lex (dfa);
1920     }
1921   else if (dfa->parse.tok == WCHAR)
1922     {
1923       if (dfa->lex.wctok == WEOF)
1924         addtok (dfa, BACKREF);
1925       else
1926         {
1927           addtok_wc (dfa, dfa->lex.wctok);
1928
1929           if (dfa->syntax.case_fold)
1930             {
1931               char32_t folded[CASE_FOLDED_BUFSIZE];
1932               int n = case_folded_counterparts (dfa->lex.wctok, folded);
1933               for (int i = 0; i < n; i++)
1934                 {
1935                   addtok_wc (dfa, folded[i]);
1936                   addtok (dfa, OR);
1937                 }
1938             }
1939         }
1940
1941       dfa->parse.tok = lex (dfa);
1942     }
1943   else if (dfa->parse.tok == LPAREN)
1944     {
1945       dfa->parse.tok = lex (dfa);
1946       regexp (dfa);
1947       if (dfa->parse.tok != RPAREN)
1948         dfaerror (_("unbalanced ("));
1949       dfa->parse.tok = lex (dfa);
1950     }
1951   else
1952     addtok (dfa, EMPTY);
1953 }
1954
1955 /* Return the number of tokens in the given subexpression.  */
1956 static idx_t _GL_ATTRIBUTE_PURE
1957 nsubtoks (struct dfa const *dfa, idx_t tindex)
1958 {
1959   switch (dfa->tokens[tindex - 1])
1960     {
1961     default:
1962       return 1;
1963     case QMARK:
1964     case STAR:
1965     case PLUS:
1966       return 1 + nsubtoks (dfa, tindex - 1);
1967     case CAT:
1968     case OR:
1969       {
1970         idx_t ntoks1 = nsubtoks (dfa, tindex - 1);
1971         return 1 + ntoks1 + nsubtoks (dfa, tindex - 1 - ntoks1);
1972       }
1973     }
1974 }
1975
1976 /* Copy the given subexpression to the top of the tree.  */
1977 static void
1978 copytoks (struct dfa *dfa, idx_t tindex, idx_t ntokens)
1979 {
1980   if (dfa->localeinfo.multibyte)
1981     for (idx_t i = 0; i < ntokens; i++)
1982       addtok_mb (dfa, dfa->tokens[tindex + i],
1983                  dfa->multibyte_prop[tindex + i]);
1984   else
1985     for (idx_t i = 0; i < ntokens; i++)
1986       addtok_mb (dfa, dfa->tokens[tindex + i], 3);
1987 }
1988
1989 static void
1990 closure (struct dfa *dfa)
1991 {
1992   atom (dfa);
1993   while (dfa->parse.tok == QMARK || dfa->parse.tok == STAR
1994          || dfa->parse.tok == PLUS || dfa->parse.tok == REPMN)
1995     if (dfa->parse.tok == REPMN && (dfa->lex.minrep || dfa->lex.maxrep))
1996       {
1997         idx_t ntokens = nsubtoks (dfa, dfa->tindex);
1998         idx_t tindex = dfa->tindex - ntokens;
1999         if (dfa->lex.maxrep < 0)
2000           addtok (dfa, PLUS);
2001         if (dfa->lex.minrep == 0)
2002           addtok (dfa, QMARK);
2003         int i;
2004         for (i = 1; i < dfa->lex.minrep; i++)
2005           {
2006             copytoks (dfa, tindex, ntokens);
2007             addtok (dfa, CAT);
2008           }
2009         for (; i < dfa->lex.maxrep; i++)
2010           {
2011             copytoks (dfa, tindex, ntokens);
2012             addtok (dfa, QMARK);
2013             addtok (dfa, CAT);
2014           }
2015         dfa->parse.tok = lex (dfa);
2016       }
2017     else if (dfa->parse.tok == REPMN)
2018       {
2019         dfa->tindex -= nsubtoks (dfa, dfa->tindex);
2020         dfa->parse.tok = lex (dfa);
2021         closure (dfa);
2022       }
2023     else
2024       {
2025         addtok (dfa, dfa->parse.tok);
2026         dfa->parse.tok = lex (dfa);
2027       }
2028 }
2029
2030 static void
2031 branch (struct dfa* dfa)
2032 {
2033   closure (dfa);
2034   while (dfa->parse.tok != RPAREN && dfa->parse.tok != OR
2035          && dfa->parse.tok >= 0)
2036     {
2037       closure (dfa);
2038       addtok (dfa, CAT);
2039     }
2040 }
2041
2042 static void
2043 regexp (struct dfa *dfa)
2044 {
2045   branch (dfa);
2046   while (dfa->parse.tok == OR)
2047     {
2048       dfa->parse.tok = lex (dfa);
2049       branch (dfa);
2050       addtok (dfa, OR);
2051     }
2052 }
2053
2054 /* Parse a string S of length LEN into D.  S can include NUL characters.
2055    This is the main entry point for the parser.  */
2056 void
2057 dfaparse (char const *s, idx_t len, struct dfa *d)
2058 {
2059   d->lex.ptr = s;
2060   d->lex.left = len;
2061   d->lex.lasttok = END;
2062   d->lex.laststart = true;
2063
2064   if (!d->syntax.syntax_bits_set)
2065     dfaerror (_("no syntax specified"));
2066
2067   if (!d->nregexps)
2068     addtok (d, BEG);
2069
2070   d->parse.tok = lex (d);
2071   d->parse.depth = d->depth;
2072
2073   regexp (d);
2074
2075   if (d->parse.tok != END)
2076     dfaerror (_("unbalanced )"));
2077
2078   addtok (d, END - d->nregexps);
2079   addtok (d, CAT);
2080
2081   if (d->nregexps)
2082     addtok (d, OR);
2083
2084   ++d->nregexps;
2085 }
2086
2087 /* Some primitives for operating on sets of positions.  */
2088
2089 /* Copy one set to another.  */
2090 static void
2091 copy (position_set const *src, position_set *dst)
2092 {
2093   if (dst->alloc < src->nelem)
2094     {
2095       free (dst->elems);
2096       dst->elems = xpalloc (NULL, &dst->alloc, src->nelem - dst->alloc, -1,
2097                             sizeof *dst->elems);
2098     }
2099   dst->nelem = src->nelem;
2100   if (src->nelem != 0)
2101     memcpy (dst->elems, src->elems, src->nelem * sizeof *dst->elems);
2102 }
2103
2104 static void
2105 alloc_position_set (position_set *s, idx_t size)
2106 {
2107   s->elems = xnmalloc (size, sizeof *s->elems);
2108   s->alloc = size;
2109   s->nelem = 0;
2110 }
2111
2112 /* Insert position P in set S.  S is maintained in sorted order on
2113    decreasing index.  If there is already an entry in S with P.index
2114    then merge (logically-OR) P's constraints into the one in S.
2115    S->elems must point to an array large enough to hold the resulting set.  */
2116 static void
2117 insert (position p, position_set *s)
2118 {
2119   idx_t count = s->nelem;
2120   idx_t lo = 0, hi = count;
2121   while (lo < hi)
2122     {
2123       idx_t mid = (lo + hi) >> 1;
2124       if (s->elems[mid].index < p.index)
2125         lo = mid + 1;
2126       else if (s->elems[mid].index == p.index)
2127         {
2128           s->elems[mid].constraint |= p.constraint;
2129           return;
2130         }
2131       else
2132         hi = mid;
2133     }
2134
2135   s->elems = maybe_realloc (s->elems, count, &s->alloc, -1, sizeof *s->elems);
2136   for (idx_t i = count; i > lo; i--)
2137     s->elems[i] = s->elems[i - 1];
2138   s->elems[lo] = p;
2139   ++s->nelem;
2140 }
2141
2142 static void
2143 append (position p, position_set *s)
2144 {
2145   idx_t count = s->nelem;
2146   s->elems = maybe_realloc (s->elems, count, &s->alloc, -1, sizeof *s->elems);
2147   s->elems[s->nelem++] = p;
2148 }
2149
2150 /* Merge S1 and S2 (with the additional constraint C2) into M.  The
2151    result is as if the positions of S1, and of S2 with the additional
2152    constraint C2, were inserted into an initially empty set.  */
2153 static void
2154 merge_constrained (position_set const *s1, position_set const *s2,
2155                    unsigned int c2, position_set *m)
2156 {
2157   idx_t i = 0, j = 0;
2158
2159   if (m->alloc - s1->nelem < s2->nelem)
2160     {
2161       free (m->elems);
2162       m->alloc = s1->nelem;
2163       m->elems = xpalloc (NULL, &m->alloc, s2->nelem, -1, sizeof *m->elems);
2164     }
2165   m->nelem = 0;
2166   while (i < s1->nelem || j < s2->nelem)
2167     if (! (j < s2->nelem)
2168         || (i < s1->nelem && s1->elems[i].index <= s2->elems[j].index))
2169       {
2170         unsigned int c = ((i < s1->nelem && j < s2->nelem
2171                            && s1->elems[i].index == s2->elems[j].index)
2172                           ? s2->elems[j++].constraint & c2
2173                           : 0);
2174         m->elems[m->nelem].index = s1->elems[i].index;
2175         m->elems[m->nelem++].constraint = s1->elems[i++].constraint | c;
2176       }
2177     else
2178       {
2179         if (s2->elems[j].constraint & c2)
2180           {
2181             m->elems[m->nelem].index = s2->elems[j].index;
2182             m->elems[m->nelem++].constraint = s2->elems[j].constraint & c2;
2183           }
2184         j++;
2185       }
2186 }
2187
2188 /* Merge two sets of positions into a third.  The result is exactly as if
2189    the positions of both sets were inserted into an initially empty set.  */
2190 static void
2191 merge (position_set const *s1, position_set const *s2, position_set *m)
2192 {
2193   merge_constrained (s1, s2, -1, m);
2194 }
2195
2196 /* Merge into DST all the elements of SRC, possibly destroying
2197    the contents of the temporary M.  */
2198 static void
2199 merge2 (position_set *dst, position_set const *src, position_set *m)
2200 {
2201   if (src->nelem < 4)
2202     {
2203       for (idx_t i = 0; i < src->nelem; i++)
2204         insert (src->elems[i], dst);
2205     }
2206    else
2207     {
2208       merge (src, dst, m);
2209       copy (m, dst);
2210     }
2211 }
2212
2213 /* Delete a position from a set.  Return the nonzero constraint of the
2214    deleted position, or zero if there was no such position.  */
2215 static unsigned int
2216 delete (idx_t del, position_set *s)
2217 {
2218   idx_t count = s->nelem;
2219   idx_t lo = 0, hi = count;
2220   while (lo < hi)
2221     {
2222       idx_t mid = (lo + hi) >> 1;
2223       if (s->elems[mid].index < del)
2224         lo = mid + 1;
2225       else if (s->elems[mid].index == del)
2226         {
2227           unsigned int c = s->elems[mid].constraint;
2228           idx_t i;
2229           for (i = mid; i + 1 < count; i++)
2230             s->elems[i] = s->elems[i + 1];
2231           s->nelem = i;
2232           return c;
2233         }
2234       else
2235         hi = mid;
2236     }
2237   return 0;
2238 }
2239
2240 /* Replace a position with the followed set.  */
2241 static void
2242 replace (position_set *dst, idx_t del, position_set *add,
2243          unsigned int constraint, position_set *tmp)
2244 {
2245   unsigned int c = delete (del, dst) & constraint;
2246
2247   if (c)
2248     {
2249       copy (dst, tmp);
2250       merge_constrained (tmp, add, c, dst);
2251     }
2252 }
2253
2254 /* Find the index of the state corresponding to the given position set with
2255    the given preceding context, or create a new state if there is no such
2256    state.  Context tells whether we got here on a newline or letter.  */
2257 static state_num
2258 state_index (struct dfa *d, position_set const *s, int context)
2259 {
2260   size_t hash = 0;
2261   int constraint = 0;
2262   state_num i;
2263
2264   for (i = 0; i < s->nelem; ++i)
2265     {
2266       idx_t ind = s->elems[i].index;
2267       hash ^= ind + s->elems[i].constraint;
2268     }
2269
2270   /* Try to find a state that exactly matches the proposed one.  */
2271   for (i = 0; i < d->sindex; ++i)
2272     {
2273       if (hash != d->states[i].hash || s->nelem != d->states[i].elems.nelem
2274           || context != d->states[i].context)
2275         continue;
2276       state_num j;
2277       for (j = 0; j < s->nelem; ++j)
2278         if (s->elems[j].constraint != d->states[i].elems.elems[j].constraint
2279             || s->elems[j].index != d->states[i].elems.elems[j].index)
2280           break;
2281       if (j == s->nelem)
2282         return i;
2283     }
2284
2285 #ifdef DEBUG
2286   fprintf (stderr, "new state %td\n nextpos:", i);
2287   for (state_num j = 0; j < s->nelem; j++)
2288     {
2289       fprintf (stderr, " %td:", s->elems[j].index);
2290       prtok (d->tokens[s->elems[j].index]);
2291     }
2292   fprintf (stderr, "\n context:");
2293   if (context ^ CTX_ANY)
2294     {
2295       if (context & CTX_NONE)
2296         fprintf (stderr, " CTX_NONE");
2297       if (context & CTX_LETTER)
2298         fprintf (stderr, " CTX_LETTER");
2299       if (context & CTX_NEWLINE)
2300         fprintf (stderr, " CTX_NEWLINE");
2301     }
2302   else
2303     fprintf (stderr, " CTX_ANY");
2304   fprintf (stderr, "\n");
2305 #endif
2306
2307   for (state_num j = 0; j < s->nelem; j++)
2308     {
2309       int c = d->constraints[s->elems[j].index];
2310
2311       if (c != 0)
2312         {
2313           if (succeeds_in_context (c, context, CTX_ANY))
2314             constraint |= c;
2315         }
2316       else if (d->tokens[s->elems[j].index] == BACKREF)
2317         constraint = NO_CONSTRAINT;
2318     }
2319
2320
2321   /* Create a new state.  */
2322   d->states = maybe_realloc (d->states, d->sindex, &d->salloc, -1,
2323                              sizeof *d->states);
2324   d->states[i].hash = hash;
2325   alloc_position_set (&d->states[i].elems, s->nelem);
2326   copy (s, &d->states[i].elems);
2327   d->states[i].context = context;
2328   d->states[i].constraint = constraint;
2329   d->states[i].mbps.nelem = 0;
2330   d->states[i].mbps.elems = NULL;
2331   d->states[i].mb_trindex = -1;
2332
2333   ++d->sindex;
2334
2335   return i;
2336 }
2337
2338 /* Find the epsilon closure of D's set of positions.  If any position of the set
2339    contains a symbol that matches the empty string in some context, replace
2340    that position with the elements of its follow labeled with an appropriate
2341    constraint.  Repeat exhaustively until no funny positions are left.
2342    S->elems must be large enough to hold the result.  BACKWARD is D's
2343    backward set; use and update it too.  */
2344 static void
2345 epsclosure (struct dfa const *d, position_set *backward)
2346 {
2347   position_set tmp;
2348   alloc_position_set (&tmp, d->nleaves);
2349   for (idx_t i = 0; i < d->tindex; i++)
2350     if (0 < d->follows[i].nelem)
2351       {
2352         unsigned int constraint;
2353         switch (d->tokens[i])
2354           {
2355           default:
2356             continue;
2357
2358           case BEGLINE:
2359             constraint = BEGLINE_CONSTRAINT;
2360             break;
2361           case ENDLINE:
2362             constraint = ENDLINE_CONSTRAINT;
2363             break;
2364           case BEGWORD:
2365             constraint = BEGWORD_CONSTRAINT;
2366             break;
2367           case ENDWORD:
2368             constraint = ENDWORD_CONSTRAINT;
2369             break;
2370           case LIMWORD:
2371             constraint = LIMWORD_CONSTRAINT;
2372             break;
2373           case NOTLIMWORD:
2374             constraint = NOTLIMWORD_CONSTRAINT;
2375             break;
2376           case EMPTY:
2377             constraint = NO_CONSTRAINT;
2378             break;
2379           }
2380
2381         delete (i, &d->follows[i]);
2382
2383         for (idx_t j = 0; j < backward[i].nelem; j++)
2384           replace (&d->follows[backward[i].elems[j].index], i, &d->follows[i],
2385                    constraint, &tmp);
2386         for (idx_t j = 0; j < d->follows[i].nelem; j++)
2387           replace (&backward[d->follows[i].elems[j].index], i, &backward[i],
2388                    NO_CONSTRAINT, &tmp);
2389       }
2390   free (tmp.elems);
2391 }
2392
2393 /* Returns the set of contexts for which there is at least one
2394    character included in C.  */
2395
2396 static int
2397 charclass_context (struct dfa const *dfa, charclass const *c)
2398 {
2399   int context = 0;
2400
2401   for (int j = 0; j < CHARCLASS_WORDS; j++)
2402     {
2403       if (c->w[j] & dfa->syntax.newline.w[j])
2404         context |= CTX_NEWLINE;
2405       if (c->w[j] & dfa->syntax.letters.w[j])
2406         context |= CTX_LETTER;
2407       if (c->w[j] & ~(dfa->syntax.letters.w[j] | dfa->syntax.newline.w[j]))
2408         context |= CTX_NONE;
2409     }
2410
2411   return context;
2412 }
2413
2414 /* Returns the contexts on which the position set S depends.  Each context
2415    in the set of returned contexts (let's call it SC) may have a different
2416    follow set than other contexts in SC, and also different from the
2417    follow set of the complement set (sc ^ CTX_ANY).  However, all contexts
2418    in the complement set will have the same follow set.  */
2419
2420 static int _GL_ATTRIBUTE_PURE
2421 state_separate_contexts (struct dfa *d, position_set const *s)
2422 {
2423   int separate_contexts = 0;
2424
2425   for (idx_t j = 0; j < s->nelem; j++)
2426     separate_contexts |= d->separates[s->elems[j].index];
2427
2428   return separate_contexts;
2429 }
2430
2431 enum
2432 {
2433   /* Single token is repeated.  It is distinguished from non-repeated.  */
2434   OPT_REPEAT = (1 << 0),
2435
2436   /* Multiple tokens are repeated.  This flag is on at head of tokens.  The
2437      node is not merged.  */
2438   OPT_LPAREN = (1 << 1),
2439
2440   /* Multiple branches are joined.  The node is not merged.  */
2441   OPT_RPAREN = (1 << 2),
2442
2443   /* The node is walked.  If the node is found in walking again, OPT_RPAREN
2444      flag is turned on. */
2445   OPT_WALKED = (1 << 3),
2446
2447   /* The node is queued.  The node is not queued again.  */
2448   OPT_QUEUED = (1 << 4)
2449 };
2450
2451 static void
2452 merge_nfa_state (struct dfa *d, idx_t tindex, char *flags,
2453                  position_set *merged)
2454 {
2455   position_set *follows = d->follows;
2456   idx_t nelem = 0;
2457
2458   for (idx_t i = 0; i < follows[tindex].nelem; i++)
2459     {
2460       idx_t sindex = follows[tindex].elems[i].index;
2461
2462       /* Skip the node as pruned in future.  */
2463       unsigned int iconstraint = follows[tindex].elems[i].constraint;
2464       if (iconstraint == 0)
2465         continue;
2466
2467       if (d->tokens[follows[tindex].elems[i].index] <= END)
2468         {
2469           d->constraints[tindex] |= follows[tindex].elems[i].constraint;
2470           continue;
2471         }
2472
2473       if (sindex != tindex && !(flags[sindex] & (OPT_LPAREN | OPT_RPAREN)))
2474         {
2475           idx_t j;
2476
2477           for (j = 0; j < nelem; j++)
2478             {
2479               idx_t dindex = follows[tindex].elems[j].index;
2480
2481               if (dindex == tindex)
2482                 continue;
2483
2484               if (follows[tindex].elems[j].constraint != iconstraint)
2485                 continue;
2486
2487               if (flags[dindex] & (OPT_LPAREN | OPT_RPAREN))
2488                 continue;
2489
2490               if (d->tokens[sindex] != d->tokens[dindex])
2491                 continue;
2492
2493               if ((flags[sindex] ^ flags[dindex]) & OPT_REPEAT)
2494                 continue;
2495
2496               if (flags[sindex] & OPT_REPEAT)
2497                 delete (sindex, &follows[sindex]);
2498
2499               merge2 (&follows[dindex], &follows[sindex], merged);
2500
2501               break;
2502             }
2503
2504           if (j < nelem)
2505             continue;
2506         }
2507
2508       follows[tindex].elems[nelem++] = follows[tindex].elems[i];
2509       flags[sindex] |= OPT_QUEUED;
2510     }
2511
2512   follows[tindex].nelem = nelem;
2513 }
2514
2515 static int
2516 compare (const void *a, const void *b)
2517 {
2518   position const *p = a, *q = b;
2519   return (p->index > q->index) - (p->index < q->index);
2520 }
2521
2522 static void
2523 reorder_tokens (struct dfa *d)
2524 {
2525   idx_t nleaves = 0;
2526   ptrdiff_t *map = xnmalloc (d->tindex, sizeof *map);
2527   map[0] = nleaves++;
2528   for (idx_t i = 1; i < d->tindex; i++)
2529     map[i] = -1;
2530
2531   token *tokens = xnmalloc (d->nleaves, sizeof *tokens);
2532   position_set *follows = xnmalloc (d->nleaves, sizeof *follows);
2533   int *constraints = xnmalloc (d->nleaves, sizeof *constraints);
2534   char *multibyte_prop = (d->localeinfo.multibyte
2535                           ? xnmalloc (d->nleaves, sizeof *multibyte_prop)
2536                           : NULL);
2537
2538   for (idx_t i = 0; i < d->tindex; i++)
2539     {
2540       if (map[i] < 0)
2541         {
2542           free (d->follows[i].elems);
2543           d->follows[i].elems = NULL;
2544           d->follows[i].nelem = 0;
2545           continue;
2546         }
2547
2548       tokens[map[i]] = d->tokens[i];
2549       follows[map[i]] = d->follows[i];
2550       constraints[map[i]] = d->constraints[i];
2551
2552       if (multibyte_prop != NULL)
2553         multibyte_prop[map[i]] = d->multibyte_prop[i];
2554
2555       for (idx_t j = 0; j < d->follows[i].nelem; j++)
2556         {
2557           if (map[d->follows[i].elems[j].index] == -1)
2558             map[d->follows[i].elems[j].index] = nleaves++;
2559
2560           d->follows[i].elems[j].index = map[d->follows[i].elems[j].index];
2561         }
2562
2563       qsort (d->follows[i].elems, d->follows[i].nelem,
2564              sizeof *d->follows[i].elems, compare);
2565     }
2566
2567   for (idx_t i = 0; i < nleaves; i++)
2568     {
2569       d->tokens[i] = tokens[i];
2570       d->follows[i] = follows[i];
2571       d->constraints[i] = constraints[i];
2572
2573       if (multibyte_prop != NULL)
2574         d->multibyte_prop[i] = multibyte_prop[i];
2575     }
2576
2577   d->tindex = d->nleaves = nleaves;
2578
2579   free (tokens);
2580   free (follows);
2581   free (constraints);
2582   free (multibyte_prop);
2583   free (map);
2584 }
2585
2586 static void
2587 dfaoptimize (struct dfa *d)
2588 {
2589   char *flags = xizalloc (d->tindex);
2590
2591   for (idx_t i = 0; i < d->tindex; i++)
2592     {
2593       for (idx_t j = 0; j < d->follows[i].nelem; j++)
2594         {
2595           if (d->follows[i].elems[j].index == i)
2596             flags[d->follows[i].elems[j].index] |= OPT_REPEAT;
2597           else if (d->follows[i].elems[j].index < i)
2598             flags[d->follows[i].elems[j].index] |= OPT_LPAREN;
2599           else if (flags[d->follows[i].elems[j].index] &= OPT_WALKED)
2600             flags[d->follows[i].elems[j].index] |= OPT_RPAREN;
2601           else
2602             flags[d->follows[i].elems[j].index] |= OPT_WALKED;
2603         }
2604     }
2605
2606   flags[0] |= OPT_QUEUED;
2607
2608   position_set merged0;
2609   position_set *merged = &merged0;
2610   alloc_position_set (merged, d->nleaves);
2611
2612   d->constraints = xicalloc (d->tindex, sizeof *d->constraints);
2613
2614   for (idx_t i = 0; i < d->tindex; i++)
2615     if (flags[i] & OPT_QUEUED)
2616       merge_nfa_state (d, i, flags, merged);
2617
2618   reorder_tokens (d);
2619
2620   free (merged->elems);
2621   free (flags);
2622 }
2623
2624 /* Perform bottom-up analysis on the parse tree, computing various functions.
2625    Note that at this point, we're pretending constructs like \< are real
2626    characters rather than constraints on what can follow them.
2627
2628    Nullable:  A node is nullable if it is at the root of a regexp that can
2629    match the empty string.
2630    *  EMPTY leaves are nullable.
2631    * No other leaf is nullable.
2632    * A QMARK or STAR node is nullable.
2633    * A PLUS node is nullable if its argument is nullable.
2634    * A CAT node is nullable if both its arguments are nullable.
2635    * An OR node is nullable if either argument is nullable.
2636
2637    Firstpos:  The firstpos of a node is the set of positions (nonempty leaves)
2638    that could correspond to the first character of a string matching the
2639    regexp rooted at the given node.
2640    * EMPTY leaves have empty firstpos.
2641    * The firstpos of a nonempty leaf is that leaf itself.
2642    * The firstpos of a QMARK, STAR, or PLUS node is the firstpos of its
2643      argument.
2644    * The firstpos of a CAT node is the firstpos of the left argument, union
2645      the firstpos of the right if the left argument is nullable.
2646    * The firstpos of an OR node is the union of firstpos of each argument.
2647
2648    Lastpos:  The lastpos of a node is the set of positions that could
2649    correspond to the last character of a string matching the regexp at
2650    the given node.
2651    * EMPTY leaves have empty lastpos.
2652    * The lastpos of a nonempty leaf is that leaf itself.
2653    * The lastpos of a QMARK, STAR, or PLUS node is the lastpos of its
2654      argument.
2655    * The lastpos of a CAT node is the lastpos of its right argument, union
2656      the lastpos of the left if the right argument is nullable.
2657    * The lastpos of an OR node is the union of the lastpos of each argument.
2658
2659    Follow:  The follow of a position is the set of positions that could
2660    correspond to the character following a character matching the node in
2661    a string matching the regexp.  At this point we consider special symbols
2662    that match the empty string in some context to be just normal characters.
2663    Later, if we find that a special symbol is in a follow set, we will
2664    replace it with the elements of its follow, labeled with an appropriate
2665    constraint.
2666    * Every node in the firstpos of the argument of a STAR or PLUS node is in
2667      the follow of every node in the lastpos.
2668    * Every node in the firstpos of the second argument of a CAT node is in
2669      the follow of every node in the lastpos of the first argument.
2670
2671    Because of the postfix representation of the parse tree, the depth-first
2672    analysis is conveniently done by a linear scan with the aid of a stack.
2673    Sets are stored as arrays of the elements, obeying a stack-like allocation
2674    scheme; the number of elements in each set deeper in the stack can be
2675    used to determine the address of a particular set's array.  */
2676 static void
2677 dfaanalyze (struct dfa *d, bool searchflag)
2678 {
2679   /* Array allocated to hold position sets.  */
2680   position *posalloc = xnmalloc (d->nleaves, 2 * sizeof *posalloc);
2681   /* Firstpos and lastpos elements.  */
2682   position *firstpos = posalloc;
2683   position *lastpos = firstpos + d->nleaves;
2684   position pos;
2685   position_set tmp;
2686
2687   /* Stack for element counts and nullable flags.  */
2688   struct
2689   {
2690     /* Whether the entry is nullable.  */
2691     bool nullable;
2692
2693     /* Counts of firstpos and lastpos sets.  */
2694     idx_t nfirstpos;
2695     idx_t nlastpos;
2696   } *stkalloc = xnmalloc (d->depth, sizeof *stkalloc), *stk = stkalloc;
2697
2698   position_set merged;          /* Result of merging sets.  */
2699
2700   addtok (d, CAT);
2701   idx_t tindex = d->tindex;
2702   assume (0 < tindex);
2703
2704 #ifdef DEBUG
2705   fprintf (stderr, "dfaanalyze:\n");
2706   for (idx_t i = 0; i < tindex; i++)
2707     {
2708       fprintf (stderr, " %td:", i);
2709       prtok (d->tokens[i]);
2710     }
2711   putc ('\n', stderr);
2712 #endif
2713
2714   d->searchflag = searchflag;
2715   alloc_position_set (&merged, d->nleaves);
2716   d->follows = xicalloc (tindex, sizeof *d->follows);
2717   position_set *backward
2718     = d->epsilon ? xicalloc (tindex, sizeof *backward) : NULL;
2719
2720   for (idx_t i = 0; i < tindex; i++)
2721     {
2722       switch (d->tokens[i])
2723         {
2724         case EMPTY:
2725           /* The empty set is nullable.  */
2726           stk->nullable = true;
2727
2728           /* The firstpos and lastpos of the empty leaf are both empty.  */
2729           stk->nfirstpos = stk->nlastpos = 0;
2730           stk++;
2731           break;
2732
2733         case STAR:
2734         case PLUS:
2735           /* Every element in the lastpos of the argument is in the backward
2736              set of every element in the firstpos.  */
2737           if (d->epsilon)
2738             {
2739               tmp.elems = lastpos - stk[-1].nlastpos;
2740               tmp.nelem = stk[-1].nlastpos;
2741               for (position *p = firstpos - stk[-1].nfirstpos;
2742                    p < firstpos; p++)
2743                 merge2 (&backward[p->index], &tmp, &merged);
2744             }
2745
2746           /* Every element in the firstpos of the argument is in the follow
2747              of every element in the lastpos.  */
2748           {
2749             tmp.elems = firstpos - stk[-1].nfirstpos;
2750             tmp.nelem = stk[-1].nfirstpos;
2751             for (position *p = lastpos - stk[-1].nlastpos; p < lastpos; p++)
2752               merge2 (&d->follows[p->index], &tmp, &merged);
2753           }
2754           FALLTHROUGH;
2755         case QMARK:
2756           /* A QMARK or STAR node is automatically nullable.  */
2757           if (d->tokens[i] != PLUS)
2758             stk[-1].nullable = true;
2759           break;
2760
2761         case CAT:
2762           /* Every element in the lastpos of the first argument is in
2763              the backward set of every element in the firstpos of the
2764              second argument.  */
2765           if (backward)
2766             {
2767               tmp.nelem = stk[-2].nlastpos;
2768               tmp.elems = lastpos - stk[-1].nlastpos - stk[-2].nlastpos;
2769               for (position *p = firstpos - stk[-1].nfirstpos;
2770                    p < firstpos; p++)
2771                 merge2 (&backward[p->index], &tmp, &merged);
2772             }
2773
2774           /* Every element in the firstpos of the second argument is in the
2775              follow of every element in the lastpos of the first argument.  */
2776           {
2777             tmp.nelem = stk[-1].nfirstpos;
2778             tmp.elems = firstpos - stk[-1].nfirstpos;
2779             for (position *plim = lastpos - stk[-1].nlastpos,
2780                    *p = plim - stk[-2].nlastpos;
2781                  p < plim; p++)
2782               merge2 (&d->follows[p->index], &tmp, &merged);
2783           }
2784
2785           /* The firstpos of a CAT node is the firstpos of the first argument,
2786              union that of the second argument if the first is nullable.  */
2787           if (stk[-2].nullable)
2788             stk[-2].nfirstpos += stk[-1].nfirstpos;
2789           else
2790             firstpos -= stk[-1].nfirstpos;
2791
2792           /* The lastpos of a CAT node is the lastpos of the second argument,
2793              union that of the first argument if the second is nullable.  */
2794           if (stk[-1].nullable)
2795             stk[-2].nlastpos += stk[-1].nlastpos;
2796           else
2797             {
2798               position *p = lastpos - stk[-1].nlastpos - stk[-2].nlastpos;
2799               for (idx_t j = 0; j < stk[-1].nlastpos; j++)
2800                 p[j] = p[j + stk[-2].nlastpos];
2801               lastpos -= stk[-2].nlastpos;
2802               stk[-2].nlastpos = stk[-1].nlastpos;
2803             }
2804
2805           /* A CAT node is nullable if both arguments are nullable.  */
2806           stk[-2].nullable &= stk[-1].nullable;
2807           stk--;
2808           break;
2809
2810         case OR:
2811           /* The firstpos is the union of the firstpos of each argument.  */
2812           stk[-2].nfirstpos += stk[-1].nfirstpos;
2813
2814           /* The lastpos is the union of the lastpos of each argument.  */
2815           stk[-2].nlastpos += stk[-1].nlastpos;
2816
2817           /* An OR node is nullable if either argument is nullable.  */
2818           stk[-2].nullable |= stk[-1].nullable;
2819           stk--;
2820           break;
2821
2822         default:
2823           /* Anything else is a nonempty position.  (Note that special
2824              constructs like \< are treated as nonempty strings here;
2825              an "epsilon closure" effectively makes them nullable later.
2826              Backreferences have to get a real position so we can detect
2827              transitions on them later.  But they are nullable.  */
2828           stk->nullable = d->tokens[i] == BACKREF;
2829
2830           /* This position is in its own firstpos and lastpos.  */
2831           stk->nfirstpos = stk->nlastpos = 1;
2832           stk++;
2833
2834           firstpos->index = lastpos->index = i;
2835           firstpos->constraint = lastpos->constraint = NO_CONSTRAINT;
2836           firstpos++, lastpos++;
2837
2838           break;
2839         }
2840 #ifdef DEBUG
2841       /* ... balance the above nonsyntactic #ifdef goo...  */
2842       fprintf (stderr, "node %td:", i);
2843       prtok (d->tokens[i]);
2844       putc ('\n', stderr);
2845       fprintf (stderr,
2846                stk[-1].nullable ? " nullable: yes\n" : " nullable: no\n");
2847       fprintf (stderr, " firstpos:");
2848       for (idx_t j = 0; j < stk[-1].nfirstpos; j++)
2849         {
2850           fprintf (stderr, " %td:", firstpos[j - stk[-1].nfirstpos].index);
2851           prtok (d->tokens[firstpos[j - stk[-1].nfirstpos].index]);
2852         }
2853       fprintf (stderr, "\n lastpos:");
2854       for (idx_t j = 0; j < stk[-1].nlastpos; j++)
2855         {
2856           fprintf (stderr, " %td:", lastpos[j - stk[-1].nlastpos].index);
2857           prtok (d->tokens[lastpos[j - stk[-1].nlastpos].index]);
2858         }
2859       putc ('\n', stderr);
2860 #endif
2861     }
2862
2863   if (backward)
2864     {
2865       /* For each follow set that is the follow set of a real position,
2866          replace it with its epsilon closure.  */
2867       epsclosure (d, backward);
2868
2869       for (idx_t i = 0; i < tindex; i++)
2870         free (backward[i].elems);
2871       free (backward);
2872     }
2873
2874   dfaoptimize (d);
2875
2876 #ifdef DEBUG
2877   for (idx_t i = 0; i < tindex; i++)
2878     if (d->tokens[i] == BEG || d->tokens[i] < NOTCHAR
2879         || d->tokens[i] == BACKREF || d->tokens[i] == ANYCHAR
2880         || d->tokens[i] == MBCSET || d->tokens[i] >= CSET)
2881       {
2882         fprintf (stderr, "follows(%td:", i);
2883         prtok (d->tokens[i]);
2884         fprintf (stderr, "):");
2885         for (idx_t j = 0; j < d->follows[i].nelem; j++)
2886           {
2887             fprintf (stderr, " %td:", d->follows[i].elems[j].index);
2888             prtok (d->tokens[d->follows[i].elems[j].index]);
2889           }
2890         putc ('\n', stderr);
2891       }
2892 #endif
2893
2894   pos.index = 0;
2895   pos.constraint = NO_CONSTRAINT;
2896
2897   alloc_position_set (&tmp, 1);
2898
2899   append (pos, &tmp);
2900
2901   d->separates = xicalloc (tindex, sizeof *d->separates);
2902
2903   for (idx_t i = 0; i < tindex; i++)
2904     {
2905       if (prev_newline_dependent (d->constraints[i]))
2906         d->separates[i] |= CTX_NEWLINE;
2907       if (prev_letter_dependent (d->constraints[i]))
2908         d->separates[i] |= CTX_LETTER;
2909
2910       for (idx_t j = 0; j < d->follows[i].nelem; j++)
2911         {
2912           if (prev_newline_dependent (d->follows[i].elems[j].constraint))
2913             d->separates[i] |= CTX_NEWLINE;
2914           if (prev_letter_dependent (d->follows[i].elems[j].constraint))
2915             d->separates[i] |= CTX_LETTER;
2916         }
2917     }
2918
2919   /* Context wanted by some position.  */
2920   int separate_contexts = state_separate_contexts (d, &tmp);
2921
2922   /* Build the initial state.  */
2923   if (separate_contexts & CTX_NEWLINE)
2924     state_index (d, &tmp, CTX_NEWLINE);
2925   d->initstate_notbol = d->min_trcount
2926     = state_index (d, &tmp, separate_contexts ^ CTX_ANY);
2927   if (separate_contexts & CTX_LETTER)
2928     d->min_trcount = state_index (d, &tmp, CTX_LETTER);
2929   d->min_trcount++;
2930   d->trcount = 0;
2931
2932   free (posalloc);
2933   free (stkalloc);
2934   free (merged.elems);
2935   free (tmp.elems);
2936 }
2937
2938 /* Make sure D's state arrays are large enough to hold NEW_STATE.  */
2939 static void
2940 realloc_trans_if_necessary (struct dfa *d)
2941 {
2942   state_num oldalloc = d->tralloc;
2943   if (oldalloc < d->sindex)
2944     {
2945       state_num **realtrans = d->trans ? d->trans - 2 : NULL;
2946       idx_t newalloc1 = realtrans ? d->tralloc + 2 : 0;
2947       realtrans = xpalloc (realtrans, &newalloc1, d->sindex - oldalloc,
2948                            -1, sizeof *realtrans);
2949       realtrans[0] = realtrans[1] = NULL;
2950       d->trans = realtrans + 2;
2951       idx_t newalloc = d->tralloc = newalloc1 - 2;
2952       d->fails = xreallocarray (d->fails, newalloc, sizeof *d->fails);
2953       d->success = xreallocarray (d->success, newalloc, sizeof *d->success);
2954       d->newlines = xreallocarray (d->newlines, newalloc, sizeof *d->newlines);
2955       if (d->localeinfo.multibyte)
2956         {
2957           realtrans = d->mb_trans ? d->mb_trans - 2 : NULL;
2958           realtrans = xreallocarray (realtrans, newalloc1, sizeof *realtrans);
2959           if (oldalloc == 0)
2960             realtrans[0] = realtrans[1] = NULL;
2961           d->mb_trans = realtrans + 2;
2962         }
2963       for (; oldalloc < newalloc; oldalloc++)
2964         {
2965           d->trans[oldalloc] = NULL;
2966           d->fails[oldalloc] = NULL;
2967           if (d->localeinfo.multibyte)
2968             d->mb_trans[oldalloc] = NULL;
2969         }
2970     }
2971 }
2972
2973 /*
2974    Calculate the transition table for a new state derived from state s
2975    for a compiled dfa d after input character uc, and return the new
2976    state number.
2977
2978    Do not worry about all possible input characters; calculate just the group
2979    of positions that match uc.  Label it with the set of characters that
2980    every position in the group matches (taking into account, if necessary,
2981    preceding context information of s).  Then find the union
2982    of these positions' follows, i.e., the set of positions of the
2983    new state.  For each character in the group's label, set the transition
2984    on this character to be to a state corresponding to the set's positions,
2985    and its associated backward context information, if necessary.
2986
2987    When building a searching matcher, include the positions of state
2988    0 in every state.
2989
2990    The group is constructed by building an equivalence-class
2991    partition of the positions of s.
2992
2993    For each position, find the set of characters C that it matches.  Eliminate
2994    any characters from C that fail on grounds of backward context.
2995
2996    Check whether the group's label L has nonempty
2997    intersection with C.  If L - C is nonempty, create a new group labeled
2998    L - C and having the same positions as the current group, and set L to
2999    the intersection of L and C.  Insert the position in the group, set
3000    C = C - L, and resume scanning.
3001
3002    If after comparing with every group there are characters remaining in C,
3003    create a new group labeled with the characters of C and insert this
3004    position in that group.  */
3005
3006 static state_num
3007 build_state (state_num s, struct dfa *d, unsigned char uc)
3008 {
3009   position_set follows;         /* Union of the follows for each
3010                                    position of the current state.  */
3011   position_set group;           /* Positions that match the input char.  */
3012   position_set tmp;             /* Temporary space for merging sets.  */
3013   state_num state;              /* New state.  */
3014   state_num state_newline;      /* New state on a newline transition.  */
3015   state_num state_letter;       /* New state on a letter transition.  */
3016
3017 #ifdef DEBUG
3018   fprintf (stderr, "build state %td\n", s);
3019 #endif
3020
3021   /* A pointer to the new transition table, and the table itself.  */
3022   state_num **ptrans = (accepting (s, d) ? d->fails : d->trans) + s;
3023   state_num *trans = *ptrans;
3024
3025   if (!trans)
3026     {
3027       /* MAX_TRCOUNT is an arbitrary upper limit on the number of
3028          transition tables that can exist at once, other than for
3029          initial states.  Often-used transition tables are quickly
3030          rebuilt, whereas rarely-used ones are cleared away.  */
3031       if (MAX_TRCOUNT <= d->trcount)
3032         {
3033           for (state_num i = d->min_trcount; i < d->tralloc; i++)
3034             {
3035               free (d->trans[i]);
3036               free (d->fails[i]);
3037               d->trans[i] = d->fails[i] = NULL;
3038             }
3039           d->trcount = 0;
3040         }
3041
3042       d->trcount++;
3043       *ptrans = trans = xmalloc (NOTCHAR * sizeof *trans);
3044
3045       /* Fill transition table with a default value which means that the
3046          transited state has not been calculated yet.  */
3047       for (int i = 0; i < NOTCHAR; i++)
3048         trans[i] = -2;
3049     }
3050
3051   /* Set up the success bits for this state.  */
3052   d->success[s] = 0;
3053   if (accepts_in_context (d->states[s].context, CTX_NEWLINE, s, d))
3054     d->success[s] |= CTX_NEWLINE;
3055   if (accepts_in_context (d->states[s].context, CTX_LETTER, s, d))
3056     d->success[s] |= CTX_LETTER;
3057   if (accepts_in_context (d->states[s].context, CTX_NONE, s, d))
3058     d->success[s] |= CTX_NONE;
3059
3060   alloc_position_set (&follows, d->nleaves);
3061
3062   /* Find the union of the follows of the positions of the group.
3063      This is a hideously inefficient loop.  Fix it someday.  */
3064   for (idx_t j = 0; j < d->states[s].elems.nelem; j++)
3065     for (idx_t k = 0;
3066          k < d->follows[d->states[s].elems.elems[j].index].nelem; ++k)
3067       insert (d->follows[d->states[s].elems.elems[j].index].elems[k],
3068               &follows);
3069
3070   /* Positions that match the input char.  */
3071   alloc_position_set (&group, d->nleaves);
3072
3073   /* The group's label.  */
3074   charclass label;
3075   fillset (&label);
3076
3077   for (idx_t i = 0; i < follows.nelem; i++)
3078     {
3079       charclass matches;            /* Set of matching characters.  */
3080       position pos = follows.elems[i];
3081       bool matched = false;
3082       if (d->tokens[pos.index] >= 0 && d->tokens[pos.index] < NOTCHAR)
3083         {
3084           zeroset (&matches);
3085           setbit (d->tokens[pos.index], &matches);
3086           if (d->tokens[pos.index] == uc)
3087             matched = true;
3088         }
3089       else if (d->tokens[pos.index] >= CSET)
3090         {
3091           matches = d->charclasses[d->tokens[pos.index] - CSET];
3092           if (tstbit (uc, &matches))
3093             matched = true;
3094         }
3095       else if (d->tokens[pos.index] == ANYCHAR)
3096         {
3097           matches = d->charclasses[d->canychar];
3098           if (tstbit (uc, &matches))
3099             matched = true;
3100
3101           /* ANYCHAR must match with a single character, so we must put
3102              it to D->states[s].mbps which contains the positions which
3103              can match with a single character not a byte.  If all
3104              positions which has ANYCHAR does not depend on context of
3105              next character, we put the follows instead of it to
3106              D->states[s].mbps to optimize.  */
3107           if (succeeds_in_context (pos.constraint, d->states[s].context,
3108                                    CTX_NONE))
3109             {
3110               if (d->states[s].mbps.nelem == 0)
3111                 alloc_position_set (&d->states[s].mbps, 1);
3112               insert (pos, &d->states[s].mbps);
3113             }
3114         }
3115       else
3116         continue;
3117
3118       /* Some characters may need to be eliminated from matches because
3119          they fail in the current context.  */
3120       if (pos.constraint != NO_CONSTRAINT)
3121         {
3122           if (!succeeds_in_context (pos.constraint,
3123                                     d->states[s].context, CTX_NEWLINE))
3124             for (int j = 0; j < CHARCLASS_WORDS; j++)
3125               matches.w[j] &= ~d->syntax.newline.w[j];
3126           if (!succeeds_in_context (pos.constraint,
3127                                     d->states[s].context, CTX_LETTER))
3128             for (int j = 0; j < CHARCLASS_WORDS; ++j)
3129               matches.w[j] &= ~d->syntax.letters.w[j];
3130           if (!succeeds_in_context (pos.constraint,
3131                                     d->states[s].context, CTX_NONE))
3132             for (int j = 0; j < CHARCLASS_WORDS; ++j)
3133               matches.w[j] &= d->syntax.letters.w[j] | d->syntax.newline.w[j];
3134
3135           /* If there are no characters left, there's no point in going on.  */
3136           if (emptyset (&matches))
3137             continue;
3138
3139           /* If we have reset the bit that made us declare "matched", reset
3140              that indicator, too.  This is required to avoid an infinite loop
3141              with this command: echo cx | LC_ALL=C grep -E 'c\b[x ]'  */
3142           if (!tstbit (uc, &matches))
3143             matched = false;
3144         }
3145
3146 #ifdef DEBUG
3147       fprintf (stderr, " nextpos %td:", pos.index);
3148       prtok (d->tokens[pos.index]);
3149       fprintf (stderr, " of");
3150       for (unsigned j = 0; j < NOTCHAR; j++)
3151         if (tstbit (j, &matches))
3152           fprintf (stderr, " 0x%02x", j);
3153       fprintf (stderr, "\n");
3154 #endif
3155
3156       if (matched)
3157         {
3158           for (int k = 0; k < CHARCLASS_WORDS; ++k)
3159             label.w[k] &= matches.w[k];
3160           append (pos, &group);
3161         }
3162       else
3163         {
3164           for (int k = 0; k < CHARCLASS_WORDS; ++k)
3165             label.w[k] &= ~matches.w[k];
3166         }
3167     }
3168
3169   alloc_position_set (&tmp, d->nleaves);
3170
3171   if (group.nelem > 0)
3172     {
3173       /* If we are building a searching matcher, throw in the positions
3174          of state 0 as well, if possible.  */
3175       if (d->searchflag)
3176         {
3177           /* If a token in follows.elems is not 1st byte of a multibyte
3178              character, or the states of follows must accept the bytes
3179              which are not 1st byte of the multibyte character.
3180              Then, if a state of follows encounters a byte, it must not be
3181              a 1st byte of a multibyte character nor a single byte character.
3182              In this case, do not add state[0].follows to next state, because
3183              state[0] must accept 1st-byte.
3184
3185              For example, suppose <sb a> is a certain single byte character,
3186              <mb A> is a certain multibyte character, and the codepoint of
3187              <sb a> equals the 2nd byte of the codepoint of <mb A>.  When
3188              state[0] accepts <sb a>, state[i] transits to state[i+1] by
3189              accepting the 1st byte of <mb A>, and state[i+1] accepts the
3190              2nd byte of <mb A>, if state[i+1] encounters the codepoint of
3191              <sb a>, it must not be <sb a> but the 2nd byte of <mb A>, so do
3192              not add state[0].  */
3193
3194           bool mergeit = !d->localeinfo.multibyte;
3195           if (!mergeit)
3196             {
3197               mergeit = true;
3198               for (idx_t j = 0; mergeit && j < group.nelem; j++)
3199                 mergeit &= d->multibyte_prop[group.elems[j].index];
3200             }
3201           if (mergeit)
3202             merge2 (&group, &d->states[0].elems, &tmp);
3203         }
3204
3205       /* Find out if the new state will want any context information,
3206          by calculating possible contexts that the group can match,
3207          and separate contexts that the new state wants to know.  */
3208       int possible_contexts = charclass_context (d, &label);
3209       int separate_contexts = state_separate_contexts (d, &group);
3210
3211       /* Find the state(s) corresponding to the union of the follows.  */
3212       if (possible_contexts & ~separate_contexts)
3213         state = state_index (d, &group, separate_contexts ^ CTX_ANY);
3214       else
3215         state = -1;
3216       if (separate_contexts & possible_contexts & CTX_NEWLINE)
3217         state_newline = state_index (d, &group, CTX_NEWLINE);
3218       else
3219         state_newline = state;
3220       if (separate_contexts & possible_contexts & CTX_LETTER)
3221         state_letter = state_index (d, &group, CTX_LETTER);
3222       else
3223         state_letter = state;
3224
3225       /* Reallocate now, to reallocate any newline transition properly.  */
3226       realloc_trans_if_necessary (d);
3227     }
3228
3229   /* If we are a searching matcher, the default transition is to a state
3230      containing the positions of state 0, otherwise the default transition
3231      is to fail miserably.  */
3232   else if (d->searchflag)
3233     {
3234       state_newline = 0;
3235       state_letter = d->min_trcount - 1;
3236       state = d->initstate_notbol;
3237     }
3238   else
3239     {
3240       state_newline = -1;
3241       state_letter = -1;
3242       state = -1;
3243     }
3244
3245   /* Set the transitions for each character in the label.  */
3246   for (int i = 0; i < NOTCHAR; i++)
3247     if (tstbit (i, &label))
3248       switch (d->syntax.sbit[i])
3249         {
3250         case CTX_NEWLINE:
3251           trans[i] = state_newline;
3252           break;
3253         case CTX_LETTER:
3254           trans[i] = state_letter;
3255           break;
3256         default:
3257           trans[i] = state;
3258           break;
3259         }
3260
3261 #ifdef DEBUG
3262   fprintf (stderr, "trans table %td", s);
3263   for (int i = 0; i < NOTCHAR; ++i)
3264     {
3265       if (!(i & 0xf))
3266         fprintf (stderr, "\n");
3267       fprintf (stderr, " %2td", trans[i]);
3268     }
3269   fprintf (stderr, "\n");
3270 #endif
3271
3272   free (group.elems);
3273   free (follows.elems);
3274   free (tmp.elems);
3275
3276   /* Keep the newline transition in a special place so we can use it as
3277      a sentinel.  */
3278   if (tstbit (d->syntax.eolbyte, &label))
3279     {
3280       d->newlines[s] = trans[d->syntax.eolbyte];
3281       trans[d->syntax.eolbyte] = -1;
3282     }
3283
3284   return trans[uc];
3285 }
3286
3287 /* Multibyte character handling sub-routines for dfaexec.  */
3288
3289 /* Consume a single byte and transit state from 's' to '*next_state'.
3290    This is almost the same as the state transition routine in dfaexec.
3291    But the state transition is done just once; otherwise, matching succeeds or
3292    we reach the end of the buffer.  */
3293 static state_num
3294 transit_state_singlebyte (struct dfa *d, state_num s, unsigned char const **pp)
3295 {
3296   state_num *t;
3297
3298   if (d->trans[s])
3299     t = d->trans[s];
3300   else if (d->fails[s])
3301     t = d->fails[s];
3302   else
3303     {
3304       build_state (s, d, **pp);
3305       if (d->trans[s])
3306         t = d->trans[s];
3307       else
3308         {
3309           t = d->fails[s];
3310           assert (t);
3311         }
3312     }
3313
3314   if (t[**pp] == -2)
3315     build_state (s, d, **pp);
3316
3317   return t[*(*pp)++];
3318 }
3319
3320 /* Transit state from s, then return new state and update the pointer of
3321    the buffer.  This function is for a period operator which can match a
3322    multi-byte character.  */
3323 static state_num
3324 transit_state (struct dfa *d, state_num s, unsigned char const **pp,
3325                unsigned char const *end)
3326 {
3327   wint_t wc;
3328
3329   int mbclen = mbs_to_wchar (&wc, (char const *) *pp, end - *pp, d);
3330
3331   /* This state has some operators which can match a multibyte character.  */
3332   d->mb_follows.nelem = 0;
3333
3334   /* Calculate the state which can be reached from the state 's' by
3335      consuming 'mbclen' single bytes from the buffer.  */
3336   state_num s1 = s;
3337   int mbci;
3338   for (mbci = 0; mbci < mbclen && (mbci == 0 || d->min_trcount <= s); mbci++)
3339     s = transit_state_singlebyte (d, s, pp);
3340   *pp += mbclen - mbci;
3341
3342   if (wc == WEOF)
3343     {
3344       /* It is an invalid character, so ANYCHAR is not accepted.  */
3345       return s;
3346     }
3347
3348   /* If all positions which have ANYCHAR do not depend on the context
3349      of the next character, calculate the next state with
3350      pre-calculated follows and cache the result.  */
3351   if (d->states[s1].mb_trindex < 0)
3352     {
3353       if (MAX_TRCOUNT <= d->mb_trcount)
3354         {
3355           state_num s3;
3356           for (s3 = -1; s3 < d->tralloc; s3++)
3357             {
3358               free (d->mb_trans[s3]);
3359               d->mb_trans[s3] = NULL;
3360             }
3361
3362           for (state_num i = 0; i < d->sindex; i++)
3363             d->states[i].mb_trindex = -1;
3364           d->mb_trcount = 0;
3365         }
3366       d->states[s1].mb_trindex = d->mb_trcount++;
3367     }
3368
3369   if (! d->mb_trans[s])
3370     {
3371       enum { TRANSPTR_SIZE = sizeof *d->mb_trans[s] };
3372       enum { TRANSALLOC_SIZE = MAX_TRCOUNT * TRANSPTR_SIZE };
3373       d->mb_trans[s] = xmalloc (TRANSALLOC_SIZE);
3374       for (int i = 0; i < MAX_TRCOUNT; i++)
3375         d->mb_trans[s][i] = -1;
3376     }
3377   else if (d->mb_trans[s][d->states[s1].mb_trindex] >= 0)
3378     return d->mb_trans[s][d->states[s1].mb_trindex];
3379
3380   if (s == -1)
3381     copy (&d->states[s1].mbps, &d->mb_follows);
3382   else
3383     merge (&d->states[s1].mbps, &d->states[s].elems, &d->mb_follows);
3384
3385   int separate_contexts = state_separate_contexts (d, &d->mb_follows);
3386   state_num s2 = state_index (d, &d->mb_follows, separate_contexts ^ CTX_ANY);
3387   realloc_trans_if_necessary (d);
3388
3389   d->mb_trans[s][d->states[s1].mb_trindex] = s2;
3390
3391   return s2;
3392 }
3393
3394 /* The initial state may encounter a byte which is not a single byte character
3395    nor the first byte of a multibyte character.  But it is incorrect for the
3396    initial state to accept such a byte.  For example, in Shift JIS the regular
3397    expression "\\" accepts the codepoint 0x5c, but should not accept the second
3398    byte of the codepoint 0x815c.  Then the initial state must skip the bytes
3399    that are not a single byte character nor the first byte of a multibyte
3400    character.
3401
3402    Given DFA state d, use mbs_to_wchar to advance MBP until it reaches
3403    or exceeds P, and return the advanced MBP.  If WCP is non-NULL and
3404    the result is greater than P, set *WCP to the final wide character
3405    processed, or to WEOF if no wide character is processed.  Otherwise,
3406    if WCP is non-NULL, *WCP may or may not be updated.
3407
3408    Both P and MBP must be no larger than END.  */
3409 static unsigned char const *
3410 skip_remains_mb (struct dfa *d, unsigned char const *p,
3411                  unsigned char const *mbp, char const *end)
3412 {
3413   if (d->syntax.never_trail[*p])
3414     return p;
3415   while (mbp < p)
3416     {
3417       wint_t wc;
3418       mbp += mbs_to_wchar (&wc, (char const *) mbp,
3419                            end - (char const *) mbp, d);
3420     }
3421   return mbp;
3422 }
3423
3424 /* Search through a buffer looking for a match to the struct dfa *D.
3425    Find the first occurrence of a string matching the regexp in the
3426    buffer, and the shortest possible version thereof.  Return a pointer to
3427    the first character after the match, or NULL if none is found.  BEGIN
3428    points to the beginning of the buffer, and END points to the first byte
3429    after its end.  Note however that we store a sentinel byte (usually
3430    newline) in *END, so the actual buffer must be one byte longer.
3431    When ALLOW_NL, newlines may appear in the matching string.
3432    If COUNT is non-NULL, increment *COUNT once for each newline processed.
3433    If MULTIBYTE, the input consists of multibyte characters and/or
3434    encoding-error bytes.  Otherwise, it consists of single-byte characters.
3435    Here is the list of features that make this DFA matcher punt:
3436     - [M-N] range in non-simple locale: regex is up to 25% faster on [a-z]
3437     - [^...] in non-simple locale
3438     - [[=foo=]] or [[.foo.]]
3439     - [[:alpha:]] etc. in multibyte locale (except [[:digit:]] works OK)
3440     - back-reference: (.)\1
3441     - word-delimiter in multibyte locale: \<, \>, \b, \B
3442    See struct localeinfo.simple for the definition of "simple locale".  */
3443
3444 static inline char *
3445 dfaexec_main (struct dfa *d, char const *begin, char *end, bool allow_nl,
3446               idx_t *count, bool multibyte)
3447 {
3448   if (MAX_TRCOUNT <= d->sindex)
3449     {
3450       for (state_num s = d->min_trcount; s < d->sindex; s++)
3451         {
3452           free (d->states[s].elems.elems);
3453           free (d->states[s].mbps.elems);
3454         }
3455       d->sindex = d->min_trcount;
3456
3457       if (d->trans)
3458         {
3459           for (state_num s = 0; s < d->tralloc; s++)
3460             {
3461               free (d->trans[s]);
3462               free (d->fails[s]);
3463               d->trans[s] = d->fails[s] = NULL;
3464             }
3465           d->trcount = 0;
3466         }
3467
3468       if (d->localeinfo.multibyte && d->mb_trans)
3469         {
3470           for (state_num s = -1; s < d->tralloc; s++)
3471             {
3472               free (d->mb_trans[s]);
3473               d->mb_trans[s] = NULL;
3474             }
3475           for (state_num s = 0; s < d->min_trcount; s++)
3476             d->states[s].mb_trindex = -1;
3477           d->mb_trcount = 0;
3478         }
3479     }
3480
3481   if (!d->tralloc)
3482     realloc_trans_if_necessary (d);
3483
3484   /* Current state.  */
3485   state_num s = 0, s1 = 0;
3486
3487   /* Current input character.  */
3488   unsigned char const *p = (unsigned char const *) begin;
3489   unsigned char const *mbp = p;
3490
3491   /* Copy of d->trans so it can be optimized into a register.  */
3492   state_num **trans = d->trans;
3493   unsigned char eol = d->syntax.eolbyte;  /* Likewise for eolbyte.  */
3494   unsigned char saved_end = *(unsigned char *) end;
3495   *end = eol;
3496
3497   if (multibyte)
3498     {
3499       mbszero (&d->mbs);
3500       if (d->mb_follows.alloc == 0)
3501         alloc_position_set (&d->mb_follows, d->nleaves);
3502     }
3503
3504   idx_t nlcount = 0;
3505   for (;;)
3506     {
3507       state_num *t;
3508       while ((t = trans[s]) != NULL)
3509         {
3510           if (s < d->min_trcount)
3511             {
3512               if (!multibyte || d->states[s].mbps.nelem == 0)
3513                 {
3514                   while (t[*p] == s)
3515                     p++;
3516                 }
3517               if (multibyte)
3518                 p = mbp = skip_remains_mb (d, p, mbp, end);
3519             }
3520
3521           if (multibyte)
3522             {
3523               s1 = s;
3524
3525               if (d->states[s].mbps.nelem == 0
3526                   || d->localeinfo.sbctowc[*p] != WEOF || (char *) p >= end)
3527                 {
3528                   /* If an input character does not match ANYCHAR, do it
3529                      like a single-byte character.  */
3530                   s = t[*p++];
3531                 }
3532               else
3533                 {
3534                   s = transit_state (d, s, &p, (unsigned char *) end);
3535                   mbp = p;
3536                   trans = d->trans;
3537                 }
3538             }
3539           else
3540             {
3541               s1 = t[*p++];
3542               t = trans[s1];
3543               if (! t)
3544                 {
3545                   state_num tmp = s;
3546                   s = s1;
3547                   s1 = tmp;     /* swap */
3548                   break;
3549                 }
3550               if (s < d->min_trcount)
3551                 {
3552                   while (t[*p] == s1)
3553                     p++;
3554                 }
3555               s = t[*p++];
3556             }
3557         }
3558
3559       if (s < 0)
3560         {
3561           if (s == -2)
3562             {
3563               s = build_state (s1, d, p[-1]);
3564               trans = d->trans;
3565             }
3566           else if ((char *) p <= end && p[-1] == eol && 0 <= d->newlines[s1])
3567             {
3568               /* The previous character was a newline.  Count it, and skip
3569                  checking of multibyte character boundary until here.  */
3570               nlcount++;
3571               mbp = p;
3572
3573               s = (allow_nl ? d->newlines[s1]
3574                    : d->syntax.sbit[eol] == CTX_NEWLINE ? 0
3575                    : d->syntax.sbit[eol] == CTX_LETTER ? d->min_trcount - 1
3576                    : d->initstate_notbol);
3577             }
3578           else
3579             {
3580               p = NULL;
3581               goto done;
3582             }
3583         }
3584       else if (d->fails[s])
3585         {
3586           if ((d->success[s] & d->syntax.sbit[*p])
3587               || ((char *) p == end
3588                   && accepts_in_context (d->states[s].context, CTX_NEWLINE, s,
3589                                          d)))
3590             goto done;
3591
3592           if (multibyte && s < d->min_trcount)
3593             p = mbp = skip_remains_mb (d, p, mbp, end);
3594
3595           s1 = s;
3596           if (!multibyte || d->states[s].mbps.nelem == 0
3597               || d->localeinfo.sbctowc[*p] != WEOF || (char *) p >= end)
3598             {
3599               /* If a input character does not match ANYCHAR, do it
3600                  like a single-byte character.  */
3601               s = d->fails[s][*p++];
3602             }
3603           else
3604             {
3605               s = transit_state (d, s, &p, (unsigned char *) end);
3606               mbp = p;
3607               trans = d->trans;
3608             }
3609         }
3610       else
3611         {
3612           build_state (s, d, p[0]);
3613           trans = d->trans;
3614         }
3615     }
3616
3617  done:
3618   if (count)
3619     *count += nlcount;
3620   *end = saved_end;
3621   return (char *) p;
3622 }
3623
3624 /* Specialized versions of dfaexec for multibyte and single-byte cases.
3625    This is for performance, as dfaexec_main is an inline function.  */
3626
3627 static char *
3628 dfaexec_mb (struct dfa *d, char const *begin, char *end,
3629             bool allow_nl, idx_t *count, bool *backref)
3630 {
3631   return dfaexec_main (d, begin, end, allow_nl, count, true);
3632 }
3633
3634 static char *
3635 dfaexec_sb (struct dfa *d, char const *begin, char *end,
3636             bool allow_nl, idx_t *count, bool *backref)
3637 {
3638   return dfaexec_main (d, begin, end, allow_nl, count, false);
3639 }
3640
3641 /* Always set *BACKREF and return BEGIN.  Use this wrapper for
3642    any regexp that uses a construct not supported by this code.  */
3643 static char *
3644 dfaexec_noop (struct dfa *d, char const *begin, char *end,
3645               bool allow_nl, idx_t *count, bool *backref)
3646 {
3647   *backref = true;
3648   return (char *) begin;
3649 }
3650
3651 /* Like dfaexec_main (D, BEGIN, END, ALLOW_NL, COUNT, D->localeinfo.multibyte),
3652    but faster and set *BACKREF if the DFA code does not support this
3653    regexp usage.  */
3654
3655 char *
3656 dfaexec (struct dfa *d, char const *begin, char *end,
3657          bool allow_nl, idx_t *count, bool *backref)
3658 {
3659   return d->dfaexec (d, begin, end, allow_nl, count, backref);
3660 }
3661
3662 struct dfa *
3663 dfasuperset (struct dfa const *d)
3664 {
3665   return d->superset;
3666 }
3667
3668 bool
3669 dfaisfast (struct dfa const *d)
3670 {
3671   return d->fast;
3672 }
3673
3674 static void
3675 free_mbdata (struct dfa *d)
3676 {
3677   free (d->multibyte_prop);
3678   free (d->lex.brack.chars);
3679   free (d->mb_follows.elems);
3680
3681   if (d->mb_trans)
3682     {
3683       state_num s;
3684       for (s = -1; s < d->tralloc; s++)
3685         free (d->mb_trans[s]);
3686       free (d->mb_trans - 2);
3687     }
3688 }
3689
3690 /* Return true if every construct in D is supported by this DFA matcher.  */
3691 bool
3692 dfasupported (struct dfa const *d)
3693 {
3694   for (idx_t i = 0; i < d->tindex; i++)
3695     {
3696       switch (d->tokens[i])
3697         {
3698         case BEGWORD:
3699         case ENDWORD:
3700         case LIMWORD:
3701         case NOTLIMWORD:
3702           if (!d->localeinfo.multibyte)
3703             continue;
3704           FALLTHROUGH;
3705         case BACKREF:
3706         case MBCSET:
3707           return false;
3708         }
3709     }
3710   return true;
3711 }
3712
3713 /* Disable use of the superset DFA if it is not likely to help
3714    performance.  */
3715 static void
3716 maybe_disable_superset_dfa (struct dfa *d)
3717 {
3718   if (!d->localeinfo.using_utf8)
3719     return;
3720
3721   bool have_backref = false;
3722   for (idx_t i = 0; i < d->tindex; i++)
3723     {
3724       switch (d->tokens[i])
3725         {
3726         case ANYCHAR:
3727           /* Lowered.  */
3728           assume (false);
3729         case BACKREF:
3730           have_backref = true;
3731           break;
3732         case MBCSET:
3733           /* Requires multi-byte algorithm.  */
3734           return;
3735         default:
3736           break;
3737         }
3738     }
3739
3740   if (!have_backref && d->superset)
3741     {
3742       /* The superset DFA is not likely to be much faster, so remove it.  */
3743       dfafree (d->superset);
3744       free (d->superset);
3745       d->superset = NULL;
3746     }
3747
3748   free_mbdata (d);
3749   d->localeinfo.multibyte = false;
3750   d->dfaexec = dfaexec_sb;
3751   d->fast = true;
3752 }
3753
3754 static void
3755 dfassbuild (struct dfa *d)
3756 {
3757   struct dfa *sup = dfaalloc ();
3758
3759   *sup = *d;
3760   sup->localeinfo.multibyte = false;
3761   sup->dfaexec = dfaexec_sb;
3762   sup->multibyte_prop = NULL;
3763   sup->superset = NULL;
3764   sup->states = NULL;
3765   sup->sindex = 0;
3766   sup->constraints = NULL;
3767   sup->separates = NULL;
3768   sup->follows = NULL;
3769   sup->tralloc = 0;
3770   sup->trans = NULL;
3771   sup->fails = NULL;
3772   sup->success = NULL;
3773   sup->newlines = NULL;
3774
3775   sup->charclasses = xnmalloc (sup->calloc, sizeof *sup->charclasses);
3776   if (d->cindex)
3777     {
3778       memcpy (sup->charclasses, d->charclasses,
3779               d->cindex * sizeof *sup->charclasses);
3780     }
3781
3782   sup->tokens = xnmalloc (d->tindex, 2 * sizeof *sup->tokens);
3783   sup->talloc = d->tindex * 2;
3784
3785   bool have_achar = false;
3786   bool have_nchar = false;
3787   idx_t j;
3788   for (idx_t i = j = 0; i < d->tindex; i++)
3789     {
3790       switch (d->tokens[i])
3791         {
3792         case ANYCHAR:
3793         case MBCSET:
3794         case BACKREF:
3795           {
3796             charclass ccl;
3797             fillset (&ccl);
3798             sup->tokens[j++] = CSET + charclass_index (sup, &ccl);
3799             sup->tokens[j++] = STAR;
3800             if (d->tokens[i + 1] == QMARK || d->tokens[i + 1] == STAR
3801                 || d->tokens[i + 1] == PLUS)
3802               i++;
3803             have_achar = true;
3804           }
3805           break;
3806         case BEGWORD:
3807         case ENDWORD:
3808         case LIMWORD:
3809         case NOTLIMWORD:
3810           if (d->localeinfo.multibyte)
3811             {
3812               /* These constraints aren't supported in a multibyte locale.
3813                  Ignore them in the superset DFA.  */
3814               sup->tokens[j++] = EMPTY;
3815               break;
3816             }
3817           FALLTHROUGH;
3818         default:
3819           sup->tokens[j++] = d->tokens[i];
3820           if ((0 <= d->tokens[i] && d->tokens[i] < NOTCHAR)
3821               || d->tokens[i] >= CSET)
3822             have_nchar = true;
3823           break;
3824         }
3825     }
3826   sup->tindex = j;
3827
3828   if (have_nchar && (have_achar || d->localeinfo.multibyte))
3829     d->superset = sup;
3830   else
3831     {
3832       dfafree (sup);
3833       free (sup);
3834     }
3835 }
3836
3837 /* Parse a string S of length LEN into D (but skip this step if S is null).
3838    Then analyze D and build a matcher for it.
3839    SEARCHFLAG says whether to build a searching or an exact matcher.  */
3840 void
3841 dfacomp (char const *s, idx_t len, struct dfa *d, bool searchflag)
3842 {
3843   if (s != NULL)
3844     dfaparse (s, len, d);
3845
3846   dfassbuild (d);
3847
3848   if (dfasupported (d))
3849     {
3850       maybe_disable_superset_dfa (d);
3851       dfaanalyze (d, searchflag);
3852     }
3853   else
3854     {
3855       d->dfaexec = dfaexec_noop;
3856     }
3857
3858   if (d->superset)
3859     {
3860       d->fast = true;
3861       dfaanalyze (d->superset, searchflag);
3862     }
3863 }
3864
3865 /* Free the storage held by the components of a dfa.  */
3866 void
3867 dfafree (struct dfa *d)
3868 {
3869   free (d->charclasses);
3870   free (d->tokens);
3871
3872   if (d->localeinfo.multibyte)
3873     free_mbdata (d);
3874
3875   free (d->constraints);
3876   free (d->separates);
3877
3878   for (idx_t i = 0; i < d->sindex; i++)
3879     {
3880       free (d->states[i].elems.elems);
3881       free (d->states[i].mbps.elems);
3882     }
3883   free (d->states);
3884
3885   if (d->follows)
3886     {
3887       for (idx_t i = 0; i < d->tindex; i++)
3888         free (d->follows[i].elems);
3889       free (d->follows);
3890     }
3891
3892   if (d->trans)
3893     {
3894       for (idx_t i = 0; i < d->tralloc; i++)
3895         {
3896           free (d->trans[i]);
3897           free (d->fails[i]);
3898         }
3899
3900       free (d->trans - 2);
3901       free (d->fails);
3902       free (d->newlines);
3903       free (d->success);
3904     }
3905
3906   if (d->superset)
3907     {
3908       dfafree (d->superset);
3909       free (d->superset);
3910     }
3911 }
3912
3913 /* Having found the postfix representation of the regular expression,
3914    try to find a long sequence of characters that must appear in any line
3915    containing the r.e.
3916    Finding a "longest" sequence is beyond the scope here;
3917    we take an easy way out and hope for the best.
3918    (Take "(ab|a)b"--please.)
3919
3920    We do a bottom-up calculation of sequences of characters that must appear
3921    in matches of r.e.'s represented by trees rooted at the nodes of the postfix
3922    representation:
3923         sequences that must appear at the left of the match ("left")
3924         sequences that must appear at the right of the match ("right")
3925         lists of sequences that must appear somewhere in the match ("in")
3926         sequences that must constitute the match ("is")
3927
3928    When we get to the root of the tree, we use one of the longest of its
3929    calculated "in" sequences as our answer.
3930
3931    The sequences calculated for the various types of node (in pseudo ANSI c)
3932    are shown below.  "p" is the operand of unary operators (and the left-hand
3933    operand of binary operators); "q" is the right-hand operand of binary
3934    operators.
3935
3936    "ZERO" means "a zero-length sequence" below.
3937
3938         Type    left            right           is              in
3939         ----    ----            -----           --              --
3940         char c  # c             # c             # c             # c
3941
3942         ANYCHAR ZERO            ZERO            ZERO            ZERO
3943
3944         MBCSET  ZERO            ZERO            ZERO            ZERO
3945
3946         CSET    ZERO            ZERO            ZERO            ZERO
3947
3948         STAR    ZERO            ZERO            ZERO            ZERO
3949
3950         QMARK   ZERO            ZERO            ZERO            ZERO
3951
3952         PLUS    p->left         p->right        ZERO            p->in
3953
3954         CAT     (p->is==ZERO)?  (q->is==ZERO)?  (p->is!=ZERO && p->in plus
3955                 p->left :       q->right :      q->is!=ZERO) ?  q->in plus
3956                 p->is##q->left  p->right##q->is p->is##q->is : p->right##q->left
3957                                                 ZERO
3958
3959         OR      longest common  longest common  (do p->is and substrings common
3960                 leading         trailing        to q->is have same p->in and
3961                 (sub)sequence   (sub)sequence   q->in length and content) ?
3962                 of p->left      of p->right
3963                 and q->left     and q->right    p->is : NULL
3964
3965    If there's anything else we recognize in the tree, all four sequences get set
3966    to zero-length sequences.  If there's something we don't recognize in the
3967    tree, we just return a zero-length sequence.
3968
3969    Break ties in favor of infrequent letters (choosing 'zzz' in preference to
3970    'aaa')?
3971
3972    And ... is it here or someplace that we might ponder "optimizations" such as
3973         egrep 'psi|epsilon'     ->      egrep 'psi'
3974         egrep 'pepsi|epsilon'   ->      egrep 'epsi'
3975                                         (Yes, we now find "epsi" as a "string
3976                                         that must occur", but we might also
3977                                         simplify the *entire* r.e. being sought)
3978         grep '[c]'              ->      grep 'c'
3979         grep '(ab|a)b'          ->      grep 'ab'
3980         grep 'ab*'              ->      grep 'a'
3981         grep 'a*b'              ->      grep 'b'
3982
3983    There are several issues:
3984
3985    Is optimization easy (enough)?
3986
3987    Does optimization actually accomplish anything,
3988    or is the automaton you get from "psi|epsilon" (for example)
3989    the same as the one you get from "psi" (for example)?
3990
3991    Are optimizable r.e.'s likely to be used in real-life situations
3992    (something like 'ab*' is probably unlikely; something like is
3993    'psi|epsilon' is likelier)?  */
3994
3995 static char *
3996 icatalloc (char *old, char const *new)
3997 {
3998   idx_t newsize = strlen (new);
3999   if (newsize == 0)
4000     return old;
4001   idx_t oldsize = strlen (old);
4002   char *result = xirealloc (old, oldsize + newsize + 1);
4003   memcpy (result + oldsize, new, newsize + 1);
4004   return result;
4005 }
4006
4007 static void
4008 freelist (char **cpp)
4009 {
4010   while (*cpp)
4011     free (*cpp++);
4012 }
4013
4014 static char **
4015 enlistnew (char **cpp, char *new)
4016 {
4017   /* Is there already something in the list that's new (or longer)?  */
4018   idx_t i;
4019   for (i = 0; cpp[i] != NULL; i++)
4020     if (strstr (cpp[i], new) != NULL)
4021       {
4022         free (new);
4023         return cpp;
4024       }
4025   /* Eliminate any obsoleted strings.  */
4026   for (idx_t j = 0; cpp[j] != NULL; )
4027     if (strstr (new, cpp[j]) == NULL)
4028       ++j;
4029     else
4030       {
4031         free (cpp[j]);
4032         if (--i == j)
4033           break;
4034         cpp[j] = cpp[i];
4035         cpp[i] = NULL;
4036       }
4037   /* Add the new string.  */
4038   cpp = xreallocarray (cpp, i + 2, sizeof *cpp);
4039   cpp[i] = new;
4040   cpp[i + 1] = NULL;
4041   return cpp;
4042 }
4043
4044 static char **
4045 enlist (char **cpp, char const *str, idx_t len)
4046 {
4047   return enlistnew (cpp, ximemdup0 (str, len));
4048 }
4049
4050 /* Given pointers to two strings, return a pointer to an allocated
4051    list of their distinct common substrings.  */
4052 static char **
4053 comsubs (char *left, char const *right)
4054 {
4055   char **cpp = xzalloc (sizeof *cpp);
4056
4057   for (char *lcp = left; *lcp != '\0'; lcp++)
4058     {
4059       idx_t len = 0;
4060       char *rcp = strchr (right, *lcp);
4061       while (rcp != NULL)
4062         {
4063           idx_t i;
4064           for (i = 1; lcp[i] != '\0' && lcp[i] == rcp[i]; ++i)
4065             continue;
4066           if (i > len)
4067             len = i;
4068           rcp = strchr (rcp + 1, *lcp);
4069         }
4070       if (len != 0)
4071         cpp = enlist (cpp, lcp, len);
4072     }
4073   return cpp;
4074 }
4075
4076 static char **
4077 addlists (char **old, char **new)
4078 {
4079   for (; *new; new++)
4080     old = enlistnew (old, xstrdup (*new));
4081   return old;
4082 }
4083
4084 /* Given two lists of substrings, return a new list giving substrings
4085    common to both.  */
4086 static char **
4087 inboth (char **left, char **right)
4088 {
4089   char **both = xzalloc (sizeof *both);
4090
4091   for (idx_t lnum = 0; left[lnum] != NULL; lnum++)
4092     {
4093       for (idx_t rnum = 0; right[rnum] != NULL; rnum++)
4094         {
4095           char **temp = comsubs (left[lnum], right[rnum]);
4096           both = addlists (both, temp);
4097           freelist (temp);
4098           free (temp);
4099         }
4100     }
4101   return both;
4102 }
4103
4104 typedef struct must must;
4105
4106 struct must
4107 {
4108   char **in;
4109   char *left;
4110   char *right;
4111   char *is;
4112   bool begline;
4113   bool endline;
4114   must *prev;
4115 };
4116
4117 static must *
4118 allocmust (must *mp, idx_t size)
4119 {
4120   must *new_mp = xmalloc (sizeof *new_mp);
4121   new_mp->in = xzalloc (sizeof *new_mp->in);
4122   new_mp->left = xizalloc (size);
4123   new_mp->right = xizalloc (size);
4124   new_mp->is = xizalloc (size);
4125   new_mp->begline = false;
4126   new_mp->endline = false;
4127   new_mp->prev = mp;
4128   return new_mp;
4129 }
4130
4131 static void
4132 resetmust (must *mp)
4133 {
4134   freelist (mp->in);
4135   mp->in[0] = NULL;
4136   mp->left[0] = mp->right[0] = mp->is[0] = '\0';
4137   mp->begline = false;
4138   mp->endline = false;
4139 }
4140
4141 static void
4142 freemust (must *mp)
4143 {
4144   freelist (mp->in);
4145   free (mp->in);
4146   free (mp->left);
4147   free (mp->right);
4148   free (mp->is);
4149   free (mp);
4150 }
4151
4152 struct dfamust *
4153 dfamust (struct dfa const *d)
4154 {
4155   must *mp = NULL;
4156   char const *result = "";
4157   bool exact = false;
4158   bool begline = false;
4159   bool endline = false;
4160   bool need_begline = false;
4161   bool need_endline = false;
4162   bool case_fold_unibyte = d->syntax.case_fold & !d->localeinfo.multibyte;
4163
4164   for (idx_t ri = 1; ri + 1 < d->tindex; ri++)
4165     {
4166       token t = d->tokens[ri];
4167       switch (t)
4168         {
4169         case BEGLINE:
4170           mp = allocmust (mp, 2);
4171           mp->begline = true;
4172           need_begline = true;
4173           break;
4174         case ENDLINE:
4175           mp = allocmust (mp, 2);
4176           mp->endline = true;
4177           need_endline = true;
4178           break;
4179         case LPAREN:
4180         case RPAREN:
4181           assert (!"neither LPAREN nor RPAREN may appear here");
4182
4183         case EMPTY:
4184         case BEGWORD:
4185         case ENDWORD:
4186         case LIMWORD:
4187         case NOTLIMWORD:
4188         case BACKREF:
4189         case ANYCHAR:
4190         case MBCSET:
4191           mp = allocmust (mp, 2);
4192           break;
4193
4194         case STAR:
4195         case QMARK:
4196           assume_nonnull (mp);
4197           resetmust (mp);
4198           break;
4199
4200         case OR:
4201           {
4202             char **new;
4203             must *rmp = mp;
4204             assume_nonnull (rmp);
4205             must *lmp = mp = mp->prev;
4206             assume_nonnull (lmp);
4207             idx_t j, ln, rn, n;
4208
4209             /* Guaranteed to be.  Unlikely, but ...  */
4210             if (str_eq (lmp->is, rmp->is))
4211               {
4212                 lmp->begline &= rmp->begline;
4213                 lmp->endline &= rmp->endline;
4214               }
4215             else
4216               {
4217                 lmp->is[0] = '\0';
4218                 lmp->begline = false;
4219                 lmp->endline = false;
4220               }
4221             /* Left side--easy */
4222             idx_t i = 0;
4223             while (lmp->left[i] != '\0' && lmp->left[i] == rmp->left[i])
4224               ++i;
4225             lmp->left[i] = '\0';
4226             /* Right side */
4227             ln = strlen (lmp->right);
4228             rn = strlen (rmp->right);
4229             n = ln;
4230             if (n > rn)
4231               n = rn;
4232             for (i = 0; i < n; ++i)
4233               if (lmp->right[ln - i - 1] != rmp->right[rn - i - 1])
4234                 break;
4235             for (j = 0; j < i; ++j)
4236               lmp->right[j] = lmp->right[(ln - i) + j];
4237             lmp->right[j] = '\0';
4238             new = inboth (lmp->in, rmp->in);
4239             freelist (lmp->in);
4240             free (lmp->in);
4241             lmp->in = new;
4242             freemust (rmp);
4243           }
4244           break;
4245
4246         case PLUS:
4247           assume_nonnull (mp);
4248           mp->is[0] = '\0';
4249           break;
4250
4251         case END:
4252           assume_nonnull (mp);
4253           assert (!mp->prev);
4254           for (idx_t i = 0; mp->in[i] != NULL; i++)
4255             if (strlen (mp->in[i]) > strlen (result))
4256               result = mp->in[i];
4257           if (str_eq (result, mp->is))
4258             {
4259               if ((!need_begline || mp->begline) && (!need_endline
4260                                                      || mp->endline))
4261                 exact = true;
4262               begline = mp->begline;
4263               endline = mp->endline;
4264             }
4265           goto done;
4266
4267         case CAT:
4268           {
4269             must *rmp = mp;
4270             assume_nonnull (rmp);
4271             must *lmp = mp = mp->prev;
4272             assume_nonnull (lmp);
4273
4274             /* In.  Everything in left, plus everything in
4275                right, plus concatenation of
4276                left's right and right's left.  */
4277             lmp->in = addlists (lmp->in, rmp->in);
4278             if (lmp->right[0] != '\0' && rmp->left[0] != '\0')
4279               {
4280                 idx_t lrlen = strlen (lmp->right);
4281                 idx_t rllen = strlen (rmp->left);
4282                 char *tp = ximalloc (lrlen + rllen + 1);
4283                 memcpy (tp + lrlen, rmp->left, rllen + 1);
4284                 memcpy (tp, lmp->right, lrlen);
4285                 lmp->in = enlistnew (lmp->in, tp);
4286               }
4287             /* Left-hand */
4288             if (lmp->is[0] != '\0')
4289               lmp->left = icatalloc (lmp->left, rmp->left);
4290             /* Right-hand */
4291             if (rmp->is[0] == '\0')
4292               lmp->right[0] = '\0';
4293             lmp->right = icatalloc (lmp->right, rmp->right);
4294             /* Guaranteed to be */
4295             if ((lmp->is[0] != '\0' || lmp->begline)
4296                 && (rmp->is[0] != '\0' || rmp->endline))
4297               {
4298                 lmp->is = icatalloc (lmp->is, rmp->is);
4299                 lmp->endline = rmp->endline;
4300               }
4301             else
4302               {
4303                 lmp->is[0] = '\0';
4304                 lmp->begline = false;
4305                 lmp->endline = false;
4306               }
4307             freemust (rmp);
4308           }
4309           break;
4310
4311         case '\0':
4312           /* Not on *my* shift.  */
4313           goto done;
4314
4315         default:
4316           if (CSET <= t)
4317             {
4318               /* If T is a singleton, or if case-folding in a unibyte
4319                  locale and T's members all case-fold to the same char,
4320                  convert T to one of its members.  Otherwise, do
4321                  nothing further with T.  */
4322               charclass *ccl = &d->charclasses[t - CSET];
4323               int j;
4324               for (j = 0; j < NOTCHAR; j++)
4325                 if (tstbit (j, ccl))
4326                   break;
4327               if (! (j < NOTCHAR))
4328                 {
4329                   mp = allocmust (mp, 2);
4330                   break;
4331                 }
4332               t = j;
4333               while (++j < NOTCHAR)
4334                 if (tstbit (j, ccl)
4335                     && ! (case_fold_unibyte
4336                           && toupper (j) == toupper (t)))
4337                   break;
4338               if (j < NOTCHAR)
4339                 {
4340                   mp = allocmust (mp, 2);
4341                   break;
4342                 }
4343             }
4344
4345           idx_t rj = ri + 2;
4346           if (d->tokens[ri + 1] == CAT)
4347             {
4348               for (; rj < d->tindex - 1; rj += 2)
4349                 {
4350                   if ((rj != ri && (d->tokens[rj] <= 0
4351                                     || NOTCHAR <= d->tokens[rj]))
4352                       || d->tokens[rj + 1] != CAT)
4353                     break;
4354                 }
4355             }
4356           mp = allocmust (mp, ((rj - ri) >> 1) + 1);
4357           mp->is[0] = mp->left[0] = mp->right[0]
4358             = case_fold_unibyte ? toupper (t) : t;
4359
4360           idx_t i;
4361           for (i = 1; ri + 2 < rj; i++)
4362             {
4363               ri += 2;
4364               t = d->tokens[ri];
4365               mp->is[i] = mp->left[i] = mp->right[i]
4366                 = case_fold_unibyte ? toupper (t) : t;
4367             }
4368           mp->is[i] = mp->left[i] = mp->right[i] = '\0';
4369           mp->in = enlist (mp->in, mp->is, i);
4370           break;
4371         }
4372     }
4373  done:;
4374
4375   struct dfamust *dm = NULL;
4376   if (*result)
4377     {
4378       dm = xmalloc (FLEXSIZEOF (struct dfamust, must, strlen (result) + 1));
4379       dm->exact = exact;
4380       dm->begline = begline;
4381       dm->endline = endline;
4382       strcpy (dm->must, result);
4383     }
4384
4385   while (mp)
4386     {
4387       must *prev = mp->prev;
4388       freemust (mp);
4389       mp = prev;
4390     }
4391
4392   return dm;
4393 }
4394
4395 void
4396 dfamustfree (struct dfamust *dm)
4397 {
4398   free (dm);
4399 }
4400
4401 struct dfa *
4402 dfaalloc (void)
4403 {
4404   return xmalloc (sizeof (struct dfa));
4405 }
4406
4407 /* Initialize DFA.  */
4408 void
4409 dfasyntax (struct dfa *dfa, struct localeinfo const *linfo,
4410            reg_syntax_t bits, int dfaopts)
4411 {
4412   memset (dfa, 0, offsetof (struct dfa, dfaexec));
4413   dfa->dfaexec = linfo->multibyte ? dfaexec_mb : dfaexec_sb;
4414   dfa->localeinfo = *linfo;
4415
4416   dfa->fast = !dfa->localeinfo.multibyte;
4417
4418   dfa->canychar = -1;
4419   dfa->syntax.syntax_bits_set = true;
4420   dfa->syntax.case_fold = (bits & RE_ICASE) != 0;
4421   dfa->syntax.eolbyte = dfaopts & DFA_EOL_NUL ? '\0' : '\n';
4422   dfa->syntax.syntax_bits = bits;
4423   dfa->syntax.dfaopts = dfaopts;
4424
4425   for (int i = CHAR_MIN; i <= CHAR_MAX; ++i)
4426     {
4427       unsigned char uc = i;
4428
4429       dfa->syntax.sbit[uc] = char_context (dfa, uc);
4430       switch (dfa->syntax.sbit[uc])
4431         {
4432         case CTX_LETTER:
4433           setbit (uc, &dfa->syntax.letters);
4434           break;
4435         case CTX_NEWLINE:
4436           setbit (uc, &dfa->syntax.newline);
4437           break;
4438         }
4439
4440       /* POSIX requires that the five bytes in "\n\r./" (including the
4441          terminating NUL) cannot occur inside a multibyte character.  */
4442       dfa->syntax.never_trail[uc] = (dfa->localeinfo.using_utf8
4443                                      ? (uc & 0xc0) != 0x80
4444                                      : strchr ("\n\r./", uc) != NULL);
4445     }
4446 }
4447
4448 /* Initialize TO by copying FROM's syntax settings.  */
4449 void
4450 dfacopysyntax (struct dfa *to, struct dfa const *from)
4451 {
4452   memset (to, 0, offsetof (struct dfa, syntax));
4453   to->canychar = -1;
4454   to->fast = from->fast;
4455   to->syntax = from->syntax;
4456   to->dfaexec = from->dfaexec;
4457   to->localeinfo = from->localeinfo;
4458 }
4459
4460 /* vim:set shiftwidth=2: */