/* fsaparse -- Build a structure naming relationships (sequences,
               alternatives, backreferences, options and precedence)
               of tokens

   Copyright (C) 1988, 1998, 2000, 2002, 2004-2005, 2007-2014 Free Software
   Foundation, Inc.

   This program is free software; you can redistribute it and/or modify
   it under the terms of the GNU General Public License as published by
   the Free Software Foundation; either version 3, or (at your option)
   any later version.

   This program is distributed in the hope that it will be useful,
   but WITHOUT ANY WARRANTY; without even the implied warranty of
   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
   GNU General Public License for more details.

   You should have received a copy of the GNU General Public License
   along with this program; if not, write to the Free Software
   Foundation, Inc.,
   51 Franklin Street - Fifth Floor, Boston, MA  02110-1301, USA */

/* Written June, 1988 by Mike Haertel
   Modified July, 1988 by Arthur David Olson to assist BMG speedups  */

/* 2014: Repackaged by "untangle" script, written by behoffski.  */

/* This function receives a stream of tokens from fsalex, and processes
   them to impose precedence rules and to describe complex pattern
   elements that are beyond the capability of the simple lexer.  In
   addition to the cases explicit in the syntax (e.g."(ab|c)",
   variable-length multibyte encodings (UTF-8; codesets including
   modifiers and/or shift items) also require these enhanced
   facilities.  */

/* Always import environment-specific configuration items first.  */
#include <config.h>

#include <assert.h>
#include "charclass.h"
#include "fsaparse.h"
#include "fsalex.h"
#include "fsatoken.h"
#include "mbcsets.h"
#include "proto-lexparse.h"
#include <stdbool.h>
#include <stdlib.h>
#include "xalloc.h"

/* gettext.h ensures that we don't use gettext if ENABLE_NLS is not defined */
#include "gettext.h"
#define _(str) gettext (str)

#include <wchar.h>
#include <wctype.h>

#if HAVE_LANGINFO_CODESET
# include <langinfo.h>
#endif

/* fsaparse_ctxt: Gather all the context to do with the parser into a
   single struct.  We do this mainly because it make it easier to
   contemplate having multiple instances of this module running in
   parallel, but also because it makes translating from "dfa->" easier.
   This definition fleshes out the opaque type given in the module
   header.  */
struct fsaparse_ctxt_struct
{
  /* Singly-linked list of all parser instances, so destroy_module can
     release all resources by traversing the list.  */
  fsaparse_ctxt_t *next_instance;

  /* Warning and abort functions provided by client.  */
  fsalex_warn_callback_fn *warn_client;
  fsalex_error_callback_fn *abandon_with_error;

  /* Plug-in functions and context to deal with lexer at arm's
     length.  */
  proto_lexparse_lex_fn_t *lexer;
  proto_lexparse_exchange_fn_t *lex_exchange;
  void *lex_context;

  /* Information about locale (needs to sync with lexer...?) */
  bool multibyte_locale;
  bool unibyte_locale;

  /* Variable to store a dynamically-allocated list of wide characters.
     The parser needs to fetch this list whenever a WCHAR token is
     received.  */
  int wide_chars_max;
  wchar_t *wide_char_list;

  fsatoken_token_t lookahead_token;

  size_t current_depth;      /* Current depth of a hypothetical stack
                             holding deferred productions.  This is
                             used to determine the depth that will be
                             required of the real stack later on in
                             dfaanalyze.  */
  /* Fields filled by the parser.  */
  fsatoken_token_t *tokens;                /* Postfix parse array.  */
  size_t tindex;                /* Index for adding new tokens.  */
  size_t talloc;                /* Number of tokens currently allocated.  */
  size_t depth;                 /* Depth required of an evaluation stack
                                   used for depth-first traversal of the
                                   parse tree.  */
  size_t nleaves;               /* Number of leaves on the parse tree.  */
  size_t nregexps;              /* Count of parallel regexps being built
                                   with dfaparse.  */
  bool fast;			/* The DFA is fast.  */
  bool multibyte;		/* MB_CUR_MAX > 1.  */
  fsatoken_token_t utf8_anychar_classes[5]; /* To lower ANYCHAR in UTF-8 locales.  */
  mbstate_t mbs;		/* Multibyte conversion state.  */

  /* The following are valid only if MB_CUR_MAX > 1.  */

  /* The value of multibyte_prop[i] is defined by following rule.
     if tokens[i] < NOTCHAR
     bit 0 : tokens[i] is the first byte of a character, including
     single-byte characters.
     bit 1 : tokens[i] is the last byte of a character, including
     single-byte characters.

     if tokens[i] = MBCSET
     ("the index of mbcsets corresponding to this operator" << 2) + 3

     e.g.
     tokens
     = 'single_byte_a', 'multi_byte_A', single_byte_b'
     = 'sb_a', 'mb_A(1st byte)', 'mb_A(2nd byte)', 'mb_A(3rd byte)', 'sb_b'
     multibyte_prop
     = 3     , 1               ,  0              ,  2              , 3
   */
  int *multibyte_prop;

  mbcsets_set_t **mbcsets;
  size_t nmbcsets;
  size_t mbcsets_alloc;
};

/* Linked list of all instances created by this module.  */
static fsaparse_ctxt_t *fsaparse_instance_list_head = NULL;

/* Ensure that the array addressed by PTR holds at least NITEMS +
   (PTR || !NITEMS) items.  Either return PTR, or reallocate the array
   and return its new address.  Although PTR may be null, the returned
   value is never null.

   The array holds *NALLOC items; *NALLOC is updated on reallocation.
   ITEMSIZE is the size of one item.  Avoid O(N**2) behavior on arrays
   growing linearly.  */
static void *
maybe_realloc (void *ptr, size_t nitems, size_t *nalloc, size_t itemsize)
{
  if (nitems < *nalloc)
    return ptr;
  *nalloc = nitems;
  return x2nrealloc (ptr, nalloc, itemsize);
}

/* UTF-8 encoding allows some optimizations that we can't otherwise
   assume in a multibyte encoding.  */
static int
using_utf8 (void)
{
  static int utf8 = -1;
  if (utf8 < 0)
    {
      wchar_t wc;
      mbstate_t mbs = { 0 };
      utf8 = mbrtowc (&wc, "\xc4\x80", 2, &mbs) == 2 && wc == 0x100;
    }
  return utf8;
}

/* Recursive descent parser for regular expressions.  */

static void
addtok_mb (fsaparse_ctxt_t *parser, fsatoken_token_t t, int mbprop)
{
  if (parser->talloc == parser->tindex)
    {
      parser->tokens = x2nrealloc (parser->tokens, &parser->talloc,
                                sizeof *parser->tokens);
      if (parser->multibyte)
        parser->multibyte_prop = xnrealloc (parser->multibyte_prop, parser->talloc,
                                         sizeof *parser->multibyte_prop);
    }
  if (parser->multibyte)
    parser->multibyte_prop[parser->tindex] = mbprop;
  parser->tokens[parser->tindex++] = t;

  switch (t)
    {
    case FSATOKEN_TK_QMARK:
    case FSATOKEN_TK_STAR:
    case FSATOKEN_TK_PLUS:
      break;

    case FSATOKEN_TK_CAT:
    case FSATOKEN_TK_OR:
      --parser->current_depth;
      break;

    case FSATOKEN_TK_BACKREF:
      parser->fast = false;
      /* fallthrough */
    default:
      ++parser->nleaves;
      /* fallthrough */
    case FSATOKEN_TK_EMPTY:
      ++parser->current_depth;
      break;
    }
  if (parser->depth < parser->current_depth)
    parser->depth = parser->current_depth;
}

static void addtok_wc (fsaparse_ctxt_t *parser, wint_t wc);

/* Sigh.  Mbcsets is so disruptive (intimate details hidden, by
   design), that we are writing addtok here from scratch.  */
static void
addtok (fsaparse_ctxt_t *parser, fsatoken_token_t t)
{
  bool need_or = false;
  mbcsets_set_t *work_mbc;
  bool invert;
  charclass_t *charclass;
  size_t nchars;
  size_t nch_classes;
  size_t nranges;
  size_t nequivs;
  size_t ncoll_elems;

  if (t != FSATOKEN_TK_MBCSET || parser->unibyte_locale)
    {
      addtok_mb (parser, t, 3);
      return;
    }

  work_mbc = parser->mbcsets[parser->nmbcsets - 1];
  mbcsets_get_characteristics (work_mbc,
                               &invert,
                               &charclass,
                               &nchars,
                               &nch_classes,
                               &nranges,
                               &nequivs,
                               &ncoll_elems);

  if (nchars && !invert)
    {
      size_t i;
      wchar_t *char_list;

      char_list = xnmalloc (nchars, sizeof *char_list);
      mbcsets_get_chars (work_mbc, char_list);
      addtok_wc (parser, char_list[0]);
      for (i = 1; i < nchars; i++)
        {
          addtok_wc (parser, char_list[i]);
          addtok (parser, FSATOKEN_TK_OR);
        }
      free (char_list);
      need_or = true;
    }

  /* If the set contains non-trivial components, it's too hard to deal
     with here, so hand it on to higher-up machinery.  */
  if (invert || nch_classes || nranges || ncoll_elems)
    {
      addtok_mb (parser, FSATOKEN_TK_MBCSET,
                 ((parser->nmbcsets - 1) << 2) + 3);
      if (need_or)
        addtok (parser, FSATOKEN_TK_OR);
    }

  /* Individual characters have been handled above, so the only case
     remaining is where codes have been simplified into charclass
     members.  */
  else if (charclass)
    {
      addtok (parser, FSATOKEN_TK_CSET + charclass_get_index (charclass));
      if (need_or)
        addtok (parser, FSATOKEN_TK_OR);
    }
}

/* We treat a multibyte character as a single atom, so that DFA
   can treat a multibyte character as a single expression.

   e.g., we construct the following tree from "<mb1><mb2>".
   <mb1(1st-byte)><mb1(2nd-byte)><FSATOKEN_TK_CAT><mb1(3rd-byte)><FSATOKEN_TK_CAT>
   <mb2(1st-byte)><mb2(2nd-byte)><FSATOKEN_TK_CAT><mb2(3rd-byte)><FSATOKEN_TK_CAT><FSATOKEN_TK_CAT> */
static void
addtok_wc (fsaparse_ctxt_t *parser, wint_t wc)
{
  unsigned char buf[MB_LEN_MAX];
  mbstate_t s = { 0 };
  int i;
  int cur_mb_len;
  size_t stored_bytes = wcrtomb ((char *) buf, wc, &s);

  if (stored_bytes != (size_t) -1)
    cur_mb_len = stored_bytes;
  else
    {
      /* This is merely stop-gap.  buf[0] is undefined, yet skipping
         the addtok_mb call altogether can corrupt the heap.  */
      cur_mb_len = 1;
      buf[0] = 0;
    }

  addtok_mb (parser, buf[0], cur_mb_len == 1 ? 3 : 1);
  for (i = 1; i < cur_mb_len; i++)
    {
      addtok_mb (parser, buf[i], i == cur_mb_len - 1 ? 2 : 0);
      addtok (parser, FSATOKEN_TK_CAT);
    }
}
static void
add_utf8_anychar (fsaparse_ctxt_t *parser)
{
  unsigned int i;

  /* Have we set up the classes for the 1-byte to 4-byte sequence types?  */
  if (parser->utf8_anychar_classes[0] == 0)
    {
      /* No, first time we've been called, so set them up now.  */
      charclass_t *ccl;
      const charclass_t *dotclass;

      /* Index 0: 80-bf -- Non-leading bytes.  */
      ccl = charclass_alloc ();
      charclass_setbit_range (0x80, 0xbf, ccl);
      parser->utf8_anychar_classes[0] = charclass_completed (ccl);

      /* Index 1: 00-7f -- 1-byte leading seq, minus dotclass exceptions.  */
      ccl = charclass_alloc ();
      charclass_setbit_range (0x00, 0x7f, ccl);
      fsalex_exchange (parser->lex_context, PROTO_LEXPARSE_OP_GET_DOTCLASS,
                       &dotclass);
      charclass_intersectset (dotclass, ccl);
      parser->utf8_anychar_classes[1] = charclass_completed (ccl);

      /* Index 2: c2-df -- 2-byte sequence.  */
      ccl = charclass_alloc ();
      charclass_setbit_range (0xc2, 0xdf, ccl);
      parser->utf8_anychar_classes[2] = charclass_completed (ccl);

      /* Index 2: e0-ef -- 3-byte sequence.  */
      ccl = charclass_alloc ();
      charclass_setbit_range (0xe0, 0xef, ccl);
      parser->utf8_anychar_classes[3] = charclass_completed (ccl);

      /* Index 2: f0-f7 -- 4-byte sequence.  */
      ccl = charclass_alloc ();
      charclass_setbit_range (0xf0, 0xf7, ccl);
      parser->utf8_anychar_classes[4] = charclass_completed (ccl);
    }

  /* A valid UTF-8 character is

     ([0x00-0x7f]
     |[0xe0-0xef][0x80-0xbf][0x80-0xbf]xc2-0xdf][0x80-0xbf]
     |[0xe0-0xef][0x80-0xbf][0x80-0xbf]xe0-0xef[0x80-0xbf][0x80-0xbf]
     |[0xe0-0xef][0x80-0xbf][0x80-0xbf]xf0-f7][0x80-0xbf][0x80-0xbf][0x80-0xbf])

     which I'll write more concisely "B|CA|DAA|EAAA".  Factor the [0x00-0x7f]
     and you get "B|(C|(D|EA)A)A".  And since the token buffer is in reverse
     Polish notation, you get "B C D E A CAT OR A CAT OR A CAT OR".  */
  /* Write out leaf tokens for each of the four possible starting bytes.  */
  for (i = 1; i < 5; i++)
    addtok (parser, FSATOKEN_TK_CSET + parser->utf8_anychar_classes[i]);
  /* Add follow-on classes, plus tokens to build a postfix tree
     covering all four alternatives of valid UTF-8 sequences.  */
  for (i = 1; i <= 3; i++)
    {
      addtok (parser, FSATOKEN_TK_CSET + parser->utf8_anychar_classes[0]);
      addtok (parser, FSATOKEN_TK_CAT);
      addtok (parser, FSATOKEN_TK_OR);
    }
}

/* The grammar understood by the parser is as follows.

   regexp:
     regexp FSATOKEN_TK_OR branch
     branch

   branch:
     branch closure
     closure

   closure:
     closure FSATOKEN_TK_QMARK
     closure FSATOKEN_TK_STAR
     closure FSATOKEN_TK_PLUS
     closure FSATOKEN_TK_REPMN
     atom

   atom:
     <normal character>
     <multibyte character>
     FSATOKEN_TK_ANYCHAR
     FSATOKEN_TK_MBCSET
     FSATOKEN_TK_CSET
     FSATOKEN_TK_BACKREF
     FSATOKEN_TK_BEGLINE
     FSATOKEN_TK_ENDLINE
     FSATOKEN_TK_BEGWORD
     FSATOKEN_TK_ENDWORD
     FSATOKEN_TK_LIMWORD
     FSATOKEN_TK_NOTLIMWORD
     FSATOKEN_TK_LPAREN regexp FSATOKEN_TK_RPAREN
     <empty>

   The parser builds a parse tree in postfix form in an array of tokens.  */

/* Provide a forward declaration for regexp, as it is at the top of
   the parse tree, but is referenced by atom, at the bottom of the
   tree.  */
static void regexp (fsaparse_ctxt_t *parser);

/* 3 June 2014: I must be a sucker for punishment, and/or going crazy;
    whatever.  I've looked at the partially-edited code for "atom",
    looking at it in the (incomplete) light of introducing the mbcsets
    change... and have decided, just as I previously did with
    closure () below, to rewrite it from scratch.  So, here goes...  */
static void
atom (fsaparse_ctxt_t *parser)
{
  fsatoken_token_t tok = parser->lookahead_token;

  /* For a unibyte character, it is its own token.  */
  if (tok >= 0 && tok < FSATOKEN_NOTCHAR)
    {
      addtok (parser, tok);
      parser->lookahead_token = parser->lexer (parser->lex_context);
      return;
    }

  /* For a unibyte character set (CSET + index), it is its own token.  */
  if (tok >= FSATOKEN_TK_CSET)
    {
      addtok (parser, tok);
      parser->lookahead_token = parser->lexer (parser->lex_context);
      return;
    }

  /* For ANYCHAR in an UTF-8 setting, use a hierarchy of CSETs.  */
  if (tok == FSATOKEN_TK_ANYCHAR && using_utf8 ())
    {
      /* For UTF-8, expand the period to a series of CSETs that define
         a valid UTF-8 character.  This avoids using the slow multibyte
         path.  I'm pretty sure it would be both profitable and correct
         to do it for any encoding; however, the optimization must be
         done manually as it is done above in add_utf8_anychar.  So,
         let's start with UTF-8: it is the most used, and the structure
         of the encoding makes the correctness more obvious.  */
      add_utf8_anychar (parser);
      parser->lookahead_token = parser->lexer (parser->lex_context);
      return;
    }

  /* Use switch to carve up the remaining cases into groups.  */
  switch (tok)
    {
    case FSATOKEN_TK_WCHAR:
      {
        /* Wide character, possibly including equivalent characters
           (probably due to selection of case folding).  */
        int nr_wide_chars;

        nr_wide_chars = fsalex_exchange (parser->lex_context,
                                         PROTO_LEXPARSE_OP_GET_WIDE_CHARS,
                                         parser->wide_char_list);
        assert(nr_wide_chars >= 1);
        if (parser->wide_char_list[0] == WEOF)
          addtok (parser, FSATOKEN_TK_BACKREF);
        else
          {
            int i;

            addtok_wc (parser, parser->wide_char_list[0]);
            for (i = 1; i < nr_wide_chars; i++)
              {
                addtok_wc (parser, parser->wide_char_list[i]);
                addtok (parser, FSATOKEN_TK_OR);
              }
          }
        parser->lookahead_token = parser->lexer (parser->lex_context);
        return;
      }

    case FSATOKEN_TK_MBCSET:
      {
        mbcsets_set_t *work_mbc;

        /* Acquire space to store set pointer.  Sadly (?), fsaparse
           maintains a duplicate list to fsalex at present.  */
        parser->mbcsets = maybe_realloc (parser->mbcsets,
                                         parser->nmbcsets,
                                         &parser->mbcsets_alloc,
                                         sizeof work_mbc);

        /* Get set pointer from fsalex, and add it to the list.  */
        fsalex_exchange (parser->lex_context,
                         PROTO_LEXPARSE_OP_GET_MBCSET,
                         &work_mbc);
        parser->mbcsets[parser->nmbcsets++] = work_mbc;

        /* We could fall-through case labels to the addtok code below,
           but am keeping code sections separate due to an overabundance
           of caution.  */
        addtok (parser, tok);
        parser->lookahead_token = parser->lexer (parser->lex_context);
        return;
      }

    case FSATOKEN_TK_ANYCHAR:
    case FSATOKEN_TK_BACKREF:
    case FSATOKEN_TK_BEGLINE:
    case FSATOKEN_TK_ENDLINE:
    case FSATOKEN_TK_BEGWORD:
    case FSATOKEN_TK_ENDWORD:
    case FSATOKEN_TK_LIMWORD:
    case FSATOKEN_TK_NOTLIMWORD:
      addtok (parser, tok);
      parser->lookahead_token = parser->lexer (parser->lex_context);
      return;

    case FSATOKEN_TK_LPAREN:
      parser->lookahead_token = parser->lexer (parser->lex_context);
      regexp (parser);
      if (parser->lookahead_token != FSATOKEN_TK_RPAREN)
        parser->abandon_with_error (_("unbalanced ("));
      parser->lookahead_token = parser->lexer (parser->lex_context);
      return;

    default:
      addtok (parser, FSATOKEN_TK_EMPTY);
      return;
  }

  /* NOTREACHED */
}

/* Return the number of tokens in the given subexpression.  */
static size_t _GL_ATTRIBUTE_PURE
nsubtoks (fsaparse_ctxt_t *parser, size_t tindex)
{
  size_t ntoks1;

  switch (parser->tokens[tindex - 1])
    {
    default:
      return 1;
    case FSATOKEN_TK_QMARK:
    case FSATOKEN_TK_STAR:
    case FSATOKEN_TK_PLUS:
      return 1 + nsubtoks (parser, tindex - 1);
    case FSATOKEN_TK_CAT:
    case FSATOKEN_TK_OR:
      ntoks1 = nsubtoks (parser, tindex - 1);
      return 1 + ntoks1 + nsubtoks (parser, tindex - 1 - ntoks1);
    }
}

/* Copy the given subexpression to the top of the tree.  */
static void
copytoks (fsaparse_ctxt_t *parser, size_t tindex, size_t ntokens)
{
  size_t i;

  if (parser->multibyte)
    for (i = 0; i < ntokens; ++i)
      addtok_mb (parser, parser->tokens[tindex + i], parser->multibyte_prop[tindex + i]);
  else
    for (i = 0; i < ntokens; ++i)
      addtok_mb (parser, parser->tokens[tindex + i], 3);
}

/* Rewriting fsaparse:closure () from scratch; original is clever but a
   little tricky to follow, so I'm trying to break up a while + compound-if
   loop into a simpler construct (more like a finite-state machine).  Also,
   edits such as replacing "dfa->" with "parser->" are done here, adding
   "parser" as a parameter in lots of places, as well as the long-winded
   FSATOKEN_TK_" prefix.

   I'm not sure if this version is an improvement over the original; the
   need to use "parser->lookahead_token" instead of "tok" influenced my
   decision to try this... but the jury is still out.  */
static void
closure (fsaparse_ctxt_t *parser)
{
restart_closure:
  atom (parser);
  for (;;)
    {
      switch (parser->lookahead_token)
        {
          case FSATOKEN_TK_QMARK:
          case FSATOKEN_TK_STAR:
          case FSATOKEN_TK_PLUS:
            addtok (parser, parser->lookahead_token);
            parser->lookahead_token = parser->lexer (parser->lex_context);
            continue;

          case FSATOKEN_TK_REPMN:
            /* REPMN needs extra work; move outside the switch statement.  */
            break;

          default:
            /* Merely let the intial atom call stand as our return result.  */
            return;
        }

      /* Deal with REPMN{min, max} cases in a separate block.  */
      {
        int i;
        size_t prev_sub_index, ntokens;
        int minrep, maxrep;

        /* Get the {min, max} pair decoded by the lexer.  */
        minrep = parser->lex_exchange (parser->lex_context,
                                       PROTO_LEXPARSE_OP_GET_REPMN_MIN,
                                       NULL);
        maxrep = parser->lex_exchange (parser->lex_context,
                                       PROTO_LEXPARSE_OP_GET_REPMN_MAX,
                                       NULL);

        /* Find out how many tokens are in the peceding token list that are
           covered by this REPMN directive.  This involves carefully working
           backwards through the linear, postfix token ordering.  */
        ntokens = nsubtoks (parser, parser->tindex);

        /* If min and max are both zero, merely remove preceding
           subexpression, get a new token, and restart the atom/closure
           processing from the top of the function.  Not sure if people will
           like this goto statement, but we'll give it a whirl.   */
        if (minrep == 0 && maxrep == 0)
          {
            parser->tindex -= ntokens;
            parser->lookahead_token = parser->lexer (parser->lex_context);
            goto restart_closure;
          }

        /* Non-zero min or max, defined as follows:
             {n}   The preceding item is matched exactly n times.
             {n,}  The preceding item is matched n or more times.
             {,m}  The preceding item is matched at most m times (GNU ext.)
             {n,m} The preceding item is matched at least n, but not more
                   than m times.
           For {n,} and {,m} cases, the omitted parameter is reported here
           as a negative value.  */
        prev_sub_index = parser->tindex - ntokens;
        if (maxrep < 0)
          addtok (parser, FSATOKEN_TK_PLUS);
        if (minrep == 0)
          addtok (parser, FSATOKEN_TK_QMARK);
        for (i = 1; i < minrep; ++i)
          {
            copytoks (parser, prev_sub_index, ntokens);
            addtok (parser, FSATOKEN_TK_CAT);
          }
        for (; i < maxrep; ++i)
          {
            copytoks (parser, prev_sub_index, ntokens);
            addtok (parser, FSATOKEN_TK_QMARK);
            addtok (parser, FSATOKEN_TK_CAT);
          }
        /* Prime the parser with the next token after REPMN and loop.  */
        parser->lookahead_token = parser->lexer (parser->lex_context);
      }
    }
}

static void
branch (fsaparse_ctxt_t *parser)
{
  fsatoken_token_t tok;

  closure (parser);
  tok = parser->lookahead_token;
  while (tok != FSATOKEN_TK_RPAREN && tok != FSATOKEN_TK_OR && tok >= 0)
    {
      closure (parser);
      tok = parser->lookahead_token;
      addtok (parser, FSATOKEN_TK_CAT);
    }
}

static void
regexp (fsaparse_ctxt_t *parser)
{
  branch (parser);
  while (parser->lookahead_token == FSATOKEN_TK_OR)
    {
      parser->lookahead_token = parser->lexer (parser->lex_context);
      branch (parser);
      addtok (parser, FSATOKEN_TK_OR);
    }
}

/* Main entry point for the parser.  Parser is a pointer to a parser
   context struct created by fsaparse_new.  Before calling this function,
   the parser instance must be supplied with a lexer (fsaparse_lexer), and
   also with callback functions to receive warning and error reports
   (fsaparse_esception_fns).  */
void
fsaparse_parse (fsaparse_ctxt_t *parser)
{
  /* Obtain an initial token for lookahead, and keep tracking tree depth.  */
  parser->lookahead_token = parser->lexer (parser->lex_context);
  parser->current_depth = parser->depth;

  /* Run regexp to manage the next level of parsing.  */
  regexp (parser);
  if (parser->lookahead_token != FSATOKEN_TK_END)
    parser->abandon_with_error (_("unbalanced )"));

  /* If multiple expressions are parsed, second and subsequent patters are
     presented as alternatives to preceding patterns.  */
  addtok (parser, FSATOKEN_TK_END - parser->nregexps);
  addtok (parser, FSATOKEN_TK_CAT);
  if (parser->nregexps)
    addtok (parser, FSATOKEN_TK_OR);

  ++parser->nregexps;
}

/* Receive functions to deal with exceptions detected by the parser:
   Warnings and errors.  Internally, we add the _Noreturn attribute
   to the error callback, to help the compiler with code flow
   analysis.  */
extern void
fsaparse_exception_fns (fsaparse_ctxt_t *parser,
                      fsaparse_warn_callback_fn *warningfn,
                      fsaparse_error_callback_fn *errorfn)
{
  /* Exception handling is done by explicit callbacks.  */
  parser->warn_client = warningfn;
  parser->abandon_with_error = errorfn;
}

/* Add "not provided!" stub function that gets called if the client
   fails to provide proper resources.  This is a hack, merely to get the
   module started; better treatment needs to be added later.  */
static void
no_function_provided (void *unused)
{
  assert (!"fsaparse: Plug-in function required, but not provided.");
}

/* Receiver a lexer function, plus lexer instance context pointer, for use by
   the parser.  Although not needed initially, this plug-in architecture may
   be useful in the future, and it breaks up some of the intricate
   connections that made the original dfa.c code so daunting.  */
void
fsaparse_lexer (fsaparse_ctxt_t *parser,
                void *lexer_context,
                proto_lexparse_lex_fn_t *lex_fn,
                proto_lexparse_exchange_fn_t *lex_exchange_fn)
{
  bool is_multibyte;
  int wchars_max;

  /* Record supplied lexer function and context for use later.  */
  parser->lex_context  = lexer_context;
  parser->lexer        = lex_fn;
  parser->lex_exchange = lex_exchange_fn;

  /* Query lexer to get multibyte nature of this locale.  */
  is_multibyte
      = lex_exchange_fn (lexer_context,
                         PROTO_LEXPARSE_OP_GET_IS_MULTIBYTE_LOCALE,
                         NULL);
  parser->multibyte_locale = is_multibyte;
  parser->unibyte_locale = ! is_multibyte;

  /* Set up WCHAR token infrastructure:  Wide-char list/array/whatever.  We
     free the wide_char_list first, in case someone calls this function
     multiple times, so that earlier allocations are correctly freed up, and
     we use the value advised by the most recent lexer presented to us.  */
  wchars_max = lex_exchange_fn (lexer_context,
                                PROTO_LEXPARSE_OP_GET_WIDE_CHAR_LIST_MAX,
                                NULL);
  parser->wide_chars_max = wchars_max;
  free (parser->wide_char_list);
  parser->wide_char_list = NULL;
  if (wchars_max)
    parser->wide_char_list = xnmalloc (wchars_max, sizeof (wchar_t));
}

/* Generate a new instance of an FSA parser.  */
fsaparse_ctxt_t *
fsaparse_new (void)
{
  fsaparse_ctxt_t *new_context;

  /* Acquire zeroed memory for new parser context.  */
  new_context = XZALLOC (fsaparse_ctxt_t);

  /* ?? Point warning, error and lexer functions to a "you need to tell me
     these first!" function? */
  new_context->warn_client        = (fsaparse_warn_callback_fn *)
                                    no_function_provided;
  new_context->abandon_with_error = (fsaparse_error_callback_fn *)
                                    no_function_provided;
  new_context->lexer              = (fsaparse_lexer_fn_t  *)
                                    no_function_provided;

  /* Default to unibyte locale... but we synchronise with the lexer later. */
  new_context->multibyte_locale = false;
  new_context->unibyte_locale = true;

  /* Add this instance at the head of the module's list.  */
  new_context->next_instance = fsaparse_instance_list_head;
  fsaparse_instance_list_head = new_context;

  return new_context;
}

/* After parsing, report a list of tokens describing the pattern.  Complex
   structures such as alternation, backreferences, and locale-induced
   complexity such as variable-length utf8 sequences are described here by
   appending operators that apply to the preceding item(s) (postfix
   notation).  */
void
fsaparse_get_token_list (fsaparse_ctxt_t *parser,
                         size_t *nr_tokens,
                         fsatoken_token_t **token_list)
{
  *nr_tokens  = parser->tindex;
  *token_list = parser->tokens;
}

/* Internal function to free all resources directly or indirectly used by an
   instance.  The pointer is no longer valid after this call.  */
static void
free_instance (fsaparse_ctxt_t *parser)
{
  free (parser->wide_char_list);
  free (parser->tokens);
  free (parser->multibyte_prop);
  free (parser);
}

/* Destroy all parser instances, plus any associated resources owned by the
   module.  */
void
fsaparse_destroy_module (void)
{
  fsaparse_ctxt_t *p_list;
  fsaparse_ctxt_t *p_next;

  /* Move the global list head into a local variable, and immediately clear
     the global.  This is a half-hearted attempt to avoid race conditions;
     to do things properly, a system-wide atomic operation (locked,
     including multi-CPU cache coherency) operation should be used.  */
  p_list = fsaparse_instance_list_head;
  fsaparse_instance_list_head = NULL;

  /* Traverse the list of instances, releasing all resources associated
     with each one.  */
  while (p_list)
    {
      p_next = p_list->next_instance;
      free_instance (p_list);
      p_list = p_next;
    }
}

/* Prepare module for operation.  */
void
fsaparse_initialise (void)
{
  /* Initialise the linked list of instances created by this module.  */
  fsaparse_instance_list_head = NULL;

  /* Clean up resources upon exit.  */
  atexit (fsaparse_destroy_module);
}

/* vim:set shiftwidth=2: */