/* fsaparse -- Build a structure naming relationships (sequences, alternatives, backreferences, options and precedence) of tokens Copyright (C) 1988, 1998, 2000, 2002, 2004-2005, 2007-2014 Free Software Foundation, Inc. This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation; either version 3, or (at your option) any later version. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with this program; if not, write to the Free Software Foundation, Inc., 51 Franklin Street - Fifth Floor, Boston, MA 02110-1301, USA */ /* Written June, 1988 by Mike Haertel Modified July, 1988 by Arthur David Olson to assist BMG speedups */ /* 2014: Repackaged by "untangle" script, written by behoffski. */ /* This function receives a stream of tokens from fsalex, and processes them to impose precedence rules and to describe complex pattern elements that are beyond the capability of the simple lexer. In addition to the cases explicit in the syntax (e.g."(ab|c)", variable-length multibyte encodings (UTF-8; codesets including modifiers and/or shift items) also require these enhanced facilities. */ /* Always import environment-specific configuration items first. */ #include #include #include "charclass.h" #include "fsaparse.h" #include "fsalex.h" #include "fsatoken.h" #include "mbcsets.h" #include "proto-lexparse.h" #include #include #include "xalloc.h" /* gettext.h ensures that we don't use gettext if ENABLE_NLS is not defined */ #include "gettext.h" #define _(str) gettext (str) #include #include #if HAVE_LANGINFO_CODESET # include #endif /* fsaparse_ctxt: Gather all the context to do with the parser into a single struct. We do this mainly because it make it easier to contemplate having multiple instances of this module running in parallel, but also because it makes translating from "dfa->" easier. This definition fleshes out the opaque type given in the module header. */ struct fsaparse_ctxt_struct { /* Singly-linked list of all parser instances, so destroy_module can release all resources by traversing the list. */ fsaparse_ctxt_t *next_instance; /* Warning and abort functions provided by client. */ fsalex_warn_callback_fn *warn_client; fsalex_error_callback_fn *abandon_with_error; /* Plug-in functions and context to deal with lexer at arm's length. */ proto_lexparse_lex_fn_t *lexer; proto_lexparse_exchange_fn_t *lex_exchange; void *lex_context; /* Information about locale (needs to sync with lexer...?) */ bool multibyte_locale; bool unibyte_locale; /* Variable to store a dynamically-allocated list of wide characters. The parser needs to fetch this list whenever a WCHAR token is received. */ int wide_chars_max; wchar_t *wide_char_list; fsatoken_token_t lookahead_token; size_t current_depth; /* Current depth of a hypothetical stack holding deferred productions. This is used to determine the depth that will be required of the real stack later on in dfaanalyze. */ /* Fields filled by the parser. */ fsatoken_token_t *tokens; /* Postfix parse array. */ size_t tindex; /* Index for adding new tokens. */ size_t talloc; /* Number of tokens currently allocated. */ size_t depth; /* Depth required of an evaluation stack used for depth-first traversal of the parse tree. */ size_t nleaves; /* Number of leaves on the parse tree. */ size_t nregexps; /* Count of parallel regexps being built with dfaparse. */ bool fast; /* The DFA is fast. */ bool multibyte; /* MB_CUR_MAX > 1. */ fsatoken_token_t utf8_anychar_classes[5]; /* To lower ANYCHAR in UTF-8 locales. */ mbstate_t mbs; /* Multibyte conversion state. */ /* The following are valid only if MB_CUR_MAX > 1. */ /* The value of multibyte_prop[i] is defined by following rule. if tokens[i] < NOTCHAR bit 0 : tokens[i] is the first byte of a character, including single-byte characters. bit 1 : tokens[i] is the last byte of a character, including single-byte characters. if tokens[i] = MBCSET ("the index of mbcsets corresponding to this operator" << 2) + 3 e.g. tokens = 'single_byte_a', 'multi_byte_A', single_byte_b' = 'sb_a', 'mb_A(1st byte)', 'mb_A(2nd byte)', 'mb_A(3rd byte)', 'sb_b' multibyte_prop = 3 , 1 , 0 , 2 , 3 */ int *multibyte_prop; mbcsets_set_t **mbcsets; size_t nmbcsets; size_t mbcsets_alloc; }; /* Linked list of all instances created by this module. */ static fsaparse_ctxt_t *fsaparse_instance_list_head = NULL; /* Ensure that the array addressed by PTR holds at least NITEMS + (PTR || !NITEMS) items. Either return PTR, or reallocate the array and return its new address. Although PTR may be null, the returned value is never null. The array holds *NALLOC items; *NALLOC is updated on reallocation. ITEMSIZE is the size of one item. Avoid O(N**2) behavior on arrays growing linearly. */ static void * maybe_realloc (void *ptr, size_t nitems, size_t *nalloc, size_t itemsize) { if (nitems < *nalloc) return ptr; *nalloc = nitems; return x2nrealloc (ptr, nalloc, itemsize); } /* UTF-8 encoding allows some optimizations that we can't otherwise assume in a multibyte encoding. */ static int using_utf8 (void) { static int utf8 = -1; if (utf8 < 0) { wchar_t wc; mbstate_t mbs = { 0 }; utf8 = mbrtowc (&wc, "\xc4\x80", 2, &mbs) == 2 && wc == 0x100; } return utf8; } /* Recursive descent parser for regular expressions. */ static void addtok_mb (fsaparse_ctxt_t *parser, fsatoken_token_t t, int mbprop) { if (parser->talloc == parser->tindex) { parser->tokens = x2nrealloc (parser->tokens, &parser->talloc, sizeof *parser->tokens); if (parser->multibyte) parser->multibyte_prop = xnrealloc (parser->multibyte_prop, parser->talloc, sizeof *parser->multibyte_prop); } if (parser->multibyte) parser->multibyte_prop[parser->tindex] = mbprop; parser->tokens[parser->tindex++] = t; switch (t) { case FSATOKEN_TK_QMARK: case FSATOKEN_TK_STAR: case FSATOKEN_TK_PLUS: break; case FSATOKEN_TK_CAT: case FSATOKEN_TK_OR: --parser->current_depth; break; case FSATOKEN_TK_BACKREF: parser->fast = false; /* fallthrough */ default: ++parser->nleaves; /* fallthrough */ case FSATOKEN_TK_EMPTY: ++parser->current_depth; break; } if (parser->depth < parser->current_depth) parser->depth = parser->current_depth; } static void addtok_wc (fsaparse_ctxt_t *parser, wint_t wc); /* Sigh. Mbcsets is so disruptive (intimate details hidden, by design), that we are writing addtok here from scratch. */ static void addtok (fsaparse_ctxt_t *parser, fsatoken_token_t t) { bool need_or = false; mbcsets_set_t *work_mbc; bool invert; charclass_t *charclass; size_t nchars; size_t nch_classes; size_t nranges; size_t nequivs; size_t ncoll_elems; if (t != FSATOKEN_TK_MBCSET || parser->unibyte_locale) { addtok_mb (parser, t, 3); return; } work_mbc = parser->mbcsets[parser->nmbcsets - 1]; mbcsets_get_characteristics (work_mbc, &invert, &charclass, &nchars, &nch_classes, &nranges, &nequivs, &ncoll_elems); if (nchars && !invert) { size_t i; wchar_t *char_list; char_list = xnmalloc (nchars, sizeof *char_list); mbcsets_get_chars (work_mbc, char_list); addtok_wc (parser, char_list[0]); for (i = 1; i < nchars; i++) { addtok_wc (parser, char_list[i]); addtok (parser, FSATOKEN_TK_OR); } free (char_list); need_or = true; } /* If the set contains non-trivial components, it's too hard to deal with here, so hand it on to higher-up machinery. */ if (invert || nch_classes || nranges || ncoll_elems) { addtok_mb (parser, FSATOKEN_TK_MBCSET, ((parser->nmbcsets - 1) << 2) + 3); if (need_or) addtok (parser, FSATOKEN_TK_OR); } /* Individual characters have been handled above, so the only case remaining is where codes have been simplified into charclass members. */ else if (charclass) { addtok (parser, FSATOKEN_TK_CSET + charclass_get_index (charclass)); if (need_or) addtok (parser, FSATOKEN_TK_OR); } } /* We treat a multibyte character as a single atom, so that DFA can treat a multibyte character as a single expression. e.g., we construct the following tree from "". */ static void addtok_wc (fsaparse_ctxt_t *parser, wint_t wc) { unsigned char buf[MB_LEN_MAX]; mbstate_t s = { 0 }; int i; int cur_mb_len; size_t stored_bytes = wcrtomb ((char *) buf, wc, &s); if (stored_bytes != (size_t) -1) cur_mb_len = stored_bytes; else { /* This is merely stop-gap. buf[0] is undefined, yet skipping the addtok_mb call altogether can corrupt the heap. */ cur_mb_len = 1; buf[0] = 0; } addtok_mb (parser, buf[0], cur_mb_len == 1 ? 3 : 1); for (i = 1; i < cur_mb_len; i++) { addtok_mb (parser, buf[i], i == cur_mb_len - 1 ? 2 : 0); addtok (parser, FSATOKEN_TK_CAT); } } static void add_utf8_anychar (fsaparse_ctxt_t *parser) { unsigned int i; /* Have we set up the classes for the 1-byte to 4-byte sequence types? */ if (parser->utf8_anychar_classes[0] == 0) { /* No, first time we've been called, so set them up now. */ charclass_t *ccl; const charclass_t *dotclass; /* Index 0: 80-bf -- Non-leading bytes. */ ccl = charclass_alloc (); charclass_setbit_range (0x80, 0xbf, ccl); parser->utf8_anychar_classes[0] = charclass_completed (ccl); /* Index 1: 00-7f -- 1-byte leading seq, minus dotclass exceptions. */ ccl = charclass_alloc (); charclass_setbit_range (0x00, 0x7f, ccl); fsalex_exchange (parser->lex_context, PROTO_LEXPARSE_OP_GET_DOTCLASS, &dotclass); charclass_intersectset (dotclass, ccl); parser->utf8_anychar_classes[1] = charclass_completed (ccl); /* Index 2: c2-df -- 2-byte sequence. */ ccl = charclass_alloc (); charclass_setbit_range (0xc2, 0xdf, ccl); parser->utf8_anychar_classes[2] = charclass_completed (ccl); /* Index 2: e0-ef -- 3-byte sequence. */ ccl = charclass_alloc (); charclass_setbit_range (0xe0, 0xef, ccl); parser->utf8_anychar_classes[3] = charclass_completed (ccl); /* Index 2: f0-f7 -- 4-byte sequence. */ ccl = charclass_alloc (); charclass_setbit_range (0xf0, 0xf7, ccl); parser->utf8_anychar_classes[4] = charclass_completed (ccl); } /* A valid UTF-8 character is ([0x00-0x7f] |[0xe0-0xef][0x80-0xbf][0x80-0xbf]xc2-0xdf][0x80-0xbf] |[0xe0-0xef][0x80-0xbf][0x80-0xbf]xe0-0xef[0x80-0xbf][0x80-0xbf] |[0xe0-0xef][0x80-0xbf][0x80-0xbf]xf0-f7][0x80-0xbf][0x80-0xbf][0x80-0xbf]) which I'll write more concisely "B|CA|DAA|EAAA". Factor the [0x00-0x7f] and you get "B|(C|(D|EA)A)A". And since the token buffer is in reverse Polish notation, you get "B C D E A CAT OR A CAT OR A CAT OR". */ /* Write out leaf tokens for each of the four possible starting bytes. */ for (i = 1; i < 5; i++) addtok (parser, FSATOKEN_TK_CSET + parser->utf8_anychar_classes[i]); /* Add follow-on classes, plus tokens to build a postfix tree covering all four alternatives of valid UTF-8 sequences. */ for (i = 1; i <= 3; i++) { addtok (parser, FSATOKEN_TK_CSET + parser->utf8_anychar_classes[0]); addtok (parser, FSATOKEN_TK_CAT); addtok (parser, FSATOKEN_TK_OR); } } /* The grammar understood by the parser is as follows. regexp: regexp FSATOKEN_TK_OR branch branch branch: branch closure closure closure: closure FSATOKEN_TK_QMARK closure FSATOKEN_TK_STAR closure FSATOKEN_TK_PLUS closure FSATOKEN_TK_REPMN atom atom: FSATOKEN_TK_ANYCHAR FSATOKEN_TK_MBCSET FSATOKEN_TK_CSET FSATOKEN_TK_BACKREF FSATOKEN_TK_BEGLINE FSATOKEN_TK_ENDLINE FSATOKEN_TK_BEGWORD FSATOKEN_TK_ENDWORD FSATOKEN_TK_LIMWORD FSATOKEN_TK_NOTLIMWORD FSATOKEN_TK_LPAREN regexp FSATOKEN_TK_RPAREN The parser builds a parse tree in postfix form in an array of tokens. */ /* Provide a forward declaration for regexp, as it is at the top of the parse tree, but is referenced by atom, at the bottom of the tree. */ static void regexp (fsaparse_ctxt_t *parser); /* 3 June 2014: I must be a sucker for punishment, and/or going crazy; whatever. I've looked at the partially-edited code for "atom", looking at it in the (incomplete) light of introducing the mbcsets change... and have decided, just as I previously did with closure () below, to rewrite it from scratch. So, here goes... */ static void atom (fsaparse_ctxt_t *parser) { fsatoken_token_t tok = parser->lookahead_token; /* For a unibyte character, it is its own token. */ if (tok >= 0 && tok < FSATOKEN_NOTCHAR) { addtok (parser, tok); parser->lookahead_token = parser->lexer (parser->lex_context); return; } /* For a unibyte character set (CSET + index), it is its own token. */ if (tok >= FSATOKEN_TK_CSET) { addtok (parser, tok); parser->lookahead_token = parser->lexer (parser->lex_context); return; } /* For ANYCHAR in an UTF-8 setting, use a hierarchy of CSETs. */ if (tok == FSATOKEN_TK_ANYCHAR && using_utf8 ()) { /* For UTF-8, expand the period to a series of CSETs that define a valid UTF-8 character. This avoids using the slow multibyte path. I'm pretty sure it would be both profitable and correct to do it for any encoding; however, the optimization must be done manually as it is done above in add_utf8_anychar. So, let's start with UTF-8: it is the most used, and the structure of the encoding makes the correctness more obvious. */ add_utf8_anychar (parser); parser->lookahead_token = parser->lexer (parser->lex_context); return; } /* Use switch to carve up the remaining cases into groups. */ switch (tok) { case FSATOKEN_TK_WCHAR: { /* Wide character, possibly including equivalent characters (probably due to selection of case folding). */ int nr_wide_chars; nr_wide_chars = fsalex_exchange (parser->lex_context, PROTO_LEXPARSE_OP_GET_WIDE_CHARS, parser->wide_char_list); assert(nr_wide_chars >= 1); if (parser->wide_char_list[0] == WEOF) addtok (parser, FSATOKEN_TK_BACKREF); else { int i; addtok_wc (parser, parser->wide_char_list[0]); for (i = 1; i < nr_wide_chars; i++) { addtok_wc (parser, parser->wide_char_list[i]); addtok (parser, FSATOKEN_TK_OR); } } parser->lookahead_token = parser->lexer (parser->lex_context); return; } case FSATOKEN_TK_MBCSET: { mbcsets_set_t *work_mbc; /* Acquire space to store set pointer. Sadly (?), fsaparse maintains a duplicate list to fsalex at present. */ parser->mbcsets = maybe_realloc (parser->mbcsets, parser->nmbcsets, &parser->mbcsets_alloc, sizeof work_mbc); /* Get set pointer from fsalex, and add it to the list. */ fsalex_exchange (parser->lex_context, PROTO_LEXPARSE_OP_GET_MBCSET, &work_mbc); parser->mbcsets[parser->nmbcsets++] = work_mbc; /* We could fall-through case labels to the addtok code below, but am keeping code sections separate due to an overabundance of caution. */ addtok (parser, tok); parser->lookahead_token = parser->lexer (parser->lex_context); return; } case FSATOKEN_TK_ANYCHAR: case FSATOKEN_TK_BACKREF: case FSATOKEN_TK_BEGLINE: case FSATOKEN_TK_ENDLINE: case FSATOKEN_TK_BEGWORD: case FSATOKEN_TK_ENDWORD: case FSATOKEN_TK_LIMWORD: case FSATOKEN_TK_NOTLIMWORD: addtok (parser, tok); parser->lookahead_token = parser->lexer (parser->lex_context); return; case FSATOKEN_TK_LPAREN: parser->lookahead_token = parser->lexer (parser->lex_context); regexp (parser); if (parser->lookahead_token != FSATOKEN_TK_RPAREN) parser->abandon_with_error (_("unbalanced (")); parser->lookahead_token = parser->lexer (parser->lex_context); return; default: addtok (parser, FSATOKEN_TK_EMPTY); return; } /* NOTREACHED */ } /* Return the number of tokens in the given subexpression. */ static size_t _GL_ATTRIBUTE_PURE nsubtoks (fsaparse_ctxt_t *parser, size_t tindex) { size_t ntoks1; switch (parser->tokens[tindex - 1]) { default: return 1; case FSATOKEN_TK_QMARK: case FSATOKEN_TK_STAR: case FSATOKEN_TK_PLUS: return 1 + nsubtoks (parser, tindex - 1); case FSATOKEN_TK_CAT: case FSATOKEN_TK_OR: ntoks1 = nsubtoks (parser, tindex - 1); return 1 + ntoks1 + nsubtoks (parser, tindex - 1 - ntoks1); } } /* Copy the given subexpression to the top of the tree. */ static void copytoks (fsaparse_ctxt_t *parser, size_t tindex, size_t ntokens) { size_t i; if (parser->multibyte) for (i = 0; i < ntokens; ++i) addtok_mb (parser, parser->tokens[tindex + i], parser->multibyte_prop[tindex + i]); else for (i = 0; i < ntokens; ++i) addtok_mb (parser, parser->tokens[tindex + i], 3); } /* Rewriting fsaparse:closure () from scratch; original is clever but a little tricky to follow, so I'm trying to break up a while + compound-if loop into a simpler construct (more like a finite-state machine). Also, edits such as replacing "dfa->" with "parser->" are done here, adding "parser" as a parameter in lots of places, as well as the long-winded FSATOKEN_TK_" prefix. I'm not sure if this version is an improvement over the original; the need to use "parser->lookahead_token" instead of "tok" influenced my decision to try this... but the jury is still out. */ static void closure (fsaparse_ctxt_t *parser) { restart_closure: atom (parser); for (;;) { switch (parser->lookahead_token) { case FSATOKEN_TK_QMARK: case FSATOKEN_TK_STAR: case FSATOKEN_TK_PLUS: addtok (parser, parser->lookahead_token); parser->lookahead_token = parser->lexer (parser->lex_context); continue; case FSATOKEN_TK_REPMN: /* REPMN needs extra work; move outside the switch statement. */ break; default: /* Merely let the intial atom call stand as our return result. */ return; } /* Deal with REPMN{min, max} cases in a separate block. */ { int i; size_t prev_sub_index, ntokens; int minrep, maxrep; /* Get the {min, max} pair decoded by the lexer. */ minrep = parser->lex_exchange (parser->lex_context, PROTO_LEXPARSE_OP_GET_REPMN_MIN, NULL); maxrep = parser->lex_exchange (parser->lex_context, PROTO_LEXPARSE_OP_GET_REPMN_MAX, NULL); /* Find out how many tokens are in the peceding token list that are covered by this REPMN directive. This involves carefully working backwards through the linear, postfix token ordering. */ ntokens = nsubtoks (parser, parser->tindex); /* If min and max are both zero, merely remove preceding subexpression, get a new token, and restart the atom/closure processing from the top of the function. Not sure if people will like this goto statement, but we'll give it a whirl. */ if (minrep == 0 && maxrep == 0) { parser->tindex -= ntokens; parser->lookahead_token = parser->lexer (parser->lex_context); goto restart_closure; } /* Non-zero min or max, defined as follows: {n} The preceding item is matched exactly n times. {n,} The preceding item is matched n or more times. {,m} The preceding item is matched at most m times (GNU ext.) {n,m} The preceding item is matched at least n, but not more than m times. For {n,} and {,m} cases, the omitted parameter is reported here as a negative value. */ prev_sub_index = parser->tindex - ntokens; if (maxrep < 0) addtok (parser, FSATOKEN_TK_PLUS); if (minrep == 0) addtok (parser, FSATOKEN_TK_QMARK); for (i = 1; i < minrep; ++i) { copytoks (parser, prev_sub_index, ntokens); addtok (parser, FSATOKEN_TK_CAT); } for (; i < maxrep; ++i) { copytoks (parser, prev_sub_index, ntokens); addtok (parser, FSATOKEN_TK_QMARK); addtok (parser, FSATOKEN_TK_CAT); } /* Prime the parser with the next token after REPMN and loop. */ parser->lookahead_token = parser->lexer (parser->lex_context); } } } static void branch (fsaparse_ctxt_t *parser) { fsatoken_token_t tok; closure (parser); tok = parser->lookahead_token; while (tok != FSATOKEN_TK_RPAREN && tok != FSATOKEN_TK_OR && tok >= 0) { closure (parser); tok = parser->lookahead_token; addtok (parser, FSATOKEN_TK_CAT); } } static void regexp (fsaparse_ctxt_t *parser) { branch (parser); while (parser->lookahead_token == FSATOKEN_TK_OR) { parser->lookahead_token = parser->lexer (parser->lex_context); branch (parser); addtok (parser, FSATOKEN_TK_OR); } } /* Main entry point for the parser. Parser is a pointer to a parser context struct created by fsaparse_new. Before calling this function, the parser instance must be supplied with a lexer (fsaparse_lexer), and also with callback functions to receive warning and error reports (fsaparse_esception_fns). */ void fsaparse_parse (fsaparse_ctxt_t *parser) { /* Obtain an initial token for lookahead, and keep tracking tree depth. */ parser->lookahead_token = parser->lexer (parser->lex_context); parser->current_depth = parser->depth; /* Run regexp to manage the next level of parsing. */ regexp (parser); if (parser->lookahead_token != FSATOKEN_TK_END) parser->abandon_with_error (_("unbalanced )")); /* If multiple expressions are parsed, second and subsequent patters are presented as alternatives to preceding patterns. */ addtok (parser, FSATOKEN_TK_END - parser->nregexps); addtok (parser, FSATOKEN_TK_CAT); if (parser->nregexps) addtok (parser, FSATOKEN_TK_OR); ++parser->nregexps; } /* Receive functions to deal with exceptions detected by the parser: Warnings and errors. Internally, we add the _Noreturn attribute to the error callback, to help the compiler with code flow analysis. */ extern void fsaparse_exception_fns (fsaparse_ctxt_t *parser, fsaparse_warn_callback_fn *warningfn, fsaparse_error_callback_fn *errorfn) { /* Exception handling is done by explicit callbacks. */ parser->warn_client = warningfn; parser->abandon_with_error = errorfn; } /* Add "not provided!" stub function that gets called if the client fails to provide proper resources. This is a hack, merely to get the module started; better treatment needs to be added later. */ static void no_function_provided (void *unused) { assert (!"fsaparse: Plug-in function required, but not provided."); } /* Receiver a lexer function, plus lexer instance context pointer, for use by the parser. Although not needed initially, this plug-in architecture may be useful in the future, and it breaks up some of the intricate connections that made the original dfa.c code so daunting. */ void fsaparse_lexer (fsaparse_ctxt_t *parser, void *lexer_context, proto_lexparse_lex_fn_t *lex_fn, proto_lexparse_exchange_fn_t *lex_exchange_fn) { bool is_multibyte; int wchars_max; /* Record supplied lexer function and context for use later. */ parser->lex_context = lexer_context; parser->lexer = lex_fn; parser->lex_exchange = lex_exchange_fn; /* Query lexer to get multibyte nature of this locale. */ is_multibyte = lex_exchange_fn (lexer_context, PROTO_LEXPARSE_OP_GET_IS_MULTIBYTE_LOCALE, NULL); parser->multibyte_locale = is_multibyte; parser->unibyte_locale = ! is_multibyte; /* Set up WCHAR token infrastructure: Wide-char list/array/whatever. We free the wide_char_list first, in case someone calls this function multiple times, so that earlier allocations are correctly freed up, and we use the value advised by the most recent lexer presented to us. */ wchars_max = lex_exchange_fn (lexer_context, PROTO_LEXPARSE_OP_GET_WIDE_CHAR_LIST_MAX, NULL); parser->wide_chars_max = wchars_max; free (parser->wide_char_list); parser->wide_char_list = NULL; if (wchars_max) parser->wide_char_list = xnmalloc (wchars_max, sizeof (wchar_t)); } /* Generate a new instance of an FSA parser. */ fsaparse_ctxt_t * fsaparse_new (void) { fsaparse_ctxt_t *new_context; /* Acquire zeroed memory for new parser context. */ new_context = XZALLOC (fsaparse_ctxt_t); /* ?? Point warning, error and lexer functions to a "you need to tell me these first!" function? */ new_context->warn_client = (fsaparse_warn_callback_fn *) no_function_provided; new_context->abandon_with_error = (fsaparse_error_callback_fn *) no_function_provided; new_context->lexer = (fsaparse_lexer_fn_t *) no_function_provided; /* Default to unibyte locale... but we synchronise with the lexer later. */ new_context->multibyte_locale = false; new_context->unibyte_locale = true; /* Add this instance at the head of the module's list. */ new_context->next_instance = fsaparse_instance_list_head; fsaparse_instance_list_head = new_context; return new_context; } /* After parsing, report a list of tokens describing the pattern. Complex structures such as alternation, backreferences, and locale-induced complexity such as variable-length utf8 sequences are described here by appending operators that apply to the preceding item(s) (postfix notation). */ void fsaparse_get_token_list (fsaparse_ctxt_t *parser, size_t *nr_tokens, fsatoken_token_t **token_list) { *nr_tokens = parser->tindex; *token_list = parser->tokens; } /* Internal function to free all resources directly or indirectly used by an instance. The pointer is no longer valid after this call. */ static void free_instance (fsaparse_ctxt_t *parser) { free (parser->wide_char_list); free (parser->tokens); free (parser->multibyte_prop); free (parser); } /* Destroy all parser instances, plus any associated resources owned by the module. */ void fsaparse_destroy_module (void) { fsaparse_ctxt_t *p_list; fsaparse_ctxt_t *p_next; /* Move the global list head into a local variable, and immediately clear the global. This is a half-hearted attempt to avoid race conditions; to do things properly, a system-wide atomic operation (locked, including multi-CPU cache coherency) operation should be used. */ p_list = fsaparse_instance_list_head; fsaparse_instance_list_head = NULL; /* Traverse the list of instances, releasing all resources associated with each one. */ while (p_list) { p_next = p_list->next_instance; free_instance (p_list); p_list = p_next; } } /* Prepare module for operation. */ void fsaparse_initialise (void) { /* Initialise the linked list of instances created by this module. */ fsaparse_instance_list_head = NULL; /* Clean up resources upon exit. */ atexit (fsaparse_destroy_module); } /* vim:set shiftwidth=2: */