[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]
[RFC PATCH v2] pcre: migrate to pcre2
From: |
Carlo Marcelo Arenas Belón |
Subject: |
[RFC PATCH v2] pcre: migrate to pcre2 |
Date: |
Wed, 13 Oct 2021 20:12:23 -0700 |
assumes a very recent version of the pcre2 library (PCRE2_MATCH_INVALID_UTF
comes with 10.34), but a test for it is still missing.
removed some optimizations that might need to be brought back once tested,
had to do some casts to get rid of some warnings that I didn't really like
and maybe missing some error checking.
Signed-off-by: Carlo Marcelo Arenas Belón <carenas@gmail.com>
---
configure.ac | 2 +-
m4/{pcre.m4 => pcre2.m4} | 23 +++--
src/pcresearch.c | 210 ++++++++++++++-------------------------
tests/filename-lineno.pl | 4 +-
4 files changed, 90 insertions(+), 149 deletions(-)
rename m4/{pcre.m4 => pcre2.m4} (67%)
diff --git a/configure.ac b/configure.ac
index c49ec4a..9291cee 100644
--- a/configure.ac
+++ b/configure.ac
@@ -197,7 +197,7 @@ if test "$ac_use_included_regex" = no; then
AC_MSG_WARN([Included lib/regex.c not used])
fi
-gl_FUNC_PCRE
+gl_FUNC_PCRE2
AM_CONDITIONAL([USE_PCRE], [test $use_pcre = yes])
case $host_os in
diff --git a/m4/pcre.m4 b/m4/pcre2.m4
similarity index 67%
rename from m4/pcre.m4
rename to m4/pcre2.m4
index 78b7fda..7970c4e 100644
--- a/m4/pcre.m4
+++ b/m4/pcre2.m4
@@ -1,15 +1,15 @@
-# pcre.m4 - check for libpcre support
+# pcre2.m4 - check for libpcre2 support
# Copyright (C) 2010-2021 Free Software Foundation, Inc.
# This file is free software; the Free Software Foundation
# gives unlimited permission to copy and/or distribute it,
# with or without modifications, as long as this notice is preserved.
-AC_DEFUN([gl_FUNC_PCRE],
+AC_DEFUN([gl_FUNC_PCRE2],
[
AC_ARG_ENABLE([perl-regexp],
AS_HELP_STRING([--disable-perl-regexp],
- [disable perl-regexp (pcre) support]),
+ [disable perl-regexp (pcre2) support]),
[case $enableval in
yes|no) test_pcre=$enableval;;
*) AC_MSG_ERROR([invalid value $enableval for --disable-perl-regexp]);;
@@ -21,24 +21,25 @@ AC_DEFUN([gl_FUNC_PCRE],
use_pcre=no
if test $test_pcre != no; then
- PKG_CHECK_MODULES([PCRE], [libpcre], [], [: ${PCRE_LIBS=-lpcre}])
+ PKG_CHECK_MODULES([PCRE], [libpcre2-8], [], [: ${PCRE_LIBS=-lpcre2-8}])
- AC_CACHE_CHECK([for pcre_compile], [pcre_cv_have_pcre_compile],
+ AC_CACHE_CHECK([for pcre2_compile], [pcre_cv_have_pcre2_compile],
[pcre_saved_CFLAGS=$CFLAGS
pcre_saved_LIBS=$LIBS
CFLAGS="$CFLAGS $PCRE_CFLAGS"
LIBS="$PCRE_LIBS $LIBS"
AC_LINK_IFELSE(
- [AC_LANG_PROGRAM([[#include <pcre.h>
+ [AC_LANG_PROGRAM([[#define PCRE2_CODE_UNIT_WIDTH 8
+ #include <pcre2.h>
]],
- [[pcre *p = pcre_compile (0, 0, 0, 0, 0);
+ [[pcre2_code *p = pcre2_compile (0, 0, 0, 0, 0, 0);
return !p;]])],
- [pcre_cv_have_pcre_compile=yes],
- [pcre_cv_have_pcre_compile=no])
+ [pcre_cv_have_pcre2_compile=yes],
+ [pcre_cv_have_pcre2_compile=no])
CFLAGS=$pcre_saved_CFLAGS
LIBS=$pcre_saved_LIBS])
- if test "$pcre_cv_have_pcre_compile" = yes; then
+ if test "$pcre_cv_have_pcre2_compile" = yes; then
use_pcre=yes
elif test $test_pcre = maybe; then
AC_MSG_WARN([AC_PACKAGE_NAME will be built without pcre support.])
@@ -50,7 +51,7 @@ AC_DEFUN([gl_FUNC_PCRE],
if test $use_pcre = yes; then
AC_DEFINE([HAVE_LIBPCRE], [1],
[Define to 1 if you have the Perl Compatible Regular Expressions
- library (-lpcre).])
+ library (-lpcre2).])
else
PCRE_CFLAGS=
PCRE_LIBS=
diff --git a/src/pcresearch.c b/src/pcresearch.c
index 3bdaee9..a6a4bb0 100644
--- a/src/pcresearch.c
+++ b/src/pcresearch.c
@@ -17,93 +17,48 @@
02110-1301, USA. */
/* Written August 1992 by Mike Haertel. */
+/* Updated for PCRE2 by Carlo Arenas. */
#include <config.h>
#include "search.h"
#include "die.h"
-#include <pcre.h>
-
-/* This must be at least 2; everything after that is for performance
- in pcre_exec. */
-enum { NSUB = 300 };
-
-#ifndef PCRE_EXTRA_MATCH_LIMIT_RECURSION
-# define PCRE_EXTRA_MATCH_LIMIT_RECURSION 0
-#endif
-#ifndef PCRE_STUDY_JIT_COMPILE
-# define PCRE_STUDY_JIT_COMPILE 0
-#endif
-#ifndef PCRE_STUDY_EXTRA_NEEDED
-# define PCRE_STUDY_EXTRA_NEEDED 0
-#endif
+#define PCRE2_CODE_UNIT_WIDTH 8
+#include <pcre2.h>
struct pcre_comp
{
/* Compiled internal form of a Perl regular expression. */
- pcre *cre;
-
- /* Additional information about the pattern. */
- pcre_extra *extra;
-
-#if PCRE_STUDY_JIT_COMPILE
- /* The JIT stack and its maximum size. */
- pcre_jit_stack *jit_stack;
- int jit_stack_size;
-#endif
-
- /* Table, indexed by ! (flag & PCRE_NOTBOL), of whether the empty
- string matches when that flag is used. */
- int empty_match[2];
+ pcre2_code *cre;
+ pcre2_match_context *mcontext;
+ pcre2_match_data *data;
};
-
/* Match the already-compiled PCRE pattern against the data in SUBJECT,
of size SEARCH_BYTES and starting with offset SEARCH_OFFSET, with
- options OPTIONS, and storing resulting matches into SUB. Return
- the (nonnegative) match location or a (negative) error number. */
+ options OPTIONS.
+ Return the (nonnegative) match location or a (negative) error number. */
static int
jit_exec (struct pcre_comp *pc, char const *subject, int search_bytes,
- int search_offset, int options, int *sub)
+ int search_offset, uint32_t options)
{
while (true)
{
- int e = pcre_exec (pc->cre, pc->extra, subject, search_bytes,
- search_offset, options, sub, NSUB);
+ int e = pcre2_match (pc->cre, (PCRE2_SPTR8)subject, search_bytes,
+ search_offset, options, pc->data, pc->mcontext);
-#if PCRE_STUDY_JIT_COMPILE
- if (e == PCRE_ERROR_JIT_STACKLIMIT
- && 0 < pc->jit_stack_size && pc->jit_stack_size <= INT_MAX / 2)
+ if (e == PCRE2_ERROR_JIT_STACKLIMIT && !pc->mcontext)
{
- int old_size = pc->jit_stack_size;
- int new_size = pc->jit_stack_size = old_size * 2;
- if (pc->jit_stack)
- pcre_jit_stack_free (pc->jit_stack);
- pc->jit_stack = pcre_jit_stack_alloc (old_size, new_size);
- if (!pc->jit_stack)
+ /* The PCRE documentation says that a 32 KiB stack is the default.
*/
+ pcre2_jit_stack *s = pcre2_jit_stack_create (64 << 10, INT_MAX / 2,
+ NULL);
+ pc->mcontext = pcre2_match_context_create (NULL);
+ if (!pc->mcontext || !s)
die (EXIT_TROUBLE, 0,
_("failed to allocate memory for the PCRE JIT stack"));
- pcre_assign_jit_stack (pc->extra, NULL, pc->jit_stack);
+ pcre2_jit_stack_assign (pc->mcontext, NULL, s);
continue;
}
-#endif
-
-#if PCRE_EXTRA_MATCH_LIMIT_RECURSION
- if (e == PCRE_ERROR_RECURSIONLIMIT
- && (PCRE_STUDY_EXTRA_NEEDED || pc->extra))
- {
- unsigned long lim
- = (pc->extra->flags & PCRE_EXTRA_MATCH_LIMIT_RECURSION
- ? pc->extra->match_limit_recursion
- : 0);
- if (lim <= ULONG_MAX / 2)
- {
- pc->extra->match_limit_recursion = lim ? 2 * lim : (1 << 24) - 1;
- pc->extra->flags |= PCRE_EXTRA_MATCH_LIMIT_RECURSION;
- continue;
- }
- }
-#endif
return e;
}
@@ -115,27 +70,26 @@ jit_exec (struct pcre_comp *pc, char const *subject, int
search_bytes,
void *
Pcompile (char *pattern, idx_t size, reg_syntax_t ignored, bool exact)
{
- int e;
- char const *ep;
+ PCRE2_SIZE e;
+ int ec;
+ PCRE2_UCHAR8 ep[128];
static char const wprefix[] = "(?<!\\w)(?:";
static char const wsuffix[] = ")(?!\\w)";
- static char const xprefix[] = "^(?:";
- static char const xsuffix[] = ")$";
- int fix_len_max = MAX (sizeof wprefix - 1 + sizeof wsuffix - 1,
- sizeof xprefix - 1 + sizeof xsuffix - 1);
- char *re = xnmalloc (4, size + (fix_len_max + 4 - 1) / 4);
- int flags = PCRE_DOLLAR_ENDONLY | (match_icase ? PCRE_CASELESS : 0);
+ size_t fix_len_max = sizeof wprefix - 1 + sizeof wsuffix - 1;
+ unsigned char *re = xnmalloc (4, size + (fix_len_max + 4 - 1) / 4);
+ uint32_t flags = PCRE2_DOLLAR_ENDONLY | (match_icase ? PCRE2_CASELESS : 0);
char *patlim = pattern + size;
- char *n = re;
+ char *n = (char *)re;
char const *p;
char const *pnul;
struct pcre_comp *pc = xcalloc (1, sizeof (*pc));
+ pcre2_compile_context *ccontext = NULL;
if (localeinfo.multibyte)
{
if (! localeinfo.using_utf8)
die (EXIT_TROUBLE, 0, _("-P supports only unibyte and UTF-8 locales"));
- flags |= PCRE_UTF8;
+ flags |= PCRE2_UTF | PCRE2_NEVER_BACKSLASH_C | PCRE2_MATCH_INVALID_UTF;
}
/* FIXME: Remove this restriction. */
@@ -145,8 +99,6 @@ Pcompile (char *pattern, idx_t size, reg_syntax_t ignored,
bool exact)
*n = '\0';
if (match_words)
strcpy (n, wprefix);
- if (match_lines)
- strcpy (n, xprefix);
n += strlen (n);
/* The PCRE interface doesn't allow NUL bytes in the pattern, so
@@ -169,36 +121,32 @@ Pcompile (char *pattern, idx_t size, reg_syntax_t
ignored, bool exact)
*patlim = '\n';
if (match_words)
- strcpy (n, wsuffix);
+ {
+ strcpy (n, wsuffix);
+ size += fix_len_max;
+ }
if (match_lines)
- strcpy (n, xsuffix);
+ {
+ ccontext = pcre2_compile_context_create(NULL);
+ uint32_t extra_options = PCRE2_EXTRA_MATCH_LINE;
- pc->cre = pcre_compile (re, flags, &ep, &e, pcre_maketables ());
- if (!pc->cre)
- die (EXIT_TROUBLE, 0, "%s", ep);
+ pcre2_set_compile_extra_options(ccontext, extra_options);
+ }
- int pcre_study_flags = PCRE_STUDY_EXTRA_NEEDED | PCRE_STUDY_JIT_COMPILE;
- pc->extra = pcre_study (pc->cre, pcre_study_flags, &ep);
- if (ep)
+ pc->cre = pcre2_compile (re, size, flags, &ec, &e, ccontext);
+ if (!pc->cre) {
+ pcre2_get_error_message (ec, ep, 128);
die (EXIT_TROUBLE, 0, "%s", ep);
+ }
-#if PCRE_STUDY_JIT_COMPILE
- if (pcre_fullinfo (pc->cre, pc->extra, PCRE_INFO_JIT, &e))
- die (EXIT_TROUBLE, 0, _("internal error (should never happen)"));
+ pc->data = pcre2_match_data_create_from_pattern (pc->cre, NULL);
- /* The PCRE documentation says that a 32 KiB stack is the default. */
- if (e)
- pc->jit_stack_size = 32 << 10;
-#endif
+ ec = pcre2_jit_compile (pc->cre, PCRE2_JIT_COMPLETE);
+ if (ec && ec != PCRE2_ERROR_JIT_BADOPTION && ec != PCRE2_ERROR_NOMEMORY)
+ die (EXIT_TROUBLE, 0, _("JIT internal error: %d"), ec);
free (re);
- int sub[NSUB];
- pc->empty_match[false] = pcre_exec (pc->cre, pc->extra, "", 0, 0,
- PCRE_NOTBOL, sub, NSUB);
- pc->empty_match[true] = pcre_exec (pc->cre, pc->extra, "", 0, 0, 0, sub,
- NSUB);
-
return pc;
}
@@ -206,11 +154,11 @@ ptrdiff_t
Pexecute (void *vcp, char const *buf, idx_t size, idx_t *match_size,
char const *start_ptr)
{
- int sub[NSUB];
+ PCRE2_SIZE *sub;
char const *p = start_ptr ? start_ptr : buf;
bool bol = p[-1] == eolbyte;
char const *line_start = buf;
- int e = PCRE_ERROR_NOMATCH;
+ int e = PCRE2_ERROR_NOMATCH;
char const *line_end;
struct pcre_comp *pc = vcp;
@@ -243,41 +191,25 @@ Pexecute (void *vcp, char const *buf, idx_t size, idx_t
*match_size,
int search_offset = p - subject;
- /* Check for an empty match; this is faster than letting
- pcre_exec do it. */
- if (p == line_end)
- {
- sub[0] = sub[1] = search_offset;
- e = pc->empty_match[bol];
- break;
- }
-
int options = 0;
if (!bol)
- options |= PCRE_NOTBOL;
+ options |= PCRE2_NOTBOL;
e = jit_exec (pc, subject, line_end - subject, search_offset,
- options, sub);
- if (e != PCRE_ERROR_BADUTF8)
+ options);
+ if (e != PCRE2_ERROR_BADUTFOFFSET)
break;
- int valid_bytes = sub[0];
+
+ sub = pcre2_get_ovector_pointer(pc->data);
+ int valid_bytes = *sub;
if (search_offset <= valid_bytes)
{
/* Try to match the string before the encoding error. */
- if (valid_bytes == 0)
- {
- /* Handle the empty-match case specially, for speed.
- This optimization is valid if VALID_BYTES is zero,
- which means SEARCH_OFFSET is also zero. */
- sub[1] = 0;
- e = pc->empty_match[bol];
- }
- else
- e = jit_exec (pc, subject, valid_bytes, search_offset,
- options | PCRE_NO_UTF8_CHECK | PCRE_NOTEOL, sub);
-
- if (e != PCRE_ERROR_NOMATCH)
+ e = jit_exec (pc, subject, valid_bytes, search_offset,
+ options | PCRE2_NO_UTF_CHECK | PCRE2_NOTEOL);
+
+ if (e != PCRE2_ERROR_NOMATCH)
break;
/* Treat the encoding error as data that cannot match. */
@@ -288,7 +220,7 @@ Pexecute (void *vcp, char const *buf, idx_t size, idx_t
*match_size,
subject += valid_bytes + 1;
}
- if (e != PCRE_ERROR_NOMATCH)
+ if (e != PCRE2_ERROR_NOMATCH)
break;
bol = true;
p = subject = line_start = line_end + 1;
@@ -299,24 +231,31 @@ Pexecute (void *vcp, char const *buf, idx_t size, idx_t
*match_size,
{
switch (e)
{
- case PCRE_ERROR_NOMATCH:
+ case PCRE2_ERROR_NOMATCH:
break;
- case PCRE_ERROR_NOMEMORY:
+ case PCRE2_ERROR_NOMEMORY:
die (EXIT_TROUBLE, 0, _("%s: memory exhausted"), input_filename ());
-#if PCRE_STUDY_JIT_COMPILE
- case PCRE_ERROR_JIT_STACKLIMIT:
+ case PCRE2_ERROR_JIT_STACKLIMIT:
die (EXIT_TROUBLE, 0, _("%s: exhausted PCRE JIT stack"),
input_filename ());
-#endif
- case PCRE_ERROR_MATCHLIMIT:
+ case PCRE2_ERROR_DEPTHLIMIT:
+ die (EXIT_TROUBLE, 0,
+ _("%s: exceeded PCRE's nested backtracking limit"),
+ input_filename ());
+
+ case PCRE2_ERROR_HEAPLIMIT:
+ die (EXIT_TROUBLE, 0, _("%s: exceeded PCRE's heap limit"),
+ input_filename ());
+
+ case PCRE2_ERROR_MATCHLIMIT:
die (EXIT_TROUBLE, 0, _("%s: exceeded PCRE's backtracking limit"),
input_filename ());
- case PCRE_ERROR_RECURSIONLIMIT:
- die (EXIT_TROUBLE, 0, _("%s: exceeded PCRE's recursion limit"),
+ case PCRE2_ERROR_RECURSELOOP:
+ die (EXIT_TROUBLE, 0, _("%s: PCRE detected recurse loop"),
input_filename ());
default:
@@ -332,8 +271,9 @@ Pexecute (void *vcp, char const *buf, idx_t size, idx_t
*match_size,
}
else
{
- char const *matchbeg = subject + sub[0];
- char const *matchend = subject + sub[1];
+ sub = pcre2_get_ovector_pointer(pc->data);
+ char const *matchbeg = subject + *sub;
+ char const *matchend = subject + *(sub + 1);
char const *beg;
char const *end;
if (start_ptr)
diff --git a/tests/filename-lineno.pl b/tests/filename-lineno.pl
index 1e84b45..1ff3d6a 100755
--- a/tests/filename-lineno.pl
+++ b/tests/filename-lineno.pl
@@ -101,13 +101,13 @@ my @Tests =
],
['invalid-re-P-paren', '-P ")"', {EXIT=>2},
{ERR => $ENV{PCRE_WORKS} == 1
- ? "$prog: unmatched parentheses\n"
+ ? "$prog: unmatched closing parenthesis\n"
: $no_pcre
},
],
['invalid-re-P-star-paren', '-P "a.*)"', {EXIT=>2},
{ERR => $ENV{PCRE_WORKS} == 1
- ? "$prog: unmatched parentheses\n"
+ ? "$prog: unmatched closing parenthesis\n"
: $no_pcre
},
],
--
2.33.0.1155.gbdb71ac078
[RFC PATCH v2] pcre: migrate to pcre2,
Carlo Marcelo Arenas Belón <=