[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]
Clarifying parse_encode handling in make_regexp
From: |
Miguel Pineiro Jr. |
Subject: |
Clarifying parse_encode handling in make_regexp |
Date: |
Fri, 25 Aug 2023 22:41:04 -0400 |
User-agent: |
Cyrus-JMAP/3.9.0-alpha0-647-g545049cfe6-fm-20230814.001-g545049cf |
Hello everyone,
Here's a patch that clarifies the parse_encode handling in
make_regexp. It borrows heavily from make_str_node.
It compiles cleanly and make check passes all tests.
Take care,
Miguel
diff --git a/ChangeLog b/ChangeLog
index 88fedbe4..52f5f86d 100644
--- a/ChangeLog
+++ b/ChangeLog
@@ -1,3 +1,33 @@
+2023-08-25 Miguel Pineiro Jr <mpj@pineiro.cc>
+
+ * awk.h (enum escape_results): Fix ESCAPE_LINE_CONINUATION typo.
+ * node.c (make_str_node): Ditto.
+ * (parse_escape): Ditto.
+ * re.c (make_regexp): Ditto.
+
+ Unrelated: Use size_t for multibyte character lengths (aside
+ from it being the type which the interfaces return, using it
+ avoids implementation-defined behavior when -1 or -2 is cast
+ to size_t then stored in an int).
+
+ * node.c (make_str_node): Change mblen from int to size_t.
+
+ Unrelated: Change the length of a translated escape sequence
+ from int to size_t (to match the underlying interfaces).
+
+ * awk.h (parse_escape): Change prototype.
+ * node.c (make_str_node): Adjust to accomodate new type.
+ * (parse_escape): Ditto.
+ * re.c (make_regexp): Ditto.
+
+ Unrelated: Clarify make_regexp, inspired by make_str_node.
+
+ * re.c (make_regexp): Copy multibyte characters at once, move
+ the escape sequence post-processing into the switch statement
+ to explicitly connect a return value with its handling, add a
+ default case to catch unrecognized values, and move ok_to_escape
+ next to where it's used (and only set it once).
+
2023-08-21 Arnold D. Robbins <arnold@skeeve.com>
* node.c (make_str_node): Don't use N_() in cant_happen message.
diff --git a/awk.h b/awk.h
index 034f7a2e..0d1dc85b 100644
--- a/awk.h
+++ b/awk.h
@@ -298,7 +298,7 @@ enum escape_results {
ESCAPE_OK, // nbytes == 1 to MB_CUR_MAX: the length of the
translated escape sequence
ESCAPE_CONV_ERR, // wcrtomb conversion error
ESCAPE_TERM_BACKSLASH, // terminal backslash (to be preserved in
cmdline strings)
- ESCAPE_LINE_CONINUATION // line continuation (backslash-newline pair)
+ ESCAPE_LINE_CONTINUATION // line continuation
(backslash-newline pair)
};
/* string hash table */
@@ -1748,7 +1748,7 @@ extern NODE *make_str_node(const char *s, size_t len, int
flags);
extern NODE *make_bool_node(bool value);
extern NODE *make_typed_regex(const char *re, size_t len);
extern void *more_blocks(int id);
-extern enum escape_results parse_escape(const char **string_ptr, const char
**escseq, int *nbytes);
+extern enum escape_results parse_escape(const char **string_ptr, const char
**escseq, size_t *nbytes);
extern NODE *str2wstr(NODE *n, size_t **ptr);
extern NODE *wstr2str(NODE *n);
#define force_wstring(n) str2wstr(n, NULL)
diff --git a/node.c b/node.c
index 91c28396..5de4e082 100644
--- a/node.c
+++ b/node.c
@@ -441,10 +441,15 @@ make_str_node(const char *s, size_t len, int flags)
* character happens to be a backslash.
*/
if (gawk_mb_cur_max > 1) {
- int mblen = mbrlen(pf, end-pf, &cur_state);
+ size_t mblen = mbrlen(pf, end-pf, &cur_state);
- if (mblen > 1) {
- int i;
+ /*
+ * Incomplete (-2), invalid (-1), and
+ * null (0) characters are excluded here.
+ * They are read as a sequence of bytes.
+ */
+ if (mblen > 1 && mblen < (size_t) -2) {
+ size_t i;
for (i = 0; i < mblen; i++)
*ptm++ = *pf++;
@@ -455,7 +460,7 @@ make_str_node(const char *s, size_t len, int flags)
c = *pf++;
if (c == '\\') {
const char *result;
- int nbytes;
+ size_t nbytes;
enum escape_results ret;
ret = parse_escape(& pf, & result, & nbytes);
@@ -473,12 +478,12 @@ make_str_node(const char *s, size_t len, int flags)
lintwarn(_("backslash at end of
string"));
*ptm++ = '\\';
break;
- case ESCAPE_LINE_CONINUATION:
+ case ESCAPE_LINE_CONTINUATION:
if (do_lint)
lintwarn(_("backslash string
continuation is not portable"));
continue;
default:
- cant_happen("received bad result %d
from parse_escape(), nbytes = %d",
+ cant_happen("received bad result %d
from parse_escape(), nbytes = %zu",
(int) ret, nbytes);
break;
}
@@ -555,13 +560,13 @@ r_unref(NODE *tmp)
* ESCAPE_OK, // nbytes == 1 to MB_CUR_MAX: the length of the
translated escape sequence
* ESCAPE_CONV_ERR, // wcrtomb conversion error
* ESCAPE_TERM_BACKSLASH, // terminal backslash (to be preserved in
cmdline strings)
- * ESCAPE_LINE_CONINUATION // line continuation (backslash-newline pair)
+ * ESCAPE_LINE_CONTINUATION // line continuation
(backslash-newline pair)
*
* POSIX doesn't allow \x or \u.
*/
enum escape_results
-parse_escape(const char **string_ptr, const char **result, int *nbytes)
+parse_escape(const char **string_ptr, const char **result, size_t *nbytes)
{
static char buf[MB_LEN_MAX];
enum escape_results retval = ESCAPE_OK;
@@ -606,7 +611,7 @@ parse_escape(const char **string_ptr, const char **result,
int *nbytes)
buf[0] = '\v';
break;
case '\n':
- retval = ESCAPE_LINE_CONINUATION;
+ retval = ESCAPE_LINE_CONTINUATION;
break;
case 0:
(*string_ptr)--;
@@ -718,8 +723,7 @@ parse_escape(const char **string_ptr, const char **result,
int *nbytes)
retval = ESCAPE_CONV_ERR;
*nbytes = 0;
} else {
- /* MB_LEN_MAX is an int, so n fits */
- *nbytes = (int) n;
+ *nbytes = n;
}
break;
}
diff --git a/re.c b/re.c
index d22b66fa..616f7b5c 100644
--- a/re.c
+++ b/re.c
@@ -60,11 +60,6 @@ make_regexp(const char *s, size_t len, bool ignorecase, bool
dfa, bool canfatal)
lintwarn(_("behavior of matching a regexp containing NUL
characters is not defined by POSIX"));
}
- /*
- * The number of bytes in the current multibyte character.
- * It is 0, when the current character is a singlebyte character.
- */
- size_t is_multibyte = 0;
mbstate_t mbs;
memset(&mbs, 0, sizeof(mbstate_t)); /* Initialize. */
@@ -95,83 +90,70 @@ make_regexp(const char *s, size_t len, bool ignorecase,
bool dfa, bool canfatal)
dest = buf;
while (src < end) {
- if (gawk_mb_cur_max > 1 && ! is_multibyte) {
- /* The previous byte is a singlebyte character, or last
byte
- of a multibyte character. We check the next
character. */
- is_multibyte = mbrlen(src, end - src, &mbs);
- if ( is_multibyte == 1
- || is_multibyte == (size_t) -1
- || is_multibyte == (size_t) -2
- || is_multibyte == 0) {
- /* We treat it as a single-byte character. */
- is_multibyte = 0;
+ /*
+ * Keep multibyte characters together. This avoids
+ * problems if a subsequent byte of a multibyte
+ * character happens to be a backslash.
+ */
+ if (gawk_mb_cur_max > 1) {
+ size_t mblen = mbrlen(src, end - src, &mbs);
+
+ /*
+ * Incomplete (-2), invalid (-1), and
+ * null (0) characters are excluded here.
+ * They are read as a sequence of bytes.
+ */
+ if (mblen > 1 && mblen < (size_t) -2) {
+ size_t i;
+
+ for (i = 0; i < mblen; i++)
+ *dest++ = *src++;
+ continue;
}
}
- const char *ok_to_escape;
- if (do_posix)
- ok_to_escape = "{}()|*+?.^$\\[]/-";
- else if (do_traditional)
- ok_to_escape = "()|*+?.^$\\[]/-";
- else
- ok_to_escape = "<>`'BywWsS{}()|*+?.^$\\[]/-";
-
- /* We skip multibyte character, since it must not be a special
- character. */
- if ((gawk_mb_cur_max == 1 || ! is_multibyte) &&
- (*src == '\\')) {
- c = *++src;
- switch (c) {
- case '\0': /* \\ before \0, either dynamic data or
real end of string */
- if (src >= s + len)
- *dest++ = '\\'; // at end of string,
will fatal below
- else
- fatal(_("invalid NUL byte in dynamic
regexp"));
- break;
- case 'a':
- case 'b':
- case 'f':
- case 'n':
- case 'r':
- case 't':
- case 'v':
- case 'x':
- case 'u':
- case '0':
- case '1':
- case '2':
- case '3':
- case '4':
- case '5':
- case '6':
- case '7':
- {
- const char *result;
- int nbytes;
- enum escape_results ret;
-
- ret = parse_escape(& src, & result, & nbytes);
- switch (ret) {
- case ESCAPE_OK:
- case ESCAPE_CONV_ERR:
- break;
- case ESCAPE_TERM_BACKSLASH:
- case ESCAPE_LINE_CONINUATION:
- cant_happen("received bad result %d
from parse_escape(), nbytes = %d",
- (int) ret, nbytes);
- break;
- }
- /*
- * Invalid code points produce '?' (0x3F).
- * These are quoted so that they're taken
- * literally. Unlike \u3F, a metachar.
- */
- if (nbytes == 0) {
- *dest++ = '\\';
- *dest++ = '?';
- break;
- }
+ /*
+ * From here *src is a single byte character.
+ */
+ if (*src != '\\') {
+ *dest++ = *src++;
+ continue;
+ }
+ /* Escape sequence */
+ c = *++src;
+ switch (c) {
+ case '\0': /* \\ before \0, either dynamic data or real
end of string */
+ if (src >= s + len)
+ *dest++ = '\\'; // at end of string, will fatal
below
+ else
+ fatal(_("invalid NUL byte in dynamic regexp"));
+ break;
+ case 'a':
+ case 'b':
+ case 'f':
+ case 'n':
+ case 'r':
+ case 't':
+ case 'v':
+ case 'x':
+ case 'u':
+ case '0':
+ case '1':
+ case '2':
+ case '3':
+ case '4':
+ case '5':
+ case '6':
+ case '7':
+ {
+ const char *result;
+ size_t nbytes;
+ enum escape_results ret;
+
+ ret = parse_escape(& src, & result, & nbytes);
+ switch (ret) {
+ case ESCAPE_OK:
/*
* Unix awk treats octal (and hex?) chars
* literally in re's, so escape regexp
@@ -184,7 +166,8 @@ make_regexp(const char *s, size_t len, bool ignorecase,
bool dfa, bool canfatal)
&& strchr("()|*+?.^$\\[]", *result) != NULL)
*dest++ = '\\';
- if (do_lint
+ if (nbytes == 1
+ && do_lint
&& ! nul_warned
&& *result == '\0') {
nul_warned = true;
@@ -195,49 +178,85 @@ make_regexp(const char *s, size_t len, bool ignorecase,
bool dfa, bool canfatal)
while (nbytes--)
*dest++ = *result++;
break;
+ case ESCAPE_CONV_ERR:
+ /*
+ * Invalid code points produce '?' (0x3F).
+ * These are quoted so that they're taken
+ * literally. Unlike \u3F, a metachar.
+ */
+ *dest++ = '\\';
+ *dest++ = '?';
+ break;
+ default:
+ /*
+ * The outer switch handles terminal
+ * backslashes and line continuations.
+ * parse_escape should never see them
+ * and therefore it should never return
+ * ESCAPE_TERM_BACKSLASH nor
+ * ESCAPE_LINE_CONTINUATION.
+ *
+ * This also catches unknown values.
+ */
+ cant_happen("received bad result %d from
parse_escape(), nbytes = %zu",
+ (int) ret, nbytes);
+ }
+ break;
+ }
+ case '8':
+ case '9': /* a\9b not valid */
+ *dest++ = c;
+ src++;
+ {
+ static bool warned[2];
+
+ if (! warned[c - '8']) {
+ warning(_("regexp escape sequence `\\%c'
treated as plain `%c'"), c, c);
+ warned[c - '8'] = true;
}
- case '8':
- case '9': /* a\9b not valid */
- *dest++ = c;
+ }
+ break;
+ case 'y': /* normally \b */
+ /* gnu regex op */
+ if (! do_traditional) {
+ *dest++ = '\\';
+ *dest++ = 'b';
src++;
- {
- static bool warned[2];
+ break;
+ }
+ /* else, fall through */
+ default:
+ {
+ static const char *ok_to_escape = NULL;
- if (! warned[c - '8']) {
- warning(_("regexp escape sequence
`\\%c' treated as plain `%c'"), c, c);
- warned[c - '8'] = true;
- }
+ /*
+ * The posix and traditional flags do not change
+ * once the awk program is running. Therefore,
+ * neither does ok_to_escape.
+ */
+ if (ok_to_escape == NULL) {
+ if (do_posix)
+ ok_to_escape = "{}()|*+?.^$\\[]/-";
+ else if (do_traditional)
+ ok_to_escape = "()|*+?.^$\\[]/-";
+ else
+ ok_to_escape =
"<>`'BywWsS{}()|*+?.^$\\[]/-";
}
- break;
- case 'y': /* normally \b */
- /* gnu regex op */
- if (! do_traditional) {
- *dest++ = '\\';
- *dest++ = 'b';
- src++;
- break;
- }
- /* else, fall through */
- default:
- if (strchr(ok_to_escape, c) == NULL) {
- static bool warned[256];
- if (! warned[c & 0xFF]) {
- warning(_("regexp escape
sequence `\\%c' is not a known regexp operator"), c);
- warned[c & 0xFF] = true;
- }
+ if (strchr(ok_to_escape, c) == NULL) {
+ static bool warned[256];
+
+ if (! warned[c & 0xFF]) {
+ warning(_("regexp escape sequence
`\\%c' is not a known regexp operator"), c);
+ warned[c & 0xFF] = true;
}
- *dest++ = '\\';
- *dest++ = (char) c;
- src++;
- break;
- } /* switch */
- } else {
- c = *src;
- *dest++ = *src++; /* not '\\' */
- }
- if (gawk_mb_cur_max > 1 && is_multibyte)
- is_multibyte--;
+ }
+ *dest++ = '\\';
+ *dest++ = (char) c;
+ src++;
+ break;
+ }
+ } /* switch */
} /* while */
*dest = '\0';
[Prev in Thread] |
Current Thread |
[Next in Thread] |
- Clarifying parse_encode handling in make_regexp,
Miguel Pineiro Jr. <=