[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]
[SCM] gawk branch, master, updated. gawk-4.1.0-5330-g5e4e756f
From: |
Arnold Robbins |
Subject: |
[SCM] gawk branch, master, updated. gawk-4.1.0-5330-g5e4e756f |
Date: |
Sun, 27 Aug 2023 15:01:58 -0400 (EDT) |
This is an automated email from the git hooks/post-receive script. It was
generated because a ref change was pushed to the repository containing
the project "gawk".
The branch, master has been updated
via 5e4e756ffb4754911fa70fea0768933f52324625 (commit)
via 573e862d71e0604d0854219f7317d2ea51e41235 (commit)
from 76ca3d8634f9b33cfd2aaea91fe40b3ca6fe6db7 (commit)
Those revisions listed above that are new to this repository have
not appeared on any other notification email; so we list those
revisions in full, below.
- Log -----------------------------------------------------------------
http://git.sv.gnu.org/cgit/gawk.git/commit/?id=5e4e756ffb4754911fa70fea0768933f52324625
commit 5e4e756ffb4754911fa70fea0768933f52324625
Author: Arnold D. Robbins <arnold@skeeve.com>
Date: Sun Aug 27 12:01:37 2023 -0700
Small fix in make_regexp.
diff --git a/ChangeLog b/ChangeLog
index 52f5f86d..2fb55e19 100644
--- a/ChangeLog
+++ b/ChangeLog
@@ -1,3 +1,10 @@
+2023-08-27 Arnold D. Robbins <arnold@skeeve.com>
+
+ * re.c (make_regexp): When do_traditional and looking to see
+ if an escape created a metacharacter, use `metas' instead of
+ a separate string, since BWK awk now also handles interval
+ expressions. Found by inspection.
+
2023-08-25 Miguel Pineiro Jr <mpj@pineiro.cc>
* awk.h (enum escape_results): Fix ESCAPE_LINE_CONINUATION typo.
diff --git a/re.c b/re.c
index 616f7b5c..678d0e52 100644
--- a/re.c
+++ b/re.c
@@ -163,7 +163,7 @@ make_regexp(const char *s, size_t len, bool ignorecase,
bool dfa, bool canfatal)
&& do_traditional
&& ! do_posix
&& (isdigit(c) || c == 'x')
- && strchr("()|*+?.^$\\[]", *result) != NULL)
+ && strchr(metas, *result) != NULL)
*dest++ = '\\';
if (nbytes == 1
http://git.sv.gnu.org/cgit/gawk.git/commit/?id=573e862d71e0604d0854219f7317d2ea51e41235
commit 573e862d71e0604d0854219f7317d2ea51e41235
Author: Arnold D. Robbins <arnold@skeeve.com>
Date: Sun Aug 27 11:58:46 2023 -0700
Continue cleaning up handling of escape sequences.
diff --git a/ChangeLog b/ChangeLog
index 88fedbe4..52f5f86d 100644
--- a/ChangeLog
+++ b/ChangeLog
@@ -1,3 +1,33 @@
+2023-08-25 Miguel Pineiro Jr <mpj@pineiro.cc>
+
+ * awk.h (enum escape_results): Fix ESCAPE_LINE_CONINUATION typo.
+ * node.c (make_str_node): Ditto.
+ * (parse_escape): Ditto.
+ * re.c (make_regexp): Ditto.
+
+ Unrelated: Use size_t for multibyte character lengths (aside
+ from it being the type which the interfaces return, using it
+ avoids implementation-defined behavior when -1 or -2 is cast
+ to size_t then stored in an int).
+
+ * node.c (make_str_node): Change mblen from int to size_t.
+
+ Unrelated: Change the length of a translated escape sequence
+ from int to size_t (to match the underlying interfaces).
+
+ * awk.h (parse_escape): Change prototype.
+ * node.c (make_str_node): Adjust to accomodate new type.
+ * (parse_escape): Ditto.
+ * re.c (make_regexp): Ditto.
+
+ Unrelated: Clarify make_regexp, inspired by make_str_node.
+
+ * re.c (make_regexp): Copy multibyte characters at once, move
+ the escape sequence post-processing into the switch statement
+ to explicitly connect a return value with its handling, add a
+ default case to catch unrecognized values, and move ok_to_escape
+ next to where it's used (and only set it once).
+
2023-08-21 Arnold D. Robbins <arnold@skeeve.com>
* node.c (make_str_node): Don't use N_() in cant_happen message.
diff --git a/awk.h b/awk.h
index 034f7a2e..0d1dc85b 100644
--- a/awk.h
+++ b/awk.h
@@ -298,7 +298,7 @@ enum escape_results {
ESCAPE_OK, // nbytes == 1 to MB_CUR_MAX: the length of the
translated escape sequence
ESCAPE_CONV_ERR, // wcrtomb conversion error
ESCAPE_TERM_BACKSLASH, // terminal backslash (to be preserved in
cmdline strings)
- ESCAPE_LINE_CONINUATION // line continuation (backslash-newline pair)
+ ESCAPE_LINE_CONTINUATION // line continuation
(backslash-newline pair)
};
/* string hash table */
@@ -1748,7 +1748,7 @@ extern NODE *make_str_node(const char *s, size_t len, int
flags);
extern NODE *make_bool_node(bool value);
extern NODE *make_typed_regex(const char *re, size_t len);
extern void *more_blocks(int id);
-extern enum escape_results parse_escape(const char **string_ptr, const char
**escseq, int *nbytes);
+extern enum escape_results parse_escape(const char **string_ptr, const char
**escseq, size_t *nbytes);
extern NODE *str2wstr(NODE *n, size_t **ptr);
extern NODE *wstr2str(NODE *n);
#define force_wstring(n) str2wstr(n, NULL)
diff --git a/node.c b/node.c
index 91c28396..5de4e082 100644
--- a/node.c
+++ b/node.c
@@ -441,10 +441,15 @@ make_str_node(const char *s, size_t len, int flags)
* character happens to be a backslash.
*/
if (gawk_mb_cur_max > 1) {
- int mblen = mbrlen(pf, end-pf, &cur_state);
+ size_t mblen = mbrlen(pf, end-pf, &cur_state);
- if (mblen > 1) {
- int i;
+ /*
+ * Incomplete (-2), invalid (-1), and
+ * null (0) characters are excluded here.
+ * They are read as a sequence of bytes.
+ */
+ if (mblen > 1 && mblen < (size_t) -2) {
+ size_t i;
for (i = 0; i < mblen; i++)
*ptm++ = *pf++;
@@ -455,7 +460,7 @@ make_str_node(const char *s, size_t len, int flags)
c = *pf++;
if (c == '\\') {
const char *result;
- int nbytes;
+ size_t nbytes;
enum escape_results ret;
ret = parse_escape(& pf, & result, & nbytes);
@@ -473,12 +478,12 @@ make_str_node(const char *s, size_t len, int flags)
lintwarn(_("backslash at end of
string"));
*ptm++ = '\\';
break;
- case ESCAPE_LINE_CONINUATION:
+ case ESCAPE_LINE_CONTINUATION:
if (do_lint)
lintwarn(_("backslash string
continuation is not portable"));
continue;
default:
- cant_happen("received bad result %d
from parse_escape(), nbytes = %d",
+ cant_happen("received bad result %d
from parse_escape(), nbytes = %zu",
(int) ret, nbytes);
break;
}
@@ -555,13 +560,13 @@ r_unref(NODE *tmp)
* ESCAPE_OK, // nbytes == 1 to MB_CUR_MAX: the length of the
translated escape sequence
* ESCAPE_CONV_ERR, // wcrtomb conversion error
* ESCAPE_TERM_BACKSLASH, // terminal backslash (to be preserved in
cmdline strings)
- * ESCAPE_LINE_CONINUATION // line continuation (backslash-newline pair)
+ * ESCAPE_LINE_CONTINUATION // line continuation
(backslash-newline pair)
*
* POSIX doesn't allow \x or \u.
*/
enum escape_results
-parse_escape(const char **string_ptr, const char **result, int *nbytes)
+parse_escape(const char **string_ptr, const char **result, size_t *nbytes)
{
static char buf[MB_LEN_MAX];
enum escape_results retval = ESCAPE_OK;
@@ -606,7 +611,7 @@ parse_escape(const char **string_ptr, const char **result,
int *nbytes)
buf[0] = '\v';
break;
case '\n':
- retval = ESCAPE_LINE_CONINUATION;
+ retval = ESCAPE_LINE_CONTINUATION;
break;
case 0:
(*string_ptr)--;
@@ -718,8 +723,7 @@ parse_escape(const char **string_ptr, const char **result,
int *nbytes)
retval = ESCAPE_CONV_ERR;
*nbytes = 0;
} else {
- /* MB_LEN_MAX is an int, so n fits */
- *nbytes = (int) n;
+ *nbytes = n;
}
break;
}
diff --git a/re.c b/re.c
index d22b66fa..616f7b5c 100644
--- a/re.c
+++ b/re.c
@@ -60,11 +60,6 @@ make_regexp(const char *s, size_t len, bool ignorecase, bool
dfa, bool canfatal)
lintwarn(_("behavior of matching a regexp containing NUL
characters is not defined by POSIX"));
}
- /*
- * The number of bytes in the current multibyte character.
- * It is 0, when the current character is a singlebyte character.
- */
- size_t is_multibyte = 0;
mbstate_t mbs;
memset(&mbs, 0, sizeof(mbstate_t)); /* Initialize. */
@@ -95,83 +90,70 @@ make_regexp(const char *s, size_t len, bool ignorecase,
bool dfa, bool canfatal)
dest = buf;
while (src < end) {
- if (gawk_mb_cur_max > 1 && ! is_multibyte) {
- /* The previous byte is a singlebyte character, or last
byte
- of a multibyte character. We check the next
character. */
- is_multibyte = mbrlen(src, end - src, &mbs);
- if ( is_multibyte == 1
- || is_multibyte == (size_t) -1
- || is_multibyte == (size_t) -2
- || is_multibyte == 0) {
- /* We treat it as a single-byte character. */
- is_multibyte = 0;
+ /*
+ * Keep multibyte characters together. This avoids
+ * problems if a subsequent byte of a multibyte
+ * character happens to be a backslash.
+ */
+ if (gawk_mb_cur_max > 1) {
+ size_t mblen = mbrlen(src, end - src, &mbs);
+
+ /*
+ * Incomplete (-2), invalid (-1), and
+ * null (0) characters are excluded here.
+ * They are read as a sequence of bytes.
+ */
+ if (mblen > 1 && mblen < (size_t) -2) {
+ size_t i;
+
+ for (i = 0; i < mblen; i++)
+ *dest++ = *src++;
+ continue;
}
}
- const char *ok_to_escape;
- if (do_posix)
- ok_to_escape = "{}()|*+?.^$\\[]/-";
- else if (do_traditional)
- ok_to_escape = "()|*+?.^$\\[]/-";
- else
- ok_to_escape = "<>`'BywWsS{}()|*+?.^$\\[]/-";
-
- /* We skip multibyte character, since it must not be a special
- character. */
- if ((gawk_mb_cur_max == 1 || ! is_multibyte) &&
- (*src == '\\')) {
- c = *++src;
- switch (c) {
- case '\0': /* \\ before \0, either dynamic data or
real end of string */
- if (src >= s + len)
- *dest++ = '\\'; // at end of string,
will fatal below
- else
- fatal(_("invalid NUL byte in dynamic
regexp"));
- break;
- case 'a':
- case 'b':
- case 'f':
- case 'n':
- case 'r':
- case 't':
- case 'v':
- case 'x':
- case 'u':
- case '0':
- case '1':
- case '2':
- case '3':
- case '4':
- case '5':
- case '6':
- case '7':
- {
- const char *result;
- int nbytes;
- enum escape_results ret;
-
- ret = parse_escape(& src, & result, & nbytes);
- switch (ret) {
- case ESCAPE_OK:
- case ESCAPE_CONV_ERR:
- break;
- case ESCAPE_TERM_BACKSLASH:
- case ESCAPE_LINE_CONINUATION:
- cant_happen("received bad result %d
from parse_escape(), nbytes = %d",
- (int) ret, nbytes);
- break;
- }
- /*
- * Invalid code points produce '?' (0x3F).
- * These are quoted so that they're taken
- * literally. Unlike \u3F, a metachar.
- */
- if (nbytes == 0) {
- *dest++ = '\\';
- *dest++ = '?';
- break;
- }
+ /*
+ * From here *src is a single byte character.
+ */
+ if (*src != '\\') {
+ *dest++ = *src++;
+ continue;
+ }
+ /* Escape sequence */
+ c = *++src;
+ switch (c) {
+ case '\0': /* \\ before \0, either dynamic data or real
end of string */
+ if (src >= s + len)
+ *dest++ = '\\'; // at end of string, will fatal
below
+ else
+ fatal(_("invalid NUL byte in dynamic regexp"));
+ break;
+ case 'a':
+ case 'b':
+ case 'f':
+ case 'n':
+ case 'r':
+ case 't':
+ case 'v':
+ case 'x':
+ case 'u':
+ case '0':
+ case '1':
+ case '2':
+ case '3':
+ case '4':
+ case '5':
+ case '6':
+ case '7':
+ {
+ const char *result;
+ size_t nbytes;
+ enum escape_results ret;
+
+ ret = parse_escape(& src, & result, & nbytes);
+ switch (ret) {
+ case ESCAPE_OK:
/*
* Unix awk treats octal (and hex?) chars
* literally in re's, so escape regexp
@@ -184,7 +166,8 @@ make_regexp(const char *s, size_t len, bool ignorecase,
bool dfa, bool canfatal)
&& strchr("()|*+?.^$\\[]", *result) != NULL)
*dest++ = '\\';
- if (do_lint
+ if (nbytes == 1
+ && do_lint
&& ! nul_warned
&& *result == '\0') {
nul_warned = true;
@@ -195,49 +178,85 @@ make_regexp(const char *s, size_t len, bool ignorecase,
bool dfa, bool canfatal)
while (nbytes--)
*dest++ = *result++;
break;
+ case ESCAPE_CONV_ERR:
+ /*
+ * Invalid code points produce '?' (0x3F).
+ * These are quoted so that they're taken
+ * literally. Unlike \u3F, a metachar.
+ */
+ *dest++ = '\\';
+ *dest++ = '?';
+ break;
+ default:
+ /*
+ * The outer switch handles terminal
+ * backslashes and line continuations.
+ * parse_escape should never see them
+ * and therefore it should never return
+ * ESCAPE_TERM_BACKSLASH nor
+ * ESCAPE_LINE_CONTINUATION.
+ *
+ * This also catches unknown values.
+ */
+ cant_happen("received bad result %d from
parse_escape(), nbytes = %zu",
+ (int) ret, nbytes);
+ }
+ break;
+ }
+ case '8':
+ case '9': /* a\9b not valid */
+ *dest++ = c;
+ src++;
+ {
+ static bool warned[2];
+
+ if (! warned[c - '8']) {
+ warning(_("regexp escape sequence `\\%c'
treated as plain `%c'"), c, c);
+ warned[c - '8'] = true;
}
- case '8':
- case '9': /* a\9b not valid */
- *dest++ = c;
+ }
+ break;
+ case 'y': /* normally \b */
+ /* gnu regex op */
+ if (! do_traditional) {
+ *dest++ = '\\';
+ *dest++ = 'b';
src++;
- {
- static bool warned[2];
+ break;
+ }
+ /* else, fall through */
+ default:
+ {
+ static const char *ok_to_escape = NULL;
- if (! warned[c - '8']) {
- warning(_("regexp escape sequence
`\\%c' treated as plain `%c'"), c, c);
- warned[c - '8'] = true;
- }
+ /*
+ * The posix and traditional flags do not change
+ * once the awk program is running. Therefore,
+ * neither does ok_to_escape.
+ */
+ if (ok_to_escape == NULL) {
+ if (do_posix)
+ ok_to_escape = "{}()|*+?.^$\\[]/-";
+ else if (do_traditional)
+ ok_to_escape = "()|*+?.^$\\[]/-";
+ else
+ ok_to_escape =
"<>`'BywWsS{}()|*+?.^$\\[]/-";
}
- break;
- case 'y': /* normally \b */
- /* gnu regex op */
- if (! do_traditional) {
- *dest++ = '\\';
- *dest++ = 'b';
- src++;
- break;
- }
- /* else, fall through */
- default:
- if (strchr(ok_to_escape, c) == NULL) {
- static bool warned[256];
- if (! warned[c & 0xFF]) {
- warning(_("regexp escape
sequence `\\%c' is not a known regexp operator"), c);
- warned[c & 0xFF] = true;
- }
+ if (strchr(ok_to_escape, c) == NULL) {
+ static bool warned[256];
+
+ if (! warned[c & 0xFF]) {
+ warning(_("regexp escape sequence
`\\%c' is not a known regexp operator"), c);
+ warned[c & 0xFF] = true;
}
- *dest++ = '\\';
- *dest++ = (char) c;
- src++;
- break;
- } /* switch */
- } else {
- c = *src;
- *dest++ = *src++; /* not '\\' */
- }
- if (gawk_mb_cur_max > 1 && is_multibyte)
- is_multibyte--;
+ }
+ *dest++ = '\\';
+ *dest++ = (char) c;
+ src++;
+ break;
+ }
+ } /* switch */
} /* while */
*dest = '\0';
-----------------------------------------------------------------------
Summary of changes:
ChangeLog | 37 +++++++++
awk.h | 4 +-
node.c | 26 ++++---
re.c | 253 +++++++++++++++++++++++++++++++++-----------------------------
4 files changed, 190 insertions(+), 130 deletions(-)
hooks/post-receive
--
gawk
[Prev in Thread] |
Current Thread |
[Next in Thread] |
- [SCM] gawk branch, master, updated. gawk-4.1.0-5330-g5e4e756f,
Arnold Robbins <=