Clarifying parse_encode handling in make

bug-gawk
[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]
Clarifying parse_encode handling in make_regexp

From:	Miguel Pineiro Jr.
Subject:	Clarifying parse_encode handling in make_regexp
Date:	Fri, 25 Aug 2023 22:41:04 -0400
User-agent:	Cyrus-JMAP/3.9.0-alpha0-647-g545049cfe6-fm-20230814.001-g545049cf
Hello everyone,

Here's a patch that clarifies the parse_encode handling in
make_regexp. It borrows heavily from make_str_node.

It compiles cleanly and make check passes all tests.

Take care,
Miguel


diff --git a/ChangeLog b/ChangeLog
index 88fedbe4..52f5f86d 100644
--- a/ChangeLog
+++ b/ChangeLog
@@ -1,3 +1,33 @@
+2023-08-25         Miguel Pineiro Jr     <mpj@pineiro.cc>
+
+       * awk.h (enum escape_results): Fix ESCAPE_LINE_CONINUATION typo.
+       * node.c (make_str_node): Ditto.
+       * (parse_escape): Ditto.
+       * re.c (make_regexp): Ditto.
+
+       Unrelated: Use size_t for multibyte character lengths (aside
+       from it being the type which the interfaces return, using it
+       avoids implementation-defined behavior when -1 or -2 is cast
+       to size_t then stored in an int).
+
+       * node.c (make_str_node): Change mblen from int to size_t.
+
+       Unrelated: Change the length of a translated escape sequence
+       from int to size_t (to match the underlying interfaces).
+
+       * awk.h (parse_escape): Change prototype.
+       * node.c (make_str_node): Adjust to accomodate new type.
+       * (parse_escape): Ditto.
+       * re.c (make_regexp): Ditto.
+
+       Unrelated: Clarify make_regexp, inspired by make_str_node.
+
+       * re.c (make_regexp): Copy multibyte characters at once, move
+       the escape sequence post-processing into the switch statement
+       to explicitly connect a return value with its handling, add a
+       default case to catch unrecognized values, and move ok_to_escape
+       next to where it's used (and only set it once).
+
 2023-08-21         Arnold D. Robbins     <arnold@skeeve.com>
 
        * node.c (make_str_node): Don't use N_() in cant_happen message.
diff --git a/awk.h b/awk.h
index 034f7a2e..0d1dc85b 100644
--- a/awk.h
+++ b/awk.h
@@ -298,7 +298,7 @@ enum escape_results {
        ESCAPE_OK,              // nbytes == 1 to MB_CUR_MAX: the length of the 
translated escape sequence
        ESCAPE_CONV_ERR,        // wcrtomb conversion error
        ESCAPE_TERM_BACKSLASH,  // terminal backslash (to be preserved in 
cmdline strings)
-       ESCAPE_LINE_CONINUATION // line continuation  (backslash-newline pair)
+       ESCAPE_LINE_CONTINUATION        // line continuation  
(backslash-newline pair)
 };
 
 /* string hash table */
@@ -1748,7 +1748,7 @@ extern NODE *make_str_node(const char *s, size_t len, int 
flags);
 extern NODE *make_bool_node(bool value);
 extern NODE *make_typed_regex(const char *re, size_t len);
 extern void *more_blocks(int id);
-extern enum escape_results parse_escape(const char **string_ptr, const char 
**escseq, int *nbytes);
+extern enum escape_results parse_escape(const char **string_ptr, const char 
**escseq, size_t *nbytes);
 extern NODE *str2wstr(NODE *n, size_t **ptr);
 extern NODE *wstr2str(NODE *n);
 #define force_wstring(n)       str2wstr(n, NULL)
diff --git a/node.c b/node.c
index 91c28396..5de4e082 100644
--- a/node.c
+++ b/node.c
@@ -441,10 +441,15 @@ make_str_node(const char *s, size_t len, int flags)
                         * character happens to be a backslash.
                         */
                        if (gawk_mb_cur_max > 1) {
-                               int mblen = mbrlen(pf, end-pf, &cur_state);
+                               size_t mblen = mbrlen(pf, end-pf, &cur_state);
 
-                               if (mblen > 1) {
-                                       int i;
+                               /*
+                                * Incomplete (-2), invalid (-1), and
+                                * null (0) characters are excluded here.
+                                * They are read as a sequence of bytes.
+                                */
+                               if (mblen > 1 && mblen < (size_t) -2) {
+                                       size_t i;
 
                                        for (i = 0; i < mblen; i++)
                                                *ptm++ = *pf++;
@@ -455,7 +460,7 @@ make_str_node(const char *s, size_t len, int flags)
                        c = *pf++;
                        if (c == '\\') {
                                const char *result;
-                               int nbytes;
+                               size_t nbytes;
                                enum escape_results ret;
 
                                ret = parse_escape(& pf, & result, & nbytes);
@@ -473,12 +478,12 @@ make_str_node(const char *s, size_t len, int flags)
                                                lintwarn(_("backslash at end of 
string"));
                                        *ptm++ = '\\';
                                        break;
-                               case ESCAPE_LINE_CONINUATION:
+                               case ESCAPE_LINE_CONTINUATION:
                                        if (do_lint)
                                                lintwarn(_("backslash string 
continuation is not portable"));
                                        continue;
                                default:
-                                       cant_happen("received bad result %d 
from parse_escape(), nbytes = %d",
+                                       cant_happen("received bad result %d 
from parse_escape(), nbytes = %zu",
                                                        (int) ret, nbytes);
                                        break;
                                }
@@ -555,13 +560,13 @@ r_unref(NODE *tmp)
  *     ESCAPE_OK,              // nbytes == 1 to MB_CUR_MAX: the length of the 
translated escape sequence
  *     ESCAPE_CONV_ERR,        // wcrtomb conversion error
  *     ESCAPE_TERM_BACKSLASH,  // terminal backslash (to be preserved in 
cmdline strings)
- *     ESCAPE_LINE_CONINUATION // line continuation  (backslash-newline pair)
+ *     ESCAPE_LINE_CONTINUATION        // line continuation  
(backslash-newline pair)
  *
  * POSIX doesn't allow \x or \u.
  */
 
 enum escape_results
-parse_escape(const char **string_ptr, const char **result, int *nbytes)
+parse_escape(const char **string_ptr, const char **result, size_t *nbytes)
 {
        static char buf[MB_LEN_MAX];
        enum escape_results retval = ESCAPE_OK;
@@ -606,7 +611,7 @@ parse_escape(const char **string_ptr, const char **result, 
int *nbytes)
                buf[0] = '\v';
                break;
        case '\n':
-               retval = ESCAPE_LINE_CONINUATION;
+               retval = ESCAPE_LINE_CONTINUATION;
                break;
        case 0:
                (*string_ptr)--;
@@ -718,8 +723,7 @@ parse_escape(const char **string_ptr, const char **result, 
int *nbytes)
                        retval = ESCAPE_CONV_ERR;
                        *nbytes = 0;
                } else {
-                       /* MB_LEN_MAX is an int, so n fits */
-                       *nbytes = (int) n;
+                       *nbytes = n;
                }
                break;
        }
diff --git a/re.c b/re.c
index d22b66fa..616f7b5c 100644
--- a/re.c
+++ b/re.c
@@ -60,11 +60,6 @@ make_regexp(const char *s, size_t len, bool ignorecase, bool 
dfa, bool canfatal)
                lintwarn(_("behavior of matching a regexp containing NUL 
characters is not defined by POSIX"));
        }
 
-       /*
-        * The number of bytes in the current multibyte character.
-        * It is 0, when the current character is a singlebyte character.
-        */
-       size_t is_multibyte = 0;
        mbstate_t mbs;
 
        memset(&mbs, 0, sizeof(mbstate_t)); /* Initialize.  */
@@ -95,83 +90,70 @@ make_regexp(const char *s, size_t len, bool ignorecase, 
bool dfa, bool canfatal)
        dest = buf;
 
        while (src < end) {
-               if (gawk_mb_cur_max > 1 && ! is_multibyte) {
-                       /* The previous byte is a singlebyte character, or last 
byte
-                          of a multibyte character.  We check the next 
character.  */
-                       is_multibyte = mbrlen(src, end - src, &mbs);
-                       if (   is_multibyte == 1
-                           || is_multibyte == (size_t) -1
-                           || is_multibyte == (size_t) -2
-                           || is_multibyte == 0) {
-                               /* We treat it as a single-byte character.  */
-                               is_multibyte = 0;
+               /*
+                * Keep multibyte characters together. This avoids
+                * problems if a subsequent byte of a multibyte
+                * character happens to be a backslash.
+                */
+               if (gawk_mb_cur_max > 1) {
+                       size_t mblen = mbrlen(src, end - src, &mbs);
+
+                       /*
+                        * Incomplete (-2), invalid (-1), and
+                        * null (0) characters are excluded here.
+                        * They are read as a sequence of bytes.
+                        */
+                       if (mblen > 1 && mblen < (size_t) -2) {
+                               size_t i;
+
+                               for (i = 0; i < mblen; i++)
+                                       *dest++ = *src++;
+                               continue;
                        }
                }
 
-               const char *ok_to_escape;
-               if (do_posix)
-                       ok_to_escape = "{}()|*+?.^$\\[]/-";
-               else if (do_traditional)
-                       ok_to_escape = "()|*+?.^$\\[]/-";
-               else
-                       ok_to_escape = "<>`'BywWsS{}()|*+?.^$\\[]/-";
-
-               /* We skip multibyte character, since it must not be a special
-                  character.  */
-               if ((gawk_mb_cur_max == 1 || ! is_multibyte) &&
-                   (*src == '\\')) {
-                       c = *++src;
-                       switch (c) {
-                       case '\0':      /* \\ before \0, either dynamic data or 
real end of string */
-                               if (src >= s + len)
-                                       *dest++ = '\\'; // at end of string, 
will fatal below
-                               else
-                                       fatal(_("invalid NUL byte in dynamic 
regexp"));
-                               break;
-                       case 'a':
-                       case 'b':
-                       case 'f':
-                       case 'n':
-                       case 'r':
-                       case 't':
-                       case 'v':
-                       case 'x':
-                       case 'u':
-                       case '0':
-                       case '1':
-                       case '2':
-                       case '3':
-                       case '4':
-                       case '5':
-                       case '6':
-                       case '7':
-                       {
-                               const char *result;
-                               int nbytes;
-                               enum escape_results ret;
-
-                               ret = parse_escape(& src, & result, & nbytes);
-                               switch (ret) {
-                               case ESCAPE_OK:
-                               case ESCAPE_CONV_ERR:
-                                       break;
-                               case ESCAPE_TERM_BACKSLASH:
-                               case ESCAPE_LINE_CONINUATION:
-                                       cant_happen("received bad result %d 
from parse_escape(), nbytes = %d",
-                                                       (int) ret, nbytes);
-                                       break;
-                               }
-                               /*
-                                * Invalid code points produce '?' (0x3F).
-                                * These are quoted so that they're taken
-                                * literally. Unlike \u3F, a metachar.
-                                */
-                               if (nbytes == 0) {
-                                       *dest++ = '\\';
-                                       *dest++ = '?';
-                                       break;
-                               }
+               /*
+                * From here *src is a single byte character.
+                */
+               if (*src != '\\') {
+                       *dest++ = *src++;
+                       continue;
+               }
 
+               /* Escape sequence */
+               c = *++src;
+               switch (c) {
+               case '\0':      /* \\ before \0, either dynamic data or real 
end of string */
+                       if (src >= s + len)
+                               *dest++ = '\\'; // at end of string, will fatal 
below
+                       else
+                               fatal(_("invalid NUL byte in dynamic regexp"));
+                       break;
+               case 'a':
+               case 'b':
+               case 'f':
+               case 'n':
+               case 'r':
+               case 't':
+               case 'v':
+               case 'x':
+               case 'u':
+               case '0':
+               case '1':
+               case '2':
+               case '3':
+               case '4':
+               case '5':
+               case '6':
+               case '7':
+               {
+                       const char *result;
+                       size_t nbytes;
+                       enum escape_results ret;
+
+                       ret = parse_escape(& src, & result, & nbytes);
+                       switch (ret) {
+                       case ESCAPE_OK:
                                /*
                                 * Unix awk treats octal (and hex?) chars
                                 * literally in re's, so escape regexp
@@ -184,7 +166,8 @@ make_regexp(const char *s, size_t len, bool ignorecase, 
bool dfa, bool canfatal)
                                    && strchr("()|*+?.^$\\[]", *result) != NULL)
                                        *dest++ = '\\';
 
-                               if (do_lint
+                               if (nbytes == 1
+                                   && do_lint
                                    && ! nul_warned
                                    && *result == '\0') {
                                        nul_warned = true;
@@ -195,49 +178,85 @@ make_regexp(const char *s, size_t len, bool ignorecase, 
bool dfa, bool canfatal)
                                while (nbytes--)
                                        *dest++ = *result++;
                                break;
+                       case ESCAPE_CONV_ERR:
+                               /*
+                                * Invalid code points produce '?' (0x3F).
+                                * These are quoted so that they're taken
+                                * literally. Unlike \u3F, a metachar.
+                                */
+                               *dest++ = '\\';
+                               *dest++ = '?';
+                               break;
+                       default:
+                               /*
+                                * The outer switch handles terminal
+                                * backslashes and line continuations.
+                                * parse_escape should never see them
+                                * and therefore it should never return
+                                * ESCAPE_TERM_BACKSLASH nor
+                                * ESCAPE_LINE_CONTINUATION.
+                                *
+                                * This also catches unknown values.
+                                */
+                               cant_happen("received bad result %d from 
parse_escape(), nbytes = %zu",
+                                               (int) ret, nbytes);
+                       }
+                       break;
+               }
+               case '8':
+               case '9':       /* a\9b not valid */
+                       *dest++ = c;
+                       src++;
+               {
+                       static bool warned[2];
+
+                       if (! warned[c - '8']) {
+                               warning(_("regexp escape sequence `\\%c' 
treated as plain `%c'"), c, c);
+                               warned[c - '8'] = true;
                        }
-                       case '8':
-                       case '9':       /* a\9b not valid */
-                               *dest++ = c;
+               }
+                       break;
+               case 'y':       /* normally \b */
+                       /* gnu regex op */
+                       if (! do_traditional) {
+                               *dest++ = '\\';
+                               *dest++ = 'b';
                                src++;
-                       {
-                               static bool warned[2];
+                               break;
+                       }
+                       /* else, fall through */
+               default:
+                 {
+                       static const char *ok_to_escape = NULL;
 
-                               if (! warned[c - '8']) {
-                                       warning(_("regexp escape sequence 
`\\%c' treated as plain `%c'"), c, c);
-                                       warned[c - '8'] = true;
-                               }
+                       /*
+                        * The posix and traditional flags do not change
+                        * once the awk program is running. Therefore,
+                        * neither does ok_to_escape.
+                        */
+                       if (ok_to_escape == NULL) {
+                               if (do_posix)
+                                       ok_to_escape = "{}()|*+?.^$\\[]/-";
+                               else if (do_traditional)
+                                       ok_to_escape = "()|*+?.^$\\[]/-";
+                               else
+                                       ok_to_escape = 
"<>`'BywWsS{}()|*+?.^$\\[]/-";
                        }
-                               break;
-                       case 'y':       /* normally \b */
-                               /* gnu regex op */
-                               if (! do_traditional) {
-                                       *dest++ = '\\';
-                                       *dest++ = 'b';
-                                       src++;
-                                       break;
-                               }
-                               /* else, fall through */
-                       default:
-                               if (strchr(ok_to_escape, c) == NULL) {
-                                       static bool warned[256];
 
-                                       if (! warned[c & 0xFF]) {
-                                               warning(_("regexp escape 
sequence `\\%c' is not a known regexp operator"), c);
-                                               warned[c & 0xFF] = true;
-                                       }
+                       if (strchr(ok_to_escape, c) == NULL) {
+                               static bool warned[256];
+
+                               if (! warned[c & 0xFF]) {
+                                       warning(_("regexp escape sequence 
`\\%c' is not a known regexp operator"), c);
+                                       warned[c & 0xFF] = true;
                                }
-                               *dest++ = '\\';
-                               *dest++ = (char) c;
-                               src++;
-                               break;
-                       } /* switch */
-               } else {
-                       c = *src;
-                       *dest++ = *src++;       /* not '\\' */
-               }
-               if (gawk_mb_cur_max > 1 && is_multibyte)
-                       is_multibyte--;
+                       }
+                       *dest++ = '\\';
+                       *dest++ = (char) c;
+                       src++;
+                       break;
+                 }
+               } /* switch */
        } /* while */
 
        *dest = '\0';
[Prev in Thread]
Current Thread
[Next in Thread]
Clarifying parse_encode handling in make_regexp, Miguel Pineiro Jr. <=
Prev by Date: Better shell_quote() function for included shellquote.awk
Next by Date: Fwd: Better shell_quote() function for included shellquote.awk
Previous by thread: Better shell_quote() function for included shellquote.awk
Next by thread: gawk core dumped on too many input values
Index(es):
- Date
- Thread