guile-commits
[Top][All Lists]
Advanced

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

[Guile-commits] 03/05: Simplify decoding error handling


From: Andy Wingo
Subject: [Guile-commits] 03/05: Simplify decoding error handling
Date: Tue, 10 May 2016 10:51:24 +0000 (UTC)

wingo pushed a commit to branch wip-port-refactor
in repository guile.

commit 08c67dbef87e343de19eb744b076d24b31f0508c
Author: Andy Wingo <address@hidden>
Date:   Tue May 10 12:09:30 2016 +0200

    Simplify decoding error handling
    
    * libguile/ports.c (peek_utf8_codepoint, peek_latin1_codepoint):
      (peek_iconv_codepoint, peek_codepoint): Refactor to push error
      handling to the leaves, where errors happen.  Just return
      the (possibly substituted) codepoint, without an error code; if
      there's really an error, we should raise it.
      (scm_getc, scm_peek_char): Adapt.
---
 libguile/ports.c |  201 ++++++++++++++++++++++++------------------------------
 1 file changed, 88 insertions(+), 113 deletions(-)

diff --git a/libguile/ports.c b/libguile/ports.c
index 6b9c4f5..bbe3867 100644
--- a/libguile/ports.c
+++ b/libguile/ports.c
@@ -1598,27 +1598,27 @@ utf8_to_codepoint (const scm_t_uint8 *utf8_buf, size_t 
size)
   return codepoint;
 }
 
-/* Peek a UTF-8 sequence from PORT.  On success, return 0, set
-   *CODEPOINT to the codepoint that was read, and set *LEN to the length
-   in bytes.  Return `EILSEQ' on error, setting *LEN to the shortest
-   prefix that cannot begin a valid UTF-8 sequence.  */
-static int
-peek_utf8_codepoint (SCM port, scm_t_wchar *codepoint, size_t *len)
+/* Peek a UTF-8 sequence from PORT.  On success, return the codepoint
+   that was read, and set *LEN to the length in bytes.  If there was a
+   decoding error and the port conversion strategy was `substitute',
+   then return #\? and set *LEN to the length of the shortest prefix
+   that cannot begin a valid UTF-8 sequence.  Otherwise signal an
+   error.  */
+static scm_t_wchar
+peek_utf8_codepoint (SCM port, size_t *len)
 {
+#define DECODING_ERROR(bytes) \
+  do { *len = bytes; goto decoding_error; } while (0)
+#define RETURN(bytes, codepoint) \
+  do { *len = bytes; return codepoint; } while (0)
+
   int first_byte;
 
   first_byte = peek_byte_or_eof (port);
   if (first_byte == EOF)
-    {
-      *codepoint = EOF;
-      return 0;
-    }
+    RETURN (0, EOF);
   else if (first_byte < 0x80)
-    {
-      *codepoint = first_byte;
-      *len = 1;
-      return 0;
-    }
+    RETURN (1, first_byte);
   else if (first_byte >= 0xc2 && first_byte <= 0xdf)
     {
       SCM read_buf = scm_fill_input (port, 2);
@@ -1626,14 +1626,9 @@ peek_utf8_codepoint (SCM port, scm_t_wchar *codepoint, 
size_t *len)
       const scm_t_uint8 *ptr = scm_port_buffer_take_pointer (read_buf);
 
       if (can_take < 2 || (ptr[1] & 0xc0) != 0x80)
-        {
-          *len = 1;
-          return EILSEQ;
-        }
+        DECODING_ERROR (1);
 
-      *codepoint = (first_byte & 0x1f) << 6UL | (ptr[1] & 0x3f);
-      *len = 2;
-      return 0;
+      RETURN (2, (first_byte & 0x1f) << 6UL | (ptr[1] & 0x3f));
     }
   else if ((first_byte & 0xf0) == 0xe0)
     {
@@ -1644,22 +1639,15 @@ peek_utf8_codepoint (SCM port, scm_t_wchar *codepoint, 
size_t *len)
       if (can_take < 2 || (ptr[1] & 0xc0) != 0x80
           || (ptr[0] == 0xe0 && ptr[1] < 0xa0)
           || (ptr[0] == 0xed && ptr[1] > 0x9f))
-        {
-          *len = 1;
-          return EILSEQ;
-        }
+        DECODING_ERROR (1);
 
       if (can_take < 3 || (ptr[2] & 0xc0) != 0x80)
-        {
-          *len = 2;
-          return EILSEQ;
-        }
+        DECODING_ERROR (2);
 
-      *codepoint = ((scm_t_wchar) ptr[0] & 0x0f) << 12UL
-       | ((scm_t_wchar) ptr[1] & 0x3f) << 6UL
-       | (ptr[2] & 0x3f);
-      *len = 3;
-      return 0;
+      RETURN (3,
+              ((scm_t_wchar) ptr[0] & 0x0f) << 12UL
+              | ((scm_t_wchar) ptr[1] & 0x3f) << 6UL
+              | (ptr[2] & 0x3f));
     }
   else if (first_byte >= 0xf0 && first_byte <= 0xf4)
     {
@@ -1670,56 +1658,55 @@ peek_utf8_codepoint (SCM port, scm_t_wchar *codepoint, 
size_t *len)
       if (can_take < 2 || (ptr[1] & 0xc0) != 0x80
           || (ptr[0] == 0xf0 && ptr[1] < 0x90)
           || (ptr[0] == 0xf4 && ptr[1] > 0x8f))
-        {
-          *len = 1;
-          return EILSEQ;
-        }
+        DECODING_ERROR (1);
 
       if (can_take < 3 || (ptr[2] & 0xc0) != 0x80)
-        {
-          *len = 2;
-          return EILSEQ;
-        }
+        DECODING_ERROR (2);
 
       if (can_take < 4 || (ptr[3] & 0xc0) != 0x80)
-        {
-          *len = 3;
-          return EILSEQ;
-        }
+        DECODING_ERROR (3);
 
-      *codepoint = ((scm_t_wchar) ptr[0] & 0x07) << 18UL
-       | ((scm_t_wchar) ptr[1] & 0x3f) << 12UL
-       | ((scm_t_wchar) ptr[2] & 0x3f) << 6UL
-       | (ptr[3] & 0x3f);
-      *len = 4;
-      return 0;
+      RETURN (4,
+              ((scm_t_wchar) ptr[0] & 0x07) << 18UL
+              | ((scm_t_wchar) ptr[1] & 0x3f) << 12UL
+              | ((scm_t_wchar) ptr[2] & 0x3f) << 6UL
+              | (ptr[3] & 0x3f));
     }
   else
-    {
-      *len = 1;
-      return EILSEQ;
-    }
+    DECODING_ERROR (1);
+
+ decoding_error:
+  if (scm_is_eq (SCM_PTAB_ENTRY (port)->conversion_strategy, sym_substitute))
+    /* *len already set.  */
+    return '?';
+
+  scm_decoding_error ("peek-char", EILSEQ, "input decoding error", port);
+  /* Not reached.  */
+  return 0;
+#undef DECODING_ERROR
+#undef RETURN
 }
 
 /* Peek an ISO-8859-1 codepoint (a byte) from PORT.  On success, return
-   0, set *CODEPOINT to the codepoint that was peeked, and set *LEN to
-   the length in bytes.  No encoding error is possible.  */
-static int
-peek_latin1_codepoint (SCM port, scm_t_wchar *codepoint, size_t *len)
+   the codepoint, and set *LEN to 1.  Otherwise on EOF set *LEN to 0.  */
+static scm_t_wchar
+peek_latin1_codepoint (SCM port, size_t *len)
 {
-  *codepoint = peek_byte_or_eof (port);
-  if (*codepoint == EOF)
-    *len = 0;
-  else
-    *len = 1;
-  return 0;
+  scm_t_wchar ret = peek_byte_or_eof (port);
+
+  *len = ret == EOF ? 0 : 1;
+
+  return ret;
 }
 
 /* Peek a codepoint from PORT, decoding it through iconv.  On success,
-   return 0, set *CODEPOINT to the codepoint that was peeked, and set
-   *LEN to the length in bytes.  Return `EILSEQ' on decoding error.  */
-static int
-peek_iconv_codepoint (SCM port, scm_t_wchar *codepoint, size_t *len)
+   return the codepoint and set *LEN to the length in bytes.  If there
+   was a decoding error and the port conversion strategy was
+   `substitute', then return #\? and set *LEN to the length of the
+   shortest prefix that cannot begin a valid UTF-8 sequence.  Otherwise
+   signal an error.  */
+static scm_t_wchar
+peek_iconv_codepoint (SCM port, size_t *len)
 {
   scm_t_iconv_descriptors *id;
   scm_t_uint8 utf8_buf[SCM_MBCHAR_BUF_SIZE];
@@ -1736,16 +1723,13 @@ peek_iconv_codepoint (SCM port, scm_t_wchar *codepoint, 
size_t *len)
 
       if (scm_port_buffer_can_take (read_buf) <= input_size)
        {
+          *len = input_size;
           if (input_size == 0)
             /* Normal EOF.  */
-            {
-              *codepoint = (scm_t_wchar) EOF;
-              *len = 0;
-              return 0;
-            }
-          else
-            /* EOF found in the middle of a multibyte character. */
-            return EILSEQ;
+            return EOF;
+
+          /* EOF found in the middle of a multibyte character. */
+          goto decoding_error;
        }
 
       input_size++;
@@ -1764,8 +1748,9 @@ peek_iconv_codepoint (SCM port, scm_t_wchar *codepoint, 
size_t *len)
             /* The input byte sequence did not form a complete
                character.  Read another byte and try again. */
             continue;
-          else
-            return err;
+
+          *len = input_size;
+          goto decoding_error;
        }
       else
         {
@@ -1779,36 +1764,35 @@ peek_iconv_codepoint (SCM port, scm_t_wchar *codepoint, 
size_t *len)
 
           /* iconv generated output.  Convert the UTF8_BUF sequence
              to a Unicode code point.  */
-          *codepoint = utf8_to_codepoint (utf8_buf, output_size);
           *len = input_size;
-          return 0;
+          return utf8_to_codepoint (utf8_buf, output_size);
         }
     }
+
+ decoding_error:
+  if (scm_is_eq (SCM_PTAB_ENTRY (port)->conversion_strategy, sym_substitute))
+    return '?';
+
+  scm_decoding_error ("peek-char", EILSEQ, "input decoding error",
+                      port);
+  /* Not reached.  */
+  return 0;
 }
 
 /* Peek a codepoint from PORT and return it in *CODEPOINT.  Set *LEN to
    the length in bytes of that representation.  Return 0 on success and
    an errno value on error.  */
-static SCM_C_INLINE int
-peek_codepoint (SCM port, scm_t_wchar *codepoint, size_t *len)
+static SCM_C_INLINE scm_t_wchar
+peek_codepoint (SCM port, size_t *len)
 {
-  int err;
-  scm_t_port *pt = SCM_PTAB_ENTRY (port);
+  SCM encoding = SCM_PTAB_ENTRY (port)->encoding;
 
-  if (scm_is_eq (pt->encoding, sym_UTF_8))
-    err = peek_utf8_codepoint (port, codepoint, len);
-  else if (scm_is_eq (pt->encoding, sym_ISO_8859_1))
-    err = peek_latin1_codepoint (port, codepoint, len);
+  if (scm_is_eq (encoding, sym_UTF_8))
+    return peek_utf8_codepoint (port, len);
+  else if (scm_is_eq (encoding, sym_ISO_8859_1))
+    return peek_latin1_codepoint (port, len);
   else
-    err = peek_iconv_codepoint (port, codepoint, len);
-
-  if (err != 0 && scm_is_eq (pt->conversion_strategy, sym_substitute))
-    {
-      *codepoint = '?';
-      err = 0;
-    }
-
-  return err;
+    return peek_iconv_codepoint (port, len);
 }
 
 /* Read a codepoint from PORT and return it.  */
@@ -1816,13 +1800,10 @@ scm_t_wchar
 scm_getc (SCM port)
 #define FUNC_NAME "scm_getc"
 {
-  int err;
   size_t len = 0;
-  scm_t_wchar codepoint = EOF;
+  scm_t_wchar codepoint;
 
-  err = peek_codepoint (port, &codepoint, &len);
-  if (SCM_UNLIKELY (err != 0))
-    scm_decoding_error (FUNC_NAME, err, "input decoding error", port);
+  codepoint = peek_codepoint (port, &len);
   scm_port_buffer_did_take (SCM_PTAB_ENTRY (port)->read_buf, len);
   if (codepoint == EOF)
     scm_i_clear_pending_eof (port);
@@ -2009,7 +1990,6 @@ SCM_DEFINE (scm_peek_char, "peek-char", 0, 1, 0,
            "sequence when the error is raised.\n")
 #define FUNC_NAME s_scm_peek_char
 {
-  int err;
   scm_t_wchar c;
   size_t len = 0;
 
@@ -2017,14 +1997,9 @@ SCM_DEFINE (scm_peek_char, "peek-char", 0, 1, 0,
     port = scm_current_input_port ();
   SCM_VALIDATE_OPINPORT (1, port);
 
-  err = peek_codepoint (port, &c, &len);
-
-  if (err == 0)
-    return c == EOF ? SCM_EOF_VAL : SCM_MAKE_CHAR (c);
+  c = peek_codepoint (port, &len);
 
-  scm_decoding_error (FUNC_NAME, err, "input decoding error", port);
-  /* Not reached.  */
-  return SCM_BOOL_F;
+  return c == EOF ? SCM_EOF_VAL : SCM_MAKE_CHAR (c);
 }
 #undef FUNC_NAME
 



reply via email to

[Prev in Thread] Current Thread [Next in Thread]