guile-commits
[Top][All Lists]
Advanced

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

[Guile-commits] GNU Guile branch, stable-2.0, updated. v2.0.1-35-g7b292a


From: Ludovic Courtès
Subject: [Guile-commits] GNU Guile branch, stable-2.0, updated. v2.0.1-35-g7b292a9
Date: Fri, 06 May 2011 15:54:38 +0000

This is an automated email from the git hooks/post-receive script. It was
generated because a ref change was pushed to the repository containing
the project "GNU Guile".

http://git.savannah.gnu.org/cgit/guile.git/commit/?id=7b292a9d349bd09be4a493a51812d66b7ecbc728

The branch, stable-2.0 has been updated
       via  7b292a9d349bd09be4a493a51812d66b7ecbc728 (commit)
       via  1f78c6691fbcfe059c74ac93b64a453eb2353ced (commit)
      from  a2a6c0e319b5c146c484cb1fe8ffc9b14b9a9876 (commit)

Those revisions listed above that are new to this repository have
not appeared on any other notification email; so we list those
revisions in full, below.

- Log -----------------------------------------------------------------
commit 7b292a9d349bd09be4a493a51812d66b7ecbc728
Author: Ludovic Courtès <address@hidden>
Date:   Fri May 6 17:54:09 2011 +0200

    Special-case UTF-8 ports to bypass `iconv' entirely.
    
    * libguile/ports.c (update_port_lf): Handle EOF.
      (get_utf8_codepoint, get_iconv_codepoint): New functions.
      (get_codepoint): Use them.
      (scm_i_set_port_encoding_x): Don't open conversion descriptors when
      ENCODING is "UTF-8".
    
    * libguile/print.c (display_string_as_utf8, display_string_using_iconv):
      New functions.
      (display_string): Use them.
    
    * test-suite/tests/ports.test ("string ports")[#xc2 #x41 #x42]: Add a
      note that this is not the wrong behavior per Unicode 6.0.0.

commit 1f78c6691fbcfe059c74ac93b64a453eb2353ced
Author: Ludovic Courtès <address@hidden>
Date:   Fri May 6 17:43:37 2011 +0200

    Fix `foreign.test' for big endian machines.
    
    * test-suite/tests/foreign.test ("pointer<->bytevector")["pointer from
      bits", "dereference-pointer"]: Fix iteration order for big endian
      machines.

-----------------------------------------------------------------------

Summary of changes:
 libguile/ports.c              |  252 +++++++++++++++++++++++++++++++++--------
 libguile/print.c              |   84 +++++++++++---
 test-suite/tests/foreign.test |   28 +++--
 test-suite/tests/ports.test   |    8 ++
 4 files changed, 302 insertions(+), 70 deletions(-)

diff --git a/libguile/ports.c b/libguile/ports.c
index b5ad95e..767e086 100644
--- a/libguile/ports.c
+++ b/libguile/ports.c
@@ -1057,6 +1057,7 @@ update_port_lf (scm_t_wchar c, SCM port)
   switch (c)
     {
     case '\a':
+    case EOF:
       break;
     case '\b':
       SCM_DECCOL (port);
@@ -1115,23 +1116,162 @@ utf8_to_codepoint (const scm_t_uint8 *utf8_buf, size_t 
size)
   return codepoint;
 }
 
-/* Read a codepoint from PORT and return it in *CODEPOINT.  Fill BUF
-   with the byte representation of the codepoint in PORT's encoding, and
-   set *LEN to the length in bytes of that representation.  Return 0 on
-   success and an errno value on error.  */
+/* Read a UTF-8 sequence from PORT.  On success, return 0 and set
+   *CODEPOINT to the codepoint that was read, fill BUF with its UTF-8
+   representation, and set *LEN to the length in bytes.  Return
+   `EILSEQ' on error.  */
 static int
-get_codepoint (SCM port, scm_t_wchar *codepoint,
-              char buf[SCM_MBCHAR_BUF_SIZE], size_t *len)
+get_utf8_codepoint (SCM port, scm_t_wchar *codepoint,
+                   scm_t_uint8 buf[SCM_MBCHAR_BUF_SIZE], size_t *len)
 {
+#define ASSERT_NOT_EOF(b)                      \
+  if (SCM_UNLIKELY ((b) == EOF))               \
+    goto invalid_seq
+
+  int byte;
+
+  *len = 0;
+
+  byte = scm_get_byte_or_eof (port);
+  if (byte == EOF)
+    {
+      *codepoint = EOF;
+      return 0;
+    }
+
+  buf[0] = (scm_t_uint8) byte;
+  *len = 1;
+
+  if (buf[0] <= 0x7f)
+    /* 1-byte form.  */
+    *codepoint = buf[0];
+  else if (buf[0] >= 0xc2 && buf[0] <= 0xdf)
+    {
+      /* 2-byte form.  */
+      byte = scm_get_byte_or_eof (port);
+      ASSERT_NOT_EOF (byte);
+
+      buf[1] = (scm_t_uint8) byte;
+      *len = 2;
+
+      if (SCM_UNLIKELY ((byte & 0xc0) != 0x80))
+       goto invalid_seq;
+
+      *codepoint = ((scm_t_wchar) buf[0] & 0x1f) << 6UL
+       | (buf[1] & 0x3f);
+    }
+  else if ((buf[0] & 0xf0) == 0xe0)
+    {
+      /* 3-byte form.  */
+      byte = scm_get_byte_or_eof (port);
+      if (SCM_UNLIKELY (byte == EOF))
+       goto invalid_seq;
+
+      buf[1] = (scm_t_uint8) byte;
+      *len = 2;
+
+      if (SCM_UNLIKELY ((byte & 0xc0) != 0x80
+                       || (buf[0] == 0xe0 && byte < 0xa0)
+                       || (buf[0] == 0xed && byte > 0x9f)))
+       {
+         /* Swallow the 3rd byte.  */
+         byte = scm_get_byte_or_eof (port);
+         ASSERT_NOT_EOF (byte);
+         *len = 3, buf[2] = byte;
+         goto invalid_seq;
+       }
+
+
+      byte = scm_get_byte_or_eof (port);
+      ASSERT_NOT_EOF (byte);
+
+      buf[2] = (scm_t_uint8) byte;
+      *len = 3;
+
+      if (SCM_UNLIKELY ((byte & 0xc0) != 0x80))
+       goto invalid_seq;
+
+      *codepoint = ((scm_t_wchar) buf[0] & 0x0f) << 12UL
+       | ((scm_t_wchar) buf[1] & 0x3f) << 6UL
+       | (buf[2] & 0x3f);
+    }
+  else if (buf[0] >= 0xf0 && buf[0] <= 0xf4)
+    {
+      /* 4-byte form.  */
+      byte = scm_get_byte_or_eof (port);
+      ASSERT_NOT_EOF (byte);
+
+      buf[1] = (scm_t_uint8) byte;
+      *len = 2;
+
+      if (SCM_UNLIKELY (((byte & 0xc0) != 0x80)
+                       || (buf[0] == 0xf0 && byte < 0x90)
+                       || (buf[0] == 0xf4 && byte > 0x8f)))
+       {
+         /* Swallow the 3rd and 4th bytes.  */
+         byte = scm_get_byte_or_eof (port);
+         ASSERT_NOT_EOF (byte);
+         *len = 3, buf[2] = byte;
+
+         byte = scm_get_byte_or_eof (port);
+         ASSERT_NOT_EOF (byte);
+         *len = 4, buf[3] = byte;
+         goto invalid_seq;
+       }
+
+      byte = scm_get_byte_or_eof (port);
+      ASSERT_NOT_EOF (byte);
+
+      buf[2] = (scm_t_uint8) byte;
+      *len = 3;
+
+      if (SCM_UNLIKELY ((byte & 0xc0) != 0x80))
+       {
+         /* Swallow the 4th byte.  */
+         byte = scm_get_byte_or_eof (port);
+         ASSERT_NOT_EOF (byte);
+         *len = 4, buf[3] = byte;
+         goto invalid_seq;
+       }
+
+      byte = scm_get_byte_or_eof (port);
+      ASSERT_NOT_EOF (byte);
+
+      buf[3] = (scm_t_uint8) byte;
+      *len = 4;
+
+      if (SCM_UNLIKELY ((byte & 0xc0) != 0x80))
+       goto invalid_seq;
+
+      *codepoint = ((scm_t_wchar) buf[0] & 0x07) << 18UL
+       | ((scm_t_wchar) buf[1] & 0x3f) << 12UL
+       | ((scm_t_wchar) buf[2] & 0x3f) << 6UL
+       | (buf[3] & 0x3f);
+    }
+  else
+    goto invalid_seq;
+
+  return 0;
+
+ invalid_seq:
+  return EILSEQ;
+
+#undef ASSERT_NOT_EOF
+}
+
+/* Likewise, read a byte sequence from PORT, passing it through its
+   input conversion descriptor.  */
+static int
+get_iconv_codepoint (SCM port, scm_t_wchar *codepoint,
+                    char buf[SCM_MBCHAR_BUF_SIZE], size_t *len)
+{
+  scm_t_port *pt;
   int err, byte_read;
   size_t bytes_consumed, output_size;
   char *output;
   scm_t_uint8 utf8_buf[SCM_MBCHAR_BUF_SIZE];
-  scm_t_port *pt = SCM_PTAB_ENTRY (port);
 
-  if (SCM_UNLIKELY (pt->input_cd == (iconv_t) -1))
-    /* Initialize the conversion descriptors.  */
-    scm_i_set_port_encoding_x (port, pt->encoding);
+  pt = SCM_PTAB_ENTRY (port);
 
   for (output_size = 0, output = (char *) utf8_buf,
         bytes_consumed = 0, err = 0;
@@ -1177,31 +1317,46 @@ get_codepoint (SCM port, scm_t_wchar *codepoint,
   if (SCM_UNLIKELY (output_size == 0))
     /* An unterminated sequence.  */
     err = EILSEQ;
-
-  if (SCM_UNLIKELY (err != 0))
+  else if (SCM_LIKELY (err == 0))
     {
-      /* Reset the `iconv' state.  */
-      iconv (pt->input_cd, NULL, NULL, NULL, NULL);
+      /* Convert the UTF8_BUF sequence to a Unicode code point.  */
+      *codepoint = utf8_to_codepoint (utf8_buf, output_size);
+      *len = bytes_consumed;
+    }
 
-      if (pt->ilseq_handler == SCM_ICONVEH_QUESTION_MARK)
-       {
-         *codepoint = '?';
-         err = 0;
-       }
+  return err;
+}
 
-      /* Fail when the strategy is SCM_ICONVEH_ERROR or
-        SCM_ICONVEH_ESCAPE_SEQUENCE (the latter doesn't make sense for
-        input encoding errors.)  */
-    }
+/* Read a codepoint from PORT and return it in *CODEPOINT.  Fill BUF
+   with the byte representation of the codepoint in PORT's encoding, and
+   set *LEN to the length in bytes of that representation.  Return 0 on
+   success and an errno value on error.  */
+static int
+get_codepoint (SCM port, scm_t_wchar *codepoint,
+              char buf[SCM_MBCHAR_BUF_SIZE], size_t *len)
+{
+  int err;
+  scm_t_port *pt = SCM_PTAB_ENTRY (port);
+
+  if (pt->input_cd == (iconv_t) -1)
+    /* Initialize the conversion descriptors, if needed.  */
+    scm_i_set_port_encoding_x (port, pt->encoding);
+
+  /* FIXME: In 2.1, add a flag to determine whether a port is UTF-8.  */
+  if (pt->input_cd == (iconv_t) -1)
+    err = get_utf8_codepoint (port, codepoint, (scm_t_uint8 *) buf, len);
   else
+    err = get_iconv_codepoint (port, codepoint, buf, len);
+
+  if (SCM_LIKELY (err == 0))
+    update_port_lf (*codepoint, port);
+  else if (pt->ilseq_handler == SCM_ICONVEH_QUESTION_MARK)
     {
-      /* Convert the UTF8_BUF sequence to a Unicode code point.  */
-      *codepoint = utf8_to_codepoint (utf8_buf, output_size);
+      *codepoint = '?';
+      err = 0;
       update_port_lf (*codepoint, port);
     }
 
-  *len = bytes_consumed;
-
   return err;
 }
 
@@ -2031,28 +2186,35 @@ scm_i_set_port_encoding_x (SCM port, const char 
*encoding)
   if (encoding == NULL)
     encoding = "ISO-8859-1";
 
-  pt->encoding = scm_gc_strdup (encoding, "port");
+  if (pt->encoding != encoding)
+    pt->encoding = scm_gc_strdup (encoding, "port");
 
-  if (SCM_CELL_WORD_0 (port) & SCM_RDNG)
+  /* If ENCODING is UTF-8, then no conversion descriptor is opened
+     because we do I/O ourselves.  This saves 100+ KiB for each
+     descriptor.  */
+  if (strcmp (encoding, "UTF-8"))
     {
-      /* Open an input iconv conversion descriptor, from ENCODING
-        to UTF-8.  We choose UTF-8, not UTF-32, because iconv
-        implementations can typically convert from anything to
-        UTF-8, but not to UTF-32 (see
-        
<http://lists.gnu.org/archive/html/bug-libunistring/2010-09/msg00007.html>).  */
-      new_input_cd = iconv_open ("UTF-8", encoding);
-      if (new_input_cd == (iconv_t) -1)
-       goto invalid_encoding;
-    }
+      if (SCM_CELL_WORD_0 (port) & SCM_RDNG)
+       {
+         /* Open an input iconv conversion descriptor, from ENCODING
+            to UTF-8.  We choose UTF-8, not UTF-32, because iconv
+            implementations can typically convert from anything to
+            UTF-8, but not to UTF-32 (see
+            
<http://lists.gnu.org/archive/html/bug-libunistring/2010-09/msg00007.html>).  */
+         new_input_cd = iconv_open ("UTF-8", encoding);
+         if (new_input_cd == (iconv_t) -1)
+           goto invalid_encoding;
+       }
 
-  if (SCM_CELL_WORD_0 (port) & SCM_WRTNG)
-    {
-      new_output_cd = iconv_open (encoding, "UTF-8");
-      if (new_output_cd == (iconv_t) -1)
+      if (SCM_CELL_WORD_0 (port) & SCM_WRTNG)
        {
-         if (new_input_cd != (iconv_t) -1)
-           iconv_close (new_input_cd);
-         goto invalid_encoding;
+         new_output_cd = iconv_open (encoding, "UTF-8");
+         if (new_output_cd == (iconv_t) -1)
+           {
+             if (new_input_cd != (iconv_t) -1)
+               iconv_close (new_input_cd);
+             goto invalid_encoding;
+           }
        }
     }
 
diff --git a/libguile/print.c b/libguile/print.c
index 1399566..453c8a9 100644
--- a/libguile/print.c
+++ b/libguile/print.c
@@ -821,31 +821,57 @@ codepoint_to_utf8 (scm_t_wchar ch, scm_t_uint8 utf8[4])
   return len;
 }
 
-/* Display the LEN codepoints in STR to PORT according to STRATEGY;
-   return the number of codepoints successfully displayed.  If NARROW_P,
-   then STR is interpreted as a sequence of `char', denoting a Latin-1
-   string; otherwise it's interpreted as a sequence of
-   `scm_t_wchar'.  */
-static size_t
-display_string (const void *str, int narrow_p,
-               size_t len, SCM port,
-               scm_t_string_failed_conversion_handler strategy)
-
-{
 #define STR_REF(s, x)                          \
   (narrow_p                                    \
    ? (scm_t_wchar) ((unsigned char *) (s))[x]  \
    : ((scm_t_wchar *) (s))[x])
 
+/* Write STR to PORT as UTF-8.  STR is a LEN-codepoint string; it is
+   narrow if NARROW_P is true, wide otherwise.  Return LEN.  */
+static size_t
+display_string_as_utf8 (const void *str, int narrow_p, size_t len,
+                       SCM port)
+{
+  size_t printed = 0;
+
+  while (len > printed)
+    {
+      size_t utf8_len, i;
+      char *input, utf8_buf[256];
+
+      /* Convert STR to UTF-8.  */
+      for (i = printed, utf8_len = 0, input = utf8_buf;
+          i < len && utf8_len + 4 < sizeof (utf8_buf);
+          i++)
+       {
+         utf8_len += codepoint_to_utf8 (STR_REF (str, i),
+                                        (scm_t_uint8 *) input);
+         input = utf8_buf + utf8_len;
+       }
+
+      /* INPUT was successfully converted, entirely; print the
+        result.  */
+      scm_lfwrite (utf8_buf, utf8_len, port);
+      printed += i - printed;
+    }
+
+  assert (printed == len);
+
+  return len;
+}
+
+/* Convert STR through PORT's output conversion descriptor and write the
+   output to PORT.  Return the number of codepoints written.  */
+static size_t
+display_string_using_iconv (const void *str, int narrow_p, size_t len,
+                           SCM port,
+                           scm_t_string_failed_conversion_handler strategy)
+{
   size_t printed;
   scm_t_port *pt;
 
   pt = SCM_PTAB_ENTRY (port);
 
-  if (SCM_UNLIKELY (pt->output_cd == (iconv_t) -1))
-    /* Initialize the conversion descriptors.  */
-    scm_i_set_port_encoding_x (port, pt->encoding);
-
   printed = 0;
 
   while (len > printed)
@@ -928,7 +954,35 @@ display_string (const void *str, int narrow_p,
     }
 
   return printed;
+}
+
 #undef STR_REF
+
+/* Display the LEN codepoints in STR to PORT according to STRATEGY;
+   return the number of codepoints successfully displayed.  If NARROW_P,
+   then STR is interpreted as a sequence of `char', denoting a Latin-1
+   string; otherwise it's interpreted as a sequence of
+   `scm_t_wchar'.  */
+static size_t
+display_string (const void *str, int narrow_p,
+               size_t len, SCM port,
+               scm_t_string_failed_conversion_handler strategy)
+
+{
+  scm_t_port *pt;
+
+  pt = SCM_PTAB_ENTRY (port);
+
+  if (pt->output_cd == (iconv_t) -1)
+    /* Initialize the conversion descriptors, if needed.  */
+    scm_i_set_port_encoding_x (port, pt->encoding);
+
+  /* FIXME: In 2.1, add a flag to determine whether a port is UTF-8.  */
+  if (pt->output_cd == (iconv_t) -1)
+    return display_string_as_utf8 (str, narrow_p, len, port);
+  else
+    return display_string_using_iconv (str, narrow_p, len,
+                                      port, strategy);
 }
 
 /* Attempt to display CH to PORT according to STRATEGY.  Return non-zero
diff --git a/test-suite/tests/foreign.test b/test-suite/tests/foreign.test
index 60b466e..5ddd31c 100644
--- a/test-suite/tests/foreign.test
+++ b/test-suite/tests/foreign.test
@@ -124,24 +124,32 @@
 
   (pass-if "pointer from bits"
     (let* ((bytes (iota (sizeof '*)))
-           (bv    (u8-list->bytevector bytes)))
+           (bv    (u8-list->bytevector bytes))
+           (fold  (case (native-endianness)
+                    ((little) fold-right)
+                    ((big)    fold)
+                    (else     (error "unsupported endianness")))))
       (= (pointer-address
           (make-pointer (bytevector-uint-ref bv 0 (native-endianness)
                                              (sizeof '*))))
-         (fold-right (lambda (byte address)
-                       (+ byte (* 256 address)))
-                     0
-                     bytes))))
+         (fold (lambda (byte address)
+                 (+ byte (* 256 address)))
+               0
+               bytes))))
 
   (pass-if "dereference-pointer"
     (let* ((bytes (iota (sizeof '*)))
-           (bv    (u8-list->bytevector bytes)))
+           (bv    (u8-list->bytevector bytes))
+           (fold  (case (native-endianness)
+                    ((little) fold-right)
+                    ((big)    fold)
+                    (else     (error "unsupported endianness")))))
       (= (pointer-address
           (dereference-pointer (bytevector->pointer bv)))
-         (fold-right (lambda (byte address)
-                       (+ byte (* 256 address)))
-                     0
-                     bytes)))))
+         (fold (lambda (byte address)
+                 (+ byte (* 256 address)))
+               0
+               bytes)))))
 
 
 (with-test-prefix "pointer<->string"
diff --git a/test-suite/tests/ports.test b/test-suite/tests/ports.test
index 9fb6a96..c1ee7d1 100644
--- a/test-suite/tests/ports.test
+++ b/test-suite/tests/ports.test
@@ -572,6 +572,14 @@
        eof))
 
     (test-decoding-error (#xc2 #x41 #x42) "UTF-8"
+      ;; FIXME: This is the behavior of glibc/libiconv but it does not
+      ;; conform to the Unicode 6.0.0 recommendation: according to it,
+      ;; the #\A should not be swallowed (Section 3.9 reads:
+      ;; "If the converter encounters an ill-formed UTF-8 code unit
+      ;; sequence which starts with a valid first byte, but which does
+      ;; not continue with valid successor bytes (see Table 3-7), it
+      ;; must not consume the successor bytes".)
+
       (error                ;; 41: should be in the 80..BF range
        #\B
        eof))


hooks/post-receive
-- 
GNU Guile



reply via email to

[Prev in Thread] Current Thread [Next in Thread]