[Guile-commits] GNU Guile branch, stable-2.0, updated. v2.0.7-274-g1ee23

guile-commits
[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]
[Guile-commits] GNU Guile branch, stable-2.0, updated. v2.0.7-274-g1ee23

From:	Mark H Weaver
Subject:	[Guile-commits] GNU Guile branch, stable-2.0, updated. v2.0.7-274-g1ee237d
Date:	Tue, 02 Apr 2013 21:51:44 +0000
This is an automated email from the git hooks/post-receive script. It was
generated because a ref change was pushed to the repository containing
the project "GNU Guile".

http://git.savannah.gnu.org/cgit/guile.git/commit/?id=1ee237d9a159e0e9a995ecb9fea24e1d39a7c5e1

The branch, stable-2.0 has been updated
       via  1ee237d9a159e0e9a995ecb9fea24e1d39a7c5e1 (commit)
       via  8a2b596579185cd0f4d35da478f447e529d81a80 (commit)
       via  187fa0b9e7ff9b2d6204517a9daa9009245c7511 (commit)
      from  05d7f76296dc9fa21e0abd1ce6105a042905f48e (commit)

Those revisions listed above that are new to this repository have
not appeared on any other notification email; so we list those
revisions in full, below.

- Log -----------------------------------------------------------------
commit 1ee237d9a159e0e9a995ecb9fea24e1d39a7c5e1
Author: Mark H Weaver <address@hidden>
Date:   Tue Apr 2 17:26:37 2013 -0400

    Rewrite get_iconv_codepoint to fix a bug involving byte-order marks.
    
    * libguile/ports.c (get_iconv_codepoint): Rewrite to fix a bug and
      improve efficiency and clarity.  Previously, it incorrectly assumed
      that iconv would never consume input without producing output, which
      led to a buffer overrun and subsequent assertion failure.  This
      happens when a byte-order mark is consumed by iconv at the beginning
      of the stream when using the UTF-16 or UTF-32 encodings.
    
    * test-suite/tests/ports.test (unicode byte-order marks (BOMs)):
      Add tests.

commit 8a2b596579185cd0f4d35da478f447e529d81a80
Author: Mark H Weaver <address@hidden>
Date:   Tue Apr 2 05:33:24 2013 -0400

    Move slow path out of 'scm_get_byte_or_eof' et al.
    
    Suggested by Andy Wingo.
    
    * libguile/inline.h (scm_get_byte_or_eof, scm_peek_byte_or_eof): Keep
      only the fast path here, with fallback to 'scm_i_get_byte_or_eof' and
      'scm_i_peek_byte_or_eof'.
    
    * libguile/ports.c (scm_i_get_byte_or_eof, scm_i_peek_byte_or_eof):
      New internal functions.
    
    * libguile/ports.h (scm_i_get_byte_or_eof, scm_i_peek_byte_or_eof): Add
      prototypes.

commit 187fa0b9e7ff9b2d6204517a9daa9009245c7511
Author: Mark H Weaver <address@hidden>
Date:   Tue Apr 2 13:33:14 2013 -0400

    Add a static version of 'scm_fill_input' to ports.c.
    
    * libguile/ports.c (scm_i_fill_input): New static function, containing
      the code that was previously in 'scm_fill_input'.
      (scm_fill_input): Simply call 'scm_i_fill_input'.
      (scm_c_read): Use 'scm_i_fill_input'.

-----------------------------------------------------------------------

Summary of changes:
 libguile/inline.h           |   44 +++----------
 libguile/ports.c            |  154 +++++++++++++++++++++++++++++--------------
 libguile/ports.h            |    2 +
 test-suite/tests/ports.test |   92 +++++++++++++++++++++++++
 4 files changed, 209 insertions(+), 83 deletions(-)

diff --git a/libguile/inline.h b/libguile/inline.h
index 88ba7f7..17d8a0c 100644
--- a/libguile/inline.h
+++ b/libguile/inline.h
@@ -96,50 +96,26 @@ scm_is_string (SCM x)
 SCM_INLINE_IMPLEMENTATION int
 scm_get_byte_or_eof (SCM port)
 {
-  int c;
   scm_t_port *pt = SCM_PTAB_ENTRY (port);
 
-  if (pt->rw_active == SCM_PORT_WRITE)
-    /* may be marginally faster than calling scm_flush.  */
-    scm_ptobs[SCM_PTOBNUM (port)].flush (port);
-
-  if (pt->rw_random)
-    pt->rw_active = SCM_PORT_READ;
-
-  if (pt->read_pos >= pt->read_end)
-    {
-      if (SCM_UNLIKELY (scm_fill_input (port) == EOF))
-       return EOF;
-    }
-
-  c = *(pt->read_pos++);
-
-  return c;
+  if (SCM_LIKELY ((pt->rw_active == SCM_PORT_READ || !pt->rw_random)
+                  && pt->read_pos < pt->read_end))
+    return *pt->read_pos++;
+  else
+    return scm_i_get_byte_or_eof (port);
 }
 
 /* Like `scm_get_byte_or_eof' but does not change PORT's `read_pos'.  */
 SCM_INLINE_IMPLEMENTATION int
 scm_peek_byte_or_eof (SCM port)
 {
-  int c;
   scm_t_port *pt = SCM_PTAB_ENTRY (port);
 
-  if (pt->rw_active == SCM_PORT_WRITE)
-    /* may be marginally faster than calling scm_flush.  */
-    scm_ptobs[SCM_PTOBNUM (port)].flush (port);
-
-  if (pt->rw_random)
-    pt->rw_active = SCM_PORT_READ;
-
-  if (pt->read_pos >= pt->read_end)
-    {
-      if (SCM_UNLIKELY (scm_fill_input (port) == EOF))
-       return EOF;
-    }
-
-  c = *pt->read_pos;
-
-  return c;
+  if (SCM_LIKELY ((pt->rw_active == SCM_PORT_READ || !pt->rw_random)
+                  && pt->read_pos < pt->read_end))
+    return *pt->read_pos;
+  else
+    return scm_i_peek_byte_or_eof (port);
 }
 
 SCM_INLINE_IMPLEMENTATION void
diff --git a/libguile/ports.c b/libguile/ports.c
index becdbed..2170d96 100644
--- a/libguile/ports.c
+++ b/libguile/ports.c
@@ -1306,65 +1306,73 @@ static int
 get_iconv_codepoint (SCM port, scm_t_wchar *codepoint,
                     char buf[SCM_MBCHAR_BUF_SIZE], size_t *len)
 {
-  scm_t_iconv_descriptors *id;
-  int err, byte_read;
-  size_t bytes_consumed, output_size;
-  char *output;
+  scm_t_iconv_descriptors *id = scm_i_port_iconv_descriptors (port);
   scm_t_uint8 utf8_buf[SCM_MBCHAR_BUF_SIZE];
+  size_t input_size = 0;
 
-  id = scm_i_port_iconv_descriptors (port);
-
-  for (output_size = 0, output = (char *) utf8_buf,
-        bytes_consumed = 0, err = 0;
-       err == 0 && output_size == 0
-        && (bytes_consumed == 0 || byte_read != EOF);
-       bytes_consumed++)
+  for (;;)
     {
-      char *input;
+      int byte_read;
+      char *input, *output;
       size_t input_left, output_left, done;
 
       byte_read = scm_get_byte_or_eof (port);
-      if (byte_read == EOF)
+      if (SCM_UNLIKELY (byte_read == EOF))
        {
-         if (bytes_consumed == 0)
-           {
-             *codepoint = (scm_t_wchar) EOF;
-             *len = 0;
-             return 0;
-           }
-         else
-           continue;
+          if (SCM_LIKELY (input_size == 0))
+            {
+              *codepoint = (scm_t_wchar) EOF;
+              *len = input_size;
+              return 0;
+            }
+          else
+            /* EOF found in the middle of a multibyte character. */
+            return EILSEQ;
        }
 
-      buf[bytes_consumed] = byte_read;
+      buf[input_size++] = byte_read;
 
       input = buf;
-      input_left = bytes_consumed + 1;
+      input_left = input_size;
+      output = (char *) utf8_buf;
       output_left = sizeof (utf8_buf);
 
       done = iconv (id->input_cd, &input, &input_left, &output, &output_left);
+
       if (done == (size_t) -1)
        {
-         err = errno;
-         if (err == EINVAL)
-           /* Missing input: keep trying.  */
-           err = 0;
+         int err = errno;
+         if (SCM_LIKELY (err == EINVAL))
+            /* The input byte sequence did not form a complete
+               character.  Read another byte and try again. */
+            continue;
+          else
+            return err;
        }
       else
-       output_size = sizeof (utf8_buf) - output_left;
-    }
-
-  if (SCM_UNLIKELY (output_size == 0))
-    /* An unterminated sequence.  */
-    err = EILSEQ;
-  else if (SCM_LIKELY (err == 0))
-    {
-      /* Convert the UTF8_BUF sequence to a Unicode code point.  */
-      *codepoint = utf8_to_codepoint (utf8_buf, output_size);
-      *len = bytes_consumed;
+        {
+          size_t output_size = sizeof (utf8_buf) - output_left;
+          if (SCM_LIKELY (output_size > 0))
+            {
+              /* iconv generated output.  Convert the UTF8_BUF sequence
+                 to a Unicode code point.  */
+              *codepoint = utf8_to_codepoint (utf8_buf, output_size);
+              *len = input_size;
+              return 0;
+            }
+          else
+            {
+              /* iconv consumed some bytes without producing any output.
+                 Most likely this means that a Unicode byte-order mark
+                 (BOM) was consumed, which should not be included in the
+                 returned buf.  Shift any remaining bytes to the beginning
+                 of buf, and continue the loop. */
+              memmove (buf, input, input_left);
+              input_size = input_left;
+              continue;
+            }
+        }
     }
-
-  return err;
 }
 
 /* Read a codepoint from PORT and return it in *CODEPOINT.  Fill BUF
@@ -1419,8 +1427,8 @@ scm_getc (SCM port)
 /* this should only be called when the read buffer is empty.  it
    tries to refill the read buffer.  it returns the first char from
    the port, which is either EOF or *(pt->read_pos).  */
-int
-scm_fill_input (SCM port)
+static int
+scm_i_fill_input (SCM port)
 {
   scm_t_port *pt = SCM_PTAB_ENTRY (port);
 
@@ -1439,6 +1447,54 @@ scm_fill_input (SCM port)
   return scm_ptobs[SCM_PTOBNUM (port)].fill_input (port);
 }
 
+int
+scm_fill_input (SCM port)
+{
+  return scm_i_fill_input (port);
+}
+
+/* Slow-path fallback for 'scm_get_byte_or_eof' in inline.h */
+int
+scm_i_get_byte_or_eof (SCM port)
+{
+  scm_t_port *pt = SCM_PTAB_ENTRY (port);
+
+  if (pt->rw_active == SCM_PORT_WRITE)
+    scm_flush (port);
+
+  if (pt->rw_random)
+    pt->rw_active = SCM_PORT_READ;
+
+  if (pt->read_pos >= pt->read_end)
+    {
+      if (SCM_UNLIKELY (scm_i_fill_input (port) == EOF))
+       return EOF;
+    }
+
+  return *pt->read_pos++;
+}
+
+/* Slow-path fallback for 'scm_peek_byte_or_eof' in inline.h */
+int
+scm_i_peek_byte_or_eof (SCM port)
+{
+  scm_t_port *pt = SCM_PTAB_ENTRY (port);
+
+  if (pt->rw_active == SCM_PORT_WRITE)
+    scm_flush (port);
+
+  if (pt->rw_random)
+    pt->rw_active = SCM_PORT_READ;
+
+  if (pt->read_pos >= pt->read_end)
+    {
+      if (SCM_UNLIKELY (scm_i_fill_input (port) == EOF))
+       return EOF;
+    }
+
+  return *pt->read_pos;
+}
+
 
 /* scm_lfwrite
  *
@@ -1547,8 +1603,8 @@ scm_c_read (SCM port, void *buffer, size_t size)
   if (size == 0)
     return n_read;
 
-  /* Now we will call scm_fill_input repeatedly until we have read the
-     requested number of bytes.  (Note that a single scm_fill_input
+  /* Now we will call scm_i_fill_input repeatedly until we have read the
+     requested number of bytes.  (Note that a single scm_i_fill_input
      call does not guarantee to fill the whole of the port's read
      buffer.) */
   if (pt->read_buf_size <= 1 && pt->encoding == NULL)
@@ -1556,12 +1612,12 @@ scm_c_read (SCM port, void *buffer, size_t size)
       /* The port that we are reading from is unbuffered - i.e. does
         not have its own persistent buffer - but we have a buffer,
         provided by our caller, that is the right size for the data
-        that is wanted.  For the following scm_fill_input calls,
+        that is wanted.  For the following scm_i_fill_input calls,
         therefore, we use the buffer in hand as the port's read
         buffer.
 
         We need to make sure that the port's normal (1 byte) buffer
-        is reinstated in case one of the scm_fill_input () calls
+        is reinstated in case one of the scm_i_fill_input () calls
         throws an exception; we use the scm_dynwind_* API to achieve
         that. 
 
@@ -1578,9 +1634,9 @@ scm_c_read (SCM port, void *buffer, size_t size)
       scm_dynwind_rewind_handler (swap_buffer, &psb, SCM_F_WIND_EXPLICITLY);
       scm_dynwind_unwind_handler (swap_buffer, &psb, SCM_F_WIND_EXPLICITLY);
 
-      /* Call scm_fill_input until we have all the bytes that we need,
+      /* Call scm_i_fill_input until we have all the bytes that we need,
         or we hit EOF. */
-      while (pt->read_buf_size && (scm_fill_input (port) != EOF))
+      while (pt->read_buf_size && (scm_i_fill_input (port) != EOF))
        {
          pt->read_buf_size -= (pt->read_end - pt->read_pos);
          pt->read_pos = pt->read_buf = pt->read_end;
@@ -1604,7 +1660,7 @@ scm_c_read (SCM port, void *buffer, size_t size)
         that a custom port implementation's entry points (in
         particular, fill_input) can rely on the buffer always being
         the same as they first set up. */
-      while (size && (scm_fill_input (port) != EOF))
+      while (size && (scm_i_fill_input (port) != EOF))
        {
          n_available = min (size, pt->read_end - pt->read_pos);
          memcpy (buffer, pt->read_pos, n_available);
diff --git a/libguile/ports.h b/libguile/ports.h
index 53d5081..54bf595 100644
--- a/libguile/ports.h
+++ b/libguile/ports.h
@@ -328,6 +328,8 @@ scm_i_default_port_conversion_handler (void);
 /* Use HANDLER as the default conversion strategy for future ports.  */
 SCM_INTERNAL void
 scm_i_set_default_port_conversion_handler 
(scm_t_string_failed_conversion_handler);
+SCM_INTERNAL int scm_i_get_byte_or_eof (SCM port);
+SCM_INTERNAL int scm_i_peek_byte_or_eof (SCM port);
 
 SCM_API SCM scm_port_conversion_strategy (SCM port);
 SCM_API SCM scm_set_port_conversion_strategy_x (SCM port, SCM behavior);
diff --git a/test-suite/tests/ports.test b/test-suite/tests/ports.test
index 886ab24..c73e6be 100644
--- a/test-suite/tests/ports.test
+++ b/test-suite/tests/ports.test
@@ -1149,6 +1149,98 @@
 
 
 
+(define (bv-read-test encoding bv)
+  (let ((port (open-bytevector-input-port bv)))
+    (set-port-encoding! port encoding)
+    (read-string port)))
+
+(with-test-prefix "unicode byte-order marks (BOMs)"
+
+  (pass-if-equal "BOM not discarded from Latin-1 stream"
+      "\xEF\xBB\xBF\x61"
+    (bv-read-test "ISO-8859-1" #vu8(#xEF #xBB #xBF #x61)))
+
+  (pass-if-equal "BOM not discarded from Latin-2 stream"
+      "\u010F\u0165\u017C\x61"
+    (bv-read-test "ISO-8859-2" #vu8(#xEF #xBB #xBF #x61)))
+
+  (pass-if-equal "BOM not discarded from UTF-16BE stream"
+      "\uFEFF\x61"
+    (bv-read-test "UTF-16BE" #vu8(#xFE #xFF #x00 #x61)))
+
+  (pass-if-equal "BOM not discarded from UTF-16LE stream"
+      "\uFEFF\x61"
+    (bv-read-test "UTF-16LE" #vu8(#xFF #xFE #x61 #x00)))
+
+  (pass-if-equal "BOM not discarded from UTF-32BE stream"
+      "\uFEFF\x61"
+    (bv-read-test "UTF-32BE" #vu8(#x00 #x00 #xFE #xFF
+                                  #x00 #x00 #x00 #x61)))
+
+  (pass-if-equal "BOM not discarded from UTF-32LE stream"
+      "\uFEFF\x61"
+    (bv-read-test "UTF-32LE" #vu8(#xFF #xFE #x00 #x00
+                                  #x61 #x00 #x00 #x00)))
+
+  (pass-if-equal "BOM discarded from start of UTF-16 stream (BE)"
+      "a"
+    (bv-read-test "UTF-16" #vu8(#xFE #xFF #x00 #x61)))
+
+  (pass-if-equal "Only one BOM discarded from start of UTF-16 stream (BE)"
+      "\uFEFFa"
+    (bv-read-test "UTF-16" #vu8(#xFE #xFF #xFE #xFF #x00 #x61)))
+
+  (pass-if-equal "BOM not discarded unless at start of UTF-16 stream"
+      "a\uFEFFb"
+    (let ((be (bv-read-test "UTF-16" #vu8(#x00 #x61 #xFE #xFF #x00 #x62)))
+          (le (bv-read-test "UTF-16" #vu8(#x61 #x00 #xFF #xFE #x62 #x00))))
+      (if (char=? #\a (string-ref be 0))
+          be
+          le)))
+
+  (pass-if-equal "BOM discarded from start of UTF-16 stream (LE)"
+      "a"
+    (bv-read-test "UTF-16" #vu8(#xFF #xFE #x61 #x00)))
+
+  (pass-if-equal "Only one BOM discarded from start of UTF-16 stream (LE)"
+      "\uFEFFa"
+    (bv-read-test "UTF-16" #vu8(#xFF #xFE #xFF #xFE #x61 #x00)))
+
+  (pass-if-equal "BOM discarded from start of UTF-32 stream (BE)"
+      "a"
+    (bv-read-test "UTF-32" #vu8(#x00 #x00 #xFE #xFF #x00 #x00 #x00 #x61)))
+
+  (pass-if-equal "Only one BOM discarded from start of UTF-32 stream (BE)"
+      "\uFEFFa"
+    (bv-read-test "UTF-32" #vu8(#x00 #x00 #xFE #xFF
+                                #x00 #x00 #xFE #xFF
+                                #x00 #x00 #x00 #x61)))
+
+  (pass-if-equal "BOM not discarded unless at start of UTF-32 stream"
+      "a\uFEFFb"
+    (let ((be (bv-read-test "UTF-32" #vu8(#x00 #x00 #x00 #x61
+                                          #x00 #x00 #xFE #xFF
+                                          #x00 #x00 #x00 #x62)))
+          (le (bv-read-test "UTF-32" #vu8(#x61 #x00 #x00 #x00
+                                          #xFF #xFE #x00 #x00
+                                          #x62 #x00 #x00 #x00))))
+      (if (char=? #\a (string-ref be 0))
+          be
+          le)))
+
+  (pass-if-equal "BOM discarded from start of UTF-32 stream (LE)"
+      "a"
+    (bv-read-test "UTF-32" #vu8(#xFF #xFE #x00 #x00
+                                #x61 #x00 #x00 #x00)))
+
+  (pass-if-equal "Only one BOM discarded from start of UTF-32 stream (LE)"
+      "\uFEFFa"
+    (bv-read-test "UTF-32" #vu8(#xFF #xFE #x00 #x00
+                                #xFF #xFE #x00 #x00
+                                #x61 #x00 #x00 #x00))))
+
+
+
 (define-syntax-rule (with-load-path path body ...)
   (let ((new path)
         (old %load-path))


hooks/post-receive
-- 
GNU Guile
[Prev in Thread]
Current Thread
[Next in Thread]
[Guile-commits] GNU Guile branch, stable-2.0, updated. v2.0.7-274-g1ee237d, Mark H Weaver <=
Prev by Date: [Guile-commits] Hydra job gnu:guile-2-0:xbuild_mipsel_linux_gnu on x86_64-linux, build 4589354: Failed
Next by Date: [Guile-commits] GNU Guile branch, stable-2.0, updated. v2.0.7-275-g1fa89aa
Previous by thread: [Guile-commits] Hydra job gnu:guile-2-0:xbuild_mipsel_linux_gnu on x86_64-linux, build 4589354: Failed
Next by thread: [Guile-commits] GNU Guile branch, stable-2.0, updated. v2.0.7-275-g1fa89aa
Index(es):
- Date
- Thread