guile-commits
[Top][All Lists]
Advanced

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

[Guile-commits] 01/03: Handle BOM around fill/flush instead of peek/put


From: Andy Wingo
Subject: [Guile-commits] 01/03: Handle BOM around fill/flush instead of peek/put
Date: Thu, 05 May 2016 20:59:10 +0000

wingo pushed a commit to branch wip-port-refactor
in repository guile.

commit 86267af8b30be0901b6bb29cbdc730c0a5844bfd
Author: Andy Wingo <address@hidden>
Date:   Thu May 5 13:00:19 2016 +0200

    Handle BOM around fill/flush instead of peek/put
    
    * libguile/print.c (display_string_using_iconv): Remove BOM handling;
      this is now handled by scm_lfwrite.
    * libguile/ports.c (open_iconv_descriptors): Refactor to take encoding
      as a symbol.
      (prepare_iconv_descriptors): New helper.
      (scm_i_port_iconv_descriptors): Remove scm_t_port_rw_active argument,
      and don't sniff UTF-16/UTF-32 byte orders here.  Instead BOM handlers
      will call prepare_iconv_descriptors.
      (scm_c_read_bytes): Call new port_clear_stream_start_for_bom_read
      helper.
      (port_maybe_consume_initial_byte_order_mark)
      (scm_port_maybe_consume_initial_byte_order_mark): Remove.  Leaves
      Scheme %peek-char broken but it's unused currently so that's OK.
      (peek_iconv_codepoint): Fetch iconv descriptors after doing fill-input
      because it's fill-input that will sniff the BOM.
      (peek_codepoint): Instead of handling BOM at every character, handle
      in fill-input instead.
      (maybe_consume_bom, port_clear_stream_start_for_bom_read)
      (port_clear_stream_start_for_bom_write): New helpers.
      (scm_fill_input): Slurp a BOM if needed.
      (scm_i_write): Clear the start-of-stream-for-bom-write flag.
      (scm_lfwrite): Write a BOM if needed.
---
 libguile/ports-internal.h |   10 +-
 libguile/ports.c          |  307 ++++++++++++++++++++++++++-------------------
 libguile/print.c          |   18 +--
 3 files changed, 184 insertions(+), 151 deletions(-)

diff --git a/libguile/ports-internal.h b/libguile/ports-internal.h
index 526337d..689e61c 100644
--- a/libguile/ports-internal.h
+++ b/libguile/ports-internal.h
@@ -210,6 +210,13 @@ typedef enum scm_port_encoding_mode 
scm_t_port_encoding_mode;
    cause finalizers to be registered.  */
 struct scm_iconv_descriptors
 {
+  /* This is the same as pt->encoding, except if pt->encoding is UTF-16
+     or UTF-32, in which case this is UTF-16LE or a similar
+     byte-order-specialed version of UTF-16 or UTF-32.  We don't re-set
+     pt->encoding because being just plain UTF-16 or UTF-32 has an
+     additional meaning, being that we should consume and produce byte
+     order marker codepoints as appropriate. */
+  SCM precise_encoding;
   /* input/output iconv conversion descriptors */
   void *input_cd;
   void *output_cd;
@@ -237,7 +244,6 @@ typedef enum scm_t_port_rw_active {
   SCM_PORT_WRITE = 2
 } scm_t_port_rw_active;
 
-SCM_INTERNAL scm_t_iconv_descriptors *
-scm_i_port_iconv_descriptors (SCM port, scm_t_port_rw_active mode);
+SCM_INTERNAL scm_t_iconv_descriptors * scm_i_port_iconv_descriptors (SCM port);
 
 #endif
diff --git a/libguile/ports.c b/libguile/ports.c
index da1af2f..e8c79bc 100644
--- a/libguile/ports.c
+++ b/libguile/ports.c
@@ -1028,63 +1028,12 @@ scm_i_set_default_port_conversion_strategy (SCM sym)
   scm_fluid_set_x (SCM_VARIABLE_REF (default_conversion_strategy_var), sym);
 }
 
-/* If the next LEN bytes from PORT are equal to those in BYTES, then
-   return 1, else return 0.  Leave the port position unchanged.  */
-static int
-looking_at_bytes (SCM port, const unsigned char *bytes, int len)
-{
-  scm_t_port *pt = SCM_PTAB_ENTRY (port);
-  int i = 0;
-
-  while (i < len && scm_peek_byte_or_eof (port) == bytes[i])
-    {
-      scm_port_buffer_did_take (pt->read_buf, 1);
-      i++;
-    }
-  scm_unget_bytes (bytes, i, port);
-  return (i == len);
-}
-
 static const unsigned char scm_utf8_bom[3]    = {0xEF, 0xBB, 0xBF};
 static const unsigned char scm_utf16be_bom[2] = {0xFE, 0xFF};
 static const unsigned char scm_utf16le_bom[2] = {0xFF, 0xFE};
 static const unsigned char scm_utf32be_bom[4] = {0x00, 0x00, 0xFE, 0xFF};
 static const unsigned char scm_utf32le_bom[4] = {0xFF, 0xFE, 0x00, 0x00};
 
-/* Decide what byte order to use for a UTF-16 port.  Return "UTF-16BE"
-   or "UTF-16LE".  MODE must be either SCM_PORT_READ or SCM_PORT_WRITE,
-   and specifies which operation is about to be done.  The MODE
-   determines how we will decide the byte order.  We deliberately avoid
-   reading from the port unless the user is about to do so.  If the user
-   is about to read, then we look for a BOM, and if present, we use it
-   to determine the byte order.  Otherwise we choose big endian, as
-   recommended by the Unicode Standard.  Note that the BOM (if any) is
-   not consumed here.  */
-static SCM
-decide_utf16_encoding (SCM port, scm_t_port_rw_active mode)
-{
-  if (mode == SCM_PORT_READ
-      && SCM_PORT_GET_INTERNAL (port)->at_stream_start_for_bom_read
-      && looking_at_bytes (port, scm_utf16le_bom, sizeof scm_utf16le_bom))
-    return sym_UTF_16LE;
-  else
-    return sym_UTF_16BE;
-}
-
-/* Decide what byte order to use for a UTF-32 port.  Return "UTF-32BE"
-   or "UTF-32LE".  See the comment above 'decide_utf16_encoding' for
-   details.  */
-static SCM
-decide_utf32_encoding (SCM port, scm_t_port_rw_active mode)
-{
-  if (mode == SCM_PORT_READ
-      && SCM_PORT_GET_INTERNAL (port)->at_stream_start_for_bom_read
-      && looking_at_bytes (port, scm_utf32le_bom, sizeof scm_utf32le_bom))
-    return sym_UTF_32LE;
-  else
-    return sym_UTF_32BE;
-}
-
 static void
 finalize_iconv_descriptors (void *ptr, void *data)
 {
@@ -1092,8 +1041,9 @@ finalize_iconv_descriptors (void *ptr, void *data)
 }
 
 static scm_t_iconv_descriptors *
-open_iconv_descriptors (const char *encoding, int reading, int writing)
+open_iconv_descriptors (SCM precise_encoding, int reading, int writing)
 {
+  const char *encoding;
   scm_t_iconv_descriptors *id;
   iconv_t input_cd, output_cd;
   size_t i;
@@ -1101,6 +1051,7 @@ open_iconv_descriptors (const char *encoding, int 
reading, int writing)
   input_cd = (iconv_t) -1;
   output_cd = (iconv_t) -1;
 
+  encoding = scm_i_symbol_chars (precise_encoding);
   for (i = 0; encoding[i]; i++)
     if (encoding[i] > 127)
       goto invalid_encoding;
@@ -1138,6 +1089,7 @@ open_iconv_descriptors (const char *encoding, int 
reading, int writing)
     }
 
   id = scm_gc_malloc_pointerless (sizeof (*id), "iconv descriptors");
+  id->precise_encoding = precise_encoding;
   id->input_cd = input_cd;
   id->output_cd = output_cd;
 
@@ -1147,13 +1099,9 @@ open_iconv_descriptors (const char *encoding, int 
reading, int writing)
   return id;
 
  invalid_encoding:
-  {
-    SCM err;
-    err = scm_from_latin1_string (encoding);
-    scm_misc_error ("open_iconv_descriptors",
-                   "invalid or unknown character encoding ~s",
-                   scm_list_1 (err));
-  }
+  scm_misc_error ("open_iconv_descriptors",
+                  "invalid or unknown character encoding ~s",
+                  scm_list_1 (precise_encoding));
 }
 
 static void
@@ -1167,30 +1115,35 @@ close_iconv_descriptors (scm_t_iconv_descriptors *id)
   id->output_cd = (void *) -1;
 }
 
+static void
+prepare_iconv_descriptors (SCM port, SCM encoding)
+{
+  scm_t_port_internal *pti = SCM_PORT_GET_INTERNAL (port);
+  scm_t_iconv_descriptors *desc = pti->iconv_descriptors;
+
+  /* If the specified encoding is UTF-16 or UTF-32, then default to
+     big-endian byte order.  This fallback isn't necessary if you read
+     on the port before writing to it, as the read will sniff the BOM if
+     any and specialize the encoding; see the manual.  */
+  if (scm_is_eq (encoding, sym_UTF_16))
+    encoding = sym_UTF_16BE;
+  else if (scm_is_eq (encoding, sym_UTF_32))
+    encoding = sym_UTF_32BE;
+
+  if (desc && scm_is_eq (desc->precise_encoding, encoding))
+    return;
+
+  pti->iconv_descriptors = open_iconv_descriptors
+    (encoding, SCM_INPUT_PORT_P (port), SCM_OUTPUT_PORT_P (port));
+}
+
 scm_t_iconv_descriptors *
-scm_i_port_iconv_descriptors (SCM port, scm_t_port_rw_active mode)
+scm_i_port_iconv_descriptors (SCM port)
 {
-  scm_t_port *pt = SCM_PTAB_ENTRY (port);
   scm_t_port_internal *pti = SCM_PORT_GET_INTERNAL (port);
 
   if (!pti->iconv_descriptors)
-    {
-      SCM precise_encoding;
-
-      /* If the specified encoding is UTF-16 or UTF-32, then make
-         that more precise by deciding what byte order to use. */
-      if (scm_is_eq (pt->encoding, sym_UTF_16))
-        precise_encoding = decide_utf16_encoding (port, mode);
-      else if (scm_is_eq (pt->encoding, sym_UTF_32))
-        precise_encoding = decide_utf32_encoding (port, mode);
-      else
-        precise_encoding = pt->encoding;
-
-      pti->iconv_descriptors =
-        open_iconv_descriptors (scm_i_symbol_chars (precise_encoding),
-                                SCM_INPUT_PORT_P (port),
-                                SCM_OUTPUT_PORT_P (port));
-    }
+    prepare_iconv_descriptors (port, SCM_PTAB_ENTRY (port)->encoding);
 
   return pti->iconv_descriptors;
 }
@@ -1450,6 +1403,14 @@ scm_i_read_bytes (SCM port, SCM dst, size_t start, 
size_t count)
   return filled;
 }
 
+/* In text mode, we will slurp a BOM from the beginning of a UTF-8,
+   UTF-16, or UTF-32 stream, and write one at the beginning of a UTF-16
+   or UTF-32 stream.  In binary mode, we won't.  The mode depends on the
+   caller. */
+enum bom_io_mode { BOM_IO_TEXT, BOM_IO_BINARY };
+static size_t port_clear_stream_start_for_bom_read (SCM, enum bom_io_mode);
+static void port_clear_stream_start_for_bom_write (SCM, enum bom_io_mode);
+
 /* Used by an application to read arbitrary number of bytes from an SCM
    port.  Same semantics as libc read, except that scm_c_read_bytes only
    returns less than SIZE bytes if at end-of-file.
@@ -1472,6 +1433,8 @@ scm_c_read_bytes (SCM port, SCM dst, size_t start, size_t 
count)
   if (pt->rw_random)
     scm_flush (port);
 
+  port_clear_stream_start_for_bom_read (port, BOM_IO_BINARY);
+
   /* Take bytes first from the port's read buffer. */
   {
     size_t did_read = scm_port_buffer_take (read_buf, dst_ptr, to_read);
@@ -1553,50 +1516,6 @@ scm_c_read (SCM port, void *buffer, size_t size)
 }
 #undef FUNC_NAME
 
-static int
-port_maybe_consume_initial_byte_order_mark (SCM port, scm_t_wchar codepoint,
-                                            size_t len)
-{
-  scm_t_port *pt = SCM_PTAB_ENTRY (port);
-  scm_t_port_internal *pti = SCM_PORT_GET_INTERNAL (port);
-
-  if (!pti->at_stream_start_for_bom_read) return 0;
-
-  /* Record that we're no longer at stream start. */
-  pti->at_stream_start_for_bom_read = 0;
-  if (pt->rw_random)
-    pti->at_stream_start_for_bom_write = 0;
-
-  if (codepoint != SCM_UNICODE_BOM) return 0;
-
-  /* If we just read a BOM in an encoding that recognizes them, then
-     silently consume it. */
-  if (scm_is_eq (pt->encoding, sym_UTF_8)
-      || scm_is_eq (pt->encoding, sym_UTF_16)
-      || scm_is_eq (pt->encoding, sym_UTF_32))
-    {
-      scm_port_buffer_did_take (pt->read_buf, len);
-      return 1;
-    }
-
-  return 0;
-}
-
-SCM_DEFINE (scm_port_maybe_consume_initial_byte_order_mark,
-            "port-maybe-consume-initial-byte-order-mark", 3, 0, 0,
-            (SCM port, SCM codepoint, SCM len),
-            "")
-#define FUNC_NAME s_scm_port_maybe_consume_initial_byte_order_mark
-{
-  SCM_VALIDATE_PORT (1, port);
-  return scm_from_bool
-    (port_maybe_consume_initial_byte_order_mark
-     (port,
-      SCM_CHARP (codepoint) ? SCM_CHAR (codepoint) : EOF,
-      scm_to_size_t (len)));
-}
-#undef FUNC_NAME
-
 /* Update the line and column number of PORT after consumption of C.  */
 static inline void
 update_port_lf (scm_t_wchar c, SCM port)
@@ -1790,8 +1709,6 @@ peek_iconv_codepoint (SCM port, scm_t_wchar *codepoint, 
size_t *len)
   scm_t_uint8 utf8_buf[SCM_MBCHAR_BUF_SIZE];
   size_t input_size = 0;
 
-  id = scm_i_port_iconv_descriptors (port, SCM_PORT_READ);
-
   for (;;)
     {
       SCM read_buf;
@@ -1799,6 +1716,8 @@ peek_iconv_codepoint (SCM port, scm_t_wchar *codepoint, 
size_t *len)
       size_t input_left, output_left, done;
 
       read_buf = scm_fill_input (port, input_size + 1);
+      id = scm_i_port_iconv_descriptors (port);
+
       if (scm_port_buffer_can_take (read_buf) <= input_size)
        {
           if (input_size == 0)
@@ -1867,12 +1786,7 @@ peek_codepoint (SCM port, scm_t_wchar *codepoint, size_t 
*len)
   else
     err = peek_iconv_codepoint (port, codepoint, len);
 
-  if (SCM_LIKELY (err == 0))
-    {
-      if (port_maybe_consume_initial_byte_order_mark (port, *codepoint, *len))
-        return peek_codepoint (port, codepoint, len);
-    }
-  else if (scm_is_eq (pt->conversion_strategy, sym_substitute))
+  if (err != 0 && scm_is_eq (pt->conversion_strategy, sym_substitute))
     {
       *codepoint = '?';
       err = 0;
@@ -2352,16 +2266,138 @@ scm_flush (SCM port)
     scm_i_write (port, buf);
 }
 
+/* Return number of bytes consumed, or zero if no BOM was consumed.  */
+static size_t
+maybe_consume_bom (SCM port, const unsigned char *bom, size_t bom_len)
+{
+  SCM read_buf;
+  const scm_t_uint8 *buf;
+
+  if (peek_byte_or_eof (port) != bom[0])
+    return 0;
+
+  /* Make sure there's enough space in the buffer for a BOM.  Now that
+     we matched the first byte, we know we're going to have to read this
+     many bytes anyway.  */
+  read_buf = scm_fill_input (port, bom_len);
+  buf = scm_port_buffer_take_pointer (read_buf);
+
+  if (scm_port_buffer_can_take (read_buf) < bom_len)
+    return 0;
+
+  if (memcmp (buf, bom, bom_len) != 0)
+    return 0;
+
+  scm_port_buffer_did_take (read_buf, bom_len);
+  return bom_len;
+}
+
+static size_t
+port_clear_stream_start_for_bom_read (SCM port, enum bom_io_mode io_mode)
+{
+  scm_t_port_internal *pti = SCM_PORT_GET_INTERNAL (port);
+  scm_t_port *pt;
+
+  if (!pti->at_stream_start_for_bom_read)
+    return 0;
+
+  /* Maybe slurp off a byte-order marker.  */
+  pt = SCM_PTAB_ENTRY (port);
+  pti->at_stream_start_for_bom_read = 0;
+  if (pt->rw_random)
+    pti->at_stream_start_for_bom_write = 0;
+
+  if (io_mode == BOM_IO_BINARY)
+    return 0;
+
+  if (scm_is_eq (pt->encoding, sym_UTF_8))
+    return maybe_consume_bom (port, scm_utf8_bom, sizeof (scm_utf8_bom));
+
+  if (scm_is_eq (pt->encoding, sym_UTF_16))
+    {
+      if (maybe_consume_bom (port, scm_utf16le_bom, sizeof (scm_utf16le_bom)))
+        {
+          prepare_iconv_descriptors (port, sym_UTF_16LE);
+          return 2;
+        }
+      if (maybe_consume_bom (port, scm_utf16be_bom, sizeof (scm_utf16be_bom)))
+        {
+          prepare_iconv_descriptors (port, sym_UTF_16BE);
+          return 2;
+        }
+      /* Big-endian by default.  */
+      prepare_iconv_descriptors (port, sym_UTF_16BE);
+      return 0;
+    }
+
+  if (scm_is_eq (pt->encoding, sym_UTF_32))
+    {
+      if (maybe_consume_bom (port, scm_utf32le_bom, sizeof (scm_utf32le_bom)))
+        {
+          /* Big-endian by default.  */
+          prepare_iconv_descriptors (port, sym_UTF_32LE);
+          return 4;
+        }
+      if (maybe_consume_bom (port, scm_utf32be_bom, sizeof (scm_utf32be_bom)))
+        {
+          prepare_iconv_descriptors (port, sym_UTF_32BE);
+          return 4;
+        }
+      /* Big-endian by default.  */
+      prepare_iconv_descriptors (port, sym_UTF_32BE);
+      return 0;
+    }
+
+  return 0;
+}
+
+static void
+port_clear_stream_start_for_bom_write (SCM port, enum bom_io_mode io_mode)
+{
+  scm_t_port *pt = SCM_PTAB_ENTRY (port);
+  scm_t_port_internal *pti = SCM_PORT_GET_INTERNAL (port);
+
+  if (!pti->at_stream_start_for_bom_write)
+    return;
+
+  /* Record that we're no longer at stream start.  */
+  pti->at_stream_start_for_bom_write = 0;
+  if (pt->rw_random)
+    pti->at_stream_start_for_bom_read = 0;
+
+  /* Write a BOM if appropriate.  */
+  if (scm_is_eq (pt->encoding, sym_UTF_16))
+    {
+      scm_t_iconv_descriptors *id = scm_i_port_iconv_descriptors (port);
+      if (scm_is_eq (id->precise_encoding, sym_UTF_16LE))
+        scm_c_write (port, scm_utf16le_bom, sizeof (scm_utf16le_bom));
+      else
+        scm_c_write (port, scm_utf16be_bom, sizeof (scm_utf16be_bom));
+    }
+  else if (scm_is_eq (pt->encoding, sym_UTF_32))
+    {
+      scm_t_iconv_descriptors *id = scm_i_port_iconv_descriptors (port);
+      if (scm_is_eq (id->precise_encoding, sym_UTF_32LE))
+        scm_c_write (port, scm_utf32le_bom, sizeof (scm_utf32le_bom));
+      else
+        scm_c_write (port, scm_utf32be_bom, sizeof (scm_utf32be_bom));
+    }
+}
+
 SCM
 scm_fill_input (SCM port, size_t minimum_size)
 {
   scm_t_port *pt = SCM_PTAB_ENTRY (port);
-  SCM read_buf = pt->read_buf;
-  size_t buffered = scm_port_buffer_can_take (read_buf);
+  SCM read_buf;
+  size_t buffered;
 
   if (minimum_size == 0)
     minimum_size = 1;
 
+  port_clear_stream_start_for_bom_read (port, BOM_IO_TEXT);
+  read_buf = pt->read_buf;
+  buffered = scm_port_buffer_can_take (read_buf);
+
   if (buffered >= minimum_size
       || scm_is_true (scm_port_buffer_has_eof_p (read_buf)))
     return read_buf;
@@ -2525,6 +2561,8 @@ scm_i_write (SCM port, SCM buf)
 {
   size_t start, count;
 
+  port_clear_stream_start_for_bom_write (port, BOM_IO_BINARY);
+
   /* Update cursors before attempting to write, assuming that I/O errors
      are sticky.  That way if the write throws an error, causing the
      computation to abort, and possibly causing the port to be collected
@@ -2633,6 +2671,11 @@ scm_lfwrite (const char *ptr, size_t size, SCM port)
 {
   int saved_line;
 
+  if (size == 0)
+    return;
+
+  port_clear_stream_start_for_bom_write (port, BOM_IO_TEXT);
+
   scm_c_write (port, ptr, size);
 
   saved_line = SCM_LINUM (port);
diff --git a/libguile/print.c b/libguile/print.c
index 0b2d193..8dcd375 100644
--- a/libguile/print.c
+++ b/libguile/print.c
@@ -1027,24 +1027,8 @@ display_string_using_iconv (const void *str, int 
narrow_p, size_t len,
 {
   size_t printed;
   scm_t_iconv_descriptors *id;
-  scm_t_port_internal *pti = SCM_PORT_GET_INTERNAL (port);
 
-  id = scm_i_port_iconv_descriptors (port, SCM_PORT_WRITE);
-
-  if (SCM_UNLIKELY (pti->at_stream_start_for_bom_write && len > 0))
-    {
-      scm_t_port *pt = SCM_PTAB_ENTRY (port);
-
-      /* Record that we're no longer at stream start.  */
-      pti->at_stream_start_for_bom_write = 0;
-      if (pt->rw_random)
-        pti->at_stream_start_for_bom_read = 0;
-
-      /* Write a BOM if appropriate.  */
-      if (SCM_UNLIKELY (scm_is_eq (pt->encoding, sym_UTF_16)
-                        || scm_is_eq (pt->encoding, sym_UTF_32)))
-        display_character (SCM_UNICODE_BOM, port, iconveh_error);
-    }
+  id = scm_i_port_iconv_descriptors (port);
 
   printed = 0;
 



reply via email to

[Prev in Thread] Current Thread [Next in Thread]