[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]
[Guile-commits] 01/03: Handle BOM around fill/flush instead of peek/put
From: |
Andy Wingo |
Subject: |
[Guile-commits] 01/03: Handle BOM around fill/flush instead of peek/put |
Date: |
Thu, 05 May 2016 20:59:10 +0000 |
wingo pushed a commit to branch wip-port-refactor
in repository guile.
commit 86267af8b30be0901b6bb29cbdc730c0a5844bfd
Author: Andy Wingo <address@hidden>
Date: Thu May 5 13:00:19 2016 +0200
Handle BOM around fill/flush instead of peek/put
* libguile/print.c (display_string_using_iconv): Remove BOM handling;
this is now handled by scm_lfwrite.
* libguile/ports.c (open_iconv_descriptors): Refactor to take encoding
as a symbol.
(prepare_iconv_descriptors): New helper.
(scm_i_port_iconv_descriptors): Remove scm_t_port_rw_active argument,
and don't sniff UTF-16/UTF-32 byte orders here. Instead BOM handlers
will call prepare_iconv_descriptors.
(scm_c_read_bytes): Call new port_clear_stream_start_for_bom_read
helper.
(port_maybe_consume_initial_byte_order_mark)
(scm_port_maybe_consume_initial_byte_order_mark): Remove. Leaves
Scheme %peek-char broken but it's unused currently so that's OK.
(peek_iconv_codepoint): Fetch iconv descriptors after doing fill-input
because it's fill-input that will sniff the BOM.
(peek_codepoint): Instead of handling BOM at every character, handle
in fill-input instead.
(maybe_consume_bom, port_clear_stream_start_for_bom_read)
(port_clear_stream_start_for_bom_write): New helpers.
(scm_fill_input): Slurp a BOM if needed.
(scm_i_write): Clear the start-of-stream-for-bom-write flag.
(scm_lfwrite): Write a BOM if needed.
---
libguile/ports-internal.h | 10 +-
libguile/ports.c | 307 ++++++++++++++++++++++++++-------------------
libguile/print.c | 18 +--
3 files changed, 184 insertions(+), 151 deletions(-)
diff --git a/libguile/ports-internal.h b/libguile/ports-internal.h
index 526337d..689e61c 100644
--- a/libguile/ports-internal.h
+++ b/libguile/ports-internal.h
@@ -210,6 +210,13 @@ typedef enum scm_port_encoding_mode
scm_t_port_encoding_mode;
cause finalizers to be registered. */
struct scm_iconv_descriptors
{
+ /* This is the same as pt->encoding, except if pt->encoding is UTF-16
+ or UTF-32, in which case this is UTF-16LE or a similar
+ byte-order-specialed version of UTF-16 or UTF-32. We don't re-set
+ pt->encoding because being just plain UTF-16 or UTF-32 has an
+ additional meaning, being that we should consume and produce byte
+ order marker codepoints as appropriate. */
+ SCM precise_encoding;
/* input/output iconv conversion descriptors */
void *input_cd;
void *output_cd;
@@ -237,7 +244,6 @@ typedef enum scm_t_port_rw_active {
SCM_PORT_WRITE = 2
} scm_t_port_rw_active;
-SCM_INTERNAL scm_t_iconv_descriptors *
-scm_i_port_iconv_descriptors (SCM port, scm_t_port_rw_active mode);
+SCM_INTERNAL scm_t_iconv_descriptors * scm_i_port_iconv_descriptors (SCM port);
#endif
diff --git a/libguile/ports.c b/libguile/ports.c
index da1af2f..e8c79bc 100644
--- a/libguile/ports.c
+++ b/libguile/ports.c
@@ -1028,63 +1028,12 @@ scm_i_set_default_port_conversion_strategy (SCM sym)
scm_fluid_set_x (SCM_VARIABLE_REF (default_conversion_strategy_var), sym);
}
-/* If the next LEN bytes from PORT are equal to those in BYTES, then
- return 1, else return 0. Leave the port position unchanged. */
-static int
-looking_at_bytes (SCM port, const unsigned char *bytes, int len)
-{
- scm_t_port *pt = SCM_PTAB_ENTRY (port);
- int i = 0;
-
- while (i < len && scm_peek_byte_or_eof (port) == bytes[i])
- {
- scm_port_buffer_did_take (pt->read_buf, 1);
- i++;
- }
- scm_unget_bytes (bytes, i, port);
- return (i == len);
-}
-
static const unsigned char scm_utf8_bom[3] = {0xEF, 0xBB, 0xBF};
static const unsigned char scm_utf16be_bom[2] = {0xFE, 0xFF};
static const unsigned char scm_utf16le_bom[2] = {0xFF, 0xFE};
static const unsigned char scm_utf32be_bom[4] = {0x00, 0x00, 0xFE, 0xFF};
static const unsigned char scm_utf32le_bom[4] = {0xFF, 0xFE, 0x00, 0x00};
-/* Decide what byte order to use for a UTF-16 port. Return "UTF-16BE"
- or "UTF-16LE". MODE must be either SCM_PORT_READ or SCM_PORT_WRITE,
- and specifies which operation is about to be done. The MODE
- determines how we will decide the byte order. We deliberately avoid
- reading from the port unless the user is about to do so. If the user
- is about to read, then we look for a BOM, and if present, we use it
- to determine the byte order. Otherwise we choose big endian, as
- recommended by the Unicode Standard. Note that the BOM (if any) is
- not consumed here. */
-static SCM
-decide_utf16_encoding (SCM port, scm_t_port_rw_active mode)
-{
- if (mode == SCM_PORT_READ
- && SCM_PORT_GET_INTERNAL (port)->at_stream_start_for_bom_read
- && looking_at_bytes (port, scm_utf16le_bom, sizeof scm_utf16le_bom))
- return sym_UTF_16LE;
- else
- return sym_UTF_16BE;
-}
-
-/* Decide what byte order to use for a UTF-32 port. Return "UTF-32BE"
- or "UTF-32LE". See the comment above 'decide_utf16_encoding' for
- details. */
-static SCM
-decide_utf32_encoding (SCM port, scm_t_port_rw_active mode)
-{
- if (mode == SCM_PORT_READ
- && SCM_PORT_GET_INTERNAL (port)->at_stream_start_for_bom_read
- && looking_at_bytes (port, scm_utf32le_bom, sizeof scm_utf32le_bom))
- return sym_UTF_32LE;
- else
- return sym_UTF_32BE;
-}
-
static void
finalize_iconv_descriptors (void *ptr, void *data)
{
@@ -1092,8 +1041,9 @@ finalize_iconv_descriptors (void *ptr, void *data)
}
static scm_t_iconv_descriptors *
-open_iconv_descriptors (const char *encoding, int reading, int writing)
+open_iconv_descriptors (SCM precise_encoding, int reading, int writing)
{
+ const char *encoding;
scm_t_iconv_descriptors *id;
iconv_t input_cd, output_cd;
size_t i;
@@ -1101,6 +1051,7 @@ open_iconv_descriptors (const char *encoding, int
reading, int writing)
input_cd = (iconv_t) -1;
output_cd = (iconv_t) -1;
+ encoding = scm_i_symbol_chars (precise_encoding);
for (i = 0; encoding[i]; i++)
if (encoding[i] > 127)
goto invalid_encoding;
@@ -1138,6 +1089,7 @@ open_iconv_descriptors (const char *encoding, int
reading, int writing)
}
id = scm_gc_malloc_pointerless (sizeof (*id), "iconv descriptors");
+ id->precise_encoding = precise_encoding;
id->input_cd = input_cd;
id->output_cd = output_cd;
@@ -1147,13 +1099,9 @@ open_iconv_descriptors (const char *encoding, int
reading, int writing)
return id;
invalid_encoding:
- {
- SCM err;
- err = scm_from_latin1_string (encoding);
- scm_misc_error ("open_iconv_descriptors",
- "invalid or unknown character encoding ~s",
- scm_list_1 (err));
- }
+ scm_misc_error ("open_iconv_descriptors",
+ "invalid or unknown character encoding ~s",
+ scm_list_1 (precise_encoding));
}
static void
@@ -1167,30 +1115,35 @@ close_iconv_descriptors (scm_t_iconv_descriptors *id)
id->output_cd = (void *) -1;
}
+static void
+prepare_iconv_descriptors (SCM port, SCM encoding)
+{
+ scm_t_port_internal *pti = SCM_PORT_GET_INTERNAL (port);
+ scm_t_iconv_descriptors *desc = pti->iconv_descriptors;
+
+ /* If the specified encoding is UTF-16 or UTF-32, then default to
+ big-endian byte order. This fallback isn't necessary if you read
+ on the port before writing to it, as the read will sniff the BOM if
+ any and specialize the encoding; see the manual. */
+ if (scm_is_eq (encoding, sym_UTF_16))
+ encoding = sym_UTF_16BE;
+ else if (scm_is_eq (encoding, sym_UTF_32))
+ encoding = sym_UTF_32BE;
+
+ if (desc && scm_is_eq (desc->precise_encoding, encoding))
+ return;
+
+ pti->iconv_descriptors = open_iconv_descriptors
+ (encoding, SCM_INPUT_PORT_P (port), SCM_OUTPUT_PORT_P (port));
+}
+
scm_t_iconv_descriptors *
-scm_i_port_iconv_descriptors (SCM port, scm_t_port_rw_active mode)
+scm_i_port_iconv_descriptors (SCM port)
{
- scm_t_port *pt = SCM_PTAB_ENTRY (port);
scm_t_port_internal *pti = SCM_PORT_GET_INTERNAL (port);
if (!pti->iconv_descriptors)
- {
- SCM precise_encoding;
-
- /* If the specified encoding is UTF-16 or UTF-32, then make
- that more precise by deciding what byte order to use. */
- if (scm_is_eq (pt->encoding, sym_UTF_16))
- precise_encoding = decide_utf16_encoding (port, mode);
- else if (scm_is_eq (pt->encoding, sym_UTF_32))
- precise_encoding = decide_utf32_encoding (port, mode);
- else
- precise_encoding = pt->encoding;
-
- pti->iconv_descriptors =
- open_iconv_descriptors (scm_i_symbol_chars (precise_encoding),
- SCM_INPUT_PORT_P (port),
- SCM_OUTPUT_PORT_P (port));
- }
+ prepare_iconv_descriptors (port, SCM_PTAB_ENTRY (port)->encoding);
return pti->iconv_descriptors;
}
@@ -1450,6 +1403,14 @@ scm_i_read_bytes (SCM port, SCM dst, size_t start,
size_t count)
return filled;
}
+/* In text mode, we will slurp a BOM from the beginning of a UTF-8,
+ UTF-16, or UTF-32 stream, and write one at the beginning of a UTF-16
+ or UTF-32 stream. In binary mode, we won't. The mode depends on the
+ caller. */
+enum bom_io_mode { BOM_IO_TEXT, BOM_IO_BINARY };
+static size_t port_clear_stream_start_for_bom_read (SCM, enum bom_io_mode);
+static void port_clear_stream_start_for_bom_write (SCM, enum bom_io_mode);
+
/* Used by an application to read arbitrary number of bytes from an SCM
port. Same semantics as libc read, except that scm_c_read_bytes only
returns less than SIZE bytes if at end-of-file.
@@ -1472,6 +1433,8 @@ scm_c_read_bytes (SCM port, SCM dst, size_t start, size_t
count)
if (pt->rw_random)
scm_flush (port);
+ port_clear_stream_start_for_bom_read (port, BOM_IO_BINARY);
+
/* Take bytes first from the port's read buffer. */
{
size_t did_read = scm_port_buffer_take (read_buf, dst_ptr, to_read);
@@ -1553,50 +1516,6 @@ scm_c_read (SCM port, void *buffer, size_t size)
}
#undef FUNC_NAME
-static int
-port_maybe_consume_initial_byte_order_mark (SCM port, scm_t_wchar codepoint,
- size_t len)
-{
- scm_t_port *pt = SCM_PTAB_ENTRY (port);
- scm_t_port_internal *pti = SCM_PORT_GET_INTERNAL (port);
-
- if (!pti->at_stream_start_for_bom_read) return 0;
-
- /* Record that we're no longer at stream start. */
- pti->at_stream_start_for_bom_read = 0;
- if (pt->rw_random)
- pti->at_stream_start_for_bom_write = 0;
-
- if (codepoint != SCM_UNICODE_BOM) return 0;
-
- /* If we just read a BOM in an encoding that recognizes them, then
- silently consume it. */
- if (scm_is_eq (pt->encoding, sym_UTF_8)
- || scm_is_eq (pt->encoding, sym_UTF_16)
- || scm_is_eq (pt->encoding, sym_UTF_32))
- {
- scm_port_buffer_did_take (pt->read_buf, len);
- return 1;
- }
-
- return 0;
-}
-
-SCM_DEFINE (scm_port_maybe_consume_initial_byte_order_mark,
- "port-maybe-consume-initial-byte-order-mark", 3, 0, 0,
- (SCM port, SCM codepoint, SCM len),
- "")
-#define FUNC_NAME s_scm_port_maybe_consume_initial_byte_order_mark
-{
- SCM_VALIDATE_PORT (1, port);
- return scm_from_bool
- (port_maybe_consume_initial_byte_order_mark
- (port,
- SCM_CHARP (codepoint) ? SCM_CHAR (codepoint) : EOF,
- scm_to_size_t (len)));
-}
-#undef FUNC_NAME
-
/* Update the line and column number of PORT after consumption of C. */
static inline void
update_port_lf (scm_t_wchar c, SCM port)
@@ -1790,8 +1709,6 @@ peek_iconv_codepoint (SCM port, scm_t_wchar *codepoint,
size_t *len)
scm_t_uint8 utf8_buf[SCM_MBCHAR_BUF_SIZE];
size_t input_size = 0;
- id = scm_i_port_iconv_descriptors (port, SCM_PORT_READ);
-
for (;;)
{
SCM read_buf;
@@ -1799,6 +1716,8 @@ peek_iconv_codepoint (SCM port, scm_t_wchar *codepoint,
size_t *len)
size_t input_left, output_left, done;
read_buf = scm_fill_input (port, input_size + 1);
+ id = scm_i_port_iconv_descriptors (port);
+
if (scm_port_buffer_can_take (read_buf) <= input_size)
{
if (input_size == 0)
@@ -1867,12 +1786,7 @@ peek_codepoint (SCM port, scm_t_wchar *codepoint, size_t
*len)
else
err = peek_iconv_codepoint (port, codepoint, len);
- if (SCM_LIKELY (err == 0))
- {
- if (port_maybe_consume_initial_byte_order_mark (port, *codepoint, *len))
- return peek_codepoint (port, codepoint, len);
- }
- else if (scm_is_eq (pt->conversion_strategy, sym_substitute))
+ if (err != 0 && scm_is_eq (pt->conversion_strategy, sym_substitute))
{
*codepoint = '?';
err = 0;
@@ -2352,16 +2266,138 @@ scm_flush (SCM port)
scm_i_write (port, buf);
}
+/* Return number of bytes consumed, or zero if no BOM was consumed. */
+static size_t
+maybe_consume_bom (SCM port, const unsigned char *bom, size_t bom_len)
+{
+ SCM read_buf;
+ const scm_t_uint8 *buf;
+
+ if (peek_byte_or_eof (port) != bom[0])
+ return 0;
+
+ /* Make sure there's enough space in the buffer for a BOM. Now that
+ we matched the first byte, we know we're going to have to read this
+ many bytes anyway. */
+ read_buf = scm_fill_input (port, bom_len);
+ buf = scm_port_buffer_take_pointer (read_buf);
+
+ if (scm_port_buffer_can_take (read_buf) < bom_len)
+ return 0;
+
+ if (memcmp (buf, bom, bom_len) != 0)
+ return 0;
+
+ scm_port_buffer_did_take (read_buf, bom_len);
+ return bom_len;
+}
+
+static size_t
+port_clear_stream_start_for_bom_read (SCM port, enum bom_io_mode io_mode)
+{
+ scm_t_port_internal *pti = SCM_PORT_GET_INTERNAL (port);
+ scm_t_port *pt;
+
+ if (!pti->at_stream_start_for_bom_read)
+ return 0;
+
+ /* Maybe slurp off a byte-order marker. */
+ pt = SCM_PTAB_ENTRY (port);
+ pti->at_stream_start_for_bom_read = 0;
+ if (pt->rw_random)
+ pti->at_stream_start_for_bom_write = 0;
+
+ if (io_mode == BOM_IO_BINARY)
+ return 0;
+
+ if (scm_is_eq (pt->encoding, sym_UTF_8))
+ return maybe_consume_bom (port, scm_utf8_bom, sizeof (scm_utf8_bom));
+
+ if (scm_is_eq (pt->encoding, sym_UTF_16))
+ {
+ if (maybe_consume_bom (port, scm_utf16le_bom, sizeof (scm_utf16le_bom)))
+ {
+ prepare_iconv_descriptors (port, sym_UTF_16LE);
+ return 2;
+ }
+ if (maybe_consume_bom (port, scm_utf16be_bom, sizeof (scm_utf16be_bom)))
+ {
+ prepare_iconv_descriptors (port, sym_UTF_16BE);
+ return 2;
+ }
+ /* Big-endian by default. */
+ prepare_iconv_descriptors (port, sym_UTF_16BE);
+ return 0;
+ }
+
+ if (scm_is_eq (pt->encoding, sym_UTF_32))
+ {
+ if (maybe_consume_bom (port, scm_utf32le_bom, sizeof (scm_utf32le_bom)))
+ {
+ /* Big-endian by default. */
+ prepare_iconv_descriptors (port, sym_UTF_32LE);
+ return 4;
+ }
+ if (maybe_consume_bom (port, scm_utf32be_bom, sizeof (scm_utf32be_bom)))
+ {
+ prepare_iconv_descriptors (port, sym_UTF_32BE);
+ return 4;
+ }
+ /* Big-endian by default. */
+ prepare_iconv_descriptors (port, sym_UTF_32BE);
+ return 0;
+ }
+
+ return 0;
+}
+
+static void
+port_clear_stream_start_for_bom_write (SCM port, enum bom_io_mode io_mode)
+{
+ scm_t_port *pt = SCM_PTAB_ENTRY (port);
+ scm_t_port_internal *pti = SCM_PORT_GET_INTERNAL (port);
+
+ if (!pti->at_stream_start_for_bom_write)
+ return;
+
+ /* Record that we're no longer at stream start. */
+ pti->at_stream_start_for_bom_write = 0;
+ if (pt->rw_random)
+ pti->at_stream_start_for_bom_read = 0;
+
+ /* Write a BOM if appropriate. */
+ if (scm_is_eq (pt->encoding, sym_UTF_16))
+ {
+ scm_t_iconv_descriptors *id = scm_i_port_iconv_descriptors (port);
+ if (scm_is_eq (id->precise_encoding, sym_UTF_16LE))
+ scm_c_write (port, scm_utf16le_bom, sizeof (scm_utf16le_bom));
+ else
+ scm_c_write (port, scm_utf16be_bom, sizeof (scm_utf16be_bom));
+ }
+ else if (scm_is_eq (pt->encoding, sym_UTF_32))
+ {
+ scm_t_iconv_descriptors *id = scm_i_port_iconv_descriptors (port);
+ if (scm_is_eq (id->precise_encoding, sym_UTF_32LE))
+ scm_c_write (port, scm_utf32le_bom, sizeof (scm_utf32le_bom));
+ else
+ scm_c_write (port, scm_utf32be_bom, sizeof (scm_utf32be_bom));
+ }
+}
+
SCM
scm_fill_input (SCM port, size_t minimum_size)
{
scm_t_port *pt = SCM_PTAB_ENTRY (port);
- SCM read_buf = pt->read_buf;
- size_t buffered = scm_port_buffer_can_take (read_buf);
+ SCM read_buf;
+ size_t buffered;
if (minimum_size == 0)
minimum_size = 1;
+ port_clear_stream_start_for_bom_read (port, BOM_IO_TEXT);
+ read_buf = pt->read_buf;
+ buffered = scm_port_buffer_can_take (read_buf);
+
if (buffered >= minimum_size
|| scm_is_true (scm_port_buffer_has_eof_p (read_buf)))
return read_buf;
@@ -2525,6 +2561,8 @@ scm_i_write (SCM port, SCM buf)
{
size_t start, count;
+ port_clear_stream_start_for_bom_write (port, BOM_IO_BINARY);
+
/* Update cursors before attempting to write, assuming that I/O errors
are sticky. That way if the write throws an error, causing the
computation to abort, and possibly causing the port to be collected
@@ -2633,6 +2671,11 @@ scm_lfwrite (const char *ptr, size_t size, SCM port)
{
int saved_line;
+ if (size == 0)
+ return;
+
+ port_clear_stream_start_for_bom_write (port, BOM_IO_TEXT);
+
scm_c_write (port, ptr, size);
saved_line = SCM_LINUM (port);
diff --git a/libguile/print.c b/libguile/print.c
index 0b2d193..8dcd375 100644
--- a/libguile/print.c
+++ b/libguile/print.c
@@ -1027,24 +1027,8 @@ display_string_using_iconv (const void *str, int
narrow_p, size_t len,
{
size_t printed;
scm_t_iconv_descriptors *id;
- scm_t_port_internal *pti = SCM_PORT_GET_INTERNAL (port);
- id = scm_i_port_iconv_descriptors (port, SCM_PORT_WRITE);
-
- if (SCM_UNLIKELY (pti->at_stream_start_for_bom_write && len > 0))
- {
- scm_t_port *pt = SCM_PTAB_ENTRY (port);
-
- /* Record that we're no longer at stream start. */
- pti->at_stream_start_for_bom_write = 0;
- if (pt->rw_random)
- pti->at_stream_start_for_bom_read = 0;
-
- /* Write a BOM if appropriate. */
- if (SCM_UNLIKELY (scm_is_eq (pt->encoding, sym_UTF_16)
- || scm_is_eq (pt->encoding, sym_UTF_32)))
- display_character (SCM_UNICODE_BOM, port, iconveh_error);
- }
+ id = scm_i_port_iconv_descriptors (port);
printed = 0;