[Top][All Lists]
[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]
[Guile-commits] GNU Guile branch, stable-2.0, updated. v2.0.1-39-g7be170
From: |
Ludovic Courtès |
Subject: |
[Guile-commits] GNU Guile branch, stable-2.0, updated. v2.0.1-39-g7be1705 |
Date: |
Sat, 07 May 2011 20:49:54 +0000 |
This is an automated email from the git hooks/post-receive script. It was
generated because a ref change was pushed to the repository containing
the project "GNU Guile".
http://git.savannah.gnu.org/cgit/guile.git/commit/?id=7be1705dbda377780335ecbcbfce04de523f2671
The branch, stable-2.0 has been updated
via 7be1705dbda377780335ecbcbfce04de523f2671 (commit)
via 452c5ad912baee9fa64298b6a8905681557ad3ae (commit)
from 040dfa6f3727342a9596b4cb0625f0e171c3d612 (commit)
Those revisions listed above that are new to this repository have
not appeared on any other notification email; so we list those
revisions in full, below.
- Log -----------------------------------------------------------------
commit 7be1705dbda377780335ecbcbfce04de523f2671
Author: Ludovic Courtès <address@hidden>
Date: Sat May 7 22:46:38 2011 +0200
Fix `get_utf8_codepoint' to not consume valid starting bytes.
Thanks to Mark H. Weaver for pointing this out.
* libguile/ports.c (CONSUME_PEEKED_BYTE): New macro.
(get_utf8_codepoint): New variable `pt'. Use
`scm_peek_byte_or_eof'/`CONSUME_PEEKED_BYTE' pairs instead of
`scm_get_byte_or_eof'.
* test-suite/tests/ports.test ("string ports")[#xc2 #x41 #x42, #xe0 #xa0
#x41 #x42, #xf0 #x88 #x88 #x88]: Fix to conform to Unicode 6.0.0.
[#xe0 #x88 #x88]: Remove test.
[#xf0 #x80 #x80 #x41]: New test.
commit 452c5ad912baee9fa64298b6a8905681557ad3ae
Author: Ludovic Courtès <address@hidden>
Date: Sat May 7 22:41:32 2011 +0200
Add `scm_peek_byte_or_eof'.
* libguile/inline.h (scm_get_byte_or_eof): Add `SCM_UNLIKELY' for EOF.
(scm_peek_byte_or_eof): New function.
* libguile/r6rs-ports.c (scm_lookahead_u8): Use `scm_peek_byte_or_eof'.
-----------------------------------------------------------------------
Summary of changes:
libguile/inline.h | 34 +++++++++++++++-
libguile/ports.c | 92 +++++++++++++++++++-----------------------
libguile/r6rs-ports.c | 7 +--
test-suite/tests/ports.test | 31 ++++++++++-----
4 files changed, 97 insertions(+), 67 deletions(-)
diff --git a/libguile/inline.h b/libguile/inline.h
index 1eae2e4..51a4db0 100644
--- a/libguile/inline.h
+++ b/libguile/inline.h
@@ -3,7 +3,8 @@
#ifndef SCM_INLINE_H
#define SCM_INLINE_H
-/* Copyright (C) 2001, 2002, 2003, 2004, 2006, 2008, 2009, 2010 Free Software
Foundation, Inc.
+/* Copyright (C) 2001, 2002, 2003, 2004, 2006, 2008, 2009, 2010,
+ * 2011 Free Software Foundation, Inc.
*
* This library is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public License
@@ -98,6 +99,7 @@ SCM_API int scm_is_pair (SCM x);
SCM_API int scm_is_string (SCM x);
SCM_API int scm_get_byte_or_eof (SCM port);
+SCM_API int scm_peek_byte_or_eof (SCM port);
SCM_API void scm_putc (char c, SCM port);
SCM_API void scm_puts (const char *str_data, SCM port);
@@ -362,7 +364,7 @@ scm_get_byte_or_eof (SCM port)
if (pt->read_pos >= pt->read_end)
{
- if (scm_fill_input (port) == EOF)
+ if (SCM_UNLIKELY (scm_fill_input (port) == EOF))
return EOF;
}
@@ -371,6 +373,34 @@ scm_get_byte_or_eof (SCM port)
return c;
}
+/* Like `scm_get_byte_or_eof' but does not change PORT's `read_pos'. */
+#ifndef SCM_INLINE_C_INCLUDING_INLINE_H
+SCM_C_EXTERN_INLINE
+#endif
+int
+scm_peek_byte_or_eof (SCM port)
+{
+ int c;
+ scm_t_port *pt = SCM_PTAB_ENTRY (port);
+
+ if (pt->rw_active == SCM_PORT_WRITE)
+ /* may be marginally faster than calling scm_flush. */
+ scm_ptobs[SCM_PTOBNUM (port)].flush (port);
+
+ if (pt->rw_random)
+ pt->rw_active = SCM_PORT_READ;
+
+ if (pt->read_pos >= pt->read_end)
+ {
+ if (SCM_UNLIKELY (scm_fill_input (port) == EOF))
+ return EOF;
+ }
+
+ c = *pt->read_pos;
+
+ return c;
+}
+
#ifndef SCM_INLINE_C_INCLUDING_INLINE_H
SCM_C_EXTERN_INLINE
#endif
diff --git a/libguile/ports.c b/libguile/ports.c
index 767e086..926149b 100644
--- a/libguile/ports.c
+++ b/libguile/ports.c
@@ -1127,10 +1127,14 @@ get_utf8_codepoint (SCM port, scm_t_wchar *codepoint,
#define ASSERT_NOT_EOF(b) \
if (SCM_UNLIKELY ((b) == EOF)) \
goto invalid_seq
+#define CONSUME_PEEKED_BYTE() \
+ pt->read_pos++
int byte;
+ scm_t_port *pt;
*len = 0;
+ pt = SCM_PTAB_ENTRY (port);
byte = scm_get_byte_or_eof (port);
if (byte == EOF)
@@ -1148,49 +1152,44 @@ get_utf8_codepoint (SCM port, scm_t_wchar *codepoint,
else if (buf[0] >= 0xc2 && buf[0] <= 0xdf)
{
/* 2-byte form. */
- byte = scm_get_byte_or_eof (port);
+ byte = scm_peek_byte_or_eof (port);
ASSERT_NOT_EOF (byte);
- buf[1] = (scm_t_uint8) byte;
- *len = 2;
-
if (SCM_UNLIKELY ((byte & 0xc0) != 0x80))
goto invalid_seq;
+ CONSUME_PEEKED_BYTE ();
+ buf[1] = (scm_t_uint8) byte;
+ *len = 2;
+
*codepoint = ((scm_t_wchar) buf[0] & 0x1f) << 6UL
| (buf[1] & 0x3f);
}
else if ((buf[0] & 0xf0) == 0xe0)
{
/* 3-byte form. */
- byte = scm_get_byte_or_eof (port);
- if (SCM_UNLIKELY (byte == EOF))
- goto invalid_seq;
-
- buf[1] = (scm_t_uint8) byte;
- *len = 2;
+ byte = scm_peek_byte_or_eof (port);
+ ASSERT_NOT_EOF (byte);
if (SCM_UNLIKELY ((byte & 0xc0) != 0x80
|| (buf[0] == 0xe0 && byte < 0xa0)
|| (buf[0] == 0xed && byte > 0x9f)))
- {
- /* Swallow the 3rd byte. */
- byte = scm_get_byte_or_eof (port);
- ASSERT_NOT_EOF (byte);
- *len = 3, buf[2] = byte;
- goto invalid_seq;
- }
+ goto invalid_seq;
+ CONSUME_PEEKED_BYTE ();
+ buf[1] = (scm_t_uint8) byte;
+ *len = 2;
- byte = scm_get_byte_or_eof (port);
+ byte = scm_peek_byte_or_eof (port);
ASSERT_NOT_EOF (byte);
- buf[2] = (scm_t_uint8) byte;
- *len = 3;
-
if (SCM_UNLIKELY ((byte & 0xc0) != 0x80))
goto invalid_seq;
+ CONSUME_PEEKED_BYTE ();
+ buf[2] = (scm_t_uint8) byte;
+ *len = 3;
+
*codepoint = ((scm_t_wchar) buf[0] & 0x0f) << 12UL
| ((scm_t_wchar) buf[1] & 0x3f) << 6UL
| (buf[2] & 0x3f);
@@ -1198,51 +1197,38 @@ get_utf8_codepoint (SCM port, scm_t_wchar *codepoint,
else if (buf[0] >= 0xf0 && buf[0] <= 0xf4)
{
/* 4-byte form. */
- byte = scm_get_byte_or_eof (port);
+ byte = scm_peek_byte_or_eof (port);
ASSERT_NOT_EOF (byte);
- buf[1] = (scm_t_uint8) byte;
- *len = 2;
-
if (SCM_UNLIKELY (((byte & 0xc0) != 0x80)
|| (buf[0] == 0xf0 && byte < 0x90)
|| (buf[0] == 0xf4 && byte > 0x8f)))
- {
- /* Swallow the 3rd and 4th bytes. */
- byte = scm_get_byte_or_eof (port);
- ASSERT_NOT_EOF (byte);
- *len = 3, buf[2] = byte;
-
- byte = scm_get_byte_or_eof (port);
- ASSERT_NOT_EOF (byte);
- *len = 4, buf[3] = byte;
- goto invalid_seq;
- }
+ goto invalid_seq;
- byte = scm_get_byte_or_eof (port);
+ CONSUME_PEEKED_BYTE ();
+ buf[1] = (scm_t_uint8) byte;
+ *len = 2;
+
+ byte = scm_peek_byte_or_eof (port);
ASSERT_NOT_EOF (byte);
+ if (SCM_UNLIKELY ((byte & 0xc0) != 0x80))
+ goto invalid_seq;
+
+ CONSUME_PEEKED_BYTE ();
buf[2] = (scm_t_uint8) byte;
*len = 3;
- if (SCM_UNLIKELY ((byte & 0xc0) != 0x80))
- {
- /* Swallow the 4th byte. */
- byte = scm_get_byte_or_eof (port);
- ASSERT_NOT_EOF (byte);
- *len = 4, buf[3] = byte;
- goto invalid_seq;
- }
-
- byte = scm_get_byte_or_eof (port);
+ byte = scm_peek_byte_or_eof (port);
ASSERT_NOT_EOF (byte);
- buf[3] = (scm_t_uint8) byte;
- *len = 4;
-
if (SCM_UNLIKELY ((byte & 0xc0) != 0x80))
goto invalid_seq;
+ CONSUME_PEEKED_BYTE ();
+ buf[3] = (scm_t_uint8) byte;
+ *len = 4;
+
*codepoint = ((scm_t_wchar) buf[0] & 0x07) << 18UL
| ((scm_t_wchar) buf[1] & 0x3f) << 12UL
| ((scm_t_wchar) buf[2] & 0x3f) << 6UL
@@ -1254,8 +1240,14 @@ get_utf8_codepoint (SCM port, scm_t_wchar *codepoint,
return 0;
invalid_seq:
+ /* Here we could choose the consume the faulty byte when it's not a
+ valid starting byte, but it's not a requirement. What Section 3.9
+ of Unicode 6.0.0 mandates, though, is to not consume a byte that
+ would otherwise be a valid starting byte. */
+
return EILSEQ;
+#undef CONSUME_PEEKED_BYTE
#undef ASSERT_NOT_EOF
}
diff --git a/libguile/r6rs-ports.c b/libguile/r6rs-ports.c
index b9d5282..f45dfc1 100644
--- a/libguile/r6rs-ports.c
+++ b/libguile/r6rs-ports.c
@@ -460,14 +460,11 @@ SCM_DEFINE (scm_lookahead_u8, "lookahead-u8", 1, 0, 0,
SCM_VALIDATE_BINARY_INPUT_PORT (1, port);
- u8 = scm_get_byte_or_eof (port);
+ u8 = scm_peek_byte_or_eof (port);
if (u8 == EOF)
result = SCM_EOF_VAL;
else
- {
- scm_unget_byte (u8, port);
- result = SCM_I_MAKINUM ((scm_t_uint8) u8);
- }
+ result = SCM_I_MAKINUM ((scm_t_uint8) u8);
return result;
}
diff --git a/test-suite/tests/ports.test b/test-suite/tests/ports.test
index c1ee7d1..d4a333f 100644
--- a/test-suite/tests/ports.test
+++ b/test-suite/tests/ports.test
@@ -572,29 +572,40 @@
eof))
(test-decoding-error (#xc2 #x41 #x42) "UTF-8"
- ;; FIXME: This is the behavior of glibc/libiconv but it does not
- ;; conform to the Unicode 6.0.0 recommendation: according to it,
- ;; the #\A should not be swallowed (Section 3.9 reads:
- ;; "If the converter encounters an ill-formed UTF-8 code unit
- ;; sequence which starts with a valid first byte, but which does
- ;; not continue with valid successor bytes (see Table 3-7), it
- ;; must not consume the successor bytes".)
-
- (error ;; 41: should be in the 80..BF range
+ ;; Section 3.9 of Unicode 6.0.0 reads:
+ ;; "If the converter encounters an ill-formed UTF-8 code unit
+ ;; sequence which starts with a valid first byte, but which does
+ ;; not continue with valid successor bytes (see Table 3-7), it
+ ;; must not consume the successor bytes".
+ ;; Glibc/libiconv do not conform to it and instead swallow the
+ ;; #x41. This example appears literally in Section 3.9.
+ (error ;; 41: invalid successor
+ #\A ;; 41: valid starting byte
#\B
eof))
- (test-decoding-error (#xe0 #x88 #x88) "UTF-8"
+ (test-decoding-error (#xf0 #x80 #x80 #x41) "UTF-8"
+ ;; According to Unicode 6.0.0, Section 3.9, "the only formal
+ ;; requirement mandated by Unicode conformance for a converter is
+ ;; that the <41> be processed and correctly interpreted as
+ ;; <U+0041>".
(error ;; 2nd byte should be in the A0..BF range
+ error ;; 80: not a valid starting byte
+ error ;; 80: not a valid starting byte
+ #\A
eof))
(test-decoding-error (#xe0 #xa0 #x41 #x42) "UTF-8"
(error ;; 3rd byte should be in the 80..BF range
+ #\A
#\B
eof))
(test-decoding-error (#xf0 #x88 #x88 #x88) "UTF-8"
(error ;; 2nd byte should be in the 90..BF range
+ error ;; 88: not a valid starting byte
+ error ;; 88: not a valid starting byte
+ error ;; 88: not a valid starting byte
eof))))
(with-test-prefix "call-with-output-string"
hooks/post-receive
--
GNU Guile
[Prev in Thread] |
Current Thread |
[Next in Thread] |
- [Guile-commits] GNU Guile branch, stable-2.0, updated. v2.0.1-39-g7be1705,
Ludovic Courtès <=