guile-commits
[Top][All Lists]
Advanced

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

[Guile-commits] GNU Guile branch, stable-2.0, updated. v2.0.7-71-g1260fd


From: Andy Wingo
Subject: [Guile-commits] GNU Guile branch, stable-2.0, updated. v2.0.7-71-g1260fd0
Date: Wed, 30 Jan 2013 12:57:00 +0000

This is an automated email from the git hooks/post-receive script. It was
generated because a ref change was pushed to the repository containing
the project "GNU Guile".

http://git.savannah.gnu.org/cgit/guile.git/commit/?id=1260fd0b2c4ce1d0d7e7b17df924c245f67f9058

The branch, stable-2.0 has been updated
       via  1260fd0b2c4ce1d0d7e7b17df924c245f67f9058 (commit)
       via  7e0f26eb0d5a9316daad680f62168beffd050632 (commit)
       via  b2cb557d75e4daf8c7c8cd43313f4cc51d9a3f1b (commit)
      from  a8fa310b0493cd2e88a7d7f08b1ee3183a81b455 (commit)

Those revisions listed above that are new to this repository have
not appeared on any other notification email; so we list those
revisions in full, below.

- Log -----------------------------------------------------------------
commit 1260fd0b2c4ce1d0d7e7b17df924c245f67f9058
Author: Andy Wingo <address@hidden>
Date:   Wed Jan 30 13:55:59 2013 +0100

    quick fix to ssax.scm
    
    * module/sxml/ssax.scm: Fix previous commit.

commit 7e0f26eb0d5a9316daad680f62168beffd050632
Author: Andy Wingo <address@hidden>
Date:   Wed Jan 30 13:52:47 2013 +0100

    fix ssax:warn to work as intended
    
    * module/sxml/ssax.scm (ssax:warn): Fix to make more sense.

commit b2cb557d75e4daf8c7c8cd43313f4cc51d9a3f1b
Author: Andy Wingo <address@hidden>
Date:   Wed Jan 30 10:17:25 2013 +0100

    detect and consume byte-order marks for textual ports
    
    * libguile/ports.h:
    * libguile/ports.c (scm_consume_byte_order_mark): New procedure.
    
    * libguile/fports.c (scm_open_file): Call consume-byte-order-mark if we
      are opening a file in "r" mode.
    
    * libguile/read.c (scm_i_scan_for_encoding): Don't do anything about
      byte-order marks.
    
    * libguile/load.c (scm_primitive_load): Add a note about the duplicate
      encoding scan.
    
    * test-suite/tests/filesys.test: Add tests for UTF-8, UTF-16BE, and
      UTF-16LE BOM handling.

-----------------------------------------------------------------------

Summary of changes:
 libguile/fports.c             |   35 ++++++++++-------
 libguile/load.c               |    3 +
 libguile/ports.c              |   85 ++++++++++++++++++++++++++++++++++++++++-
 libguile/ports.h              |    3 +-
 libguile/read.c               |   14 +------
 module/sxml/ssax.scm          |    9 +++-
 test-suite/tests/filesys.test |   59 ++++++++++++++++++++++++++++-
 7 files changed, 175 insertions(+), 33 deletions(-)

diff --git a/libguile/fports.c b/libguile/fports.c
index 10cf671..fbc0530 100644
--- a/libguile/fports.c
+++ b/libguile/fports.c
@@ -1,5 +1,5 @@
 /* Copyright (C) 1995, 1996, 1997, 1998, 1999, 2000, 2001, 2002, 2003,
- *   2004, 2006, 2007, 2008, 2009, 2010, 2011, 2012 Free Software Foundation, 
Inc.
+ *   2004, 2006, 2007, 2008, 2009, 2010, 2011, 2012, 2013 Free Software 
Foundation, Inc.
  *
  * This library is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public License
@@ -399,7 +399,7 @@ SCM_DEFINE (scm_open_file, "open-file", 2, 0, 0,
 #define FUNC_NAME s_scm_open_file
 {
   SCM port;
-  int fdes, flags = 0, use_encoding = 1;
+  int fdes, flags = 0, scan_for_encoding = 0, consume_bom = 0, binary = 0;
   unsigned int retries;
   char *file, *md, *ptr;
 
@@ -415,6 +415,8 @@ SCM_DEFINE (scm_open_file, "open-file", 2, 0, 0,
     {
     case 'r':
       flags |= O_RDONLY;
+      consume_bom = 1;
+      scan_for_encoding = 1;
       break;
     case 'w':
       flags |= O_WRONLY | O_CREAT | O_TRUNC;
@@ -432,9 +434,12 @@ SCM_DEFINE (scm_open_file, "open-file", 2, 0, 0,
        {
        case '+':
          flags = (flags & ~(O_RDONLY | O_WRONLY)) | O_RDWR;
+          consume_bom = 0;
          break;
        case 'b':
-         use_encoding = 0;
+         scan_for_encoding = 0;
+          consume_bom = 0;
+          binary = 1;
 #if defined (O_BINARY)
          flags |= O_BINARY;
 #endif
@@ -473,21 +478,21 @@ SCM_DEFINE (scm_open_file, "open-file", 2, 0, 0,
   port = scm_i_fdes_to_port (fdes, scm_i_mode_bits (mode),
                              fport_canonicalize_filename (filename));
 
-  if (use_encoding)
-    {
-      /* If this file has a coding declaration, use that as the port
-        encoding.  */
-      if (SCM_INPUT_PORT_P (port))
-       {
-         char *enc = scm_i_scan_for_encoding (port);
-         if (enc != NULL)
-           scm_i_set_port_encoding_x (port, enc);
-       }
-    }
-  else
+  if (consume_bom) 
+    scm_consume_byte_order_mark (port);
+
+  if (binary)
     /* If this is a binary file, use the binary-friendly ISO-8859-1
        encoding.  */
     scm_i_set_port_encoding_x (port, NULL);
+  else if (scan_for_encoding)
+    /* If this is an input port and the file has a coding declaration,
+       use that as the port encoding.  */
+    {
+      char *enc = scm_i_scan_for_encoding (port);
+      if (enc != NULL)
+        scm_i_set_port_encoding_x (port, enc);
+    }
 
   scm_dynwind_end ();
 
diff --git a/libguile/load.c b/libguile/load.c
index 84b6705..476461c 100644
--- a/libguile/load.c
+++ b/libguile/load.c
@@ -106,6 +106,9 @@ SCM_DEFINE (scm_primitive_load, "primitive-load", 1, 0, 0,
     scm_dynwind_begin (SCM_F_DYNWIND_REWINDABLE);
     scm_i_dynwind_current_load_port (port);
 
+    /* FIXME: For better or for worse, scm_open_file already scans the
+       file for an encoding.  This scans again; necessary for this
+       logic, but unnecessary overall.  */
     encoding = scm_i_scan_for_encoding (port);
     if (encoding)
       scm_i_set_port_encoding_x (port, encoding);
diff --git a/libguile/ports.c b/libguile/ports.c
index 55808e2..9b1be9b 100644
--- a/libguile/ports.c
+++ b/libguile/ports.c
@@ -1,5 +1,5 @@
 /* Copyright (C) 1995, 1996, 1997, 1998, 1999, 2000, 2001, 2003, 2004,
- *   2006, 2007, 2008, 2009, 2010, 2011, 2012 Free Software Foundation, Inc.
+ *   2006, 2007, 2008, 2009, 2010, 2011, 2012, 2013 Free Software Foundation, 
Inc.
  *
  * This library is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public License
@@ -2153,6 +2153,89 @@ SCM_DEFINE (scm_set_port_filename_x, 
"set-port-filename!", 2, 0, 0,
 }
 #undef FUNC_NAME
 
+SCM_DEFINE (scm_consume_byte_order_mark, "consume-byte-order-mark", 1, 0, 0,
+            (SCM port),
+            "Peek ahead in @var{port} for a byte-order mark (\\uFEFF) 
encoded\n"
+            "in UTF-8 or in UTF-16.  If found, consume the byte-order mark\n"
+            "and set the port to the indicated encoding.\n"
+            "\n"
+            "As a special case, if the port's encoding is already UTF-16LE\n"
+            "or UTF-16BE (as opposed to UTF-16), we consider that the user\n"
+            "has already asked for an explicit byte order.  In this case no\n"
+            "scan is performed, and the byte-order mark (if any) is left in\n"
+            "the port.\n"
+            "\n"
+            "Return @code{#t} if a byte-order mark was consumed, and\n"
+            "@code{#f} otherwise.")
+#define FUNC_NAME s_scm_consume_byte_order_mark
+{
+  scm_t_port *pt;
+  const char *enc;
+
+  SCM_VALIDATE_PORT (1, port);
+
+  pt = SCM_PTAB_ENTRY (port);
+  enc = pt->encoding;
+
+  if (enc && strcasecmp (enc, "UTF-16BE") == 0)
+    return SCM_BOOL_F;
+
+  if (enc && strcasecmp (enc, "UTF-16LE") == 0)
+    return SCM_BOOL_F;
+
+  switch (scm_peek_byte_or_eof (port))
+    {
+    case 0xEF:
+      scm_get_byte_or_eof (port);
+      switch (scm_peek_byte_or_eof (port))
+        {
+        case 0xBB:
+          scm_get_byte_or_eof (port);
+          switch (scm_peek_byte_or_eof (port))
+            {
+            case 0xBF:
+              scm_get_byte_or_eof (port);
+              scm_i_set_port_encoding_x (port, "UTF-8");
+              return SCM_BOOL_T;
+            default:
+              scm_unget_byte (0xBB, port);
+              scm_unget_byte (0xEF, port);
+              return SCM_BOOL_F;
+            }
+        default:
+          scm_unget_byte (0xEF, port);
+          return SCM_BOOL_F;
+        }
+    case 0xFE:
+      scm_get_byte_or_eof (port);
+      switch (scm_peek_byte_or_eof (port))
+        {
+        case 0xFF:
+          scm_get_byte_or_eof (port);
+          scm_i_set_port_encoding_x (port, "UTF-16BE");
+          return SCM_BOOL_T;
+        default:
+          scm_unget_byte (0xFE, port);
+          return SCM_BOOL_F;
+        }
+    case 0xFF:
+      scm_get_byte_or_eof (port);
+      switch (scm_peek_byte_or_eof (port))
+        {
+        case 0xFE:
+          scm_get_byte_or_eof (port);
+          scm_i_set_port_encoding_x (port, "UTF-16LE");
+          return SCM_BOOL_T;
+        default:
+          scm_unget_byte (0xFF, port);
+          return SCM_BOOL_F;
+        }
+    default:
+      return SCM_BOOL_F;
+    }
+}
+#undef FUNC_NAME
+
 /* A fluid specifying the default encoding for newly created ports.  If it is
    a string, that is the encoding.  If it is #f, it is in the "native"
    (Latin-1) encoding.  */
diff --git a/libguile/ports.h b/libguile/ports.h
index d4d59b7..2f32345 100644
--- a/libguile/ports.h
+++ b/libguile/ports.h
@@ -4,7 +4,7 @@
 #define SCM_PORTS_H
 
 /* Copyright (C) 1995, 1996, 1997, 1998, 1999, 2000, 2001, 2003, 2004,
- *   2006, 2008, 2009, 2010, 2011, 2012 Free Software Foundation, Inc.
+ *   2006, 2008, 2009, 2010, 2011, 2012, 2013 Free Software Foundation, Inc.
  *
  * This library is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public License
@@ -303,6 +303,7 @@ SCM_API SCM scm_port_column (SCM port);
 SCM_API SCM scm_set_port_column_x (SCM port, SCM line);
 SCM_API SCM scm_port_filename (SCM port);
 SCM_API SCM scm_set_port_filename_x (SCM port, SCM filename);
+SCM_API SCM scm_consume_byte_order_mark (SCM port);
 SCM_INTERNAL const char *scm_i_default_port_encoding (void);
 SCM_INTERNAL void scm_i_set_default_port_encoding (const char *);
 SCM_INTERNAL void scm_i_set_port_encoding_x (SCM port, const char *str);
diff --git a/libguile/read.c b/libguile/read.c
index 222891b..a8f7744 100644
--- a/libguile/read.c
+++ b/libguile/read.c
@@ -1,5 +1,5 @@
 /* Copyright (C) 1995, 1996, 1997, 1999, 2000, 2001, 2003, 2004, 2006,
- *   2007, 2008, 2009, 2010, 2011, 2012 Free Software Foundation, Inc.
+ *   2007, 2008, 2009, 2010, 2011, 2012, 2013 Free Software Foundation, Inc.
  * 
  * This library is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public License
@@ -1985,7 +1985,6 @@ scm_i_scan_for_encoding (SCM port)
   char header[SCM_ENCODING_SEARCH_SIZE+1];
   size_t bytes_read, encoding_length, i;
   char *encoding = NULL;
-  int utf8_bom = 0;
   char *pos, *encoding_start;
   int in_comment;
 
@@ -2027,13 +2026,9 @@ scm_i_scan_for_encoding (SCM port)
 
       bytes_read = scm_c_read (port, header, SCM_ENCODING_SEARCH_SIZE);
       header[bytes_read] = '\0';
-      scm_seek (port, scm_from_int (0), scm_from_int (SEEK_SET));
+      scm_seek (port, scm_from_int (-bytes_read), scm_from_int (SEEK_CUR));
     }
 
-  if (bytes_read > 3 
-      && header[0] == '\xef' && header[1] == '\xbb' && header[2] == '\xbf')
-    utf8_bom = 1;
-
   /* search past "coding[:=]" */
   pos = header;
   while (1)
@@ -2102,11 +2097,6 @@ scm_i_scan_for_encoding (SCM port)
     /* This wasn't in a comment */
     return NULL;
 
-  if (utf8_bom && strcmp(encoding, "UTF-8"))
-    scm_misc_error (NULL,
-                   "the port input declares the encoding ~s but is encoded as 
UTF-8",
-                   scm_list_1 (scm_from_locale_string (encoding)));
-
   return encoding;
 }
 
diff --git a/module/sxml/ssax.scm b/module/sxml/ssax.scm
index 474247b..f750c93 100644
--- a/module/sxml/ssax.scm
+++ b/module/sxml/ssax.scm
@@ -180,9 +180,12 @@
   (parameterize ((current-ssax-error-port port))
     (thunk)))
 
-(define (ssax:warn port msg . args)
-  (format (current-ssax-error-port)
-          ";;; SSAX warning: ~a ~a\n" msg args))
+(define (ssax:warn port . args)
+  (with-output-to-port (current-ssax-error-port)
+    (lambda ()
+      (display ";;; SSAX warning: ")
+      (for-each display args)
+      (newline))))
 
 (define (ucscode->string codepoint)
   (string (integer->char codepoint)))
diff --git a/test-suite/tests/filesys.test b/test-suite/tests/filesys.test
index a6bfb6e..8bd974d 100644
--- a/test-suite/tests/filesys.test
+++ b/test-suite/tests/filesys.test
@@ -1,6 +1,6 @@
 ;;;; filesys.test --- test file system functions -*- scheme -*-
 ;;;; 
-;;;; Copyright (C) 2004, 2006 Free Software Foundation, Inc.
+;;;; Copyright (C) 2004, 2006, 2013 Free Software Foundation, Inc.
 ;;;; 
 ;;;; This library is free software; you can redistribute it and/or
 ;;;; modify it under the terms of the GNU Lesser General Public
@@ -17,6 +17,8 @@
 ;;;; Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 
USA
 
 (define-module (test-suite test-filesys)
+  #:use-module (ice-9 rdelim)
+  #:use-module (ice-9 binary-ports)
   #:use-module (test-suite lib)
   #:use-module (test-suite guile-test))
 
@@ -127,3 +129,58 @@
 
 (delete-file (test-file))
 (delete-file (test-symlink))
+
+(let ((s "\ufeffHello, world!"))
+  (define* (test-encoding encoding #:optional (ambient "ISO-8859-1"))
+    (with-fluids ((%default-port-encoding ambient))
+      (let* ((bytes (catch 'misc-error
+                      (lambda ()
+                        (call-with-values open-bytevector-output-port
+                          (lambda (port get-bytevector)
+                            (set-port-encoding! port encoding)
+                            (display s port)
+                            (get-bytevector))))
+                      (lambda args
+                        (throw 'unresolved))))
+             (name (string-copy "myfile-XXXXXX"))
+             (port (mkstemp! name)))
+        (put-bytevector port bytes)
+        (close-port port)
+        (let ((contents (call-with-input-file name read-string)))
+          (delete-file name)
+          contents))))
+
+  (pass-if "UTF-8"
+    (equal? (test-encoding "UTF-8")
+            "Hello, world!"))
+
+  (pass-if "UTF-16BE"
+    (equal? (test-encoding "UTF-16BE")
+            "Hello, world!"))
+
+  (pass-if "UTF-16LE"
+    (equal? (test-encoding "UTF-16LE")
+            "Hello, world!"))
+
+  (pass-if "UTF-8 (ambient)"
+    (equal? (test-encoding "UTF-8" "UTF-8")
+            "Hello, world!"))
+
+  (pass-if "UTF-8 (UTF-16 ambient)"
+    (equal? (test-encoding "UTF-8" "UTF-16")
+            "Hello, world!"))
+
+  ;; Unicode 6.2 section 16.8:
+  ;;
+  ;; For compatibility with versions of the Unicode Standard prior to
+  ;; Version 3.2, the code point U+FEFF has the word-joining semantics
+  ;; of zero width no-break space when it is not used as a BOM.  [...]
+  ;;
+  ;; Where the byte order is explicitly specified, such as in UTF-16BE
+  ;; or UTF-16LE, then all U+FEFF characters -- even at the very
+  ;; beginning of the text -- are to be interpreted as zero width
+  ;; no-break spaces.
+  ;;
+  (pass-if "UTF-16LE (ambient)"
+    (equal? (test-encoding "UTF-16LE" "UTF-16LE")
+            "\ufeffHello, world!")))


hooks/post-receive
-- 
GNU Guile



reply via email to

[Prev in Thread] Current Thread [Next in Thread]