[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]
[PATCH] mbrtowc: work around glibc bug#19932
From: |
Paul Eggert |
Subject: |
[PATCH] mbrtowc: work around glibc bug#19932 |
Date: |
Sat, 9 Apr 2016 01:29:33 -0700 |
From: Paul Eggert <address@hidden>
Fix mbrtowc so that it never returns -1 in the C locale,
as this conflicts with a future version of POSIX
http://austingroupbugs.net/view.php?id=663#c2738
and causes problems with GNU grep: http://bugs.gnu.org/23234
See glibc bug 19932:
https://sourceware.org/bugzilla/show_bug.cgi?id=19932
* doc/posix-functions/mbrlen.texi (mbrlen):
* doc/posix-functions/mbrtowc.texi (mbrtowc):
Document the glibc bug.
* lib/mbrtowc.c [C_LOCALE_MAYBE_EILSEQ]:
Include hard-locale.h, locale.h.
(rpl_mbrtowc): Work around the C_LOCALE_MAYBE_EILSEQ bug,
if the bug is possible.
* m4/mbrtowc.m4 (gl_MBRTOWC_C_LOCALE): New macro.
(gl_FUNC_MBRTOWC): Use it, and define C_LOCALE_MAYBE_EILSEQ as needed.
* modules/hard-locale (License): Now LGPLv2+, for mbrtowc.
* modules/mbrtowc (Depends-on): Add hard-locale.
* modules/mbrtowc-tests (Files, TESTS): Add tests/test-mbrtowc5.sh.
* tests/test-mbrtowc.c (main): Test for bug fix if arg is '5'.
* tests/test-mbrtowc5.sh: New file.
---
ChangeLog | 24 ++++++++++++++++++
doc/posix-functions/mbrlen.texi | 4 +++
doc/posix-functions/mbrtowc.texi | 4 +++
lib/mbrtowc.c | 54 ++++++++++++++++++++--------------------
m4/mbrtowc.m4 | 50 ++++++++++++++++++++++++++++++++++++-
modules/hard-locale | 2 +-
modules/mbrtowc | 1 +
modules/mbrtowc-tests | 3 ++-
tests/test-mbrtowc.c | 11 +++++++-
tests/test-mbrtowc5.sh | 6 +++++
10 files changed, 128 insertions(+), 31 deletions(-)
create mode 100755 tests/test-mbrtowc5.sh
diff --git a/ChangeLog b/ChangeLog
index 980cfaa..77f1be9 100644
--- a/ChangeLog
+++ b/ChangeLog
@@ -1,3 +1,27 @@
+2016-04-09 Paul Eggert <address@hidden>
+
+ mbrtowc: work around glibc bug#19932
+ Fix mbrtowc so that it never returns -1 in the C locale,
+ as this conflicts with a future version of POSIX
+ http://austingroupbugs.net/view.php?id=663#c2738
+ and causes problems with GNU grep: http://bugs.gnu.org/23234
+ See glibc bug 19932:
+ https://sourceware.org/bugzilla/show_bug.cgi?id=19932
+ * doc/posix-functions/mbrlen.texi (mbrlen):
+ * doc/posix-functions/mbrtowc.texi (mbrtowc):
+ Document the glibc bug.
+ * lib/mbrtowc.c [C_LOCALE_MAYBE_EILSEQ]:
+ Include hard-locale.h, locale.h.
+ (rpl_mbrtowc): Work around the C_LOCALE_MAYBE_EILSEQ bug,
+ if the bug is possible.
+ * m4/mbrtowc.m4 (gl_MBRTOWC_C_LOCALE): New macro.
+ (gl_FUNC_MBRTOWC): Use it, and define C_LOCALE_MAYBE_EILSEQ as needed.
+ * modules/hard-locale (License): Now LGPLv2+, for mbrtowc.
+ * modules/mbrtowc (Depends-on): Add hard-locale.
+ * modules/mbrtowc-tests (Files, TESTS): Add tests/test-mbrtowc5.sh.
+ * tests/test-mbrtowc.c (main): Test for bug fix if arg is '5'.
+ * tests/test-mbrtowc5.sh: New file.
+
2016-04-03 Pedro Alves <address@hidden>
stdint: detect good enough pre-C++11 stdint.h in C++ mode
diff --git a/doc/posix-functions/mbrlen.texi b/doc/posix-functions/mbrlen.texi
index 7db550e..3f1d472 100644
--- a/doc/posix-functions/mbrlen.texi
+++ b/doc/posix-functions/mbrlen.texi
@@ -12,6 +12,10 @@ Portability problems fixed by Gnulib:
This function is missing on some platforms:
Minix 3.1.8, HP-UX 11.00, IRIX 6.5, Solaris 2.6, mingw, Interix 3.5.
@item
+In the C or POSIX locales, this function can return @code{(size_t) -1}
+and set @code{errno} to @code{EILSEQ}:
+glibc 2.23.
address@hidden
This function returns 0 instead of @code{(size_t) -2} when the input
is empty:
glibc 2.19.
diff --git a/doc/posix-functions/mbrtowc.texi b/doc/posix-functions/mbrtowc.texi
index 7c7f5fd..ad5c671 100644
--- a/doc/posix-functions/mbrtowc.texi
+++ b/doc/posix-functions/mbrtowc.texi
@@ -12,6 +12,10 @@ Portability problems fixed by Gnulib:
This function is missing on some platforms:
Minix 3.1.8, HP-UX 11.00, IRIX 6.5, Solaris 2.6, mingw, Interix 3.5.
@item
+In the C or POSIX locales, this function can return @code{(size_t) -1}
+and set @code{errno} to @code{EILSEQ}:
+glibc 2.23.
address@hidden
This function returns 0 instead of @code{(size_t) -2} when the input
is empty:
glibc 2.19.
diff --git a/lib/mbrtowc.c b/lib/mbrtowc.c
index 864e006..cdd874b 100644
--- a/lib/mbrtowc.c
+++ b/lib/mbrtowc.c
@@ -20,6 +20,11 @@
/* Specification. */
#include <wchar.h>
+#if C_LOCALE_MAYBE_EILSEQ
+# include "hard-locale.h"
+# include <locale.h>
+#endif
+
#if GNULIB_defined_mbstate_t
/* Implement mbrtowc() on top of mbtowc(). */
@@ -328,6 +333,9 @@ mbrtowc (wchar_t *pwc, const char *s, size_t n, mbstate_t
*ps)
size_t
rpl_mbrtowc (wchar_t *pwc, const char *s, size_t n, mbstate_t *ps)
{
+ size_t ret;
+ wchar_t wc;
+
# if MBRTOWC_NULL_ARG2_BUG || MBRTOWC_RETVAL_BUG || MBRTOWC_EMPTY_INPUT_BUG
if (s == NULL)
{
@@ -342,6 +350,9 @@ rpl_mbrtowc (wchar_t *pwc, const char *s, size_t n,
mbstate_t *ps)
return (size_t) -2;
# endif
+ if (! pwc)
+ pwc = &wc;
+
# if MBRTOWC_RETVAL_BUG
{
static mbstate_t internal_state;
@@ -357,8 +368,7 @@ rpl_mbrtowc (wchar_t *pwc, const char *s, size_t n,
mbstate_t *ps)
size_t count = 0;
for (; n > 0; s++, n--)
{
- wchar_t wc;
- size_t ret = mbrtowc (&wc, s, 1, ps);
+ ret = mbrtowc (&wc, s, 1, ps);
if (ret == (size_t)(-1))
return (size_t)(-1);
@@ -366,8 +376,7 @@ rpl_mbrtowc (wchar_t *pwc, const char *s, size_t n,
mbstate_t *ps)
if (ret != (size_t)(-2))
{
/* The multibyte character has been completed. */
- if (pwc != NULL)
- *pwc = wc;
+ *pwc = wc;
return (wc == 0 ? 0 : count);
}
}
@@ -376,32 +385,23 @@ rpl_mbrtowc (wchar_t *pwc, const char *s, size_t n,
mbstate_t *ps)
}
# endif
-# if MBRTOWC_NUL_RETVAL_BUG
- {
- wchar_t wc;
- size_t ret = mbrtowc (&wc, s, n, ps);
+ ret = mbrtowc (pwc, s, n, ps);
- if (ret != (size_t)(-1) && ret != (size_t)(-2))
- {
- if (pwc != NULL)
- *pwc = wc;
- if (wc == 0)
- ret = 0;
- }
- return ret;
- }
-# else
- {
-# if MBRTOWC_NULL_ARG1_BUG
- wchar_t dummy;
-
- if (pwc == NULL)
- pwc = &dummy;
-# endif
+# if MBRTOWC_NUL_RETVAL_BUG
+ if (ret < (size_t) -2 && !*pwc)
+ return 0;
+# endif
- return mbrtowc (pwc, s, n, ps);
- }
+# if C_LOCALE_MAYBE_EILSEQ
+ if ((size_t) -2 <= ret && n != 0 && ! hard_locale (LC_CTYPE))
+ {
+ unsigned char uc = *s;
+ *pwc = uc;
+ return 1;
+ }
# endif
+
+ return ret;
}
#endif
diff --git a/m4/mbrtowc.m4 b/m4/mbrtowc.m4
index e8c7eeb..d370fcc 100644
--- a/m4/mbrtowc.m4
+++ b/m4/mbrtowc.m4
@@ -1,4 +1,4 @@
-# mbrtowc.m4 serial 26 -*- coding: utf-8 -*-
+# mbrtowc.m4 serial 27 -*- coding: utf-8 -*-
dnl Copyright (C) 2001-2002, 2004-2005, 2008-2016 Free Software Foundation,
dnl Inc.
dnl This file is free software; the Free Software Foundation
@@ -40,6 +40,7 @@ AC_DEFUN([gl_FUNC_MBRTOWC],
gl_MBRTOWC_RETVAL
gl_MBRTOWC_NUL_RETVAL
gl_MBRTOWC_EMPTY_INPUT
+ gl_MBRTOWC_C_LOCALE
case "$gl_cv_func_mbrtowc_null_arg1" in
*yes) ;;
*) AC_DEFINE([MBRTOWC_NULL_ARG1_BUG], [1],
@@ -76,6 +77,13 @@ AC_DEFUN([gl_FUNC_MBRTOWC],
REPLACE_MBRTOWC=1
;;
esac
+ case $gl_cv_C_locale_sans_EILSEQ in
+ *yes) ;;
+ *) AC_DEFINE([C_LOCALE_MAYBE_EILSEQ], [1],
+ [Define to 1 if the C locale may have encoding errors.])
+ REPLACE_MBRTOWC=1
+ ;;
+ esac
fi
fi
])
@@ -577,6 +585,46 @@ changequote([,])dnl
])
])
+dnl Test whether mbrtowc reports encoding errors in the C locale.
+dnl Although POSIX was never intended to allow this, the GNU C Library
+dnl and other implementations do it. See:
+dnl https://sourceware.org/bugzilla/show_bug.cgi?id=19932
+
+AC_DEFUN([gl_MBRTOWC_C_LOCALE],
+[
+ AC_CACHE_CHECK([whether the C locale is free of encoding errors],
+ [gl_cv_C_locale_sans_EILSEQ],
+ [
+ dnl Initial guess, used when cross-compiling or when no suitable locale
+ dnl is present.
+ gl_cv_C_locale_sans_EILSEQ="guessing no"
+
+ AC_RUN_IFELSE(
+ [AC_LANG_PROGRAM(
+ [[#include <limits.h>
+ #include <locale.h>
+ #include <wchar.h>
+ ]], [[
+ int i;
+ char *locale = setlocale (LC_ALL, "C");
+ if (! locale)
+ return 1;
+ for (i = CHAR_MIN; i <= CHAR_MAX; i++)
+ {
+ char c = i;
+ wchar_t wc;
+ mbstate_t mbs = { 0, };
+ size_t ss = mbrtowc (&wc, &c, 1, &mbs);
+ if (1 < ss)
+ return 1;
+ }
+ return 0;
+ ]])],
+ [gl_cv_C_locale_sans_EILSEQ=yes],
+ [gl_cv_C_locale_sans_EILSEQ=no],
+ [:])])
+])
+
# Prerequisites of lib/mbrtowc.c.
AC_DEFUN([gl_PREREQ_MBRTOWC], [
:
diff --git a/modules/hard-locale b/modules/hard-locale
index 88dff8e..76c6edd 100644
--- a/modules/hard-locale
+++ b/modules/hard-locale
@@ -20,7 +20,7 @@ Include:
"hard-locale.h"
License:
-GPL
+LGPLv2+
Maintainer:
Paul Eggert
diff --git a/modules/mbrtowc b/modules/mbrtowc
index 4e90b67..bd951ae 100644
--- a/modules/mbrtowc
+++ b/modules/mbrtowc
@@ -13,6 +13,7 @@ m4/codeset.m4
Depends-on:
wchar
extensions
+hard-locale [test $HAVE_MBRTOWC = 0 || test $REPLACE_MBRTOWC = 1]
mbsinit [test $HAVE_MBRTOWC = 0 || test $REPLACE_MBRTOWC = 1]
localcharset [test $HAVE_MBRTOWC = 0 || test $REPLACE_MBRTOWC = 1]
streq [test $HAVE_MBRTOWC = 0 || test $REPLACE_MBRTOWC = 1]
diff --git a/modules/mbrtowc-tests b/modules/mbrtowc-tests
index bbd2213..fe948c3 100644
--- a/modules/mbrtowc-tests
+++ b/modules/mbrtowc-tests
@@ -3,6 +3,7 @@ tests/test-mbrtowc1.sh
tests/test-mbrtowc2.sh
tests/test-mbrtowc3.sh
tests/test-mbrtowc4.sh
+tests/test-mbrtowc5.sh
tests/test-mbrtowc.c
tests/test-mbrtowc-w32-1.sh
tests/test-mbrtowc-w32-2.sh
@@ -31,6 +32,7 @@ gt_LOCALE_ZH_CN
Makefile.am:
TESTS += \
test-mbrtowc1.sh test-mbrtowc2.sh test-mbrtowc3.sh test-mbrtowc4.sh \
+ test-mbrtowc5.sh \
test-mbrtowc-w32-1.sh test-mbrtowc-w32-2.sh test-mbrtowc-w32-3.sh \
test-mbrtowc-w32-4.sh test-mbrtowc-w32-5.sh
TESTS_ENVIRONMENT += \
@@ -39,4 +41,3 @@ TESTS_ENVIRONMENT += \
LOCALE_JA='@LOCALE_JA@' \
LOCALE_ZH_CN='@LOCALE_ZH_CN@'
check_PROGRAMS += test-mbrtowc test-mbrtowc-w32
-
diff --git a/tests/test-mbrtowc.c b/tests/test-mbrtowc.c
index 831836e..f7fed6a 100644
--- a/tests/test-mbrtowc.c
+++ b/tests/test-mbrtowc.c
@@ -72,6 +72,10 @@ main (int argc, char *argv[])
for (c = 0; c < 0x100; c++)
switch (c)
{
+ default:
+ if (! (c && 1 < argc && argv[1][0] == '5'))
+ break;
+ /* Fall through. */
case '\t': case '\v': case '\f':
case ' ': case '!': case '"': case '#': case '%':
case '&': case '\'': case '(': case ')': case '*':
@@ -93,7 +97,8 @@ main (int argc, char *argv[])
case 'p': case 'q': case 'r': case 's': case 't':
case 'u': case 'v': case 'w': case 'x': case 'y':
case 'z': case '{': case '|': case '}': case '~':
- /* c is in the ISO C "basic character set". */
+ /* c is in the ISO C "basic character set", or argv[1] starts
+ with '5' so we are testing all nonnull bytes. */
buf[0] = c;
wc = (wchar_t) 0xBADFACE;
ret = mbrtowc (&wc, buf, 1, &state);
@@ -334,6 +339,10 @@ main (int argc, char *argv[])
ASSERT (mbsinit (&state));
}
return 0;
+
+ case '5':
+ /* C locale; tested above. */
+ return 0;
}
return 1;
diff --git a/tests/test-mbrtowc5.sh b/tests/test-mbrtowc5.sh
new file mode 100755
index 0000000..c10b228
--- /dev/null
+++ b/tests/test-mbrtowc5.sh
@@ -0,0 +1,6 @@
+#!/bin/sh
+# Test whether the POSIX locale has encoding errors.
+LC_ALL=C \
+./test-mbrtowc${EXEEXT} 5 || exit
+LC_ALL=POSIX \
+./test-mbrtowc${EXEEXT} 5
--
2.5.5
- [PATCH] mbrtowc: work around glibc bug#19932,
Paul Eggert <=