emacs-devel
[Top][All Lists]
Advanced

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

[PATCH] Improve error reporting when serializing non-Unicode strings to


From: Philipp Stephani
Subject: [PATCH] Improve error reporting when serializing non-Unicode strings to JSON
Date: Fri, 22 Dec 2017 22:00:31 +0100

* admin/merge-gnulib (GNULIB_MODULES): Add unistr modules.

* lib/Makefile.in (.c.o): Fix output file for files in subdirectories.
(${DEPDIR}/unistr, unistr/u8-check.o): Create missing deps directory.

* src/json.c (json_check_utf8): New helper function.
(lisp_to_json_toplevel_1, lisp_to_json): Use it.  To save a bit of
time, check for invalid UTF-8 strings only after encountering an
error, since Jansson already rejects them.

* test/src/json-tests.el (json-serialize/invalid-unicode): Adapt
expected error symbol.
---
 .gitignore              |   2 +
 admin/merge-gnulib      |   2 +-
 lib/Makefile.in         |   7 +-
 lib/gnulib.mk.in        |  65 ++++-
 lib/unistr.in.h         | 746 ++++++++++++++++++++++++++++++++++++++++++++++++
 lib/unistr/u8-check.c   |  77 +++++
 lib/unitypes.in.h       |  46 +++
 lib/unused-parameter.h  |  36 +++
 m4/gnulib-comp.m4       |  15 +
 m4/inline.m4            |  40 +++
 m4/libunistring-base.m4 | 141 +++++++++
 src/json.c              |  35 ++-
 test/src/json-tests.el  |  10 +-
 13 files changed, 1207 insertions(+), 15 deletions(-)
 create mode 100644 lib/unistr.in.h
 create mode 100644 lib/unistr/u8-check.c
 create mode 100644 lib/unitypes.in.h
 create mode 100644 lib/unused-parameter.h
 create mode 100644 m4/inline.m4
 create mode 100644 m4/libunistring-base.m4

diff --git a/.gitignore b/.gitignore
index 7426082906..9a6f06c33f 100644
--- a/.gitignore
+++ b/.gitignore
@@ -68,6 +68,8 @@ lib/string.h
 lib/sys/
 lib/time.h
 lib/unistd.h
+lib/unistr.h
+lib/unitypes.h
 src/buildobj.h
 src/globals.h
 src/lisp.mk
diff --git a/admin/merge-gnulib b/admin/merge-gnulib
index 4b1dc592b9..bd41f90f1a 100755
--- a/admin/merge-gnulib
+++ b/admin/merge-gnulib
@@ -41,7 +41,7 @@ GNULIB_MODULES=
   sig2str socklen stat-time std-gnu11 stdalign stddef stdio
   stpcpy strtoimax symlink sys_stat sys_time
   tempname time time_r time_rz timegm timer-time timespec-add timespec-sub
-  update-copyright unlocked-io utimens
+  update-copyright unistr/base unistr/u8-check unlocked-io utimens
   vla warnings
 '
 
diff --git a/lib/Makefile.in b/lib/Makefile.in
index 1f5b154f35..7b9294c2e6 100644
--- a/lib/Makefile.in
+++ b/lib/Makefile.in
@@ -86,12 +86,17 @@ libegnu_a_OBJECTS =
 $(libegnu_a_OBJECTS) $(libgnu_a_OBJECTS): $(BUILT_SOURCES)
 
 .c.o:
-       $(AM_V_CC)$(CC) -c $(CPPFLAGS) $(ALL_CFLAGS) $<
+       $(AM_V_CC)$(CC) -c $(CPPFLAGS) $(ALL_CFLAGS) -o $@ $<
 e-%.o: %.c
        $(AM_V_CC)$(CC) -c $(CPPFLAGS) $(ALL_CFLAGS) -Demacs -o $@ $<
 
 all: libgnu.a $(if $(HYBRID_MALLOC),libegnu.a)
 
+unistr/u8-check.o: ${DEPDIR}/unistr
+
+${DEPDIR}/unistr:
+       ${AM_V_at}mkdir -p ${DEPDIR}/unistr
+
 libgnu.a: $(libgnu_a_OBJECTS)
        $(AM_V_at)rm -f $@
        $(AM_V_AR)$(AR) $(ARFLAGS) $@ $(libgnu_a_OBJECTS)
diff --git a/lib/gnulib.mk.in b/lib/gnulib.mk.in
index a7b33ba34e..9917d5ba7f 100644
--- a/lib/gnulib.mk.in
+++ b/lib/gnulib.mk.in
@@ -21,7 +21,7 @@
 # the same distribution terms as the rest of that program.
 #
 # Generated by gnulib-tool.
-# Reproduce by: gnulib-tool --import --lib=libgnu --source-base=lib 
--m4-base=m4 --doc-base=doc --tests-base=tests --aux-dir=build-aux 
--avoid=close --avoid=dup --avoid=fchdir --avoid=fstat --avoid=malloc-posix 
--avoid=msvc-inval --avoid=msvc-nothrow --avoid=openat-die --avoid=opendir 
--avoid=raise --avoid=save-cwd --avoid=select --avoid=setenv 
--avoid=sigprocmask --avoid=stat --avoid=stdarg --avoid=stdbool 
--avoid=threadlib --avoid=tzset --avoid=unsetenv --avoid=utime --avoid=utime-h 
--gnu-make --makefile-name=gnulib.mk.in --conditional-dependencies --no-libtool 
--macro-prefix=gl --no-vc-files alloca-opt binary-io byteswap c-ctype c-strcase 
careadlinkat close-stream count-leading-zeros count-one-bits 
count-trailing-zeros crypto/md5 crypto/sha1 crypto/sha256 crypto/sha512 d-type 
diffseq dtoastr dtotimespec dup2 environ execinfo explicit_bzero faccessat 
fcntl fcntl-h fdatasync fdopendir filemode filevercmp flexmember fstatat 
fsusage fsync getloadavg getopt-gnu gettime gettimeofday gitlog-to-changelog 
ignore-value intprops largefile lstat manywarnings memrchr minmax mkostemp 
mktime nstrftime pipe2 pselect pthread_sigmask putenv qcopy-acl readlink 
readlinkat sig2str socklen stat-time std-gnu11 stdalign stddef stdio stpcpy 
strtoimax symlink sys_stat sys_time tempname time time_r time_rz timegm 
timer-time timespec-add timespec-sub unlocked-io update-copyright utimens vla 
warnings
+# Reproduce by: gnulib-tool --import --lib=libgnu --source-base=lib 
--m4-base=m4 --doc-base=doc --tests-base=tests --aux-dir=build-aux 
--avoid=close --avoid=dup --avoid=fchdir --avoid=fstat --avoid=malloc-posix 
--avoid=msvc-inval --avoid=msvc-nothrow --avoid=openat-die --avoid=opendir 
--avoid=raise --avoid=save-cwd --avoid=select --avoid=setenv 
--avoid=sigprocmask --avoid=stat --avoid=stdarg --avoid=stdbool 
--avoid=threadlib --avoid=tzset --avoid=unsetenv --avoid=utime --avoid=utime-h 
--gnu-make --makefile-name=gnulib.mk.in --conditional-dependencies --no-libtool 
--macro-prefix=gl --no-vc-files alloca-opt binary-io byteswap c-ctype c-strcase 
careadlinkat close-stream count-leading-zeros count-one-bits 
count-trailing-zeros crypto/md5 crypto/sha1 crypto/sha256 crypto/sha512 d-type 
diffseq dtoastr dtotimespec dup2 environ execinfo explicit_bzero faccessat 
fcntl fcntl-h fdatasync fdopendir filemode filevercmp flexmember fstatat 
fsusage fsync getloadavg getopt-gnu gettime gettimeofday gitlog-to-changelog 
ignore-value intprops largefile lstat manywarnings memrchr minmax mkostemp 
mktime nstrftime pipe2 pselect pthread_sigmask putenv qcopy-acl readlink 
readlinkat sig2str socklen stat-time std-gnu11 stdalign stddef stdio stpcpy 
strtoimax symlink sys_stat sys_time tempname time time_r time_rz timegm 
timer-time timespec-add timespec-sub unistr/base unistr/u8-check unlocked-io 
update-copyright utimens vla warnings
 
 
 MOSTLYCLEANFILES += core *.stackdump
@@ -576,6 +576,9 @@ LIBS_MAIL = @LIBS_MAIL@
 LIBS_SYSTEM = @LIBS_SYSTEM@
 LIBS_TERMCAP = @LIBS_TERMCAP@
 LIBTIFF = @LIBTIFF@
+LIBUNISTRING_COMPILE_UNISTR_U8_CHECK = @LIBUNISTRING_COMPILE_UNISTR_U8_CHECK@
+LIBUNISTRING_UNISTR_H = @LIBUNISTRING_UNISTR_H@
+LIBUNISTRING_UNITYPES_H = @LIBUNISTRING_UNITYPES_H@
 LIBXMENU = @LIBXMENU@
 LIBXML2_CFLAGS = @LIBXML2_CFLAGS@
 LIBXML2_LIBS = @LIBXML2_LIBS@
@@ -2051,6 +2054,20 @@ EXTRA_DIST += c++defs.h
 endif
 ## end   gnulib module snippet/c++defs
 
+## begin gnulib module snippet/unused-parameter
+ifeq (,$(OMIT_GNULIB_MODULE_snippet/unused-parameter))
+
+# Because this Makefile snippet defines a variable used by other
+# gnulib Makefile snippets, it must be present in all makefiles that
+# need it. This is ensured by the applicability 'all' defined above.
+
+UNUSED_PARAMETER_H=$(srcdir)/unused-parameter.h
+
+EXTRA_DIST += unused-parameter.h
+
+endif
+## end   gnulib module snippet/unused-parameter
+
 ## begin gnulib module snippet/warn-on-use
 ifeq (,$(OMIT_GNULIB_MODULE_snippet/warn-on-use))
 
@@ -3041,6 +3058,52 @@ EXTRA_DIST += unistd.in.h
 endif
 ## end   gnulib module unistd
 
+## begin gnulib module unistr/base
+ifeq (,$(OMIT_GNULIB_MODULE_unistr/base))
+
+BUILT_SOURCES += $(LIBUNISTRING_UNISTR_H)
+
+unistr.h: unistr.in.h
+       $(AM_V_GEN)rm -f address@hidden $@ && \
+       { echo '/* DO NOT EDIT! GENERATED AUTOMATICALLY! */'; \
+         cat $(srcdir)/unistr.in.h; \
+       } > address@hidden && \
+       mv -f address@hidden $@
+MOSTLYCLEANFILES += unistr.h unistr.h-t
+
+EXTRA_DIST += unistr.in.h
+
+endif
+## end   gnulib module unistr/base
+
+## begin gnulib module unistr/u8-check
+ifeq (,$(OMIT_GNULIB_MODULE_unistr/u8-check))
+
+ifneq (,$(LIBUNISTRING_COMPILE_UNISTR_U8_CHECK))
+libgnu_a_SOURCES += unistr/u8-check.c
+endif
+
+endif
+## end   gnulib module unistr/u8-check
+
+## begin gnulib module unitypes
+ifeq (,$(OMIT_GNULIB_MODULE_unitypes))
+
+BUILT_SOURCES += $(LIBUNISTRING_UNITYPES_H)
+
+unitypes.h: unitypes.in.h
+       $(AM_V_GEN)rm -f address@hidden $@ && \
+       { echo '/* DO NOT EDIT! GENERATED AUTOMATICALLY! */'; \
+         cat $(srcdir)/unitypes.in.h; \
+       } > address@hidden && \
+       mv -f address@hidden $@
+MOSTLYCLEANFILES += unitypes.h unitypes.h-t
+
+EXTRA_DIST += unitypes.in.h
+
+endif
+## end   gnulib module unitypes
+
 ## begin gnulib module unlocked-io
 ifeq (,$(OMIT_GNULIB_MODULE_unlocked-io))
 
diff --git a/lib/unistr.in.h b/lib/unistr.in.h
new file mode 100644
index 0000000000..233f96025f
--- /dev/null
+++ b/lib/unistr.in.h
@@ -0,0 +1,746 @@
+/* Elementary Unicode string functions.
+   Copyright (C) 2001-2002, 2005-2017 Free Software Foundation, Inc.
+
+   This program is free software: you can redistribute it and/or modify it
+   under the terms of the GNU General Public License as published
+   by the Free Software Foundation; either version 3 of the License, or
+   (at your option) any later version.
+
+   This program is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   General Public License for more details.
+
+   You should have received a copy of the GNU General Public License
+   along with this program.  If not, see <https://www.gnu.org/licenses/>.  */
+
+#ifndef _UNISTR_H
+#define _UNISTR_H
+
+#include "unitypes.h"
+
+/* Get common macros for C.  */
+#include "unused-parameter.h"
+
+/* Get bool.  */
+#include <stdbool.h>
+
+/* Get size_t.  */
+#include <stddef.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+
+/* Conventions:
+
+   All functions prefixed with u8_ operate on UTF-8 encoded strings.
+   Their unit is an uint8_t (1 byte).
+
+   All functions prefixed with u16_ operate on UTF-16 encoded strings.
+   Their unit is an uint16_t (a 2-byte word).
+
+   All functions prefixed with u32_ operate on UCS-4 encoded strings.
+   Their unit is an uint32_t (a 4-byte word).
+
+   All argument pairs (s, n) denote a Unicode string s[0..n-1] with exactly
+   n units.
+
+   All arguments starting with "str" and the arguments of functions starting
+   with u8_str/u16_str/u32_str denote a NUL terminated string, i.e. a string
+   which terminates at the first NUL unit.  This termination unit is
+   considered part of the string for all memory allocation purposes, but
+   is not considered part of the string for all other logical purposes.
+
+   Functions returning a string result take a (resultbuf, lengthp) argument
+   pair.  If resultbuf is not NULL and the result fits into *lengthp units,
+   it is put in resultbuf, and resultbuf is returned.  Otherwise, a freshly
+   allocated string is returned.  In both cases, *lengthp is set to the
+   length (number of units) of the returned string.  In case of error,
+   NULL is returned and errno is set.  */
+
+
+/* Elementary string checks.  */
+
+/* Check whether an UTF-8 string is well-formed.
+   Return NULL if valid, or a pointer to the first invalid unit otherwise.  */
+extern const uint8_t *
+       u8_check (const uint8_t *s, size_t n)
+       _UC_ATTRIBUTE_PURE;
+
+/* Check whether an UTF-16 string is well-formed.
+   Return NULL if valid, or a pointer to the first invalid unit otherwise.  */
+extern const uint16_t *
+       u16_check (const uint16_t *s, size_t n)
+       _UC_ATTRIBUTE_PURE;
+
+/* Check whether an UCS-4 string is well-formed.
+   Return NULL if valid, or a pointer to the first invalid unit otherwise.  */
+extern const uint32_t *
+       u32_check (const uint32_t *s, size_t n)
+       _UC_ATTRIBUTE_PURE;
+
+
+/* Elementary string conversions.  */
+
+/* Convert an UTF-8 string to an UTF-16 string.  */
+extern uint16_t *
+       u8_to_u16 (const uint8_t *s, size_t n, uint16_t *resultbuf,
+                  size_t *lengthp);
+
+/* Convert an UTF-8 string to an UCS-4 string.  */
+extern uint32_t *
+       u8_to_u32 (const uint8_t *s, size_t n, uint32_t *resultbuf,
+                  size_t *lengthp);
+
+/* Convert an UTF-16 string to an UTF-8 string.  */
+extern uint8_t *
+       u16_to_u8 (const uint16_t *s, size_t n, uint8_t *resultbuf,
+                  size_t *lengthp);
+
+/* Convert an UTF-16 string to an UCS-4 string.  */
+extern uint32_t *
+       u16_to_u32 (const uint16_t *s, size_t n, uint32_t *resultbuf,
+                   size_t *lengthp);
+
+/* Convert an UCS-4 string to an UTF-8 string.  */
+extern uint8_t *
+       u32_to_u8 (const uint32_t *s, size_t n, uint8_t *resultbuf,
+                  size_t *lengthp);
+
+/* Convert an UCS-4 string to an UTF-16 string.  */
+extern uint16_t *
+       u32_to_u16 (const uint32_t *s, size_t n, uint16_t *resultbuf,
+                   size_t *lengthp);
+
+
+/* Elementary string functions.  */
+
+/* Return the length (number of units) of the first character in S, which is
+   no longer than N.  Return 0 if it is the NUL character.  Return -1 upon
+   failure.  */
+/* Similar to mblen(), except that s must not be NULL.  */
+extern int
+       u8_mblen (const uint8_t *s, size_t n)
+       _UC_ATTRIBUTE_PURE;
+extern int
+       u16_mblen (const uint16_t *s, size_t n)
+       _UC_ATTRIBUTE_PURE;
+extern int
+       u32_mblen (const uint32_t *s, size_t n)
+       _UC_ATTRIBUTE_PURE;
+
+/* Return the length (number of units) of the first character in S, putting
+   its 'ucs4_t' representation in *PUC.  Upon failure, *PUC is set to 0xfffd,
+   and an appropriate number of units is returned.
+   The number of available units, N, must be > 0.  */
+/* Similar to mbtowc(), except that puc and s must not be NULL, n must be > 0,
+   and the NUL character is not treated specially.  */
+/* The variants with _unsafe suffix are for backward compatibility with
+   libunistring versions < 0.9.7.  */
+
+#if GNULIB_UNISTR_U8_MBTOUC_UNSAFE || HAVE_LIBUNISTRING
+# if !HAVE_INLINE
+extern int
+       u8_mbtouc_unsafe (ucs4_t *puc, const uint8_t *s, size_t n);
+# else
+extern int
+       u8_mbtouc_unsafe_aux (ucs4_t *puc, const uint8_t *s, size_t n);
+static inline int
+u8_mbtouc_unsafe (ucs4_t *puc, const uint8_t *s, size_t n)
+{
+  uint8_t c = *s;
+
+  if (c < 0x80)
+    {
+      *puc = c;
+      return 1;
+    }
+  else
+    return u8_mbtouc_unsafe_aux (puc, s, n);
+}
+# endif
+#endif
+
+#if GNULIB_UNISTR_U16_MBTOUC_UNSAFE || HAVE_LIBUNISTRING
+# if !HAVE_INLINE
+extern int
+       u16_mbtouc_unsafe (ucs4_t *puc, const uint16_t *s, size_t n);
+# else
+extern int
+       u16_mbtouc_unsafe_aux (ucs4_t *puc, const uint16_t *s, size_t n);
+static inline int
+u16_mbtouc_unsafe (ucs4_t *puc, const uint16_t *s, size_t n)
+{
+  uint16_t c = *s;
+
+  if (c < 0xd800 || c >= 0xe000)
+    {
+      *puc = c;
+      return 1;
+    }
+  else
+    return u16_mbtouc_unsafe_aux (puc, s, n);
+}
+# endif
+#endif
+
+#if GNULIB_UNISTR_U32_MBTOUC_UNSAFE || HAVE_LIBUNISTRING
+# if !HAVE_INLINE
+extern int
+       u32_mbtouc_unsafe (ucs4_t *puc, const uint32_t *s, size_t n);
+# else
+static inline int
+u32_mbtouc_unsafe (ucs4_t *puc,
+                   const uint32_t *s, size_t n _GL_UNUSED_PARAMETER)
+{
+  uint32_t c = *s;
+
+  if (c < 0xd800 || (c >= 0xe000 && c < 0x110000))
+    *puc = c;
+  else
+    /* invalid multibyte character */
+    *puc = 0xfffd;
+  return 1;
+}
+# endif
+#endif
+
+#if GNULIB_UNISTR_U8_MBTOUC || HAVE_LIBUNISTRING
+# if !HAVE_INLINE
+extern int
+       u8_mbtouc (ucs4_t *puc, const uint8_t *s, size_t n);
+# else
+extern int
+       u8_mbtouc_aux (ucs4_t *puc, const uint8_t *s, size_t n);
+static inline int
+u8_mbtouc (ucs4_t *puc, const uint8_t *s, size_t n)
+{
+  uint8_t c = *s;
+
+  if (c < 0x80)
+    {
+      *puc = c;
+      return 1;
+    }
+  else
+    return u8_mbtouc_aux (puc, s, n);
+}
+# endif
+#endif
+
+#if GNULIB_UNISTR_U16_MBTOUC || HAVE_LIBUNISTRING
+# if !HAVE_INLINE
+extern int
+       u16_mbtouc (ucs4_t *puc, const uint16_t *s, size_t n);
+# else
+extern int
+       u16_mbtouc_aux (ucs4_t *puc, const uint16_t *s, size_t n);
+static inline int
+u16_mbtouc (ucs4_t *puc, const uint16_t *s, size_t n)
+{
+  uint16_t c = *s;
+
+  if (c < 0xd800 || c >= 0xe000)
+    {
+      *puc = c;
+      return 1;
+    }
+  else
+    return u16_mbtouc_aux (puc, s, n);
+}
+# endif
+#endif
+
+#if GNULIB_UNISTR_U32_MBTOUC || HAVE_LIBUNISTRING
+# if !HAVE_INLINE
+extern int
+       u32_mbtouc (ucs4_t *puc, const uint32_t *s, size_t n);
+# else
+static inline int
+u32_mbtouc (ucs4_t *puc, const uint32_t *s, size_t n _GL_UNUSED_PARAMETER)
+{
+  uint32_t c = *s;
+
+  if (c < 0xd800 || (c >= 0xe000 && c < 0x110000))
+    *puc = c;
+  else
+    /* invalid multibyte character */
+    *puc = 0xfffd;
+  return 1;
+}
+# endif
+#endif
+
+/* Return the length (number of units) of the first character in S, putting
+   its 'ucs4_t' representation in *PUC.  Upon failure, *PUC is set to 0xfffd,
+   and -1 is returned for an invalid sequence of units, -2 is returned for an
+   incomplete sequence of units.
+   The number of available units, N, must be > 0.  */
+/* Similar to u*_mbtouc(), except that the return value gives more details
+   about the failure, similar to mbrtowc().  */
+
+#if GNULIB_UNISTR_U8_MBTOUCR || HAVE_LIBUNISTRING
+extern int
+       u8_mbtoucr (ucs4_t *puc, const uint8_t *s, size_t n);
+#endif
+
+#if GNULIB_UNISTR_U16_MBTOUCR || HAVE_LIBUNISTRING
+extern int
+       u16_mbtoucr (ucs4_t *puc, const uint16_t *s, size_t n);
+#endif
+
+#if GNULIB_UNISTR_U32_MBTOUCR || HAVE_LIBUNISTRING
+extern int
+       u32_mbtoucr (ucs4_t *puc, const uint32_t *s, size_t n);
+#endif
+
+/* Put the multibyte character represented by UC in S, returning its
+   length.  Return -1 upon failure, -2 if the number of available units, N,
+   is too small.  The latter case cannot occur if N >= 6/2/1, respectively.  */
+/* Similar to wctomb(), except that s must not be NULL, and the argument n
+   must be specified.  */
+
+#if GNULIB_UNISTR_U8_UCTOMB || HAVE_LIBUNISTRING
+/* Auxiliary function, also used by u8_chr, u8_strchr, u8_strrchr.  */
+extern int
+       u8_uctomb_aux (uint8_t *s, ucs4_t uc, int n);
+# if !HAVE_INLINE
+extern int
+       u8_uctomb (uint8_t *s, ucs4_t uc, int n);
+# else
+static inline int
+u8_uctomb (uint8_t *s, ucs4_t uc, int n)
+{
+  if (uc < 0x80 && n > 0)
+    {
+      s[0] = uc;
+      return 1;
+    }
+  else
+    return u8_uctomb_aux (s, uc, n);
+}
+# endif
+#endif
+
+#if GNULIB_UNISTR_U16_UCTOMB || HAVE_LIBUNISTRING
+/* Auxiliary function, also used by u16_chr, u16_strchr, u16_strrchr.  */
+extern int
+       u16_uctomb_aux (uint16_t *s, ucs4_t uc, int n);
+# if !HAVE_INLINE
+extern int
+       u16_uctomb (uint16_t *s, ucs4_t uc, int n);
+# else
+static inline int
+u16_uctomb (uint16_t *s, ucs4_t uc, int n)
+{
+  if (uc < 0xd800 && n > 0)
+    {
+      s[0] = uc;
+      return 1;
+    }
+  else
+    return u16_uctomb_aux (s, uc, n);
+}
+# endif
+#endif
+
+#if GNULIB_UNISTR_U32_UCTOMB || HAVE_LIBUNISTRING
+# if !HAVE_INLINE
+extern int
+       u32_uctomb (uint32_t *s, ucs4_t uc, int n);
+# else
+static inline int
+u32_uctomb (uint32_t *s, ucs4_t uc, int n)
+{
+  if (uc < 0xd800 || (uc >= 0xe000 && uc < 0x110000))
+    {
+      if (n > 0)
+        {
+          *s = uc;
+          return 1;
+        }
+      else
+        return -2;
+    }
+  else
+    return -1;
+}
+# endif
+#endif
+
+/* Copy N units from SRC to DEST.  */
+/* Similar to memcpy().  */
+extern uint8_t *
+       u8_cpy (uint8_t *dest, const uint8_t *src, size_t n);
+extern uint16_t *
+       u16_cpy (uint16_t *dest, const uint16_t *src, size_t n);
+extern uint32_t *
+       u32_cpy (uint32_t *dest, const uint32_t *src, size_t n);
+
+/* Copy N units from SRC to DEST, guaranteeing correct behavior for
+   overlapping memory areas.  */
+/* Similar to memmove().  */
+extern uint8_t *
+       u8_move (uint8_t *dest, const uint8_t *src, size_t n);
+extern uint16_t *
+       u16_move (uint16_t *dest, const uint16_t *src, size_t n);
+extern uint32_t *
+       u32_move (uint32_t *dest, const uint32_t *src, size_t n);
+
+/* Set the first N characters of S to UC.  UC should be a character that
+   occupies only 1 unit.  */
+/* Similar to memset().  */
+extern uint8_t *
+       u8_set (uint8_t *s, ucs4_t uc, size_t n);
+extern uint16_t *
+       u16_set (uint16_t *s, ucs4_t uc, size_t n);
+extern uint32_t *
+       u32_set (uint32_t *s, ucs4_t uc, size_t n);
+
+/* Compare S1 and S2, each of length N.  */
+/* Similar to memcmp().  */
+extern int
+       u8_cmp (const uint8_t *s1, const uint8_t *s2, size_t n)
+       _UC_ATTRIBUTE_PURE;
+extern int
+       u16_cmp (const uint16_t *s1, const uint16_t *s2, size_t n)
+       _UC_ATTRIBUTE_PURE;
+extern int
+       u32_cmp (const uint32_t *s1, const uint32_t *s2, size_t n)
+       _UC_ATTRIBUTE_PURE;
+
+/* Compare S1 and S2.  */
+/* Similar to the gnulib function memcmp2().  */
+extern int
+       u8_cmp2 (const uint8_t *s1, size_t n1, const uint8_t *s2, size_t n2)
+       _UC_ATTRIBUTE_PURE;
+extern int
+       u16_cmp2 (const uint16_t *s1, size_t n1, const uint16_t *s2, size_t n2)
+       _UC_ATTRIBUTE_PURE;
+extern int
+       u32_cmp2 (const uint32_t *s1, size_t n1, const uint32_t *s2, size_t n2)
+       _UC_ATTRIBUTE_PURE;
+
+/* Search the string at S for UC.  */
+/* Similar to memchr().  */
+extern uint8_t *
+       u8_chr (const uint8_t *s, size_t n, ucs4_t uc)
+       _UC_ATTRIBUTE_PURE;
+extern uint16_t *
+       u16_chr (const uint16_t *s, size_t n, ucs4_t uc)
+       _UC_ATTRIBUTE_PURE;
+extern uint32_t *
+       u32_chr (const uint32_t *s, size_t n, ucs4_t uc)
+       _UC_ATTRIBUTE_PURE;
+
+/* Count the number of Unicode characters in the N units from S.  */
+/* Similar to mbsnlen().  */
+extern size_t
+       u8_mbsnlen (const uint8_t *s, size_t n)
+       _UC_ATTRIBUTE_PURE;
+extern size_t
+       u16_mbsnlen (const uint16_t *s, size_t n)
+       _UC_ATTRIBUTE_PURE;
+extern size_t
+       u32_mbsnlen (const uint32_t *s, size_t n)
+       _UC_ATTRIBUTE_PURE;
+
+/* Elementary string functions with memory allocation.  */
+
+/* Make a freshly allocated copy of S, of length N.  */
+extern uint8_t *
+       u8_cpy_alloc (const uint8_t *s, size_t n);
+extern uint16_t *
+       u16_cpy_alloc (const uint16_t *s, size_t n);
+extern uint32_t *
+       u32_cpy_alloc (const uint32_t *s, size_t n);
+
+/* Elementary string functions on NUL terminated strings.  */
+
+/* Return the length (number of units) of the first character in S.
+   Return 0 if it is the NUL character.  Return -1 upon failure.  */
+extern int
+       u8_strmblen (const uint8_t *s)
+       _UC_ATTRIBUTE_PURE;
+extern int
+       u16_strmblen (const uint16_t *s)
+       _UC_ATTRIBUTE_PURE;
+extern int
+       u32_strmblen (const uint32_t *s)
+       _UC_ATTRIBUTE_PURE;
+
+/* Return the length (number of units) of the first character in S, putting
+   its 'ucs4_t' representation in *PUC.  Return 0 if it is the NUL
+   character.  Return -1 upon failure.  */
+extern int
+       u8_strmbtouc (ucs4_t *puc, const uint8_t *s);
+extern int
+       u16_strmbtouc (ucs4_t *puc, const uint16_t *s);
+extern int
+       u32_strmbtouc (ucs4_t *puc, const uint32_t *s);
+
+/* Forward iteration step.  Advances the pointer past the next character,
+   or returns NULL if the end of the string has been reached.  Puts the
+   character's 'ucs4_t' representation in *PUC.  */
+extern const uint8_t *
+       u8_next (ucs4_t *puc, const uint8_t *s);
+extern const uint16_t *
+       u16_next (ucs4_t *puc, const uint16_t *s);
+extern const uint32_t *
+       u32_next (ucs4_t *puc, const uint32_t *s);
+
+/* Backward iteration step.  Advances the pointer to point to the previous
+   character, or returns NULL if the beginning of the string had been reached.
+   Puts the character's 'ucs4_t' representation in *PUC.  */
+extern const uint8_t *
+       u8_prev (ucs4_t *puc, const uint8_t *s, const uint8_t *start);
+extern const uint16_t *
+       u16_prev (ucs4_t *puc, const uint16_t *s, const uint16_t *start);
+extern const uint32_t *
+       u32_prev (ucs4_t *puc, const uint32_t *s, const uint32_t *start);
+
+/* Return the number of units in S.  */
+/* Similar to strlen(), wcslen().  */
+extern size_t
+       u8_strlen (const uint8_t *s)
+       _UC_ATTRIBUTE_PURE;
+extern size_t
+       u16_strlen (const uint16_t *s)
+       _UC_ATTRIBUTE_PURE;
+extern size_t
+       u32_strlen (const uint32_t *s)
+       _UC_ATTRIBUTE_PURE;
+
+/* Return the number of units in S, but at most MAXLEN.  */
+/* Similar to strnlen(), wcsnlen().  */
+extern size_t
+       u8_strnlen (const uint8_t *s, size_t maxlen)
+       _UC_ATTRIBUTE_PURE;
+extern size_t
+       u16_strnlen (const uint16_t *s, size_t maxlen)
+       _UC_ATTRIBUTE_PURE;
+extern size_t
+       u32_strnlen (const uint32_t *s, size_t maxlen)
+       _UC_ATTRIBUTE_PURE;
+
+/* Copy SRC to DEST.  */
+/* Similar to strcpy(), wcscpy().  */
+extern uint8_t *
+       u8_strcpy (uint8_t *dest, const uint8_t *src);
+extern uint16_t *
+       u16_strcpy (uint16_t *dest, const uint16_t *src);
+extern uint32_t *
+       u32_strcpy (uint32_t *dest, const uint32_t *src);
+
+/* Copy SRC to DEST, returning the address of the terminating NUL in DEST.  */
+/* Similar to stpcpy().  */
+extern uint8_t *
+       u8_stpcpy (uint8_t *dest, const uint8_t *src);
+extern uint16_t *
+       u16_stpcpy (uint16_t *dest, const uint16_t *src);
+extern uint32_t *
+       u32_stpcpy (uint32_t *dest, const uint32_t *src);
+
+/* Copy no more than N units of SRC to DEST.  */
+/* Similar to strncpy(), wcsncpy().  */
+extern uint8_t *
+       u8_strncpy (uint8_t *dest, const uint8_t *src, size_t n);
+extern uint16_t *
+       u16_strncpy (uint16_t *dest, const uint16_t *src, size_t n);
+extern uint32_t *
+       u32_strncpy (uint32_t *dest, const uint32_t *src, size_t n);
+
+/* Copy no more than N units of SRC to DEST.  Return a pointer past the last
+   non-NUL unit written into DEST.  */
+/* Similar to stpncpy().  */
+extern uint8_t *
+       u8_stpncpy (uint8_t *dest, const uint8_t *src, size_t n);
+extern uint16_t *
+       u16_stpncpy (uint16_t *dest, const uint16_t *src, size_t n);
+extern uint32_t *
+       u32_stpncpy (uint32_t *dest, const uint32_t *src, size_t n);
+
+/* Append SRC onto DEST.  */
+/* Similar to strcat(), wcscat().  */
+extern uint8_t *
+       u8_strcat (uint8_t *dest, const uint8_t *src);
+extern uint16_t *
+       u16_strcat (uint16_t *dest, const uint16_t *src);
+extern uint32_t *
+       u32_strcat (uint32_t *dest, const uint32_t *src);
+
+/* Append no more than N units of SRC onto DEST.  */
+/* Similar to strncat(), wcsncat().  */
+extern uint8_t *
+       u8_strncat (uint8_t *dest, const uint8_t *src, size_t n);
+extern uint16_t *
+       u16_strncat (uint16_t *dest, const uint16_t *src, size_t n);
+extern uint32_t *
+       u32_strncat (uint32_t *dest, const uint32_t *src, size_t n);
+
+/* Compare S1 and S2.  */
+/* Similar to strcmp(), wcscmp().  */
+#ifdef __sun
+/* Avoid a collision with the u8_strcmp() function in Solaris 11 libc.  */
+extern int
+       u8_strcmp_gnu (const uint8_t *s1, const uint8_t *s2)
+       _UC_ATTRIBUTE_PURE;
+# define u8_strcmp u8_strcmp_gnu
+#else
+extern int
+       u8_strcmp (const uint8_t *s1, const uint8_t *s2)
+       _UC_ATTRIBUTE_PURE;
+#endif
+extern int
+       u16_strcmp (const uint16_t *s1, const uint16_t *s2)
+       _UC_ATTRIBUTE_PURE;
+extern int
+       u32_strcmp (const uint32_t *s1, const uint32_t *s2)
+       _UC_ATTRIBUTE_PURE;
+
+/* Compare S1 and S2 using the collation rules of the current locale.
+   Return -1 if S1 < S2, 0 if S1 = S2, 1 if S1 > S2.
+   Upon failure, set errno and return any value.  */
+/* Similar to strcoll(), wcscoll().  */
+extern int
+       u8_strcoll (const uint8_t *s1, const uint8_t *s2);
+extern int
+       u16_strcoll (const uint16_t *s1, const uint16_t *s2);
+extern int
+       u32_strcoll (const uint32_t *s1, const uint32_t *s2);
+
+/* Compare no more than N units of S1 and S2.  */
+/* Similar to strncmp(), wcsncmp().  */
+extern int
+       u8_strncmp (const uint8_t *s1, const uint8_t *s2, size_t n)
+       _UC_ATTRIBUTE_PURE;
+extern int
+       u16_strncmp (const uint16_t *s1, const uint16_t *s2, size_t n)
+       _UC_ATTRIBUTE_PURE;
+extern int
+       u32_strncmp (const uint32_t *s1, const uint32_t *s2, size_t n)
+       _UC_ATTRIBUTE_PURE;
+
+/* Duplicate S, returning an identical malloc'd string.  */
+/* Similar to strdup(), wcsdup().  */
+extern uint8_t *
+       u8_strdup (const uint8_t *s);
+extern uint16_t *
+       u16_strdup (const uint16_t *s);
+extern uint32_t *
+       u32_strdup (const uint32_t *s);
+
+/* Find the first occurrence of UC in STR.  */
+/* Similar to strchr(), wcschr().  */
+extern uint8_t *
+       u8_strchr (const uint8_t *str, ucs4_t uc)
+       _UC_ATTRIBUTE_PURE;
+extern uint16_t *
+       u16_strchr (const uint16_t *str, ucs4_t uc)
+       _UC_ATTRIBUTE_PURE;
+extern uint32_t *
+       u32_strchr (const uint32_t *str, ucs4_t uc)
+       _UC_ATTRIBUTE_PURE;
+
+/* Find the last occurrence of UC in STR.  */
+/* Similar to strrchr(), wcsrchr().  */
+extern uint8_t *
+       u8_strrchr (const uint8_t *str, ucs4_t uc)
+       _UC_ATTRIBUTE_PURE;
+extern uint16_t *
+       u16_strrchr (const uint16_t *str, ucs4_t uc)
+       _UC_ATTRIBUTE_PURE;
+extern uint32_t *
+       u32_strrchr (const uint32_t *str, ucs4_t uc)
+       _UC_ATTRIBUTE_PURE;
+
+/* Return the length of the initial segment of STR which consists entirely
+   of Unicode characters not in REJECT.  */
+/* Similar to strcspn(), wcscspn().  */
+extern size_t
+       u8_strcspn (const uint8_t *str, const uint8_t *reject)
+       _UC_ATTRIBUTE_PURE;
+extern size_t
+       u16_strcspn (const uint16_t *str, const uint16_t *reject)
+       _UC_ATTRIBUTE_PURE;
+extern size_t
+       u32_strcspn (const uint32_t *str, const uint32_t *reject)
+       _UC_ATTRIBUTE_PURE;
+
+/* Return the length of the initial segment of STR which consists entirely
+   of Unicode characters in ACCEPT.  */
+/* Similar to strspn(), wcsspn().  */
+extern size_t
+       u8_strspn (const uint8_t *str, const uint8_t *accept)
+       _UC_ATTRIBUTE_PURE;
+extern size_t
+       u16_strspn (const uint16_t *str, const uint16_t *accept)
+       _UC_ATTRIBUTE_PURE;
+extern size_t
+       u32_strspn (const uint32_t *str, const uint32_t *accept)
+       _UC_ATTRIBUTE_PURE;
+
+/* Find the first occurrence in STR of any character in ACCEPT.  */
+/* Similar to strpbrk(), wcspbrk().  */
+extern uint8_t *
+       u8_strpbrk (const uint8_t *str, const uint8_t *accept)
+       _UC_ATTRIBUTE_PURE;
+extern uint16_t *
+       u16_strpbrk (const uint16_t *str, const uint16_t *accept)
+       _UC_ATTRIBUTE_PURE;
+extern uint32_t *
+       u32_strpbrk (const uint32_t *str, const uint32_t *accept)
+       _UC_ATTRIBUTE_PURE;
+
+/* Find the first occurrence of NEEDLE in HAYSTACK.  */
+/* Similar to strstr(), wcsstr().  */
+extern uint8_t *
+       u8_strstr (const uint8_t *haystack, const uint8_t *needle)
+       _UC_ATTRIBUTE_PURE;
+extern uint16_t *
+       u16_strstr (const uint16_t *haystack, const uint16_t *needle)
+       _UC_ATTRIBUTE_PURE;
+extern uint32_t *
+       u32_strstr (const uint32_t *haystack, const uint32_t *needle)
+       _UC_ATTRIBUTE_PURE;
+
+/* Test whether STR starts with PREFIX.  */
+extern bool
+       u8_startswith (const uint8_t *str, const uint8_t *prefix)
+       _UC_ATTRIBUTE_PURE;
+extern bool
+       u16_startswith (const uint16_t *str, const uint16_t *prefix)
+       _UC_ATTRIBUTE_PURE;
+extern bool
+       u32_startswith (const uint32_t *str, const uint32_t *prefix)
+       _UC_ATTRIBUTE_PURE;
+
+/* Test whether STR ends with SUFFIX.  */
+extern bool
+       u8_endswith (const uint8_t *str, const uint8_t *suffix)
+       _UC_ATTRIBUTE_PURE;
+extern bool
+       u16_endswith (const uint16_t *str, const uint16_t *suffix)
+       _UC_ATTRIBUTE_PURE;
+extern bool
+       u32_endswith (const uint32_t *str, const uint32_t *suffix)
+       _UC_ATTRIBUTE_PURE;
+
+/* Divide STR into tokens separated by characters in DELIM.
+   This interface is actually more similar to wcstok than to strtok.  */
+/* Similar to strtok_r(), wcstok().  */
+extern uint8_t *
+       u8_strtok (uint8_t *str, const uint8_t *delim, uint8_t **ptr);
+extern uint16_t *
+       u16_strtok (uint16_t *str, const uint16_t *delim, uint16_t **ptr);
+extern uint32_t *
+       u32_strtok (uint32_t *str, const uint32_t *delim, uint32_t **ptr);
+
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* _UNISTR_H */
diff --git a/lib/unistr/u8-check.c b/lib/unistr/u8-check.c
new file mode 100644
index 0000000000..4f8e33eb74
--- /dev/null
+++ b/lib/unistr/u8-check.c
@@ -0,0 +1,77 @@
+/* Check UTF-8 string.
+   Copyright (C) 2002, 2006-2007, 2009-2017 Free Software Foundation, Inc.
+   Written by Bruno Haible <address@hidden>, 2002.
+
+   This program is free software: you can redistribute it and/or modify it
+   under the terms of the GNU General Public License as published
+   by the Free Software Foundation; either version 3 of the License, or
+   (at your option) any later version.
+
+   This program is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   General Public License for more details.
+
+   You should have received a copy of the GNU General Public License
+   along with this program.  If not, see <https://www.gnu.org/licenses/>.  */
+
+#include <config.h>
+
+/* Specification.  */
+#include "unistr.h"
+
+const uint8_t *
+u8_check (const uint8_t *s, size_t n)
+{
+  const uint8_t *s_end = s + n;
+
+  while (s < s_end)
+    {
+      /* Keep in sync with unistr.h and u8-mbtouc-aux.c.  */
+      uint8_t c = *s;
+
+      if (c < 0x80)
+        {
+          s++;
+          continue;
+        }
+      if (c >= 0xc2)
+        {
+          if (c < 0xe0)
+            {
+              if (s + 2 <= s_end
+                  && (s[1] ^ 0x80) < 0x40)
+                {
+                  s += 2;
+                  continue;
+                }
+            }
+          else if (c < 0xf0)
+            {
+              if (s + 3 <= s_end
+                  && (s[1] ^ 0x80) < 0x40 && (s[2] ^ 0x80) < 0x40
+                  && (c >= 0xe1 || s[1] >= 0xa0)
+                  && (c != 0xed || s[1] < 0xa0))
+                {
+                  s += 3;
+                  continue;
+                }
+            }
+          else if (c < 0xf8)
+            {
+              if (s + 4 <= s_end
+                  && (s[1] ^ 0x80) < 0x40 && (s[2] ^ 0x80) < 0x40
+                  && (s[3] ^ 0x80) < 0x40
+                  && (c >= 0xf1 || s[1] >= 0x90)
+                  && (c < 0xf4 || (c == 0xf4 && s[1] < 0x90)))
+                {
+                  s += 4;
+                  continue;
+                }
+            }
+        }
+      /* invalid or incomplete multibyte character */
+      return s;
+    }
+  return NULL;
+}
diff --git a/lib/unitypes.in.h b/lib/unitypes.in.h
new file mode 100644
index 0000000000..01e0495d1c
--- /dev/null
+++ b/lib/unitypes.in.h
@@ -0,0 +1,46 @@
+/* Elementary types and macros for the GNU UniString library.
+   Copyright (C) 2002, 2005-2006, 2009-2017 Free Software Foundation, Inc.
+
+   This program is free software: you can redistribute it and/or modify it
+   under the terms of the GNU General Public License as published
+   by the Free Software Foundation; either version 3 of the License, or
+   (at your option) any later version.
+
+   This program is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   General Public License for more details.
+
+   You should have received a copy of the GNU General Public License
+   along with this program.  If not, see <https://www.gnu.org/licenses/>.  */
+
+#ifndef _UNITYPES_H
+#define _UNITYPES_H
+
+/* Get uint8_t, uint16_t, uint32_t.  */
+#include <stdint.h>
+
+/* Type representing a Unicode character.  */
+typedef uint32_t ucs4_t;
+
+/* Attribute of a function whose result depends only on the arguments
+   (not pointers!) and which has no side effects.  */
+#ifndef _UC_ATTRIBUTE_CONST
+# if __GNUC__ > 2 || (__GNUC__ == 2 && __GNUC_MINOR__ >= 95)
+#  define _UC_ATTRIBUTE_CONST __attribute__ ((__const__))
+# else
+#  define _UC_ATTRIBUTE_CONST
+# endif
+#endif
+
+/* Attribute of a function whose result depends only on the arguments
+   (possibly pointers) and global memory, and which has no side effects.  */
+#ifndef _UC_ATTRIBUTE_PURE
+# if __GNUC__ > 2 || (__GNUC__ == 2 && __GNUC_MINOR__ >= 96)
+#  define _UC_ATTRIBUTE_PURE __attribute__ ((__pure__))
+# else
+#  define _UC_ATTRIBUTE_PURE
+# endif
+#endif
+
+#endif /* _UNITYPES_H */
diff --git a/lib/unused-parameter.h b/lib/unused-parameter.h
new file mode 100644
index 0000000000..8bd04b1fba
--- /dev/null
+++ b/lib/unused-parameter.h
@@ -0,0 +1,36 @@
+/* A C macro for declaring that specific function parameters are not used.
+   Copyright (C) 2008-2017 Free Software Foundation, Inc.
+
+   This program is free software: you can redistribute it and/or modify it
+   under the terms of the GNU General Public License as published
+   by the Free Software Foundation; either version 3 of the License, or
+   (at your option) any later version.
+
+   This program is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   General Public License for more details.
+
+   You should have received a copy of the GNU General Public License
+   along with this program.  If not, see <https://www.gnu.org/licenses/>.  */
+
+/* _GL_UNUSED_PARAMETER is a marker that can be appended to function parameter
+   declarations for parameters that are not used.  This helps to reduce
+   warnings, such as from GCC -Wunused-parameter.  The syntax is as follows:
+       type param _GL_UNUSED_PARAMETER
+   or more generally
+       param_decl _GL_UNUSED_PARAMETER
+   For example:
+       int param _GL_UNUSED_PARAMETER
+       int *(*param)(void) _GL_UNUSED_PARAMETER
+   Other possible, but obscure and discouraged syntaxes:
+       int _GL_UNUSED_PARAMETER *(*param)(void)
+       _GL_UNUSED_PARAMETER int *(*param)(void)
+ */
+#ifndef _GL_UNUSED_PARAMETER
+# if __GNUC__ >= 3 || (__GNUC__ == 2 && __GNUC_MINOR__ >= 7)
+#  define _GL_UNUSED_PARAMETER __attribute__ ((__unused__))
+# else
+#  define _GL_UNUSED_PARAMETER
+# endif
+#endif
diff --git a/m4/gnulib-comp.m4 b/m4/gnulib-comp.m4
index 61d39ebda2..9c4dfe4bf5 100644
--- a/m4/gnulib-comp.m4
+++ b/m4/gnulib-comp.m4
@@ -101,6 +101,7 @@ AC_DEFUN
   # Code from module group-member:
   # Code from module ignore-value:
   # Code from module include_next:
+  # Code from module inline:
   # Code from module intprops:
   # Code from module inttypes-incomplete:
   # Code from module largefile:
@@ -132,6 +133,7 @@ AC_DEFUN
   # Code from module snippet/_Noreturn:
   # Code from module snippet/arg-nonnull:
   # Code from module snippet/c++defs:
+  # Code from module snippet/unused-parameter:
   # Code from module snippet/warn-on-use:
   # Code from module socklen:
   # Code from module ssize_t:
@@ -162,6 +164,9 @@ AC_DEFUN
   # Code from module timespec-sub:
   # Code from module u64:
   # Code from module unistd:
+  # Code from module unistr/base:
+  # Code from module unistr/u8-check:
+  # Code from module unitypes:
   # Code from module unlocked-io:
   # Code from module update-copyright:
   # Code from module utimens:
@@ -293,6 +298,7 @@ AC_DEFUN
     gl_PREREQ_GETTIMEOFDAY
   fi
   gl_SYS_TIME_MODULE_INDICATOR([gettimeofday])
+  gl_INLINE
   gl_INTTYPES_INCOMPLETE
   AC_REQUIRE([gl_LARGEFILE])
   gl_LIMITS_H
@@ -417,6 +423,9 @@ AC_DEFUN
   gl_TIMER_TIME
   gl_TIMESPEC
   gl_UNISTD_H
+  gl_LIBUNISTRING_LIBHEADER([0.9.4], [unistr.h])
+  gl_LIBUNISTRING_MODULE([0.9], [unistr/u8-check])
+  gl_LIBUNISTRING_LIBHEADER([0.9.4], [unitypes.h])
   gl_FUNC_GLIBC_UNLOCKED_IO
   gl_UTIMENS
   AC_C_VARARRAYS
@@ -964,7 +973,11 @@ AC_DEFUN
   lib/u64.h
   lib/unistd.c
   lib/unistd.in.h
+  lib/unistr.in.h
+  lib/unistr/u8-check.c
+  lib/unitypes.in.h
   lib/unlocked-io.h
+  lib/unused-parameter.h
   lib/utimens.c
   lib/utimens.h
   lib/verify.h
@@ -1015,8 +1028,10 @@ AC_DEFUN
   m4/gnulib-common.m4
   m4/group-member.m4
   m4/include_next.m4
+  m4/inline.m4
   m4/inttypes.m4
   m4/largefile.m4
+  m4/libunistring-base.m4
   m4/limits-h.m4
   m4/localtime-buffer.m4
   m4/longlong.m4
diff --git a/m4/inline.m4 b/m4/inline.m4
new file mode 100644
index 0000000000..f00572e0ce
--- /dev/null
+++ b/m4/inline.m4
@@ -0,0 +1,40 @@
+# inline.m4 serial 4
+dnl Copyright (C) 2006, 2009-2017 Free Software Foundation, Inc.
+dnl This file is free software; the Free Software Foundation
+dnl gives unlimited permission to copy and/or distribute it,
+dnl with or without modifications, as long as this notice is preserved.
+
+dnl Test for the 'inline' keyword or equivalent.
+dnl Define 'inline' to a supported equivalent, or to nothing if not supported,
+dnl like AC_C_INLINE does.  Also, define HAVE_INLINE if 'inline' or an
+dnl equivalent is effectively supported, i.e. if the compiler is likely to
+dnl drop unused 'static inline' functions.
+AC_DEFUN([gl_INLINE],
+[
+  AC_REQUIRE([AC_C_INLINE])
+  AC_CACHE_CHECK([whether the compiler generally respects inline],
+    [gl_cv_c_inline_effective],
+    [if test $ac_cv_c_inline = no; then
+       gl_cv_c_inline_effective=no
+     else
+       dnl GCC defines __NO_INLINE__ if not optimizing or if -fno-inline is
+       dnl specified.
+       dnl Use AC_COMPILE_IFELSE here, not AC_EGREP_CPP, because the result
+       dnl depends on optimization flags, which can be in CFLAGS.
+       dnl (AC_EGREP_CPP looks only at the CPPFLAGS.)
+       AC_COMPILE_IFELSE(
+         [AC_LANG_PROGRAM([[]],
+           [[#ifdef __NO_INLINE__
+               #error "inline is not effective"
+             #endif]])],
+         [gl_cv_c_inline_effective=yes],
+         [gl_cv_c_inline_effective=no])
+     fi
+    ])
+  if test $gl_cv_c_inline_effective = yes; then
+    AC_DEFINE([HAVE_INLINE], [1],
+      [Define to 1 if the compiler supports one of the keywords
+       'inline', '__inline__', '__inline' and effectively inlines
+       functions marked as such.])
+  fi
+])
diff --git a/m4/libunistring-base.m4 b/m4/libunistring-base.m4
new file mode 100644
index 0000000000..ee648f272a
--- /dev/null
+++ b/m4/libunistring-base.m4
@@ -0,0 +1,141 @@
+# libunistring-base.m4 serial 5
+dnl Copyright (C) 2010-2017 Free Software Foundation, Inc.
+dnl This file is free software; the Free Software Foundation
+dnl gives unlimited permission to copy and/or distribute it,
+dnl with or without modifications, as long as this notice is preserved.
+
+dnl From Paolo Bonzini and Bruno Haible.
+
+dnl gl_LIBUNISTRING_MODULE([VERSION], [Module])
+dnl Declares that the source files of Module should be compiled, unless we
+dnl are linking with libunistring and its version is >= the given VERSION.
+dnl Defines an automake conditional LIBUNISTRING_COMPILE_$MODULE that is
+dnl true if the source files of Module should be compiled.
+dnl This macro is to be used for public libunistring API, not for
+dnl undocumented API.
+dnl
+dnl You have to bump the VERSION argument to the next projected version
+dnl number each time you make a change that affects the behaviour of the
+dnl functions defined in Module (even if the sources of Module itself do not
+dnl change).
+
+AC_DEFUN([gl_LIBUNISTRING_MODULE],
+[
+  AC_REQUIRE([gl_LIBUNISTRING_LIB_PREPARE])
+  dnl Use the variables HAVE_LIBUNISTRING, LIBUNISTRING_VERSION from
+  dnl gl_LIBUNISTRING_CORE if that macro has been run.
+  AM_CONDITIONAL(AS_TR_CPP([LIBUNISTRING_COMPILE_$2]),
+    [gl_LIBUNISTRING_VERSION_CMP([$1])])
+])
+
+dnl gl_LIBUNISTRING_LIBHEADER([VERSION], [HeaderFile])
+dnl Declares that HeaderFile should be created, unless we are linking
+dnl with libunistring and its version is >= the given VERSION.
+dnl HeaderFile should be relative to the lib directory and end in '.h'.
+dnl Prepares for substituting LIBUNISTRING_HEADERFILE (to HeaderFile or empty).
+dnl
+dnl When we are linking with the already installed libunistring and its version
+dnl is < VERSION, we create HeaderFile here, because we may compile functions
+dnl (via gl_LIBUNISTRING_MODULE above) that are not contained in the installed
+dnl version.
+dnl When we are linking with the already installed libunistring and its version
+dnl is > VERSION, we don't create HeaderFile here: it could cause compilation
+dnl errors in other libunistring header files if some types are missing.
+dnl
+dnl You have to bump the VERSION argument to the next projected version
+dnl number each time you make a non-comment change to the HeaderFile.
+
+AC_DEFUN([gl_LIBUNISTRING_LIBHEADER],
+[
+  AC_REQUIRE([gl_LIBUNISTRING_LIB_PREPARE])
+  dnl Use the variables HAVE_LIBUNISTRING, LIBUNISTRING_VERSION from
+  dnl gl_LIBUNISTRING_CORE if that macro has been run.
+  if gl_LIBUNISTRING_VERSION_CMP([$1]); then
+    LIBUNISTRING_[]AS_TR_CPP([$2])='$2'
+  else
+    LIBUNISTRING_[]AS_TR_CPP([$2])=
+  fi
+  AC_SUBST([LIBUNISTRING_]AS_TR_CPP([$2]))
+])
+
+dnl Miscellaneous preparations/initializations.
+
+AC_DEFUN([gl_LIBUNISTRING_LIB_PREPARE],
+[
+  dnl Ensure that HAVE_LIBUNISTRING is fully determined at this point.
+  m4_ifdef([gl_LIBUNISTRING], [AC_REQUIRE([gl_LIBUNISTRING])])
+
+  AC_REQUIRE([AC_PROG_AWK])
+
+dnl Sed expressions to extract the parts of a version number.
+changequote(,)
+gl_libunistring_sed_extract_major='/^[0-9]/{s/^\([0-9]*\).*/\1/p;q;}
+i\
+0
+q
+'
+gl_libunistring_sed_extract_minor='/^[0-9][0-9]*[.][0-9]/{s/^[0-9]*[.]\([0-9]*\).*/\1/p;q;}
+i\
+0
+q
+'
+gl_libunistring_sed_extract_subminor='/^[0-9][0-9]*[.][0-9][0-9]*[.][0-9]/{s/^[0-9]*[.][0-9]*[.]\([0-9]*\).*/\1/p;q;}
+i\
+0
+q
+'
+changequote([,])
+
+  if test "$HAVE_LIBUNISTRING" = yes; then
+    LIBUNISTRING_VERSION_MAJOR=`echo "$LIBUNISTRING_VERSION" | sed -n -e 
"$gl_libunistring_sed_extract_major"`
+    LIBUNISTRING_VERSION_MINOR=`echo "$LIBUNISTRING_VERSION" | sed -n -e 
"$gl_libunistring_sed_extract_minor"`
+    LIBUNISTRING_VERSION_SUBMINOR=`echo "$LIBUNISTRING_VERSION" | sed -n -e 
"$gl_libunistring_sed_extract_subminor"`
+  fi
+])
+
+dnl gl_LIBUNISTRING_VERSION_CMP([VERSION])
+dnl Expands to a shell statement that evaluates to true if LIBUNISTRING_VERSION
+dnl is less than the VERSION argument.
+AC_DEFUN([gl_LIBUNISTRING_VERSION_CMP],
+[ { test "$HAVE_LIBUNISTRING" != yes \
+    || {
+         dnl AS_LITERAL_IF exists and works fine since autoconf-2.59 at least.
+         AS_LITERAL_IF([$1],
+           [dnl This is the optimized variant, that assumes the argument is a 
literal:
+            m4_pushdef([requested_version_major],
+              [gl_LIBUNISTRING_ARG_OR_ZERO(m4_bpatsubst([$1], [^\([0-9]*\).*], 
[\1]), [])])
+            m4_pushdef([requested_version_minor],
+              [gl_LIBUNISTRING_ARG_OR_ZERO(m4_bpatsubst([$1], 
[^[0-9]*[.]\([0-9]*\).*], [\1]), [$1])])
+            m4_pushdef([requested_version_subminor],
+              [gl_LIBUNISTRING_ARG_OR_ZERO(m4_bpatsubst([$1], 
[^[0-9]*[.][0-9]*[.]\([0-9]*\).*], [\1]), [$1])])
+            test $LIBUNISTRING_VERSION_MAJOR -lt requested_version_major \
+            || { test $LIBUNISTRING_VERSION_MAJOR -eq requested_version_major \
+                 && { test $LIBUNISTRING_VERSION_MINOR -lt 
requested_version_minor \
+                      || { test $LIBUNISTRING_VERSION_MINOR -eq 
requested_version_minor \
+                           && test $LIBUNISTRING_VERSION_SUBMINOR -lt 
requested_version_subminor
+                         }
+                    }
+               }
+            m4_popdef([requested_version_subminor])
+            m4_popdef([requested_version_minor])
+            m4_popdef([requested_version_major])
+           ],
+           [dnl This is the unoptimized variant:
+            requested_version_major=`echo '$1' | sed -n -e 
"$gl_libunistring_sed_extract_major"`
+            requested_version_minor=`echo '$1' | sed -n -e 
"$gl_libunistring_sed_extract_minor"`
+            requested_version_subminor=`echo '$1' | sed -n -e 
"$gl_libunistring_sed_extract_subminor"`
+            test $LIBUNISTRING_VERSION_MAJOR -lt $requested_version_major \
+            || { test $LIBUNISTRING_VERSION_MAJOR -eq $requested_version_major 
\
+                 && { test $LIBUNISTRING_VERSION_MINOR -lt 
$requested_version_minor \
+                      || { test $LIBUNISTRING_VERSION_MINOR -eq 
$requested_version_minor \
+                           && test $LIBUNISTRING_VERSION_SUBMINOR -lt 
$requested_version_subminor
+                         }
+                    }
+               }
+           ])
+       }
+  }])
+
+dnl gl_LIBUNISTRING_ARG_OR_ZERO([ARG], [ORIG]) expands to ARG if it is not the
+dnl same as ORIG, otherwise to 0.
+m4_define([gl_LIBUNISTRING_ARG_OR_ZERO], [m4_if([$1], [$2], [0], [$1])])
diff --git a/src/json.c b/src/json.c
index 689f6ac510..5d0c518138 100644
--- a/src/json.c
+++ b/src/json.c
@@ -26,6 +26,8 @@ along with GNU Emacs.  If not, see 
<https://www.gnu.org/licenses/>.  */
 
 #include <jansson.h>
 
+#include "unistr.h"
+
 #include "lisp.h"
 #include "buffer.h"
 #include "coding.h"
@@ -313,6 +315,17 @@ json_check (json_t *object)
   return object;
 }
 
+/* If STRING is not a valid UTF-8 string, signal an error of type
+   `wrong-type-argument'.  STRING must be a unibyte string.  */
+
+static void
+json_check_utf8 (Lisp_Object string)
+{
+  eassert (!STRING_MULTIBYTE (string));
+  CHECK_TYPE (u8_check (SDATA (string), SBYTES (string)) == NULL,
+              Qutf_8_string_p, string);
+}
+
 static json_t *lisp_to_json (Lisp_Object);
 
 /* Convert a Lisp object to a toplevel JSON object (array or object).
@@ -355,9 +368,12 @@ lisp_to_json_toplevel_1 (Lisp_Object lisp, json_t **json)
             int status = json_object_set_new (*json, SSDATA (key),
                                               lisp_to_json (HASH_VALUE (h, 
i)));
             if (status == -1)
-              /* FIXME: A failure here might also indicate that the
-                 key is not a valid Unicode string.  */
-              json_out_of_memory ();
+              {
+                /* A failure can be caused either by an invalid key or
+                   by low memory.  */
+                json_check_utf8 (key);
+                json_out_of_memory ();
+              }
           }
       clear_unwind_protect (count);
       return unbind_to (count, Qnil);
@@ -403,9 +419,15 @@ lisp_to_json (Lisp_Object lisp)
   else if (STRINGP (lisp))
     {
       Lisp_Object encoded = json_encode (lisp);
-      /* FIXME: We might throw an out-of-memory error here if the
-         string is not valid Unicode.  */
-      return json_check (json_stringn (SSDATA (encoded), SBYTES (encoded)));
+      json_t *json = json_stringn (SSDATA (encoded), SBYTES (encoded));
+      if (json == NULL)
+        {
+          /* A failure can be caused either by an invalid string or by
+             low memory.  */
+          json_check_utf8 (encoded);
+          json_out_of_memory ();
+        }
+      return json;
     }
 
   /* LISP now must be a vector or hashtable.  */
@@ -818,6 +840,7 @@ syms_of_json (void)
 
   DEFSYM (Qstring_without_embedded_nulls_p, "string-without-embedded-nulls-p");
   DEFSYM (Qjson_value_p, "json-value-p");
+  DEFSYM (Qutf_8_string_p, "utf-8-string-p");
 
   DEFSYM (Qutf_8_unix, "utf-8-unix");
 
diff --git a/test/src/json-tests.el b/test/src/json-tests.el
index 9884e9a2d5..9bdb639423 100644
--- a/test/src/json-tests.el
+++ b/test/src/json-tests.el
@@ -84,12 +84,10 @@
 
 (ert-deftest json-serialize/invalid-unicode ()
   (skip-unless (fboundp 'json-serialize))
-  ;; FIXME: "out of memory" is the wrong error signal, but we don't
-  ;; currently distinguish between error types when serializing.
-  (should-error (json-serialize ["a\uDBBBb"]) :type 'json-out-of-memory)
-  (should-error (json-serialize ["u\x110000v"]) :type 'json-out-of-memory)
-  (should-error (json-serialize ["u\x3FFFFFv"]) :type 'json-out-of-memory)
-  (should-error (json-serialize ["u\xCCv"]) :type 'json-out-of-memory))
+  (should-error (json-serialize ["a\uDBBBb"]) :type 'wrong-type-argument)
+  (should-error (json-serialize ["u\x110000v"]) :type 'wrong-type-argument)
+  (should-error (json-serialize ["u\x3FFFFFv"]) :type 'wrong-type-argument)
+  (should-error (json-serialize ["u\xCCv"]) :type 'wrong-type-argument))
 
 (ert-deftest json-parse-string/null ()
   (skip-unless (fboundp 'json-parse-string))
-- 
2.15.1




reply via email to

[Prev in Thread] Current Thread [Next in Thread]