bug-gnulib
[Top][All Lists]
Advanced

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

[resend] unistr: New modules for backward iteration in string.


From: Ben Pfaff
Subject: [resend] unistr: New modules for backward iteration in string.
Date: Mon, 14 Feb 2011 22:45:43 -0800

This is the third time that I have sent this patch.  The first time
was in November.  The second time, which was slightly refined
compared to the first version, was in January.  This third posting
is identical to the second version, except that I rebased it.  I
have not re-tested the rebased version.

I sent a request for libunistring copyright assignment papers to
address@hidden in December, but I did not receive any response.  I
re-sent my request a few minutes ago.  But that request would only
be necessary for documentation updates, since I already have a
copyright assignment for Gnulib.

Thanks,

Ben.

--8<--------------------------cut here-------------------------->8--

From: Ben Pfaff <address@hidden>
Date: Sat, 1 Jan 2011 14:51:16 -0800
Subject: unistr: New modules for backward iteration in string.

New module 'u8-mb-prev-uc'.
* lib/unistr.in.h (u8_mb_prev_uc): New declaration.
(u8_mb_prev_uc_aux): New declaration.
* lib/unistr/u8-mb-prev-uc.c: New file.
* lib/unistr/u8-mb-prev-uc-aux.c: New file.
* tests/test-u8-mb-prev-uc.c: New file.
* modules/u8-mb-prev-uc: New file.
* modules/u8-mb-prev-uc-tests: New file.

New module 'u16-mb-prev-uc'.
* lib/unistr.in.h (u16_mb_prev_uc): New declaration.
(u16_mb_prev_uc_aux): New declaration.
* lib/unistr/u16-mb-prev-uc.c: New file.
* lib/unistr/u16-mb-prev-uc-aux.c: New file.
* tests/test-u16-mb-prev-uc.c: New file.
* modules/u16-mb-prev-uc: New file.
* modules/u16-mb-prev-uc-tests: New file.

New module 'u32-mb-prev-uc'.
* lib/unistr.in.h (u32_mb_prev_uc): New declaration.
* lib/unistr/u32-mb-prev-uc.c: New file.
* tests/test-u32-mb-prev-uc.c: New file.
* modules/u32-mb-prev-uc: New file.
* modules/u32-mb-prev-uc-tests: New file.
---
 ChangeLog                           |   27 ++++
 lib/unistr.in.h                     |   71 +++++++++
 lib/unistr/u16-mb-prev-uc-aux.c     |   52 +++++++
 lib/unistr/u16-mb-prev-uc.c         |   62 ++++++++
 lib/unistr/u32-mb-prev-uc.c         |   43 ++++++
 lib/unistr/u8-mb-prev-uc-aux.c      |  128 ++++++++++++++++
 lib/unistr/u8-mb-prev-uc.c          |  139 +++++++++++++++++
 modules/unistr/u16-mb-prev-uc       |   28 ++++
 modules/unistr/u16-mb-prev-uc-tests |   12 ++
 modules/unistr/u32-mb-prev-uc       |   27 ++++
 modules/unistr/u32-mb-prev-uc-tests |   12 ++
 modules/unistr/u8-mb-prev-uc        |   28 ++++
 modules/unistr/u8-mb-prev-uc-tests  |   14 ++
 tests/unistr/test-u16-mb-prev-uc.c  |   89 +++++++++++
 tests/unistr/test-u32-mb-prev-uc.c  |   89 +++++++++++
 tests/unistr/test-u8-mb-prev-uc.c   |  279 +++++++++++++++++++++++++++++++++++
 16 files changed, 1100 insertions(+), 0 deletions(-)
 create mode 100644 lib/unistr/u16-mb-prev-uc-aux.c
 create mode 100644 lib/unistr/u16-mb-prev-uc.c
 create mode 100644 lib/unistr/u32-mb-prev-uc.c
 create mode 100644 lib/unistr/u8-mb-prev-uc-aux.c
 create mode 100644 lib/unistr/u8-mb-prev-uc.c
 create mode 100644 modules/unistr/u16-mb-prev-uc
 create mode 100644 modules/unistr/u16-mb-prev-uc-tests
 create mode 100644 modules/unistr/u32-mb-prev-uc
 create mode 100644 modules/unistr/u32-mb-prev-uc-tests
 create mode 100644 modules/unistr/u8-mb-prev-uc
 create mode 100644 modules/unistr/u8-mb-prev-uc-tests
 create mode 100644 tests/unistr/test-u16-mb-prev-uc.c
 create mode 100644 tests/unistr/test-u32-mb-prev-uc.c
 create mode 100644 tests/unistr/test-u8-mb-prev-uc.c

diff --git a/ChangeLog b/ChangeLog
index 3382c60..b55b4b9 100644
--- a/ChangeLog
+++ b/ChangeLog
@@ -1,3 +1,30 @@
+2011-01-01  Ben Pfaff  <address@hidden>
+
+       New module 'u8-mb-prev-uc'.
+       * lib/unistr.in.h (u8_mb_prev_uc): New declaration.
+       (u8_mb_prev_uc_aux): New declaration.
+       * lib/unistr/u8-mb-prev-uc.c: New file.
+       * lib/unistr/u8-mb-prev-uc-aux.c: New file.
+       * tests/test-u8-mb-prev-uc.c: New file.
+       * modules/u8-mb-prev-uc: New file.
+       * modules/u8-mb-prev-uc-tests: New file.
+
+       New module 'u16-mb-prev-uc'.
+       * lib/unistr.in.h (u16_mb_prev_uc): New declaration.
+       (u16_mb_prev_uc_aux): New declaration.
+       * lib/unistr/u16-mb-prev-uc.c: New file.
+       * lib/unistr/u16-mb-prev-uc-aux.c: New file.
+       * tests/test-u16-mb-prev-uc.c: New file.
+       * modules/u16-mb-prev-uc: New file.
+       * modules/u16-mb-prev-uc-tests: New file.
+
+       New module 'u32-mb-prev-uc'.
+       * lib/unistr.in.h (u32_mb_prev_uc): New declaration.
+       * lib/unistr/u32-mb-prev-uc.c: New file.
+       * tests/test-u32-mb-prev-uc.c: New file.
+       * modules/u32-mb-prev-uc: New file.
+       * modules/u32-mb-prev-uc-tests: New file.
+
 2011-02-13  Bruno Haible  <address@hidden>
 
        mbsinit: Don't crash for a NULL argument.
diff --git a/lib/unistr.in.h b/lib/unistr.in.h
index c665aa8..b899172 100644
--- a/lib/unistr.in.h
+++ b/lib/unistr.in.h
@@ -294,6 +294,77 @@ extern int
        u32_mbtoucr (ucs4_t *puc, const uint32_t *s, size_t n);
 #endif
 
+/* Return the length (number of units) of the last character in S, putting
+   its 'ucs4_t' representation in *PUC.  Upon failure, *PUC is set to 0xfffd,
+   and an appropriate number of units is returned.
+   The number of available units, N, must be > 0.  */
+
+#if GNULIB_UNISTR_U8_MB_PREV_UC || HAVE_LIBUNISTRING
+# if !HAVE_INLINE
+extern int
+       u8_mb_prev_uc (ucs4_t *puc, const uint8_t *s, size_t n);
+# else
+extern int
+       u8_mb_prev_uc_aux (ucs4_t *puc, const uint8_t *s, size_t n);
+static inline int
+u8_mb_prev_uc (ucs4_t *puc, const uint8_t *s, size_t n)
+{
+  uint8_t c = s[n - 1];
+
+  if (c < 0x80)
+    {
+      *puc = c;
+      return 1;
+    }
+  else
+    return u8_mb_prev_uc_aux (puc, s, n);
+}
+# endif
+#endif
+
+#if GNULIB_UNISTR_U16_MB_PREV_UC || HAVE_LIBUNISTRING
+# if !HAVE_INLINE
+extern int
+       u16_mb_prev_uc (ucs4_t *puc, const uint16_t *s, size_t n);
+# else
+extern int
+       u16_mb_prev_uc_aux (ucs4_t *puc, const uint16_t *s, size_t n);
+static inline int
+u16_mb_prev_uc (ucs4_t *puc, const uint16_t *s, size_t n)
+{
+  uint16_t c = s[n - 1];
+
+  if (c < 0xd800 || c >= 0xe000)
+    {
+      *puc = c;
+      return 1;
+    }
+  else
+    return u16_mb_prev_uc_aux (puc, s, n);
+}
+# endif
+#endif
+
+#if GNULIB_UNISTR_U32_MB_PREV_UC || HAVE_LIBUNISTRING
+# if !HAVE_INLINE
+extern int
+       u32_mb_prev_uc (ucs4_t *puc, const uint32_t *s, size_t n);
+# else
+static inline int
+u32_mb_prev_uc (ucs4_t *puc, const uint32_t *s, size_t n _GL_UNUSED_PARAMETER)
+{
+  uint32_t c = s[n - 1];
+
+  if (c < 0xd800 || (c >= 0xe000 && c < 0x110000))
+    *puc = c;
+  else
+    /* invalid multibyte character */
+    *puc = 0xfffd;
+  return 1;
+}
+# endif
+#endif
+
 /* Put the multibyte character represented by UC in S, returning its
    length.  Return -1 upon failure, -2 if the number of available units, N,
    is too small.  The latter case cannot occur if N >= 6/2/1, respectively.  */
diff --git a/lib/unistr/u16-mb-prev-uc-aux.c b/lib/unistr/u16-mb-prev-uc-aux.c
new file mode 100644
index 0000000..eeab787
--- /dev/null
+++ b/lib/unistr/u16-mb-prev-uc-aux.c
@@ -0,0 +1,52 @@
+/* Look at last character in UTF-16 string.
+   Copyright (C) 1999-2002, 2006-2007, 2009-2011 Free Software Foundation, Inc.
+   Written by Ben Pfaff <address@hidden>, 2011,
+   based on code by Bruno Haible <address@hidden>, 2001.
+
+   This program is free software: you can redistribute it and/or modify it
+   under the terms of the GNU Lesser General Public License as published
+   by the Free Software Foundation; either version 3 of the License, or
+   (at your option) any later version.
+
+   This program is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public License
+   along with this program.  If not, see <http://www.gnu.org/licenses/>.  */
+
+#include <config.h>
+
+/* Specification.  */
+#include "unistr.h"
+
+#if defined IN_LIBUNISTRING || HAVE_INLINE
+
+int
+u16_mb_prev_uc_aux (ucs4_t *puc, const uint16_t *s, size_t n)
+{
+  uint16_t c = s[n - 1];
+
+  if (c >= 0xdc00)
+    {
+      if (n >= 2)
+        {
+          if (s[n - 2] >= 0xd800 && s[n - 2] < 0xdc00)
+            {
+              *puc = 0x10000 + ((s[n - 2] - 0xd800) << 10) + (c - 0xdc00);
+              return 2;
+            }
+          /* invalid multibyte character */
+        }
+      else
+        {
+          /* incomplete multibyte character */
+        }
+    }
+  /* invalid multibyte character */
+  *puc = 0xfffd;
+  return 1;
+}
+
+#endif
diff --git a/lib/unistr/u16-mb-prev-uc.c b/lib/unistr/u16-mb-prev-uc.c
new file mode 100644
index 0000000..3511666
--- /dev/null
+++ b/lib/unistr/u16-mb-prev-uc.c
@@ -0,0 +1,62 @@
+/* Look at last character in UTF-16 string.
+   Copyright (C) 1999-2002, 2006-2007, 2009-2011 Free Software Foundation, Inc.
+   Written by Ben Pfaff <address@hidden>, 2011,
+   based on code by Bruno Haible <address@hidden>, 2001.
+
+   This program is free software: you can redistribute it and/or modify it
+   under the terms of the GNU Lesser General Public License as published
+   by the Free Software Foundation; either version 3 of the License, or
+   (at your option) any later version.
+
+   This program is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public License
+   along with this program.  If not, see <http://www.gnu.org/licenses/>.  */
+
+#include <config.h>
+
+#if defined IN_LIBUNISTRING
+/* Tell unistr.h to declare u16_mb_prev_uc as 'extern', not 'static inline'. */
+# include "unistring-notinline.h"
+#endif
+
+/* Specification.  */
+#include "unistr.h"
+
+#if !HAVE_INLINE
+
+int
+u16_mb_prev_uc (ucs4_t *puc, const uint16_t *s, size_t n)
+{
+  uint16_t c = s[n - 1];
+
+  if (c < 0xd800 || c >= 0xe000)
+    {
+      *puc = c;
+      return 1;
+    }
+  if (c >= 0xdc00)
+    {
+      if (n >= 2)
+        {
+          if (s[n - 2] >= 0xd800 && s[n - 2] < 0xdc00)
+            {
+              *puc = 0x10000 + ((s[n - 2] - 0xd800) << 10) + (c - 0xdc00);
+              return 2;
+            }
+          /* invalid multibyte character */
+        }
+      else
+        {
+          /* incomplete multibyte character */
+        }
+    }
+  /* invalid multibyte character */
+  *puc = 0xfffd;
+  return 1;
+}
+
+#endif
diff --git a/lib/unistr/u32-mb-prev-uc.c b/lib/unistr/u32-mb-prev-uc.c
new file mode 100644
index 0000000..398827b
--- /dev/null
+++ b/lib/unistr/u32-mb-prev-uc.c
@@ -0,0 +1,43 @@
+/* Look at last character in UTF-32 string.
+   Copyright (C) 2002, 2006-2007, 2009-2011 Free Software Foundation, Inc.
+   Written by Bruno Haible <address@hidden>, 2002.
+
+   This program is free software: you can redistribute it and/or modify it
+   under the terms of the GNU Lesser General Public License as published
+   by the Free Software Foundation; either version 3 of the License, or
+   (at your option) any later version.
+
+   This program is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public License
+   along with this program.  If not, see <http://www.gnu.org/licenses/>.  */
+
+#include <config.h>
+
+#if defined IN_LIBUNISTRING
+/* Tell unistr.h to declare u32_mb_prev_uc as 'extern', not 'static inline'. */
+# include "unistring-notinline.h"
+#endif
+
+/* Specification.  */
+#include "unistr.h"
+
+#if !HAVE_INLINE
+
+int
+u32_mb_prev_uc (ucs4_t *puc, const uint32_t *s, size_t n)
+{
+  uint32_t c = s[n - 1];
+
+  if (c < 0xd800 || (c >= 0xe000 && c < 0x110000))
+    *puc = c;
+  else
+    /* invalid multibyte character */
+    *puc = 0xfffd;
+  return 1;
+}
+
+#endif
diff --git a/lib/unistr/u8-mb-prev-uc-aux.c b/lib/unistr/u8-mb-prev-uc-aux.c
new file mode 100644
index 0000000..296a583
--- /dev/null
+++ b/lib/unistr/u8-mb-prev-uc-aux.c
@@ -0,0 +1,128 @@
+/* Look at last character in UCS-8 string.
+   Copyright (C) 2001-2002, 2006-2007, 2009-2011 Free Software Foundation, Inc.
+   Written by Ben Pfaff <address@hidden>, 2010,
+   based on code by Bruno Haible <address@hidden>, 2001.
+
+   This program is free software: you can redistribute it and/or modify it
+   under the terms of the GNU Lesser General Public License as published
+   by the Free Software Foundation; either version 3 of the License, or
+   (at your option) any later version.
+
+   This program is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public License
+   along with this program.  If not, see <http://www.gnu.org/licenses/>.  */
+
+#include <config.h>
+
+/* Specification.  */
+#include "unistr.h"
+
+#if defined IN_LIBUNISTRING || HAVE_INLINE
+
+int
+u8_mb_prev_uc_aux (ucs4_t *puc, const uint8_t *s, size_t n)
+{
+  uint8_t c_1 = s[n - 1];
+
+#if CONFIG_UNICODE_SAFETY
+  if (c_1 <= 0xbf)
+#endif
+    {
+      if (n >= 2)
+        {
+          uint8_t c_2 = s[n - 2];
+
+          if ((c_2 ^ 0x80) >= 0x40)
+            {
+#if CONFIG_UNICODE_SAFETY
+              if (c_2 >= 0xc2 && c_2 < 0xe0)
+#endif
+                {
+                  *puc = ((unsigned int) (c_2 & 0x1f) << 6)
+                         | (unsigned int) (c_1 ^ 0x80);
+                  return 2;
+                }
+#if CONFIG_UNICODE_SAFETY
+              if (c_2 >= 0xe0 && c_2 < 0xf8)
+                {
+                  /* incomplete multibyte character */
+                  *puc = 0xfffd;
+                  return 2;
+                }
+#endif
+            }
+          else if (n >= 3)
+            {
+              uint8_t c_3 = s[n - 3];
+
+              if ((c_3 ^ 0x80) >= 0x40)
+                {
+#if CONFIG_UNICODE_SAFETY
+                  if ((c_3 == 0xe0 && c_2 >= 0xa0)
+                      || (c_3 >= 0xe1 && c_3 < 0xed)
+                      || (c_3 == 0xed && c_2 < 0xa0)
+                      || (c_3 >= 0xee && c_3 < 0xf0))
+#endif
+                    {
+                      *puc = ((unsigned int) (c_3 & 0x0f) << 12)
+                             | (unsigned int) ((c_2 ^ 0x80) << 6)
+                             | (unsigned int) (c_1 ^ 0x80);
+                      return 3;
+                    }
+#if CONFIG_UNICODE_SAFETY
+                  if (c_3 >= 0xe0 && c_3 < 0xf8)
+                    {
+                      /* 0xe0: overlong sequence.
+                         0xe1...0xec: not reached.
+                         0xed: UTF-16 surrogate.
+                         0xee...0xef: not reached.
+                         0xf0...0xf7: incomplete multibyte character. */
+                      *puc = 0xfffd;
+                      return 3;
+                    }
+#endif
+                }
+              else if (n >= 4)
+                {
+                  uint8_t c_4 = s[n - 4];
+
+                  if ((c_4 ^ 0x80) >= 0x40)
+                    {
+#if CONFIG_UNICODE_SAFETY
+                      if ((c_4 == 0xf0 && c_3 >= 0x90)
+                          || (c_4 >= 0xf1 && c_4 < 0xf4)
+                          || (c_4 == 0xf4 && c_3 < 0x90))
+#endif
+                        {
+                          *puc = (unsigned int) ((c_4 & 0x07) << 18)
+                                 | (unsigned int) ((c_3 ^ 0x80) << 12)
+                                 | (unsigned int) ((c_2 ^ 0x80) << 6)
+                                 | (unsigned int) (c_1 ^ 0x80);
+                          return 4;
+                        }
+#if CONFIG_UNICODE_SAFETY
+                      if (c_4 >= 0xf0 && c_4 < 0xf8)
+                        {
+                          /* 0xf0: overlong sequence.
+                             0xf1...0xf3: not reached.
+                             0xf4...0xf7: invalid code point above U+10FFFF */
+                          *puc = 0xfffd;
+                          return 4;
+                        }
+#endif
+                    }
+                }
+            }
+        }
+    }
+
+  /* invalid or incomplete multibyte character */
+  *puc = 0xfffd;
+  return 1;
+}
+
+#endif
diff --git a/lib/unistr/u8-mb-prev-uc.c b/lib/unistr/u8-mb-prev-uc.c
new file mode 100644
index 0000000..41eaf2b
--- /dev/null
+++ b/lib/unistr/u8-mb-prev-uc.c
@@ -0,0 +1,139 @@
+/* Look at last character in UTF-8 string.
+   Copyright (C) 2001-2002, 2006-2007, 2009-2011 Free Software Foundation, Inc.
+   Written by Ben Pfaff <address@hidden>, 2010,
+   based on code by Bruno Haible <address@hidden>, 2001.
+
+   This program is free software: you can redistribute it and/or modify it
+   under the terms of the GNU Lesser General Public License as published
+   by the Free Software Foundation; either version 3 of the License, or
+   (at your option) any later version.
+
+   This program is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public License
+   along with this program.  If not, see <http://www.gnu.org/licenses/>.  */
+
+#include <config.h>
+
+#if defined IN_LIBUNISTRING
+/* Tell unistr.h to declare u8_mb_prev_uc as 'extern', not 'static inline'.  */
+# include "unistring-notinline.h"
+#endif
+
+/* Specification.  */
+#include "unistr.h"
+
+#if !HAVE_INLINE
+
+int
+u8_mb_prev_uc (ucs4_t *puc, const uint8_t *s, size_t n)
+{
+  uint8_t c_1 = s[n - 1];
+
+  if (c_1 < 0x80)
+    {
+      *puc = c_1;
+      return 1;
+    }
+
+#if CONFIG_UNICODE_SAFETY
+  if (c_1 <= 0xbf)
+#endif
+    {
+      if (n >= 2)
+        {
+          uint8_t c_2 = s[n - 2];
+
+          if ((c_2 ^ 0x80) >= 0x40)
+            {
+#if CONFIG_UNICODE_SAFETY
+              if (c_2 >= 0xc2 && c_2 < 0xe0)
+#endif
+                {
+                  *puc = ((unsigned int) (c_2 & 0x1f) << 6)
+                         | (unsigned int) (c_1 ^ 0x80);
+                  return 2;
+                }
+#if CONFIG_UNICODE_SAFETY
+              if (c_2 >= 0xe0 && c_2 < 0xf8)
+                {
+                  /* incomplete multibyte character */
+                  *puc = 0xfffd;
+                  return 2;
+                }
+#endif
+            }
+          else if (n >= 3)
+            {
+              uint8_t c_3 = s[n - 3];
+
+              if ((c_3 ^ 0x80) >= 0x40)
+                {
+#if CONFIG_UNICODE_SAFETY
+                  if ((c_3 == 0xe0 && c_2 >= 0xa0)
+                      || (c_3 >= 0xe1 && c_3 < 0xed)
+                      || (c_3 == 0xed && c_2 < 0xa0)
+                      || (c_3 >= 0xee && c_3 < 0xf0))
+#endif
+                    {
+                      *puc = ((unsigned int) (c_3 & 0x0f) << 12)
+                             | (unsigned int) ((c_2 ^ 0x80) << 6)
+                             | (unsigned int) (c_1 ^ 0x80);
+                      return 3;
+                    }
+#if CONFIG_UNICODE_SAFETY
+                  if (c_3 >= 0xe0 && c_3 < 0xf8)
+                    {
+                      /* 0xe0: overlong sequence.
+                         0xe1...0xec: not reached.
+                         0xed: UTF-16 surrogate.
+                         0xee...0xef: not reached.
+                         0xf0...0xf7: incomplete multibyte character. */
+                      *puc = 0xfffd;
+                      return 3;
+                    }
+#endif
+                }
+              else if (n >= 4)
+                {
+                  uint8_t c_4 = s[n - 4];
+
+                  if ((c_4 ^ 0x80) >= 0x40)
+                    {
+#if CONFIG_UNICODE_SAFETY
+                      if ((c_4 == 0xf0 && c_3 >= 0x90)
+                          || (c_4 >= 0xf1 && c_4 < 0xf4)
+                          || (c_4 == 0xf4 && c_3 < 0x90))
+#endif
+                        {
+                          *puc = (unsigned int) ((c_4 & 0x07) << 18)
+                                 | (unsigned int) ((c_3 ^ 0x80) << 12)
+                                 | (unsigned int) ((c_2 ^ 0x80) << 6)
+                                 | (unsigned int) (c_1 ^ 0x80);
+                          return 4;
+                        }
+#if CONFIG_UNICODE_SAFETY
+                      if (c_4 >= 0xf0 && c_4 < 0xf8)
+                        {
+                          /* 0xf0: overlong sequence.
+                             0xf1...0xf3: not reached.
+                             0xf4...0xf7: invalid code point above U+10FFFF */
+                          *puc = 0xfffd;
+                          return 4;
+                        }
+#endif
+                    }
+                }
+            }
+        }
+    }
+
+  /* invalid or incomplete multibyte character */
+  *puc = 0xfffd;
+  return 1;
+}
+
+#endif
diff --git a/modules/unistr/u16-mb-prev-uc b/modules/unistr/u16-mb-prev-uc
new file mode 100644
index 0000000..508fc72
--- /dev/null
+++ b/modules/unistr/u16-mb-prev-uc
@@ -0,0 +1,28 @@
+Description:
+Look at last character in UTF-16 string.
+
+Files:
+lib/unistr/u16-mb-prev-uc.c
+lib/unistr/u16-mb-prev-uc-aux.c
+
+Depends-on:
+unistr/base
+
+configure.ac:
+gl_MODULE_INDICATOR([unistr/u16-mb-prev-uc])
+gl_LIBUNISTRING_MODULE([0.9.4], [unistr/u16-mb-prev-uc])
+
+Makefile.am:
+if LIBUNISTRING_COMPILE_UNISTR_U16_MB_PREV_UC
+lib_SOURCES += unistr/u16-mb-prev-uc.c unistr/u16-mb-prev-uc-aux.c
+endif
+
+Include:
+"unistr.h"
+
+License:
+LGPL
+
+Maintainer:
+Bruno Haible, Ben Pfaff
+
diff --git a/modules/unistr/u16-mb-prev-uc-tests 
b/modules/unistr/u16-mb-prev-uc-tests
new file mode 100644
index 0000000..a9f504f
--- /dev/null
+++ b/modules/unistr/u16-mb-prev-uc-tests
@@ -0,0 +1,12 @@
+Files:
+tests/unistr/test-u16-mb-prev-uc.c
+
+Depends-on:
+
+configure.ac:
+
+Makefile.am:
+TESTS += test-u16-mb-prev-uc
+check_PROGRAMS += test-u16-mb-prev-uc
+test_u16_mb_prev_uc_SOURCES = unistr/test-u16-mb-prev-uc.c
+test_u16_mb_prev_uc_LDADD = $(LDADD) $(LIBUNISTRING)
diff --git a/modules/unistr/u32-mb-prev-uc b/modules/unistr/u32-mb-prev-uc
new file mode 100644
index 0000000..ad7974a
--- /dev/null
+++ b/modules/unistr/u32-mb-prev-uc
@@ -0,0 +1,27 @@
+Description:
+Look at last character in UTF-32 string.
+
+Files:
+lib/unistr/u32-mb-prev-uc.c
+
+Depends-on:
+unistr/base
+
+configure.ac:
+gl_MODULE_INDICATOR([unistr/u32-mb-prev-uc])
+gl_LIBUNISTRING_MODULE([0.9.4], [unistr/u32-mb-prev-uc])
+
+Makefile.am:
+if LIBUNISTRING_COMPILE_UNISTR_U32_MB_PREV_UC
+lib_SOURCES += unistr/u32-mb-prev-uc.c
+endif
+
+Include:
+"unistr.h"
+
+License:
+LGPL
+
+Maintainer:
+Bruno Haible, Ben Pfaff
+
diff --git a/modules/unistr/u32-mb-prev-uc-tests 
b/modules/unistr/u32-mb-prev-uc-tests
new file mode 100644
index 0000000..e1e45c8
--- /dev/null
+++ b/modules/unistr/u32-mb-prev-uc-tests
@@ -0,0 +1,12 @@
+Files:
+tests/unistr/test-u32-mb-prev-uc.c
+
+Depends-on:
+
+configure.ac:
+
+Makefile.am:
+TESTS += test-u32-mb-prev-uc
+check_PROGRAMS += test-u32-mb-prev-uc
+test_u32_mb_prev_uc_SOURCES = unistr/test-u32-mb-prev-uc.c
+test_u32_mb_prev_uc_LDADD = $(LDADD) $(LIBUNISTRING)
diff --git a/modules/unistr/u8-mb-prev-uc b/modules/unistr/u8-mb-prev-uc
new file mode 100644
index 0000000..2a12805
--- /dev/null
+++ b/modules/unistr/u8-mb-prev-uc
@@ -0,0 +1,28 @@
+Description:
+Look at last character in UTF-8 string.
+
+Files:
+lib/unistr/u8-mb-prev-uc.c
+lib/unistr/u8-mb-prev-uc-aux.c
+
+Depends-on:
+unistr/base
+
+configure.ac:
+gl_MODULE_INDICATOR([unistr/u8-mb-prev-uc])
+gl_LIBUNISTRING_MODULE([0.9.4], [unistr/u8-mb-prev-uc])
+
+Makefile.am:
+if LIBUNISTRING_COMPILE_UNISTR_U8_MB_PREV_UC
+lib_SOURCES += unistr/u8-mb-prev-uc.c unistr/u8-mb-prev-uc-aux.c
+endif
+
+Include:
+"unistr.h"
+
+License:
+LGPL
+
+Maintainer:
+Bruno Haible, Ben Pfaff
+
diff --git a/modules/unistr/u8-mb-prev-uc-tests 
b/modules/unistr/u8-mb-prev-uc-tests
new file mode 100644
index 0000000..66a593a
--- /dev/null
+++ b/modules/unistr/u8-mb-prev-uc-tests
@@ -0,0 +1,14 @@
+Files:
+tests/unistr/test-u8-mb-prev-uc.c
+tests/macros.h
+
+Depends-on:
+unistr/u8-mbtouc
+
+configure.ac:
+
+Makefile.am:
+TESTS += test-u8-mb-prev-uc
+check_PROGRAMS += test-u8-mb-prev-uc
+test_u8_mb_prev_uc_SOURCES = unistr/test-u8-mb-prev-uc.c
+test_u8_mb_prev_uc_LDADD = $(LDADD) $(LIBUNISTRING)
diff --git a/tests/unistr/test-u16-mb-prev-uc.c 
b/tests/unistr/test-u16-mb-prev-uc.c
new file mode 100644
index 0000000..7f85e98
--- /dev/null
+++ b/tests/unistr/test-u16-mb-prev-uc.c
@@ -0,0 +1,89 @@
+/* Test of u16_mb_prev_uc() function.
+   Copyright (C) 2011 Free Software Foundation, Inc.
+
+   This program is free software: you can redistribute it and/or modify
+   it under the terms of the GNU General Public License as published by
+   the Free Software Foundation; either version 3 of the License, or
+   (at your option) any later version.
+
+   This program is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+   GNU General Public License for more details.
+
+   You should have received a copy of the GNU General Public License
+   along with this program.  If not, see <http://www.gnu.org/licenses/>.  */
+
+/* Written by Ben Pfaff, 2011.  */
+
+#include <config.h>
+
+#include "unistr.h"
+
+#include <assert.h>
+#include <stdarg.h>
+#include <stdio.h>
+#include <stdlib.h>
+
+static void
+test_u16_mb_prev_uc (int expect_len, ucs4_t expect_uc, ...)
+{
+  uint16_t s[16];
+  va_list args;
+  size_t n;
+
+  ucs4_t uc;
+  int len;
+
+  va_start (args, expect_uc);
+  n = 0;
+  for (;;)
+    {
+      int unit = va_arg (args, int);
+      if (unit == -1)
+        break;
+      else if (n >= sizeof s / sizeof *s)
+        abort ();
+
+      s[n++] = unit;
+    }
+  va_end (args);
+
+  len = u16_mb_prev_uc (&uc, s, n);
+  if (len != expect_len || uc != expect_uc)
+    {
+      size_t i;
+
+      fprintf (stderr, "u16_mb_prev_uc returned length %d and U+%04x, "
+               "expected length %d and U+%04x:",
+               len, (unsigned int) uc,
+               expect_len, (unsigned int) expect_uc);
+      for (i = 0; i < n; i++)
+        fprintf (stderr, " %04x", s[i]);
+      putc ('\n', stderr);
+      fflush (stderr);
+      abort ();
+    }
+}
+
+int
+main (void)
+{
+  /* Valid single-unit sequences. */
+  test_u16_mb_prev_uc (1, 'a',     'a', -1);
+  test_u16_mb_prev_uc (1, 0x3042,  0x3042, -1);
+  test_u16_mb_prev_uc (1, 'b',     'a', 'b', -1);
+  test_u16_mb_prev_uc (1, 'x',     0x3042, 'x', -1);
+
+  /* Valid surrogate pairs. */
+  test_u16_mb_prev_uc (2, 0x1f610, 0xd83d, 0xde10, -1);
+  test_u16_mb_prev_uc (2, 0x1f610, 'x', 0xd83d, 0xde10, -1);
+
+  /* Invalid surrogate pairs. */
+  test_u16_mb_prev_uc (1, 0xfffd,  0xd800, -1);
+  test_u16_mb_prev_uc (1, 0xfffd,  'a', 0xd800, -1);
+  test_u16_mb_prev_uc (1, 0xfffd,  0xdeff, -1);
+  test_u16_mb_prev_uc (1, 0xfffd,  'b', 0xdeff, -1);
+
+  return 0;
+}
diff --git a/tests/unistr/test-u32-mb-prev-uc.c 
b/tests/unistr/test-u32-mb-prev-uc.c
new file mode 100644
index 0000000..6666877
--- /dev/null
+++ b/tests/unistr/test-u32-mb-prev-uc.c
@@ -0,0 +1,89 @@
+/* Test of u32_mb_prev_uc() function.
+   Copyright (C) 2011 Free Software Foundation, Inc.
+
+   This program is free software: you can redistribute it and/or modify
+   it under the terms of the GNU General Public License as published by
+   the Free Software Foundation; either version 3 of the License, or
+   (at your option) any later version.
+
+   This program is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+   GNU General Public License for more details.
+
+   You should have received a copy of the GNU General Public License
+   along with this program.  If not, see <http://www.gnu.org/licenses/>.  */
+
+/* Written by Ben Pfaff, 2011.  */
+
+#include <config.h>
+
+#include "unistr.h"
+
+#include <assert.h>
+#include <stdarg.h>
+#include <stdio.h>
+#include <stdlib.h>
+
+static void
+test_u32_mb_prev_uc (int expect_len, ucs4_t expect_uc, ...)
+{
+  uint32_t s[16];
+  va_list args;
+  size_t n;
+
+  ucs4_t uc;
+  int len;
+
+  va_start (args, expect_uc);
+  n = 0;
+  for (;;)
+    {
+      int unit = va_arg (args, int);
+      if (unit == -1)
+        break;
+      else if (n >= sizeof s / sizeof *s)
+        abort ();
+
+      s[n++] = unit;
+    }
+  va_end (args);
+
+  len = u32_mb_prev_uc (&uc, s, n);
+  if (len != expect_len || uc != expect_uc)
+    {
+      size_t i;
+
+      fprintf (stderr, "u32_mb_prev_uc returned length %d and U+%04x, "
+               "expected length %d and U+%04x:",
+               len, (unsigned int) uc,
+               expect_len, (unsigned int) expect_uc);
+      for (i = 0; i < n; i++)
+        fprintf (stderr, " %04x", s[i]);
+      putc ('\n', stderr);
+      fflush (stderr);
+      abort ();
+    }
+}
+
+int
+main (void)
+{
+  /* Valid. */
+  test_u32_mb_prev_uc (1, 'a',     'a', -1);
+  test_u32_mb_prev_uc (1, 0x3042,  0x3042, -1);
+  test_u32_mb_prev_uc (1, 'b',     'a', 'b', -1);
+  test_u32_mb_prev_uc (1, 'x',     0x3042, 'x', -1);
+
+  /* Surrogate pairs are invalid in UTF-32. */
+  test_u32_mb_prev_uc (1, 0xfffd,  0xd83d, 0xde10, -1);
+  test_u32_mb_prev_uc (1, 0xfffd,  'x', 0xd83d, 0xde10, -1);
+
+  /* Malformed surrogate pairs are doubly invalid in UTF-32. */
+  test_u32_mb_prev_uc (1, 0xfffd,  0xd800, -1);
+  test_u32_mb_prev_uc (1, 0xfffd,  'a', 0xd800, -1);
+  test_u32_mb_prev_uc (1, 0xfffd,  0xdeff, -1);
+  test_u32_mb_prev_uc (1, 0xfffd,  'b', 0xdeff, -1);
+
+  return 0;
+}
diff --git a/tests/unistr/test-u8-mb-prev-uc.c 
b/tests/unistr/test-u8-mb-prev-uc.c
new file mode 100644
index 0000000..fd092ca
--- /dev/null
+++ b/tests/unistr/test-u8-mb-prev-uc.c
@@ -0,0 +1,279 @@
+/* Test of u8_mb_prev_uc() function.
+   Copyright (C) 2010, 2011 Free Software Foundation, Inc.
+
+   This program is free software: you can redistribute it and/or modify
+   it under the terms of the GNU General Public License as published by
+   the Free Software Foundation; either version 3 of the License, or
+   (at your option) any later version.
+
+   This program is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+   GNU General Public License for more details.
+
+   You should have received a copy of the GNU General Public License
+   along with this program.  If not, see <http://www.gnu.org/licenses/>.  */
+
+/* Written by Ben Pfaff, 2010.  */
+
+#include <config.h>
+
+#include "unistr.h"
+
+#include <assert.h>
+
+#include "macros.h"
+
+struct uc
+  {
+    /* UTF-8 representation. */
+    const uint8_t *s;
+    int n;
+
+    /* Code point. */
+    ucs4_t uc;
+  };
+
+/* Print the N code points and their representations in UC on stderr, preceded
+   by TITLE. */
+static void
+print_ucs (const char *title, const struct uc *uc, size_t n)
+{
+  fprintf (stderr, "%s:", title);
+  for (; n-- > 0; uc++)
+    {
+      size_t i;
+
+      fprintf (stderr, " <");
+      for (i = 0; i < uc->n; i++)
+        {
+          if (i > 0)
+            putc (' ', stderr);
+          fprintf (stderr, "%02x", (unsigned int) uc->s[i]);
+        }
+      fprintf (stderr, "> U+%04X", (unsigned int) uc->uc);
+    }
+  putc ('\n', stderr);
+}
+
+/* Reverses the order of the N elements of UC. */
+static void
+reverse_ucs (struct uc *uc, size_t n)
+{
+  size_t i;
+
+  for (i = 0; i < n / 2; i++)
+    {
+      size_t j = n - (i + 1);
+      struct uc tmp = uc[i];
+      uc[i] = uc[j];
+      uc[j] = tmp;
+    }
+}
+
+static bool
+equal_ucs (const struct uc *a, size_t n_a, const struct uc *b, size_t n_b)
+{
+  if (n_a != n_b)
+    return false;
+  for (; n_a-- > 0; a++, b++)
+    if (a->n != b->n || a->s != b->s || a->uc != b->uc)
+      return false;
+  return true;
+}
+
+/* Checks that the N units in S yield the same code points whether iterated
+   in the forward or reverse direction. */
+static void
+check_bidirectionally (const uint8_t *s, int n)
+{
+  struct uc ucf[16];
+  struct uc ucr[16];
+  int n_ucf, n_ucr;
+  int used;
+
+  assert (n <= SIZEOF (ucf));
+  assert (n <= SIZEOF (ucr));
+
+  /* Translate units to code points forward. */
+  used = 0;
+  n_ucf = 0;
+  while (used < n)
+    {
+      struct uc *uc = &ucf[n_ucf++];
+      uc->s = &s[used];
+      uc->n = u8_mbtouc (&uc->uc, uc->s, n - used);
+      ASSERT (uc->n >= 1);
+      ASSERT (uc->n <= n - used);
+      used += uc->n;
+    }
+
+  /* Translate units to code points backward. */
+  used = 0;
+  n_ucr = 0;
+  while (used < n)
+    {
+      struct uc *uc = &ucr[n_ucr++];
+      uc->n = u8_mb_prev_uc (&uc->uc, s, n - used);
+      ASSERT (uc->n >= 1);
+      ASSERT (uc->n <= n - used);
+      used += uc->n;
+      uc->s = &s[n - used];
+    }
+  reverse_ucs (ucr, n_ucr);
+
+  /* Check that the results were the same. */
+  if (!equal_ucs (ucf, n_ucf, ucr, n_ucr))
+    {
+      fprintf (stderr, "%s:%d: forward and reverse differ\n",
+               __FILE__, __LINE__);
+      print_ucs ("forward", ucf, n_ucf);
+      print_ucs ("reverse", ucr, n_ucr);
+      fflush (stderr);
+      abort ();
+    }
+}
+
+#if CONFIG_UNICODE_SAFETY
+static void
+do_exhaustive_test (const uint8_t *start, uint8_t *s, int n)
+{
+  /* The units to test. */
+  static const uint8_t units[] = {
+    /* The smallest value in each class. (Any other member or members would
+       work as well). */
+    0x00, 0x80, 0x90, 0xa0, 0xc0, 0xc2, 0xe0, 0xe1, 0xed, 0xee, 0xf0, 0xf1,
+    0xf4, 0xf5,
+
+    /* The UTF-8 units that make up U+FFFD, since that is such a special value
+       for these routines. */
+    0xef, 0xbf, 0xbd
+  };
+  int i;
+
+  for (i = 0; i < SIZEOF (units); i++)
+    {
+      s[0] = units[i];
+      if (n > 1)
+        do_exhaustive_test (start, s + 1, n - 1);
+      else
+        check_bidirectionally (start, (s + 1) - start);
+    }
+}
+
+/* This test exhaustively compares how u8_mbtouc() and u8_mb_prev_uc() treat
+   all UTF-8 well-formed and ill-formed sequences that are MAX_LENGTH units or
+   shorter.  To do so in a reasonable amount of time, it uses a trick: many
+   UTF-8 unit values are in classes whose members are all treated the same way.
+   Thus, it is only necessary to test one member of each class. */
+static void
+exhaustive_test (int max_length)
+{
+  uint8_t s[16];
+  int length;
+
+  assert (max_length <= SIZEOF (s));
+  for (length = 0; length <= max_length; length++)
+    do_exhaustive_test (s, s, length);
+}
+#endif  /* CONFIG_UNICODE_SAFETY */
+
+static void
+do_well_formed_test (const uint8_t *start, uint8_t *s, int n)
+{
+  if (n == 0)
+    {
+      check_bidirectionally (start, s - start);
+      return;
+    }
+
+  /* Test single-byte characters. */
+  s[0] = 0;
+  do_well_formed_test (start, s + 1, n - 1);
+
+  s[0] = 0x41;
+  do_well_formed_test (start, s + 1, n - 1);
+
+  /* Test 2-byte characters. */
+  if (n >= 2)
+    {
+      s[0] = 0xc2;
+      s[1] = 0xb0;
+      do_well_formed_test (start, s + 2, n - 2);
+    }
+
+  /* Test 3-byte characters. */
+  if (n >= 3)
+    {
+      s[0] = 0xe0;
+      s[1] = 0xa0;
+      s[2] = 0xa5;
+      do_well_formed_test (start, s + 3, n - 3);
+
+      s[0] = 0xe5;
+      s[1] = 0xbf;
+      s[2] = 0x81;
+      do_well_formed_test (start, s + 3, n - 3);
+
+      s[0] = 0xed;
+      s[1] = 0x9f;
+      s[2] = 0x99;
+      do_well_formed_test (start, s + 3, n - 3);
+    }
+
+  /* Test 4-byte characters. */
+  if (n >= 4)
+    {
+      s[0] = 0xf0;
+      s[1] = 0x90;
+      s[2] = 0xbb;
+      s[3] = 0x80;
+      do_well_formed_test (start, s + 4, n - 4);
+
+      s[0] = 0xf2;
+      s[1] = 0x80;
+      s[2] = 0xbf;
+      s[3] = 0x80;
+      do_well_formed_test (start, s + 4, n - 4);
+
+      s[0] = 0xf4;
+      s[1] = 0x8f;
+      s[2] = 0x80;
+      s[3] = 0xbf;
+      do_well_formed_test (start, s + 4, n - 4);
+    }
+}
+
+/* Checks iteration through all possible sets of UTF-8 sequence lengths with
+   no more than MAX_LENGTH units. */
+static void
+well_formed_test (int max_length)
+{
+  uint8_t s[16];
+  int length;
+
+  assert (max_length <= SIZEOF (s));
+  for (length = 0; length <= max_length; length++)
+    do_well_formed_test (s, s, length);
+}
+
+int
+main (void)
+{
+#if CONFIG_UNICODE_SAFETY
+  /* This only passes if Unicode safety was compiled in, because most of the
+     sequences that it tests are ill-formed UTF-8.
+
+     Runtime increases exponentially with the argument: 4 runs in a fraction
+     of a second, 5 in a few seconds, 6 in half a minute. */
+  exhaustive_test (5);
+#endif
+
+  /* This only tests well-formed characters so it should always pass.
+
+     Runtime increases exponentially but much more slowly than with
+     exhaustive_test(). */
+  well_formed_test (10);
+
+  return 0;
+}
-- 
1.7.2.3




reply via email to

[Prev in Thread] Current Thread [Next in Thread]