From b93de66735cd6f935ee0970f8cb26908d113e09d Mon Sep 17 00:00:00 2001
From: Paul Eggert <eggert@cs.ucla.edu>
Date: Thu, 7 Sep 2023 14:51:55 -0700
Subject: [PATCH 1/7] mcel: new module

* lib/mcel.c, lib/mcel.h, modules/mcel: New files.
---
 ChangeLog    |   5 +
 lib/mcel.c   |   3 +
 lib/mcel.h   | 294 +++++++++++++++++++++++++++++++++++++++++++++++++++
 modules/mcel |  34 ++++++
 4 files changed, 336 insertions(+)
 create mode 100644 lib/mcel.c
 create mode 100644 lib/mcel.h
 create mode 100644 modules/mcel

diff --git a/ChangeLog b/ChangeLog
index d5fc6c2130..d477347b91 100644
--- a/ChangeLog
+++ b/ChangeLog
@@ -1,3 +1,8 @@
+2023-09-07  Paul Eggert  <eggert@cs.ucla.edu>
+
+	mcel: new module
+	* lib/mcel.c, lib/mcel.h, modules/mcel: New files.
+
 2023-09-07  Bruno Haible  <bruno@clisp.org>
 
 	Don't use 'throw ()' in C++ 11 or newer.
diff --git a/lib/mcel.c b/lib/mcel.c
new file mode 100644
index 0000000000..3c2ae46290
--- /dev/null
+++ b/lib/mcel.c
@@ -0,0 +1,3 @@
+#include <config.h>
+#define MCEL_INLINE _GL_EXTERN_INLINE
+#include "mcel.h"
diff --git a/lib/mcel.h b/lib/mcel.h
new file mode 100644
index 0000000000..400604f8b2
--- /dev/null
+++ b/lib/mcel.h
@@ -0,0 +1,294 @@
+/* Multi-byte characters, Error encodings, and Lengths (MCELs)
+   Copyright 2023 Free Software Foundation, Inc.
+
+   This file is free software: you can redistribute it and/or modify
+   it under the terms of the GNU Lesser General Public License as
+   published by the Free Software Foundation; either version 2.1 of the
+   License, or (at your option) any later version.
+
+   This file is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+   GNU Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public License
+   along with this program.  If not, see <https://www.gnu.org/licenses/>.  */
+
+/* Written by Paul Eggert.  */
+
+/* The macros in this file implement multi-byte character representation
+   and forward iteration through a multi-byte string.
+   They are simpler and can be faster than the mbiter family.
+   However, they do not support obsolescent encodings like CP864,
+   EBCDIC, Johab, and Shift JIS that glibc also does not support,
+   and it is up to the caller to coalesce encoding-error bytes if desired.
+
+   The mcel_scan function lets code iterate through an array of bytes,
+   supporting character encodings in practical use
+   more simply than using plain mbrtoc32.
+
+   Instead of this single-byte code:
+
+      char *p = ..., *lim = ...;
+      for (; p < lim; p++)
+        process (*p);
+
+   You can use this multi-byte code:
+
+      char *p = ..., *lim = ...;
+      for (mcel_t g; p < lim; p += g.len)
+        {
+	  g = mcel_scan (p, lim);
+	  process (g);
+	}
+
+   You can select from G using G.ch, G.err, and G.len.
+   G is an encoding error if G.err is nonzero, a character otherwise.
+
+   The mcel_scanz function is similar except it works with a
+   string of unknown but positive length that is terminated with '\0'.
+   Instead of this single-byte code:
+
+      char *p = ...;
+      for (; *p; p++)
+	process (*p);
+
+   You can use this multi-byte code:
+
+      char *p = ...;
+      for (mcel_t g; *p; p += g.len)
+	{
+	  g = mcel_scanz (p);
+	  process (g);
+	}
+
+   mcel_scant (P, TERMINATOR) is like mcel_scanz (P) except the
+   string is terminated by TERMINATOR.  The C standard says that the
+   TERMINATORs '\0', '\r', '\n', '.', '/' are safe, as they cannot be
+   a part (even a trailing byte) of a multi-byte character.
+   In practice TERMINATOR is safe if 0 <= TERMINATOR <= 0x2f (ASCII '/').
+
+   mcel_ch (CH, LEN) and mcel_err (ERR) construct mcel_t values.
+
+   mcel_cmp (G1, G2) compares two mcel_t values lexicographically by
+   character or by encoding byte value, with encoding bytes sorting
+   after characters.
+
+   Calls like c32isalpha (G.ch) test G; they return false for encoding
+   errors since calls like c32isalpha (0) return false.  Calls like
+   mcel_tocmp (c32tolower, G1, G2) are like mcel_cmp (G1, G2),
+   but transliterate first.
+
+   Although ISO C and POSIX allow encodings that have shift states or
+   that can produce multiple characters from an indivisible byte sequence,
+   POSIX does not require support for these encodings,
+   they are not in practical use on GNUish platforms,
+   and omitting support for them simplifies the API.  */
+
+#ifndef _MCEL_H
+#define _MCEL_H 1
+
+#if !_GL_CONFIG_H_INCLUDED
+ #error "Please include config.h first."
+#endif
+
+#include <verify.h>
+
+#include <limits.h>
+#include <stddef.h>
+#include <uchar.h>
+
+/* Pacify GCC re type limits.  */
+#if defined __GNUC__ && 4 < __GNUC__ + (3 <= __GNUC_MINOR__)
+# pragma GCC diagnostic ignored "-Wtype-limits"
+#endif
+
+/* The maximum multi-byte character length supported on any platform.
+   This can be less than MB_LEN_MAX because many platforms have a
+   large MB_LEN_MAX to allow for stateful encodings, and mcel does not
+   support these encodings.  MCEL_LEN_MAX is enough for UTF-8, EUC,
+   Shift-JIS, GB18030, etc.  In all multi-byte encodings supported by glibc,
+   0 < MB_CUR_MAX <= MCEL_LEN_MAX <= MB_LEN_MAX.  */
+enum { MCEL_LEN_MAX = MB_LEN_MAX < 4 ? MB_LEN_MAX : 4 };
+
+/* Bounds for mcel_t members.  */
+enum { MCEL_CHAR_MAX = 0x10FFFF };
+enum { MCEL_ERR_MIN = 0x80 };
+
+/* mcel_t is a type representing a character CH or an encoding error byte ERR,
+   along with a count of the LEN bytes that represent CH or ERR.
+   If ERR is zero, CH is a valid character and 0 < LEN <= MCEL_LEN_MAX;
+   otherwise ERR is an encoding error byte, MCEL_ERR_MIN <= ERR,
+   CH == 0, and LEN == 1.  */
+typedef struct
+{
+  char32_t ch;
+  unsigned char err;
+  unsigned char len;
+} mcel_t;
+
+/* Every multi-byte character length fits in mcel_t's LEN.  */
+static_assert (MB_LEN_MAX <= UCHAR_MAX);
+
+/* Shifting an encoding error byte left by this value
+   suffices to sort encoding errors after characters.  */
+enum { MCEL_ERR_SHIFT = 14 };
+static_assert (MCEL_CHAR_MAX < MCEL_ERR_MIN << MCEL_ERR_SHIFT);
+
+/* Unsigned char promotes to int.  */
+static_assert (UCHAR_MAX <= INT_MAX);
+
+/* Bytes have 8 bits, as POSIX requires.  */
+static_assert (CHAR_BIT == 8);
+
+#ifndef _GL_LIKELY
+/* Rely on __builtin_expect, as provided by the module 'builtin-expect'.  */
+# define _GL_LIKELY(cond) __builtin_expect ((cond), 1)
+# define _GL_UNLIKELY(cond) __builtin_expect ((cond), 0)
+#endif
+
+_GL_INLINE_HEADER_BEGIN
+#ifndef MCEL_INLINE
+# define MCEL_INLINE _GL_INLINE
+#endif
+
+/* mcel_t constructors.  */
+MCEL_INLINE mcel_t
+mcel_ch (char32_t ch, size_t len)
+{
+  assume (0 < len);
+  assume (len <= MCEL_LEN_MAX);
+  assume (ch <= MCEL_CHAR_MAX);
+  return (mcel_t) {.ch = ch, .len = len};
+}
+MCEL_INLINE mcel_t
+mcel_err (unsigned char err)
+{
+  assume (MCEL_ERR_MIN <= err);
+  return (mcel_t) {.err = err, .len = 1};
+}
+
+/* Compare C1 and C2, with encoding errors sorting after characters.
+   Return <0, 0, >0 for <, =, >.  */
+MCEL_INLINE int
+mcel_cmp (mcel_t c1, mcel_t c2)
+{
+  int ch1 = c1.ch, ch2 = c2.ch;
+  return ((c1.err - c2.err) * (1 << MCEL_ERR_SHIFT)) + (ch1 - ch2);
+}
+
+/* Apply the uchar translator TO to C1 and C2 and compare the results,
+   with encoding errors sorting after characters,
+   Return <0, 0, >0 for <, =, >.  */
+MCEL_INLINE int
+mcel_tocmp (wint_t (*to) (wint_t), mcel_t c1, mcel_t c2)
+{
+  int cmp = mcel_cmp (c1, c2);
+  if (_GL_LIKELY ((c1.err - c2.err) | !cmp))
+    return cmp;
+  int ch1 = to (c1.ch), ch2 = to (c2.ch);
+  return ch1 - ch2;
+}
+
+/* Whether C represents itself as a Unicode character
+   when it is the first byte of a single- or multi-byte character.
+   These days it is safe to assume ASCII, so do not support
+   obsolescent encodings like CP864, EBCDIC, Johab, and Shift JIS.  */
+MCEL_INLINE bool
+mcel_isbasic (char c)
+{
+  return _GL_LIKELY (0 <= c && c < MCEL_ERR_MIN);
+}
+
+/* With mcel there should be no need for the performance overhead of
+   replacing glibc mbrtoc32, as callers shouldn't care whether the
+   C locale treats a byte with the high bit set as an encoding error.  */
+#ifdef __GLIBC__
+# undef mbrtoc32
+#endif
+
+/* Scan bytes from P inclusive to LIM exclusive.  P must be less than LIM.
+   Return the character or encoding error starting at P.  */
+MCEL_INLINE mcel_t
+mcel_scan (char const *p, char const *lim)
+{
+  /* Handle ASCII quickly to avoid the overhead of calling mbrtoc32.
+     In supported encodings, the first byte of a multi-byte character
+     cannot be an ASCII byte.  */
+  char c = *p;
+  if (mcel_isbasic (c))
+    return mcel_ch (c, 1);
+
+  /* An initial mbstate_t; initialization optimized for some platforms.
+     For details about these and other platforms, see wchar.in.h.  */
+#if defined __GLIBC__ && 2 < __GLIBC__ + (2 <= __GLIBC_MINOR__)
+  /* Although only a trivial optimization, it's worth it for GNU.  */
+  mbstate_t mbs; mbs.__count = 0;
+#elif (defined __FreeBSD__ || defined __DragonFly__ || defined __OpenBSD__ \
+       || (defined __APPLE__ && defined __MACH__))
+  /* These platforms have 128-byte mbstate_t.  What were they thinking?
+     Initialize just for supported encodings (UTF-8, EUC, etc.).
+     Avoid memset because some compilers generate function call code.  */
+  struct mbhidden { char32_t ch; int utf8_want, euc_want; }
+    _GL_ATTRIBUTE_MAY_ALIAS;
+  union { mbstate_t m; struct mbhidden s; } u;
+  u.s.ch = u.s.utf8_want = u.s.euc_want = 0;
+# define mbs u.m
+#elif defined __NetBSD__
+  /* Experiments on both 32- and 64-bit NetBSD platforms have
+     shown that it doesn't work to clear fewer than 24 bytes.  */
+  struct mbhidden { long long int a, b, c; } _GL_ATTRIBUTE_MAY_ALIAS;
+  union { mbstate_t m; struct mbhidden s; } u;
+  u.s.a = u.s.b = u.s.c = 0;
+# define mbs u.m
+#else
+  /* mbstate_t has unknown structure or is not worth optimizing.  */
+  mbstate_t mbs = {0};
+#endif
+
+  char32_t ch;
+  size_t len = mbrtoc32 (&ch, p, lim - p, &mbs);
+
+  /* Any LEN with top bit set is an encoding error, as LEN == (size_t) -3
+     is not supported and MB_LEN_MAX is small.  */
+  if (_GL_UNLIKELY ((size_t) -1 / 2 < len))
+    return mcel_err (c);
+
+  /* A multi-byte character.  LEN must be positive,
+     as *P != '\0' and shift sequences are not supported.  */
+  return mcel_ch (ch, len);
+}
+
+/* Scan bytes from P, a byte sequence terminated by TERMINATOR.
+   If *P == TERMINATOR, scan just that byte; otherwise scan
+   bytes up to but not including TERMINATOR.
+   TERMINATOR must be ASCII, and should be '\0', '\r', '\n', '.', or '/'.
+   Return the character or encoding error starting at P.  */
+MCEL_INLINE mcel_t
+mcel_scant (char const *p, char terminator)
+{
+  /* Handle ASCII quickly for speed.  */
+  if (mcel_isbasic (*p))
+    return mcel_ch (*p, 1);
+
+  /* Defer to mcel_scan for non-ASCII.  Compute length with code that
+     is typically faster than strnlen.  */
+  char const *lim = p + 1;
+  for (int i = 0; i < MCEL_LEN_MAX - 1; i++)
+    lim += *lim != terminator;
+  return mcel_scan (p, lim);
+}
+
+/* Scan bytes from P, a byte sequence terminated by '\0'.
+   If *P == '\0', scan just that byte; otherwise scan
+   bytes up to but not including '\0'.
+   Return the character or encoding error starting at P.  */
+MCEL_INLINE mcel_t
+mcel_scanz (char const *p)
+{
+  return mcel_scant (p, '\0');
+}
+
+_GL_INLINE_HEADER_END
+
+#endif /* _MCEL_H */
diff --git a/modules/mcel b/modules/mcel
new file mode 100644
index 0000000000..59ca633641
--- /dev/null
+++ b/modules/mcel
@@ -0,0 +1,34 @@
+Description:
+Multibye Characters, Encoding errors, and Lengths
+
+Files:
+lib/mcel.c
+lib/mcel.h
+
+Depends-on:
+assert-h
+extern-inline
+limits-h
+mbrtoc32
+stdbool
+uchar
+verify
+
+configure.ac:
+
+Makefile.am:
+lib_SOURCES += mcel.c mcel.h
+
+Include:
+"mcel.h"
+
+Link:
+$(LTLIBUNISTRING) when linking with libtool, $(LIBUNISTRING) otherwise
+$(MBRTOWC_LIB)
+$(LTLIBC32CONV) when linking with libtool, $(LIBC32CONV) otherwise
+
+License:
+LGPLv2+
+
+Maintainer:
+all
-- 
2.39.2