diff --git a/doc/strings.texi b/doc/strings.texi index aa0830f1a5..73cb56120f 100644 --- a/doc/strings.texi +++ b/doc/strings.texi @@ -44,7 +44,7 @@ functions, standardized by ISO C and POSIX, that assume this representation of strings. -An @emph{character encoding}, or @emph{encoding} for short, describes +A @emph{character encoding}, or @emph{encoding} for short, describes how the elements of a character set are represented as a sequence of bytes. For example, in the @code{ASCII} encoding, the UNDERSCORE character is represented by a single byte, with value 0x5F. As another diff --git a/lib/exclude.c b/lib/exclude.c index 7bd0ec8c71..af204cd300 100644 --- a/lib/exclude.c +++ b/lib/exclude.c @@ -209,10 +209,10 @@ string_hasher_ci (void const *data, size_t n_buckets) for (mbui_init (iter, p); mbui_avail (iter); mbui_advance (iter)) { mbchar_t m = mbui_cur (iter); - wchar_t wc; + char32_t wc; if (m.wc_valid) - wc = towlower (m.wc); + wc = c32tolower (m.wc); else wc = *m.ptr; diff --git a/lib/mbchar.h b/lib/mbchar.h index a2ff1d8b21..c183772cc6 100644 --- a/lib/mbchar.h +++ b/lib/mbchar.h @@ -17,10 +17,10 @@ /* Written by Bruno Haible . */ /* A multibyte character is a short subsequence of a char* string, - representing a single wide character. + representing a single 32-bit wide character. - We use multibyte characters instead of wide characters because of - the following goals: + We use multibyte characters instead of 32-bit wide characters because + of the following goals: 1) correct multibyte handling, i.e. operate according to the LC_CTYPE locale, 2) ease of maintenance, i.e. the maintainer needs not know all details @@ -28,8 +28,7 @@ 3) don't fail grossly if the input is not in the encoding set by the locale, because often different encodings are in use in the same countries (ISO-8859-1/UTF-8, EUC-JP/Shift_JIS, ...), - 4) fast in the case of ASCII characters, - 5) portability, i.e. don't make unportable assumptions about wchar_t. + 4) fast in the case of ASCII characters. Multibyte characters are only accessed through the mb* macros. @@ -150,8 +149,7 @@ #endif #include -#include -#include +#include _GL_INLINE_HEADER_BEGIN #ifndef MBCHAR_INLINE @@ -164,8 +162,8 @@ struct mbchar { const char *ptr; /* pointer to current character */ size_t bytes; /* number of bytes of current character, > 0 */ - bool wc_valid; /* true if wc is a valid wide character */ - wchar_t wc; /* if wc_valid: the current character */ + bool wc_valid; /* true if wc is a valid 32-bit wide character */ + char32_t wc; /* if wc_valid: the current character */ char buf[MBCHAR_BUF_SIZE]; /* room for the bytes, used for file input only */ }; @@ -184,7 +182,7 @@ typedef struct mbchar mbchar_t; #define mb_cmp(mbc1, mbc2) \ ((mbc1).wc_valid \ ? ((mbc2).wc_valid \ - ? (int) (mbc1).wc - (int) (mbc2).wc \ + ? _GL_CMP ((mbc1).wc, (mbc2).wc) \ : -1) \ : ((mbc2).wc_valid \ ? 1 \ @@ -196,7 +194,7 @@ typedef struct mbchar mbchar_t; #define mb_casecmp(mbc1, mbc2) \ ((mbc1).wc_valid \ ? ((mbc2).wc_valid \ - ? (int) towlower ((mbc1).wc) - (int) towlower ((mbc2).wc) \ + ? _GL_CMP (c32tolower ((mbc1).wc), c32tolower ((mbc2).wc)) \ : -1) \ : ((mbc2).wc_valid \ ? 1 \ @@ -212,25 +210,25 @@ typedef struct mbchar mbchar_t; && memcmp ((mbc1).ptr, (mbc2).ptr, (mbc1).bytes) == 0) #define mb_caseequal(mbc1, mbc2) \ ((mbc1).wc_valid && (mbc2).wc_valid \ - ? towlower ((mbc1).wc) == towlower ((mbc2).wc) \ + ? c32tolower ((mbc1).wc) == c32tolower ((mbc2).wc) \ : (mbc1).bytes == (mbc2).bytes \ && memcmp ((mbc1).ptr, (mbc2).ptr, (mbc1).bytes) == 0) /* , classification. */ #define mb_isascii(mbc) \ ((mbc).wc_valid && (mbc).wc >= 0 && (mbc).wc <= 127) -#define mb_isalnum(mbc) ((mbc).wc_valid && iswalnum ((mbc).wc)) -#define mb_isalpha(mbc) ((mbc).wc_valid && iswalpha ((mbc).wc)) -#define mb_isblank(mbc) ((mbc).wc_valid && iswblank ((mbc).wc)) -#define mb_iscntrl(mbc) ((mbc).wc_valid && iswcntrl ((mbc).wc)) -#define mb_isdigit(mbc) ((mbc).wc_valid && iswdigit ((mbc).wc)) -#define mb_isgraph(mbc) ((mbc).wc_valid && iswgraph ((mbc).wc)) -#define mb_islower(mbc) ((mbc).wc_valid && iswlower ((mbc).wc)) -#define mb_isprint(mbc) ((mbc).wc_valid && iswprint ((mbc).wc)) -#define mb_ispunct(mbc) ((mbc).wc_valid && iswpunct ((mbc).wc)) -#define mb_isspace(mbc) ((mbc).wc_valid && iswspace ((mbc).wc)) -#define mb_isupper(mbc) ((mbc).wc_valid && iswupper ((mbc).wc)) -#define mb_isxdigit(mbc) ((mbc).wc_valid && iswxdigit ((mbc).wc)) +#define mb_isalnum(mbc) ((mbc).wc_valid && c32isalnum ((mbc).wc)) +#define mb_isalpha(mbc) ((mbc).wc_valid && c32isalpha ((mbc).wc)) +#define mb_isblank(mbc) ((mbc).wc_valid && c32isblank ((mbc).wc)) +#define mb_iscntrl(mbc) ((mbc).wc_valid && c32iscntrl ((mbc).wc)) +#define mb_isdigit(mbc) ((mbc).wc_valid && c32isdigit ((mbc).wc)) +#define mb_isgraph(mbc) ((mbc).wc_valid && c32isgraph ((mbc).wc)) +#define mb_islower(mbc) ((mbc).wc_valid && c32islower ((mbc).wc)) +#define mb_isprint(mbc) ((mbc).wc_valid && c32isprint ((mbc).wc)) +#define mb_ispunct(mbc) ((mbc).wc_valid && c32ispunct ((mbc).wc)) +#define mb_isspace(mbc) ((mbc).wc_valid && c32isspace ((mbc).wc)) +#define mb_isupper(mbc) ((mbc).wc_valid && c32isupper ((mbc).wc)) +#define mb_isxdigit(mbc) ((mbc).wc_valid && c32isxdigit ((mbc).wc)) /* Extra function. */ @@ -238,12 +236,12 @@ typedef struct mbchar mbchar_t; #define MB_UNPRINTABLE_WIDTH 1 MBCHAR_INLINE int -mb_width_aux (wint_t wc) +mb_width_aux (char32_t wc) { - int w = wcwidth (wc); + int w = c32width (wc); /* For unprintable characters, arbitrarily return 0 for control characters and MB_UNPRINTABLE_WIDTH otherwise. */ - return (w >= 0 ? w : iswcntrl (wc) ? 0 : MB_UNPRINTABLE_WIDTH); + return (w >= 0 ? w : c32iscntrl (wc) ? 0 : MB_UNPRINTABLE_WIDTH); } #define mb_width(mbc) \ diff --git a/lib/mbfile.h b/lib/mbfile.h index 3482f394b9..7c6d70fcae 100644 --- a/lib/mbfile.h +++ b/lib/mbfile.h @@ -110,7 +110,7 @@ mbfile_multi_getc (struct mbchar *mbc, struct mbfile_multi *mbf) { /* These characters are part of the basic character set. ISO C 99 guarantees that their wide character code is identical to their - char code. */ + char code. The 32-bit wide character code is the same as well. */ mbc->wc = mbc->buf[0] = mbf->buf[0]; mbc->wc_valid = true; mbc->ptr = &mbc->buf[0]; @@ -136,7 +136,7 @@ mbfile_multi_getc (struct mbchar *mbc, struct mbfile_multi *mbf) behaviour will clobber it. */ mbstate_t backup_state = mbf->state; - bytes = mbrtowc (&mbc->wc, &mbf->buf[0], mbf->bufcount, &mbf->state); + bytes = mbrtoc32 (&mbc->wc, &mbf->buf[0], mbf->bufcount, &mbf->state); if (bytes == (size_t) -1) { @@ -178,7 +178,7 @@ mbfile_multi_getc (struct mbchar *mbc, struct mbfile_multi *mbf) { if (bytes == 0) { - /* A null wide character was encountered. */ + /* A null 32-bit wide character was encountered. */ bytes = 1; assert (mbf->buf[0] == '\0'); assert (mbc->wc == 0); diff --git a/lib/mbiter.h b/lib/mbiter.h index 7b41870b55..93bad990a1 100644 --- a/lib/mbiter.h +++ b/lib/mbiter.h @@ -90,7 +90,7 @@ #include #include #include -#include +#include #include "mbchar.h" @@ -106,11 +106,11 @@ struct mbiter_multi mbstate_t state; /* if in_shift: current shift state */ bool next_done; /* true if mbi_avail has already filled the following */ struct mbchar cur; /* the current character: - const char *cur.ptr pointer to current character + const char *cur.ptr pointer to current character The following are only valid after mbi_avail. - size_t cur.bytes number of bytes of current character - bool cur.wc_valid true if wc is a valid wide character - wchar_t cur.wc if wc_valid: the current character + size_t cur.bytes number of bytes of current character + bool cur.wc_valid true if wc is a valid 32-bit wide character + char32_t cur.wc if wc_valid: the current character */ }; @@ -136,8 +136,8 @@ mbiter_multi_next (struct mbiter_multi *iter) assert (mbsinit (&iter->state)); iter->in_shift = true; with_shift: - iter->cur.bytes = mbrtowc (&iter->cur.wc, iter->cur.ptr, - iter->limit - iter->cur.ptr, &iter->state); + iter->cur.bytes = mbrtoc32 (&iter->cur.wc, iter->cur.ptr, + iter->limit - iter->cur.ptr, &iter->state); if (iter->cur.bytes == (size_t) -1) { /* An invalid multibyte sequence was encountered. */ diff --git a/lib/mbscasestr.c b/lib/mbscasestr.c index d92b847ba7..0753aeb864 100644 --- a/lib/mbscasestr.c +++ b/lib/mbscasestr.c @@ -64,7 +64,7 @@ knuth_morris_pratt_multibyte (const char *haystack, const char *needle, { mb_copy (&needle_mbchars[j], &mbui_cur (iter)); if (needle_mbchars[j].wc_valid) - needle_mbchars[j].wc = towlower (needle_mbchars[j].wc); + needle_mbchars[j].wc = c32tolower (needle_mbchars[j].wc); } } @@ -152,7 +152,7 @@ knuth_morris_pratt_multibyte (const char *haystack, const char *needle, mb_copy (&c, &mbui_cur (phaystack)); if (c.wc_valid) - c.wc = towlower (c.wc); + c.wc = c32tolower (c.wc); if (mb_equal (needle_mbchars[j], c)) { j++; @@ -237,7 +237,7 @@ mbscasestr (const char *haystack, const char *needle) mb_copy (&b, &mbui_cur (iter_needle)); if (b.wc_valid) - b.wc = towlower (b.wc); + b.wc = c32tolower (b.wc); mbui_init (iter_haystack, haystack); for (;; mbui_advance (iter_haystack)) @@ -279,7 +279,7 @@ mbscasestr (const char *haystack, const char *needle) comparison_count++; mb_copy (&c, &mbui_cur (iter_haystack)); if (c.wc_valid) - c.wc = towlower (c.wc); + c.wc = c32tolower (c.wc); if (mb_equal (c, b)) /* The first character matches. */ { diff --git a/lib/mbuiter.h b/lib/mbuiter.h index 7a619f19e1..632def10c5 100644 --- a/lib/mbuiter.h +++ b/lib/mbuiter.h @@ -114,11 +114,11 @@ struct mbuiter_multi mbstate_t state; /* if in_shift: current shift state */ bool next_done; /* true if mbui_avail has already filled the following */ struct mbchar cur; /* the current character: - const char *cur.ptr pointer to current character + const char *cur.ptr pointer to current character The following are only valid after mbui_avail. - size_t cur.bytes number of bytes of current character - bool cur.wc_valid true if wc is a valid wide character - wchar_t cur.wc if wc_valid: the current character + size_t cur.bytes number of bytes of current character + bool cur.wc_valid true if wc is a valid 32-bit wide character + wchar_t cur.wc if wc_valid: the current character */ }; @@ -144,9 +144,9 @@ mbuiter_multi_next (struct mbuiter_multi *iter) assert (mbsinit (&iter->state)); iter->in_shift = true; with_shift: - iter->cur.bytes = mbrtowc (&iter->cur.wc, iter->cur.ptr, - strnlen1 (iter->cur.ptr, MB_CUR_MAX), - &iter->state); + iter->cur.bytes = mbrtoc32 (&iter->cur.wc, iter->cur.ptr, + strnlen1 (iter->cur.ptr, MB_CUR_MAX), + &iter->state); if (iter->cur.bytes == (size_t) -1) { /* An invalid multibyte sequence was encountered. */ diff --git a/modules/exclude b/modules/exclude index 841dd826cd..93bfdaf4cf 100644 --- a/modules/exclude +++ b/modules/exclude @@ -7,6 +7,7 @@ lib/exclude.c Depends-on: assert-h +c32tolower filename fnmatch fopen-gnu diff --git a/modules/mbchar b/modules/mbchar index b1fa0fa4ac..51a1c8e1b9 100644 --- a/modules/mbchar +++ b/modules/mbchar @@ -10,12 +10,21 @@ Depends-on: extensions extern-inline stdbool -wchar -wctype-h -iswblank -iswdigit -iswxdigit -wcwidth +uchar +c32isalnum +c32isalpha +c32isblank +c32iscntrl +c32isdigit +c32isgraph +c32islower +c32isprint +c32ispunct +c32isspace +c32isupper +c32isxdigit +c32tolower +c32width memcmp configure.ac: diff --git a/modules/mbiter b/modules/mbiter index 42305d62cd..082afd42f2 100644 --- a/modules/mbiter +++ b/modules/mbiter @@ -10,9 +10,9 @@ m4/mbrtowc.m4 Depends-on: extern-inline mbchar -mbrtowc +mbrtoc32 mbsinit -wchar +uchar stdbool configure.ac: diff --git a/modules/mbscasestr b/modules/mbscasestr index 2892c2fc2b..672cac8960 100644 --- a/modules/mbscasestr +++ b/modules/mbscasestr @@ -11,6 +11,7 @@ stdbool string mbslen malloca +c32tolower strnlen configure.ac: diff --git a/modules/mbuiter b/modules/mbuiter index b9e41031d5..63a11ff2f5 100644 --- a/modules/mbuiter +++ b/modules/mbuiter @@ -10,9 +10,9 @@ m4/mbrtowc.m4 Depends-on: extern-inline mbchar -mbrtowc +mbrtoc32 mbsinit -wchar +uchar stdbool strnlen1