bug-gnulib
[Top][All Lists]
Advanced

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

new modules for Unicode string case mappings


From: Bruno Haible
Subject: new modules for Unicode string case mappings
Date: Sun, 8 Mar 2009 17:33:10 +0100
User-agent: KMail/1.9.9

These commits add modules for case conversion (upper/lower/title case),
string matching that ignores case, and detection of case of a given string.

The code handles intricacies like
  - Turkish upper/lowercase i İ ı I (locale dependent),
  - German sharp s (ß -> ss),
  - Greek final sigma,
  - Lithuanian and soft-dot.

Here's the newly implemented API, and the ChangeLog entry (without mentioning
the tests).

========================== part of lib/unicase.h ==========================

/* String case mappings.  */

/* These functions are locale dependent.  The iso639_language argument
   identifies the language (e.g. "tr" for Turkish).  NULL means to use
   locale independent case mappings.  */

/* Return the ISO 639 language code of the current locale.
   Return "" if it is unknown, or in the "C" locale.  */
extern const char *
       uc_locale_language (void);

/* Conventions:

   All functions prefixed with u8_ operate on UTF-8 encoded strings.
   Their unit is an uint8_t (1 byte).

   All functions prefixed with u16_ operate on UTF-16 encoded strings.
   Their unit is an uint16_t (a 2-byte word).

   All functions prefixed with u32_ operate on UCS-4 encoded strings.
   Their unit is an uint32_t (a 4-byte word).

   All argument pairs (s, n) denote a Unicode string s[0..n-1] with exactly
   n units.

   Functions returning a string result take a (resultbuf, lengthp) argument
   pair.  If resultbuf is not NULL and the result fits into *lengthp units,
   it is put in resultbuf, and resultbuf is returned.  Otherwise, a freshly
   allocated string is returned.  In both cases, *lengthp is set to the
   length (number of units) of the returned string.  In case of error,
   NULL is returned and errno is set.  */

/* Return the uppercase mapping of a string.
   The nf argument identifies the normalization form to apply after the
   case-mapping.  It can also be NULL, for no normalization.  */
extern uint8_t *
       u8_toupper (const uint8_t *s, size_t n, const char *iso639_language,
                   uninorm_t nf,
                   uint8_t *resultbuf, size_t *lengthp);
extern uint16_t *
       u16_toupper (const uint16_t *s, size_t n, const char *iso639_language,
                    uninorm_t nf,
                    uint16_t *resultbuf, size_t *lengthp);
extern uint32_t *
       u32_toupper (const uint32_t *s, size_t n, const char *iso639_language,
                    uninorm_t nf,
                    uint32_t *resultbuf, size_t *lengthp);

/* Return the lowercase mapping of a string.
   The nf argument identifies the normalization form to apply after the
   case-mapping.  It can also be NULL, for no normalization.  */
extern uint8_t *
       u8_tolower (const uint8_t *s, size_t n, const char *iso639_language,
                   uninorm_t nf,
                   uint8_t *resultbuf, size_t *lengthp);
extern uint16_t *
       u16_tolower (const uint16_t *s, size_t n, const char *iso639_language,
                    uninorm_t nf,
                    uint16_t *resultbuf, size_t *lengthp);
extern uint32_t *
       u32_tolower (const uint32_t *s, size_t n, const char *iso639_language,
                    uninorm_t nf,
                    uint32_t *resultbuf, size_t *lengthp);

/* Return the titlecase mapping of a string.
   The nf argument identifies the normalization form to apply after the
   case-mapping.  It can also be NULL, for no normalization.  */
extern uint8_t *
       u8_totitle (const uint8_t *s, size_t n, const char *iso639_language,
                   uninorm_t nf,
                   uint8_t *resultbuf, size_t *lengthp);
extern uint16_t *
       u16_totitle (const uint16_t *s, size_t n, const char *iso639_language,
                    uninorm_t nf,
                    uint16_t *resultbuf, size_t *lengthp);
extern uint32_t *
       u32_totitle (const uint32_t *s, size_t n, const char *iso639_language,
                    uninorm_t nf,
                    uint32_t *resultbuf, size_t *lengthp);

/* Return the case folded string.
   The nf argument identifies the normalization form to apply after the
   case-mapping.  It can also be NULL, for no normalization.  */
extern uint8_t *
       u8_casefold (const uint8_t *s, size_t n, const char *iso639_language,
                    uninorm_t nf,
                    uint8_t *resultbuf, size_t *lengthp);
extern uint16_t *
       u16_casefold (const uint16_t *s, size_t n, const char *iso639_language,
                     uninorm_t nf,
                     uint16_t *resultbuf, size_t *lengthp);
extern uint32_t *
       u32_casefold (const uint32_t *s, size_t n, const char *iso639_language,
                     uninorm_t nf,
                     uint32_t *resultbuf, size_t *lengthp);

/* Compare S1 and S2, ignoring differences in case and normalization.
   The nf argument identifies the normalization form to apply after the
   case-mapping.  It can also be NULL, for no normalization.
   If successful, set *RESULTP to -1 if S1 < S2, 0 if S1 = S2, 1 if S1 > S2, and
   return 0.  Upon failure, return -1 with errno set.  */
extern int
       u8_casecmp (const uint8_t *s1, size_t n1,
                   const uint8_t *s2, size_t n2,
                   const char *iso639_language, uninorm_t nf, int *resultp);
extern int
       u16_casecmp (const uint16_t *s1, size_t n1,
                    const uint16_t *s2, size_t n2,
                    const char *iso639_language, uninorm_t nf, int *resultp);
extern int
       u32_casecmp (const uint32_t *s1, size_t n1,
                    const uint32_t *s2, size_t n2,
                    const char *iso639_language, uninorm_t nf, int *resultp);

/* Converts the string S of length N to a string in locale encoding, in such a
   way that comparing uN_casexfrm (S1) and uN_casexfrm (S2) with memcmp2() is
   equivalent to comparing S1 and S2 with uN_casecoll().
   NF must be either UNINORM_NFC, UNINORM_NFKC, or NULL for no normalization.  
*/
extern char *
       u8_casexfrm (const uint8_t *s, size_t n, const char *iso639_language,
                    uninorm_t nf, char *resultbuf, size_t *lengthp);
extern char *
       u16_casexfrm (const uint16_t *s, size_t n, const char *iso639_language,
                     uninorm_t nf, char *resultbuf, size_t *lengthp);
extern char *
       u32_casexfrm (const uint32_t *s, size_t n, const char *iso639_language,
                     uninorm_t nf, char *resultbuf, size_t *lengthp);

/* Compare S1 and S2, ignoring differences in case and normalization, using the
   collation rules of the current locale.
   The nf argument identifies the normalization form to apply after the
   case-mapping.  It must be either UNINORM_NFC or UNINORM_NFKC.  It can also
   be NULL, for no normalization.
   If successful, set *RESULTP to -1 if S1 < S2, 0 if S1 = S2, 1 if S1 > S2, and
   return 0.  Upon failure, return -1 with errno set.  */
extern int
       u8_casecoll (const uint8_t *s1, size_t n1,
                    const uint8_t *s2, size_t n2,
                    const char *iso639_language, uninorm_t nf, int *resultp);
extern int
       u16_casecoll (const uint16_t *s1, size_t n1,
                     const uint16_t *s2, size_t n2,
                     const char *iso639_language, uninorm_t nf, int *resultp);
extern int
       u32_casecoll (const uint32_t *s1, size_t n1,
                     const uint32_t *s2, size_t n2,
                     const char *iso639_language, uninorm_t nf, int *resultp);


/* Set *RESULTP to true if mapping NFD(S) to upper case is a no-op, or to false
   otherwise, and return 0.  Upon failure, return -1 with errno set.  */
extern int
       u8_is_uppercase (const uint8_t *s, size_t n,
                        const char *iso639_language,
                        bool *resultp);
extern int
       u16_is_uppercase (const uint16_t *s, size_t n,
                         const char *iso639_language,
                         bool *resultp);
extern int
       u32_is_uppercase (const uint32_t *s, size_t n,
                         const char *iso639_language,
                         bool *resultp);

/* Set *RESULTP to true if mapping NFD(S) to lower case is a no-op, or to false
   otherwise, and return 0.  Upon failure, return -1 with errno set.  */
extern int
       u8_is_lowercase (const uint8_t *s, size_t n,
                        const char *iso639_language,
                        bool *resultp);
extern int
       u16_is_lowercase (const uint16_t *s, size_t n,
                         const char *iso639_language,
                         bool *resultp);
extern int
       u32_is_lowercase (const uint32_t *s, size_t n,
                         const char *iso639_language,
                         bool *resultp);

/* Set *RESULTP to true if mapping NFD(S) to title case is a no-op, or to false
   otherwise, and return 0.  Upon failure, return -1 with errno set.  */
extern int
       u8_is_titlecase (const uint8_t *s, size_t n,
                        const char *iso639_language,
                        bool *resultp);
extern int
       u16_is_titlecase (const uint16_t *s, size_t n,
                         const char *iso639_language,
                         bool *resultp);
extern int
       u32_is_titlecase (const uint32_t *s, size_t n,
                         const char *iso639_language,
                         bool *resultp);

/* Set *RESULTP to true if applying case folding to NFD(S) is a no-op, or to
   false otherwise, and return 0.  Upon failure, return -1 with errno set.  */
extern int
       u8_is_casefolded (const uint8_t *s, size_t n,
                         const char *iso639_language,
                         bool *resultp);
extern int
       u16_is_casefolded (const uint16_t *s, size_t n,
                          const char *iso639_language,
                          bool *resultp);
extern int
       u32_is_casefolded (const uint32_t *s, size_t n,
                          const char *iso639_language,
                          bool *resultp);

/* Set *RESULTP to true if case matters for S, that is, if mapping NFD(S) to
   either upper case or lower case or title case is not a no-op.
   Set *RESULTP to false if NFD(S) maps to itself under the upper case mapping,
   under the lower case mapping, and under the title case mapping; in other
   words, when NFD(S) consists entirely of caseless characters.
   Upon failure, return -1 with errno set.  */
extern int
       u8_is_cased (const uint8_t *s, size_t n,
                    const char *iso639_language,
                    bool *resultp);
extern int
       u16_is_cased (const uint16_t *s, size_t n,
                     const char *iso639_language,
                     bool *resultp);
extern int
       u32_is_cased (const uint32_t *s, size_t n,
                     const char *iso639_language,
                     bool *resultp);
===========================================================================
2009-03-08  Bruno Haible  <address@hidden>

        New module 'unicase/u32-is-cased'.
        * lib/unicase/u32-is-cased.c: New file.
        * modules/unicase/u32-is-cased: New file.

        New module 'unicase/u16-is-cased'.
        * lib/unicase/u16-is-cased.c: New file.
        * modules/unicase/u16-is-cased: New file.

        New module 'unicase/u8-is-cased'.
        * lib/unicase/u8-is-cased.c: New file.
        * lib/unicase/u-is-cased.h: New file.
        * modules/unicase/u8-is-cased: New file.

        New module 'unicase/u32-is-casefolded'.
        * lib/unicase/u32-is-casefolded.c: New file.
        * modules/unicase/u32-is-casefolded: New file.

        New module 'unicase/u16-is-casefolded'.
        * lib/unicase/u16-is-casefolded.c: New file.
        * modules/unicase/u16-is-casefolded: New file.

        New module 'unicase/u8-is-casefolded'.
        * lib/unicase/u8-is-casefolded.c: New file.
        * modules/unicase/u8-is-casefolded: New file.

        New module 'unicase/u32-is-titlecase'.
        * lib/unicase/u32-is-titlecase.c: New file.
        * modules/unicase/u32-is-titlecase: New file.

        New module 'unicase/u16-is-titlecase'.
        * lib/unicase/u16-is-titlecase.c: New file.
        * modules/unicase/u16-is-titlecase: New file.

        New module 'unicase/u8-is-titlecase'.
        * lib/unicase/u8-is-titlecase.c: New file.
        * modules/unicase/u8-is-titlecase: New file.

        New module 'unicase/u32-is-lowercase'.
        * lib/unicase/u32-is-lowercase.c: New file.
        * modules/unicase/u32-is-lowercase: New file.

        New module 'unicase/u16-is-lowercase'.
        * lib/unicase/u16-is-lowercase.c: New file.
        * modules/unicase/u16-is-lowercase: New file.

        New module 'unicase/u8-is-lowercase'.
        * lib/unicase/u8-is-lowercase.c: New file.
        * modules/unicase/u8-is-lowercase: New file.

        New module 'unicase/u32-is-uppercase'.
        * lib/unicase/u32-is-uppercase.c: New file.
        * modules/unicase/u32-is-uppercase: New file.

        New module 'unicase/u16-is-uppercase'.
        * lib/unicase/u16-is-uppercase.c: New file.
        * modules/unicase/u16-is-uppercase: New file.

        New module 'unicase/u8-is-uppercase'.
        * lib/unicase/u8-is-uppercase.c: New file.
        * modules/unicase/u8-is-uppercase: New file.

        New module 'unicase/u32-is-invariant'.
        * lib/unicase/u32-is-invariant.c: New file.
        * modules/unicase/u32-is-invariant: New file.

        New module 'unicase/u16-is-invariant'.
        * lib/unicase/u16-is-invariant.c: New file.
        * modules/unicase/u16-is-invariant: New file.

        New module 'unicase/u8-is-invariant'.
        * lib/unicase/u8-is-invariant.c: New file.
        * lib/unicase/invariant.h: New file.
        * lib/unicase/u-is-invariant.h: New file.
        * modules/unicase/u8-is-invariant: New file.

        New module 'unicase/u32-casecoll'.
        * lib/unicase/u32-casecoll.c: New file.
        * modules/unicase/u32-casecoll: New file.

        New module 'unicase/u16-casecoll'.
        * lib/unicase/u16-casecoll.c: New file.
        * modules/unicase/u16-casecoll: New file.

        New module 'unicase/u8-casecoll'.
        * lib/unicase/u8-casecoll.c: New file.
        * lib/unicase/u-casecoll.h: New file.
        * modules/unicase/u8-casecoll: New file.

        New module 'unicase/u32-casexfrm'.
        * lib/unicase/u32-casexfrm.c: New file.
        * modules/unicase/u32-casexfrm: New file.

        New module 'unicase/u16-casexfrm'.
        * lib/unicase/u16-casexfrm.c: New file.
        * modules/unicase/u16-casexfrm: New file.

        New module 'unicase/u8-casexfrm'.
        * lib/unicase/u8-casexfrm.c: New file.
        * lib/unicase/u-casexfrm.h: New file.
        * modules/unicase/u8-casexfrm: New file.

        New module 'unicase/u32-casecmp'.
        * lib/unicase/u32-casecmp.c: New file.
        * modules/unicase/u32-casecmp: New file.

        New module 'unicase/u16-casecmp'.
        * lib/unicase/u16-casecmp.c: New file.
        * modules/unicase/u16-casecmp: New file.

        New module 'unicase/u8-casecmp'.
        * lib/unicase/u8-casecmp.c: New file.
        * lib/unicase/u-casecmp.h: New file.
        * modules/unicase/u8-casecmp: New file.

        New module 'unicase/u32-casefold'.
        * lib/unicase/u32-casefold.c: New file.
        * modules/unicase/u32-casefold: New file.

        New module 'unicase/u16-casefold'.
        * lib/unicase/u16-casefold.c: New file.
        * modules/unicase/u16-casefold: New file.

        New module 'unicase/u8-casefold'.
        * lib/unicase/u8-casefold.c: New file.
        * lib/unicase/u-casefold.h: New file.
        * modules/unicase/u8-casefold: New file.

        New module 'unicase/tocasefold'.
        * lib/unicase/casefold.h: New file.
        * lib/unicase/tocasefold.c: New file.
        * lib/unicase/tocasefold.h: New file, generated by gen-uni-tables.c.
        * modules/unicase/tocasefold: New file.

        New module 'unicase/u32-totitle'.
        * lib/unicase/u32-totitle.c: New file.
        * modules/unicase/u32-totitle: New file.

        New module 'unicase/u16-totitle'.
        * lib/unicase/u16-totitle.c: New file.
        * modules/unicase/u16-totitle: New file.

        New module 'unicase/u8-totitle'.
        * lib/unicase/u8-totitle.c: New file.
        * lib/unicase/u-totitle.h: New file.
        * modules/unicase/u8-totitle: New file.

        New module 'unicase/u32-tolower'.
        * lib/unicase/u32-tolower.c: New file.
        * modules/unicase/u32-tolower: New file.

        New module 'unicase/u16-tolower'.
        * lib/unicase/u16-tolower.c: New file.
        * modules/unicase/u16-tolower: New file.

        New module 'unicase/u8-tolower'.
        * lib/unicase/u8-tolower.c: New file.
        * modules/unicase/u8-tolower: New file.

        New module 'unicase/u32-toupper'.
        * lib/unicase/u32-toupper.c: New file.
        * modules/unicase/u32-toupper: New file.

        New module 'unicase/u16-toupper'.
        * lib/unicase/u16-toupper.c: New file.
        * modules/unicase/u16-toupper: New file.

        New module 'unicase/u8-toupper'.
        * lib/unicase/u8-toupper.c: New file.
        * modules/unicase/u8-toupper: New file.

        New module 'unicase/u32-casemap'.
        * lib/unicase/u32-casemap.c: New file.
        * modules/unicase/u32-casemap: New file.

        New module 'unicase/u16-casemap'.
        * lib/unicase/u16-casemap.c: New file.
        * modules/unicase/u16-casemap: New file.

        New module 'unicase/u8-casemap'.
        * lib/unicase/unicasemap.h: New file.
        * lib/unicase/u8-casemap.c: New file.
        * lib/unicase/u-casemap.h: New file.
        * modules/unicase/u8-casemap: New file.

        New module 'unicase/special-casing'.
        * lib/unicase/special-casing.h: New file.
        * lib/unicase/special-casing.c: New file.
        * lib/unicase/special-casing-table.gperf: New file, generated by
        gen-uni-tables.c.
        * modules/unicase/special-casing: New file.

        New module 'unicase/locale-language'.
        * lib/unicase/locale-language.c: New file.
        * lib/unicase/locale-languages.gperf: New file.
        * modules/unicase/locale-language: New file.

        Generate more tables for case conversion and case folding.
        * lib/gen-uni-tables.c (SCC_*): New enum items.
        (struct special_casing_rule): New type.
        (casing_rules, num_casing_rules, allocated_casing_rules): New
        variables.
        (add_casing_rule, fill_casing_rules): New functions.
        (struct casefold_rule): New type.
        (casefolding_rules, num_casefolding_rules,
        allocated_casefolding_rules): New variables.
        (fill_casefolding_rules): New function.
        (unicode_casefold): New variable.
        (to_casefold, redistribute_casefolding_rules, compare_casing_rules,
        sort_casing_rules, output_casing_rules): New functions.
        (main): Accept to more arguments: SpecialCasing.txt and
        CaseFolding.txt. Invoke fill_casing_rules, fill_casefolding_rules,
        redistribute_casefolding_rules, sort_casing_rules, output_casing_rules.
        Output mapping for casefolding.

        * lib/unicase.h: Include stdbool.h, uninorm.h.
        (u8_toupper, u16_toupper, u32_toupper, u8_tolower, u16_tolower,
        u32_tolower, u8_totitle, u16_totitle, u32_totitle): Add nf argument.
        (u8_casefold, u16_casefold, u32_casefold): Add iso639_language and nf
        arguments.
        (u8_casecmp, u16_casecmp, u32_casecmp): Add iso639_language, nf,
        resultp arguments.
        (u8_casexfrm, u16_casexfrm, u32_casexfrm): New declarations.
        (u8_casecoll, u16_casecoll, u32_casecoll): Add iso639_language, nf,
        resultp arguments.
        (u8_is_uppercase, u16_is_uppercase, u32_is_uppercase, u8_is_lowercase,
        u16_is_lowercase, u32_is_lowercase, u8_is_titlecase, u16_is_titlecase,
        u32_is_titlecase, u8_is_casefolded, u16_is_casefolded,
        u32_is_casefolded, u8_is_cased, u16_is_cased, u32_is_cased): New
        declarations.
        * modules/unicase/base (Depends-on): Add uninorm/base, stdbool.





reply via email to

[Prev in Thread] Current Thread [Next in Thread]