bug-gnulib
[Top][All Lists]
Advanced

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

Re: new module 'bcp47'


From: Bruno Haible
Subject: Re: new module 'bcp47'
Date: Fri, 04 Oct 2024 23:54:02 +0200

Here's a refactoring of that code, reducing implicit code duplication.


2024-10-04  Bruno Haible  <bruno@clisp.org>

        bcp47: Refactor.
        * lib/bcp47.c (SCRIPT_*): New macros.
        (default_script_in_territory, default_script_for_language2,
        default_script_for_language3): New functions.
        (xpg_to_bcp47, bcp47_to_xpg): Use them.

diff --git a/lib/bcp47.c b/lib/bcp47.c
index 8008ac030d..96f7aad8ce 100644
--- a/lib/bcp47.c
+++ b/lib/bcp47.c
@@ -99,17 +99,94 @@ struct script
    <https://unicode.org/iso15924/iso15924-codes.html>.  */
 static const struct script scripts[] =
 {
+#define SCRIPT_LATIN      0
   { "latin",      "Latn" },
+#define SCRIPT_CYRILLIC   1
   { "cyrillic",   "Cyrl" },
+#define SCRIPT_HEBREW     2
   { "hebrew",     "Hebr" },
+#define SCRIPT_ARABIC     3
   { "arabic",     "Arab" },
+#define SCRIPT_DEVANAGARI 4
   { "devanagari", "Deva" },
+#define SCRIPT_GURMUKHI   5
   { "gurmukhi",   "Guru" },
+#define SCRIPT_MONGOLIAN  6
   { "mongolian",  "Mong" }
 };
 #define NUM_SCRIPTS (sizeof (scripts) / sizeof (scripts[0]))
 
 
+/* For a language that uses a different script depending on the territory,
+   other than Chinese, this function returns the default script in the given
+   territory, or NULL.  */
+static const struct script *
+default_script_in_territory (const char language[2], const char territory[2])
+{
+  if (memcmp (language, "az", 2) == 0)
+    {
+      if (memcmp (territory, "AZ", 2) == 0)
+        return &scripts[SCRIPT_LATIN];
+      else if (memcmp (territory, "IR", 2) == 0)
+        return &scripts[SCRIPT_ARABIC];
+    }
+  else if (memcmp (language, "ku", 2) == 0)
+    {
+      if (memcmp (territory, "IQ", 2) == 0
+          || memcmp (territory, "IR", 2) == 0)
+        return &scripts[SCRIPT_ARABIC];
+      else if (memcmp (territory, "SY", 2) == 0
+               || memcmp (territory, "TR", 2) == 0)
+        return &scripts[SCRIPT_LATIN];
+    }
+  else if (memcmp (language, "pa", 2) == 0)
+    {
+      if (memcmp (territory, "PK", 2) == 0)
+        return &scripts[SCRIPT_ARABIC];
+      else if (memcmp (territory, "IN", 2) == 0)
+        return &scripts[SCRIPT_GURMUKHI];
+    }
+  return NULL;
+}
+
+/* For a language that can be written using different scripts, independently of
+   the territory, other than Inuktitut and Min Nan Chinese, these functions
+   return the default (main) script, or NULL.  */
+static const struct script *
+default_script_for_language2 (const char language[2])
+{
+  if (memcmp (language, "be", 2) == 0)
+     return &scripts[SCRIPT_CYRILLIC];
+   else if (memcmp (language, "bs", 2) == 0)
+     return &scripts[SCRIPT_LATIN];
+   else if (memcmp (language, "ha", 2) == 0)
+     return &scripts[SCRIPT_LATIN];
+   else if (memcmp (language, "kk", 2) == 0)
+     return &scripts[SCRIPT_CYRILLIC];
+   else if (memcmp (language, "ks", 2) == 0)
+     return &scripts[SCRIPT_ARABIC];
+   else if (memcmp (language, "mn", 2) == 0)
+     return &scripts[SCRIPT_CYRILLIC];
+   else if (memcmp (language, "sd", 2) == 0)
+     return &scripts[SCRIPT_ARABIC];
+   else if (memcmp (language, "sr", 2) == 0)
+     return &scripts[SCRIPT_CYRILLIC];
+   else if (memcmp (language, "uz", 2) == 0)
+     return &scripts[SCRIPT_LATIN];
+   else if (memcmp (language, "yi", 2) == 0)
+     return &scripts[SCRIPT_HEBREW];
+   return NULL;
+}
+static const struct script *
+default_script_for_language3 (const char language[3])
+{
+   if (memcmp (language, "ber", 3) == 0)
+     return &scripts[SCRIPT_LATIN];
+   return NULL;
+}
+
+
+
 void
 xpg_to_bcp47 (char *bcp47, const char *xpg)
 {
@@ -205,85 +282,37 @@ xpg_to_bcp47 (char *bcp47, const char *xpg)
   if (language_len > 0 && script_subtag == NULL)
     {
       /* Languages with a script that depends on the territory.  */
-      if (territory_len > 0)
+      if (language_len == 2 && territory_len == 2)
         {
-          if (language_len == 2)
+          const struct script *sp =
+            default_script_in_territory (language_start, territory_start);
+          if (sp != NULL)
+            script_subtag = sp->code;
+          else if (memcmp (language_start, "zh", 2) == 0)
             {
-              if (memcmp (language_start, "az", 2) == 0)
-                {
-                  if (territory_len == 2)
-                    {
-                      if (memcmp (territory_start, "AZ", 2) == 0)
-                        script_subtag = "Latn";
-                      else if (memcmp (territory_start, "IR", 2) == 0)
-                        script_subtag = "Arab";
-                    }
-                }
-              else if (memcmp (language_start, "ku", 2) == 0)
-                {
-                  if (territory_len == 2)
-                    {
-                      if (memcmp (territory_start, "IQ", 2) == 0
-                          || memcmp (territory_start, "IR", 2) == 0)
-                        script_subtag = "Arab";
-                      else if (memcmp (territory_start, "SY", 2) == 0
-                               || memcmp (territory_start, "TR", 2) == 0)
-                        script_subtag = "Latn";
-                    }
-                }
-              else if (memcmp (language_start, "pa", 2) == 0)
-                {
-                  if (territory_len == 2)
-                    {
-                      if (memcmp (territory_start, "PK", 2) == 0)
-                        script_subtag = "Arab";
-                      else if (memcmp (territory_start, "IN", 2) == 0)
-                        script_subtag = "Guru";
-                    }
-                }
-              else if (memcmp (language_start, "zh", 2) == 0)
-                {
-                  if (territory_len == 2)
-                    {
-                      if (memcmp (territory_start, "CN", 2) == 0
-                          || memcmp (territory_start, "SG", 2) == 0)
-                        script_subtag = "Hans";
-                      else
-                        script_subtag = "Hant";
-                    }
-                }
+              if (memcmp (territory_start, "CN", 2) == 0
+                  || memcmp (territory_start, "SG", 2) == 0)
+                script_subtag = "Hans";
+              else
+                script_subtag = "Hant";
             }
         }
       /* Languages with a main script and one or more alternate scripts.  */
       if (language_len == 2)
         {
-          if (memcmp (language_start, "be", 2) == 0)
-            script_subtag = "Cyrl";
-          else if (memcmp (language_start, "bs", 2) == 0)
-            script_subtag = "Latn";
-          else if (memcmp (language_start, "ha", 2) == 0)
-            script_subtag = "Latn";
+          const struct script *sp =
+            default_script_for_language2 (language_start);
+          if (sp != NULL)
+            script_subtag = sp->code;
           else if (memcmp (language_start, "iu", 2) == 0)
             script_subtag = "Cans";
-          else if (memcmp (language_start, "kk", 2) == 0)
-            script_subtag = "Cyrl";
-          else if (memcmp (language_start, "ks", 2) == 0)
-            script_subtag = "Arab";
-          else if (memcmp (language_start, "mn", 2) == 0)
-            script_subtag = "Cyrl";
-          else if (memcmp (language_start, "sd", 2) == 0)
-            script_subtag = "Arab";
-          else if (memcmp (language_start, "sr", 2) == 0)
-            script_subtag = "Cyrl";
-          else if (memcmp (language_start, "uz", 2) == 0)
-            script_subtag = "Latn";
-          else if (memcmp (language_start, "yi", 2) == 0)
-            script_subtag = "Hebr";
         }
       else if (language_len == 3)
         {
-          if (memcmp (language_start, "ber", 3) == 0)
-            script_subtag = "Latn";
+          const struct script *sp =
+            default_script_for_language3 (language_start);
+          if (sp != NULL)
+            script_subtag = sp->code;
           else if (memcmp (language_start, "nan", 3) == 0)
             script_subtag = "Hant";
         }
@@ -451,83 +480,29 @@ bcp47_to_xpg (char *xpg, const char *bcp47, const char 
*codeset)
   if (script != NULL)
     {
       /* Languages with a script that depends on the territory.  */
-      if (territory_len > 0)
+      if (language_len == 2 && territory_len == 2)
         {
-          if (language_len == 2)
+          const struct script *sp =
+            default_script_in_territory (xpg, territory);
+          if (sp != NULL)
+            {
+              if (strcmp (script, sp->name) == 0)
+                script = NULL;
+            }
+          else if (memcmp (xpg, "zh", 2) == 0)
             {
-              if (memcmp (xpg, "az", 2) == 0)
-                {
-                  if (territory_len == 2)
-                    {
-                      if (memcmp (territory, "AZ", 2) == 0)
-                        {
-                          if (strcmp (script, "latin") == 0)
-                            script = NULL;
-                        }
-                      else if (memcmp (territory, "IR", 2) == 0)
-                        {
-                          if (strcmp (script, "arabic") == 0)
-                            script = NULL;
-                        }
-                    }
-                }
-              else if (memcmp (xpg, "ku", 2) == 0)
-                {
-                  if (territory_len == 2)
-                    {
-                      if (memcmp (territory, "IQ", 2) == 0
-                          || memcmp (territory, "IR", 2) == 0)
-                        {
-                          if (strcmp (script, "arabic") == 0)
-                            script = NULL;
-                        }
-                      else if (memcmp (territory, "SY", 2) == 0
-                               || memcmp (territory, "TR", 2) == 0)
-                        {
-                          if (strcmp (script, "latin") == 0)
-                            script = NULL;
-                        }
-                    }
-                }
-              else if (memcmp (xpg, "pa", 2) == 0)
-                {
-                  if (territory_len == 2)
-                    {
-                      if (memcmp (territory, "PK", 2) == 0)
-                        {
-                          if (strcmp (script, "arabic") == 0)
-                            script = NULL;
-                        }
-                      else if (memcmp (territory, "IN", 2) == 0)
-                        {
-                          if (strcmp (script, "gurmukhi") == 0)
-                            script = NULL;
-                        }
-                    }
-                }
-              else if (memcmp (xpg, "zh", 2) == 0)
-                {
-                  /* "Hans" and "Hant" are not present in the scripts[] table,
-                     therefore nothing to do here.  */
-                }
+              /* "Hans" and "Hant" are not present in the scripts[] table,
+                 therefore nothing to do here.  */
             }
         }
       /* Languages with a main script and one or more alternate scripts.  */
       if (language_len == 2)
         {
-          if (memcmp (xpg, "be", 2) == 0)
-            {
-              if (strcmp (script, "cyrillic") == 0)
-                script = NULL;
-            }
-          else if (memcmp (xpg, "bs", 2) == 0)
+          const struct script *sp =
+            default_script_for_language2 (xpg);
+          if (sp != NULL)
             {
-              if (strcmp (script, "latin") == 0)
-                script = NULL;
-            }
-          else if (memcmp (xpg, "ha", 2) == 0)
-            {
-              if (strcmp (script, "latin") == 0)
+              if (strcmp (script, sp->name) == 0)
                 script = NULL;
             }
           else if (memcmp (xpg, "iu", 2) == 0)
@@ -535,47 +510,14 @@ bcp47_to_xpg (char *xpg, const char *bcp47, const char 
*codeset)
               /* "Cans" is not present in the scripts[] table,
                  therefore nothing to do here.  */
             }
-          else if (memcmp (xpg, "kk", 2) == 0)
-            {
-              if (strcmp (script, "cyrillic") == 0)
-                script = NULL;
-            }
-          else if (memcmp (xpg, "ks", 2) == 0)
-            {
-              if (strcmp (script, "arabic") == 0)
-                script = NULL;
-            }
-          else if (memcmp (xpg, "mn", 2) == 0)
-            {
-              if (strcmp (script, "cyrillic") == 0)
-                script = NULL;
-            }
-          else if (memcmp (xpg, "sd", 2) == 0)
-            {
-              if (strcmp (script, "arabic") == 0)
-                script = NULL;
-            }
-          else if (memcmp (xpg, "sr", 2) == 0)
-            {
-              if (strcmp (script, "cyrillic") == 0)
-                script = NULL;
-            }
-          else if (memcmp (xpg, "uz", 2) == 0)
-            {
-              if (strcmp (script, "latin") == 0)
-                script = NULL;
-            }
-          else if (memcmp (xpg, "yi", 2) == 0)
-            {
-              if (strcmp (script, "hebrew") == 0)
-                script = NULL;
-            }
         }
       else if (language_len == 3)
         {
-          if (memcmp (xpg, "ber", 3) == 0)
+          const struct script *sp =
+            default_script_for_language3 (xpg);
+          if (sp != NULL)
             {
-              if (strcmp (script, "latin") == 0)
+              if (strcmp (script, sp->name) == 0)
                 script = NULL;
             }
           else if (memcmp (xpg, "nan", 3) == 0)






reply via email to

[Prev in Thread] Current Thread [Next in Thread]