branch master updated: Revert change on 2023-11-09 "Locale-independent X

texinfo-commits
[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]
branch master updated: Revert change on 2023-11-09 "Locale-independent X

From:	Gavin D. Smith
Subject:	branch master updated: Revert change on 2023-11-09 "Locale-independent XS paragraph formatting".
Date:	Fri, 10 Nov 2023 11:50:09 -0500
This is an automated email from the git hooks/post-receive script.

gavin pushed a commit to branch master
in repository texinfo.

The following commit(s) were added to refs/heads/master by this push:
     new 56028a44ba Revert change on 2023-11-09 "Locale-independent XS 
paragraph formatting".
56028a44ba is described below

commit 56028a44baffb7d4b7626fc557b974d66a0c5d98
Author: Gavin Smith <gavinsmith0123@gmail.com>
AuthorDate: Fri Nov 10 16:50:00 2023 +0000

    Revert change on 2023-11-09 "Locale-independent XS paragraph
    formatting".
    
    This gives inconsistent test results between XS and pure Perl.
    Reverting until this can be investigated.
---
 ChangeLog                 |   8 +++
 tp/Texinfo/XS/Makefile.am |   2 +-
 tp/Texinfo/XS/xspara.c    | 158 +++++++++++++++++++++++++++++++++++++++-------
 3 files changed, 145 insertions(+), 23 deletions(-)

diff --git a/ChangeLog b/ChangeLog
index 2b22db2409..a5a51cc1bf 100644
--- a/ChangeLog
+++ b/ChangeLog
@@ -1,3 +1,11 @@
+2023-11-10  Gavin Smith <gavinsmith0123@gmail.com>
+
+       Revert change on 2023-11-09 "Locale-independent XS paragraph
+       formatting".
+
+       This gives inconsistent test results between XS and pure Perl.
+       Reverting until this can be investigated.
+
 2023-11-09  Patrice Dumas  <pertusus@free.fr>
 
        * tp/Texinfo/XS/convert/ConvertXS.xs (html_prepare_conversion_units),
diff --git a/tp/Texinfo/XS/Makefile.am b/tp/Texinfo/XS/Makefile.am
index 6fc24a0b3a..6b63bd01f0 100644
--- a/tp/Texinfo/XS/Makefile.am
+++ b/tp/Texinfo/XS/Makefile.am
@@ -97,7 +97,7 @@ XSParagraph_la_SOURCES = XSParagraph.c xspara.c xspara.h \
 XSParagraph_la_CFLAGS = $(XSLIBS_CFLAGS)
 XSParagraph_la_CPPFLAGS = $(AM_CPPFLAGS) $(GNULIB_CPPFLAGS) $(XSLIBS_CPPFLAGS)
 XSParagraph_la_LIBADD = $(builddir)/gnulib/lib/libgnu.la
-XSParagraph_la_LDFLAGS = $(AM_LDFLAGS) $(XSLIBS_LDFLAGS) $(LTLIBINTL) 
$(LTLIBICONV) $(LTLIBUNISTRING) $(LTLIBC32CONV)
+XSParagraph_la_LDFLAGS = $(AM_LDFLAGS) $(XSLIBS_LDFLAGS) $(LTLIBINTL) 
$(LTLIBICONV) $(LTLIBUNISTRING)
 
 EXTRA_DIST += XSParagraph.xs MiscXS.xs
 
diff --git a/tp/Texinfo/XS/xspara.c b/tp/Texinfo/XS/xspara.c
index a80f05c683..60e3eba02a 100644
--- a/tp/Texinfo/XS/xspara.c
+++ b/tp/Texinfo/XS/xspara.c
@@ -29,9 +29,8 @@
    perl.h includes ctype.h.  */
 #include <ctype.h>
 #endif
-
-#include <unistr.h>
-#include <uchar.h>
+#include <wchar.h>
+#include <wctype.h>
 
 /* See "How do I use all this in extensions" in 'man perlguts'. */
 #define PERL_NO_GET_CONTEXT
@@ -85,7 +84,7 @@ typedef struct {
     int end_line_count; /* Number of newlines so far in an output unit, i.e.
                            with add_text or add_next. */
 
-    char32_t last_letter; /* Last letter in word, used to decide if we're
+    wint_t last_letter; /* Last letter in word, used to decide if we're
                             at the end of a sentence. */
 
     /* Options set with set_space_protection. */
@@ -270,7 +269,101 @@ xspara__print_escaped_spaces (char *string, size_t len)
 int
 xspara_init (int unused, char *unused2)
 {
-  return 1;
+  char *utf8_locale = 0;
+  int len;
+  char *cur;
+  char *dot;
+
+  dTHX;
+
+#if PERL_VERSION > 27 || (PERL_VERSION == 27 && PERL_SUBVERSION > 8)
+  /* needed due to thread-safe locale handling in newer perls */
+  switch_to_global_locale();
+#endif
+
+  if (setlocale (LC_CTYPE, "en_US.UTF-8")
+      || setlocale (LC_CTYPE, "en_US.utf8"))
+    goto success;
+
+  cur = setlocale (LC_CTYPE, 0); /* Name of current locale. */
+  if (!cur)
+    goto failure;
+  len = strlen (cur);
+  if ((len >= 6 && !memcmp (".UTF-8", cur + len - 6, 6))
+      || (len >= 5 && !memcmp (".utf8", cur + len - 5, 5))
+      || (len >= 6 && !memcmp (".utf-8", cur + len - 6, 6))
+      || (len >= 5 && !memcmp (".UTF8", cur + len - 5, 5)))
+    {
+      setlocale (LC_CTYPE, ""); /* Use the locale from the environment. */
+      goto success;
+    }
+
+  /* Otherwise try altering the current locale name. */
+  dot = strchr (cur, '.');
+  if (!dot)
+    dot = cur + len;
+  utf8_locale = malloc (len + 6 + 1); /* enough to add ".UTF-8" to end */
+  memcpy (utf8_locale, cur, dot - cur);
+  dot = utf8_locale + (dot - cur);
+  memcpy (dot, ".UTF-8", 7);
+  if (setlocale (LC_CTYPE, utf8_locale))
+    goto success;
+
+  memcpy (dot, ".utf8", 6);
+  if (setlocale (LC_CTYPE, utf8_locale))
+    goto success;
+
+  /* Otherwise, look for any UTF-8 locale in the output of "locale -a". */
+  {
+  FILE *p;
+  char *line = 0;
+  size_t n = 0;
+  ssize_t ret;
+  p = popen ("locale -a", "r");
+  if (!p)
+    goto failure;
+  while (1)
+    {
+      ret = getline (&line, &n, p);
+      if (ret == (ssize_t) -1)
+        {
+          free (line);
+          pclose (p);
+          goto failure;
+        }
+      if (strstr (line, "UTF-8") || strstr (line, "utf8"))
+        {
+          line[ret - 1] = '\0';   /* Remove trailing newline. */
+          if (setlocale (LC_CTYPE, line))
+            {
+              free (line);
+              pclose (p);
+              goto success;
+            }
+        }
+    }
+  }
+      
+  if (1)
+    {
+failure:
+      return 0; /* failure */
+    }
+  else
+    {
+success: ;
+      free (utf8_locale);
+#if PERL_VERSION > 27 || (PERL_VERSION == 27 && PERL_SUBVERSION > 8)
+      /* needed due to thread-safe locale handling in newer perls */
+      sync_locale();
+#endif
+      /*
+      fprintf (stderr, "tried to set LC_CTYPE to UTF-8.\n");
+      fprintf (stderr, "character encoding is: %s\n",
+               nl_langinfo (CODESET));
+       */
+      return 1; /* success */
+    }
 }
 
 /* Array for storing paragraph states which aren't in use. */
@@ -330,7 +423,7 @@ xspara_new (HV *conf)
   state.max = 72;
   state.indent_length_next = -1; /* Special value meaning undefined. */
   state.end_sentence = eos_undef;
-  state.last_letter = U'\0';
+  state.last_letter = L'\0';
 
   if (conf)
     xspara_init_state (conf);
@@ -448,7 +541,7 @@ xspara__end_line (void)
   state.lines_counter++;
   state.end_line_count++;
   /* could be set to other values, anything that is not upper case. */
-  state.last_letter = U'\n';
+  state.last_letter = L'\n';
 }
 
 char *
@@ -563,7 +656,7 @@ xspara_end (void)
     fprintf (stderr, "PARA END\n");
 
   /* probably not really useful, but cleaner */
-  state.last_letter = U'\0';
+  state.last_letter = L'\0';
 
   xspara__add_pending_word (&ret, state.add_final_space);
   if (!state.no_final_newline && state.counter != 0)
@@ -630,10 +723,18 @@ xspara__add_next (TEXT *result, char *word, int word_len, 
int transparent)
           if (!strchr (end_sentence_characters
                        after_punctuation_characters, *p))
             {
-              char32_t wc;
-              u8_mbtouc (&wc, p, len);
-              state.last_letter = wc;
-              break;
+              if (!PRINTABLE_ASCII(*p))
+                {
+                  wchar_t wc = L'\0';
+                  mbrtowc (&wc, p, len, NULL);
+                  state.last_letter = wc;
+                  break;
+                }
+              else
+                {
+                  state.last_letter = btowc (*p);
+                  break;
+                }
             }
         }
     }
@@ -650,7 +751,7 @@ xspara__add_next (TEXT *result, char *word, int word_len, 
int transparent)
       /* Calculate length of multibyte string in characters. */
       int len = 0;
       int left = word_len;
-      char32_t w;
+      wchar_t w;
       char *p = word;
 
       while (left > 0)
@@ -664,7 +765,7 @@ xspara__add_next (TEXT *result, char *word, int word_len, 
int transparent)
               continue;
             }
 
-          char_len = u8_mbtouc (&w, p, left);
+          char_len = mbrtowc (&w, p, left, NULL);
           if (char_len == (size_t) -2) {
             /* unfinished multibyte character */
             char_len = left;
@@ -678,7 +779,7 @@ xspara__add_next (TEXT *result, char *word, int word_len, 
int transparent)
           }
           left -= char_len;
 
-          columns = c32width (w);
+          columns = wcwidth (w);
           if (columns > 0)
             len += columns;
 
@@ -732,7 +833,7 @@ xspara_add_end_sentence (int value)
 void
 xspara_allow_end_sentence (void)
 {
-  state.last_letter = U'a'; /* A lower-case letter. */
+  state.last_letter = L'a'; /* A lower-case letter. */
 }
 
 /* -1 in a parameter means leave that value as it is. */
@@ -781,12 +882,14 @@ enum text_class { type_NULL, type_spaces, type_regular,
                  type_double_width, type_EOS, type_finished,
                  type_unknown };
 
-/* Return string to be added to paragraph contents, wrapping text. */
+/* Return string to be added to paragraph contents, wrapping text. This 
+   function relies on there being a UTF-8 locale in LC_CTYPE for mbrtowc to
+   work correctly. */
 TEXT
 xspara_add_text (char *text, int len)
 {
   char *p = text, *q = 0;
-  char32_t wc, wc_fw;
+  wchar_t wc, wc_fw;
   size_t next_len = 0;
   int width;
   static TEXT result;
@@ -843,7 +946,18 @@ xspara_add_text (char *text, int len)
             }
           else
             {
-              next_len = u8_mbtouc (&wc, q, len);
+              /* Set wc and next_len */
+              if (!PRINTABLE_ASCII(*q))
+                {
+                  next_len = mbrtowc (&wc, q, len, NULL);
+                }
+              else
+                {
+                  /* Functionally the same as mbrtowc but (tested) slightly
+                     quicker. */
+                  next_len = 1;
+                  wc = btowc (*q);
+                }
 
               if ((long) next_len == 0)
                 break; /* Null character. Shouldn't happen. */
@@ -856,7 +970,7 @@ xspara_add_text (char *text, int len)
              /* Note: width == 0 includes accent characters which should not
                 properly increase the column count.  This is not what the pure
                 Perl code does, though. */
-              width = c32width (wc);
+              width = wcwidth (wc);
               if (width == 1 || width == 0)
                 next_type = type_regular;
               else if (width == 2)
@@ -977,7 +1091,7 @@ xspara_add_text (char *text, int len)
               xspara__end_line ();
               text_append (&result, "\n");
             }
-          state.last_letter = U' ';
+          state.last_letter = ' ';
         }
 
       /*************** Double width character. *********************/
@@ -1029,7 +1143,7 @@ xspara_add_text (char *text, int len)
               if (strchr (end_sentence_characters, *q2) && !state.unfilled)
                 {
                   /* Doesn't count if preceded by an upper-case letter. */
-                  if (!c32isupper (state.last_letter))
+                  if (!iswupper (state.last_letter))
                     {
                       if (state.french_spacing)
                         state.end_sentence = eos_present_frenchspacing;
[Prev in Thread]
Current Thread
[Next in Thread]
branch master updated: Revert change on 2023-11-09 "Locale-independent XS paragraph formatting"., Gavin D. Smith <=
Prev by Date: [no subject]
Next by Date: [no subject]
Previous by thread: master updated (e3a28cc9bf -> bd7b8f424b)
Next by thread: master updated (56028a44ba -> a6b77aac3d)
Index(es):
- Date
- Thread