texinfo-commits
[Top][All Lists]
Advanced

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

[no subject]


From: Gavin D. Smith
Date: Fri, 20 Oct 2023 16:14:47 -0400 (EDT)

branch: master
commit 49c596d717347c03e1d06df2d5ec05c92c94af11
Author: Gavin Smith <gavinsmith0123@gmail.com>
AuthorDate: Fri Oct 20 20:56:40 2023 +0100

    * tp/Texinfo/XS/xspara.c (xspara_add_text):
    Keep track of both the type of current (block of) characters to
    proess and the type of the next (block of) characters.
    Still only operate on one character at a time for now.
---
 ChangeLog              |   7 +++
 tp/Texinfo/XS/xspara.c | 144 ++++++++++++++++++++++++++++++++-----------------
 2 files changed, 101 insertions(+), 50 deletions(-)

diff --git a/ChangeLog b/ChangeLog
index 682711c978..753c714283 100644
--- a/ChangeLog
+++ b/ChangeLog
@@ -1,3 +1,10 @@
+2023-10-20  Gavin Smith <gavinsmith0123@gmail.com>
+
+       * tp/Texinfo/XS/xspara.c (xspara_add_text):
+       Keep track of both the type of current (block of) characters to
+       proess and the type of the next (block of) characters.
+       Still only operate on one character at a time for now.
+
 2023-10-20  Gavin Smith <gavinsmith0123@gmail.com>
 
        * tp/Texinfo/XS/xspara.c (xspara_add_text): Rearrange code
diff --git a/tp/Texinfo/XS/xspara.c b/tp/Texinfo/XS/xspara.c
index d15d59f546..58901d12a8 100644
--- a/tp/Texinfo/XS/xspara.c
+++ b/tp/Texinfo/XS/xspara.c
@@ -882,7 +882,8 @@ xspara_set_space_protection (int no_break,
 /*****************************************************************/
 
 enum text_class { type_NULL, type_spaces, type_regular,
-                 type_double_width, type_unknown };
+                 type_double_width, type_EOS, type_finished,
+                 type_unknown };
 
 /* Return string to be added to paragraph contents, wrapping text. This 
    function relies on there being a UTF-8 locale in LC_CTYPE for mbrtowc to
@@ -890,12 +891,12 @@ enum text_class { type_NULL, type_spaces, type_regular,
 TEXT
 xspara_add_text (char *text, int len)
 {
-  char *p = text;
-  wchar_t wc;
-  size_t char_len;
+  char *p = text, *q = 0;
+  wchar_t wc, wc_fw;
+  size_t next_len = 0;
   int width;
   static TEXT result;
-  enum text_class type = type_NULL;
+  enum text_class type = type_NULL, next_type = type_NULL;
 
   dTHX;
 
@@ -903,7 +904,7 @@ xspara_add_text (char *text, int len)
 
   state.end_line_count = 0;
 
-  while (len > 0)
+  while (1)
     {
       if (debug)
         {
@@ -915,47 +916,93 @@ xspara_add_text (char *text, int len)
                     state.word.end > 0 ? state.word.text : "UNDEF");
         }
 
-      /* Get the type of the next character.  Set wc and char_len
-         if it is not a space. */
-      if (isspace ((unsigned char) *p))
-        {
-          type = type_spaces;
-        }
-      else
+      /* p is now at the beginning of the text we have left to process.
+         next_type is set to the type of the next block, or is null. */
+
+      type = next_type;
+
+      q = p;
+      q += next_len; len -= next_len; /* Skip over the last character
+                                         processed. */
+
+      /* Set q to the end of the block.  Set next_len to the length of
+         the following character, and next_type to the type of
+         the block after. */
+      while (1)
         {
-          /* Set wc and char_len */
-          if (!PRINTABLE_ASCII(*p))
+          if (len <= 0)
             {
-              char_len = mbrtowc (&wc, p, len, NULL);
+              next_type = type_finished;
             }
-          else
+          else if (isspace ((unsigned char) *q))
             {
-              /* Functonally the same as mbrtowc but (tested) slightly
-                 quicker. */
-              char_len = 1;
-              wc = btowc (*p);
+              next_type = type_spaces;
+              next_len = 1;
             }
-
-          if ((long) char_len == 0)
-            break; /* Null character. Shouldn't happen. */
-          else if ((long) char_len < 0)
+          else if (*q == '\b')
             {
-              p++; len--; /* Invalid.  Just try to keep going. */
-              continue;
+            /* Code to say that a following full stop (or question or
+               exclamation mark) may be an end of sentence. */
+              next_type = type_EOS;
+              next_len = 1;
             }
-
-         /* Note: width == 0 includes accent characters which should not
-            properly increase the column count.  This is not what the pure
-            Perl code does, though. */
-          width = wcwidth (wc);
-          if (width == 1 || width == 0)
-            type = type_regular;
-          else if (width == 2)
-            type = type_double_width;
           else
-            type = type_unknown;
+            {
+              /* Set wc and next_len */
+              if (!PRINTABLE_ASCII(*q))
+                {
+                  next_len = mbrtowc (&wc, q, len, NULL);
+                }
+              else
+                {
+                  /* Functionally the same as mbrtowc but (tested) slightly
+                     quicker. */
+                  next_len = 1;
+                  wc = btowc (*q);
+                }
+
+              if ((long) next_len == 0)
+                break; /* Null character. Shouldn't happen. */
+              else if ((long) next_len < 0)
+                {
+                  q++; len--; /* Invalid.  Just try to keep going. */
+                  continue;
+                }
+
+             /* Note: width == 0 includes accent characters which should not
+                properly increase the column count.  This is not what the pure
+                Perl code does, though. */
+              width = wcwidth (wc);
+              if (width == 1 || width == 0)
+                next_type = type_regular;
+              else if (width == 2)
+                {
+                  next_type = type_double_width;
+                  wc_fw = wc; /* final value is used below */
+                }
+              else
+                next_type = type_unknown;
+            }
+
+          /* TODO: test just one character at a time to start.  then
+             we can gradually work on the various blocks of
+             code to operate on multiple characters. */
+          if (1 || next_type != type || next_type == type_finished)
+            break;
+
+          q += next_len; len -= next_len;
         }
 
+      /* For the very start of the string. */
+      if (type == type_NULL)
+        continue;
+
+      /* Now type is the type of the block we are about to operate on, and
+         next_type the one after it.  p is the beginning of the span and q
+         is the end. */
+
+      if (type == type_finished)
+        break;
       /*************** Whitespace character. *********************/
       if (type == type_spaces)
         {
@@ -1055,23 +1102,21 @@ xspara_add_text (char *text, int len)
               xspara__end_line ();
               text_append (&result, "\n");
             }
-          p++; len--;
           state.last_letter = ' ';
-          continue;
         }
 
       /*************** Double width character. *********************/
-      if (type == type_double_width)
+      else if (type == type_double_width)
         {
           if (debug)
             fprintf (stderr, "FULLWIDTH\n");
 
-          text_append_n (&state.word, p, char_len);
+          text_append_n (&state.word, p, q - p);
           state.word_counter += 2;
 
           /* fullwidth latin letters can be upper case, so it is important to
              use the actual characters here. */
-          state.last_letter = wc;
+          state.last_letter = wc_fw;
 
           /* We allow a line break in between Chinese characters even if
              there was no space between them, unlike single-width
@@ -1090,20 +1135,18 @@ xspara_add_text (char *text, int len)
             }
           state.end_sentence = -2;
         }
-      else if (wc == L'\b')
+      else if (type == type_EOS)
         {
-          /* Code to say that a following full stop (or question or
-             exclamation mark) may be an end of sentence. */
           xspara_allow_end_sentence ();
         }
       /*************** Word character ******************************/
       else if (type == type_regular)
         {
           static char added_word[8]; /* long enough for one UTF-8 character */
-          memcpy (added_word, p, char_len);
-          added_word[char_len] = '\0';
+          memcpy (added_word, p, q - p);
+          added_word[q - p] = '\0';
 
-          xspara__add_next (&result, added_word, char_len, 0);
+          xspara__add_next (&result, added_word, q - p, 0);
 
           /* Now check if it is considered as an end of sentence, and
              set state.end_sentence if it is. */
@@ -1142,9 +1185,10 @@ xspara_add_text (char *text, int len)
           /* Not printable, possibly a tab, or a combining character.
              Add it to the pending word without increasing the column
              count. */
-          text_append_n (&state.word, p, char_len);
+          text_append_n (&state.word, p, q - p);
         }
-      p += char_len; len -= char_len;
+
+      p = q;
     }
 
   return result;



reply via email to

[Prev in Thread] Current Thread [Next in Thread]