[7405] parsetexi convert_to

texinfo-commits
[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]
[7405] parsetexi convert_to_utf8

From:	gavinsmith0123
Subject:	[7405] parsetexi convert_to_utf8
Date:	Wed, 21 Sep 2016 21:42:03 +0000 (UTC)
Revision: 7405
          http://svn.sv.gnu.org/viewvc/?view=rev&root=texinfo&revision=7405
Author:   gavin
Date:     2016-09-21 21:42:03 +0000 (Wed, 21 Sep 2016)
Log Message:
-----------
parsetexi convert_to_utf8

Modified Paths:
--------------
    trunk/tp/parsetexi/ChangeLog
    trunk/tp/parsetexi/end_line.c
    trunk/tp/parsetexi/input.c
    trunk/tp/parsetexi/text.c
    trunk/tp/parsetexi/text.h

Modified: trunk/tp/parsetexi/ChangeLog
===================================================================
--- trunk/tp/parsetexi/ChangeLog        2016-09-21 15:25:51 UTC (rev 7404)
+++ trunk/tp/parsetexi/ChangeLog        2016-09-21 21:42:03 UTC (rev 7405)
@@ -1,3 +1,8 @@
+2016-09-21  Gavin Smith  <address@hidden>
+
+       * input.c (convert_to_utf8): Start on converting the character 
+       encoding of the input.
+
 2016-07-09  Gavin Smith  <address@hidden>
 
        * convert.c (convert_to_texinfo): New function.

Modified: trunk/tp/parsetexi/end_line.c
===================================================================
--- trunk/tp/parsetexi/end_line.c       2016-09-21 15:25:51 UTC (rev 7404)
+++ trunk/tp/parsetexi/end_line.c       2016-09-21 21:42:03 UTC (rev 7405)
@@ -578,9 +578,14 @@
               {
                 // 5650
                 if (idx->merged_in)
-                  line_warn
-                    ("printing an index `%s' merged in another one, `%s'",
-                     arg, idx->merged_in->name);
+                  {
+                    INDEX *i2;
+                    for (i2 = idx; (i2->merged_in); i2 = i2->merged_in)
+                      ;
+                    line_warn
+                      ("printing an index `%s' merged in another one, `%s'",
+                       arg, i2->name);
+                  }
                 if (!current_node && !current_section && !current_region ())
                   {
                     line_warn ("printindex before document beginning: "
@@ -1480,7 +1485,8 @@
                       char *from; char *to;
                   };
                   static struct encoding_map map[] = {
-                      "utf-8", "utf-8-strict"
+                      "utf-8", "utf-8-strict",
+                      "us-ascii", "ascii"
                   };
                   perl_encoding = texinfo_encoding;
                   for (i = 0; i < sizeof map / sizeof *map; i++)

Modified: trunk/tp/parsetexi/input.c
===================================================================
--- trunk/tp/parsetexi/input.c  2016-09-21 15:25:51 UTC (rev 7404)
+++ trunk/tp/parsetexi/input.c  2016-09-21 21:42:03 UTC (rev 7405)
@@ -19,6 +19,8 @@
 #include <stdlib.h>
 #include <stdio.h>
 #include <string.h>
+#include <iconv.h>
+#include <errno.h>
 
 #include "tree_types.h"
 #include "input.h"
@@ -27,10 +29,16 @@
 
 enum input_type { IN_file, IN_text };
 
+enum character_encoding {
+    ce_latin1,
+    ce_utf8
+};
+
 typedef struct {
     enum input_type type;
 
     FILE *file;
+    enum character_encoding input_encoding;
     LINE_NR line_nr;
 
     char *text;  /* Input text to be parsed as Texinfo. */
@@ -79,6 +87,113 @@
     return 0;
 }
 
+
+/* TODO: integrate with gnulib */
+#define ICONV_CONST
+
+static iconv_t iconv_from_latin1 = (iconv_t) 0;
+
+/* Run iconv using text buffer as output buffer. */
+size_t
+text_buffer_iconv (TEXT *buf, iconv_t iconv_state,
+                   ICONV_CONST char **inbuf, size_t *inbytesleft)
+{
+  size_t out_bytes_left;
+  char *outptr;
+  size_t iconv_ret;
+
+  outptr = buf->text + buf->end;
+  out_bytes_left = buf->space - buf->end;
+  iconv_ret = iconv (iconv_state, inbuf, inbytesleft,
+                     &outptr, &out_bytes_left);
+
+  buf->end = outptr - buf->text;
+
+  return iconv_ret;
+}
+
+
+
+/* Return conversion of S according to ENC.  This function frees S. */
+static char *
+convert_to_utf8 (char *s, enum character_encoding enc)
+{
+  iconv_t our_iconv;
+  static TEXT t;
+  char *inptr; size_t bytes_left;
+  size_t iconv_ret;
+
+  /* Convert from @documentencoding to UTF-8.
+       It might be possible not to convert to UTF-8 and use an 8-bit encoding
+     throughout, but then we'd have to not set the UTF-8 flag on the Perl 
+     strings in api.c.  If multiple character encodings were used in a single 
+     file, then we'd have to keep track of which strings needed the UTF-8 flag
+     and which didn't. */
+
+  /* Could and check for malformed input: see
+     <http://savannah.gnu.org/bugs/?42896>. */
+
+  if (iconv_from_latin1 == (iconv_t) 0)
+    {
+      /* Initialize the conversion for the first time. */
+      iconv_from_latin1 = iconv_open ("UTF-8", "ISO-8859-1");
+      if (iconv_from_latin1 == (iconv_t) -1)
+        {
+          abort ();
+
+          /* big trouble.  if we do return it unconverted, we will have to
+             remember not to set the UTF-8 flags on the Perl strings, otherwise
+             Perl will choke. */
+          return s;
+        }
+    }
+
+  switch (enc)
+    {
+    case ce_latin1:
+      our_iconv = iconv_from_latin1;
+      break;
+    case ce_utf8:
+      return s; /* no conversion required. */
+      break;
+    }
+
+  t.end = 0;
+  inptr = s;
+  bytes_left = strlen (s);
+  text_alloc (&t, 10);
+
+  while (1)
+    {
+      iconv_ret = text_buffer_iconv (&t, our_iconv,
+                                     &inptr, &bytes_left);
+
+      /* Make sure libiconv flushes out the last converted character.
+         This is required when the conversion is stateful, in which
+         case libiconv might not output the last character, waiting to
+         see whether it should be combined with the next one.  */
+      if (iconv_ret != (size_t) -1
+          && text_buffer_iconv (&t, our_iconv, 0, 0) != (size_t) -1)
+        /* Success: all of input converted. */
+        break;
+
+      switch (errno)
+        {
+        case E2BIG:
+          text_alloc (&t, t.space + 20);
+          break;
+        default:
+          abort ();
+          break;
+        }
+    }
+
+  free (s);
+  t.text[t.end] = '\0';
+  //fprintf (stderr, "CONVERTED STRING IS <<%s>>", t.text);
+  return strdup (t.text);
+}
+
 /* Return value to be freed by caller.  Return null if we are out of input. */
 char *
 next_text (void)
@@ -116,7 +231,7 @@
 
           line_nr = i->line_nr;
 
-          return new;
+          return convert_to_utf8 (new, 0); // i->input_encoding);
 
           break;
         case IN_file: // 1911
@@ -139,18 +254,12 @@
               if (comment)
                 *comment = '\0';
 
-              /* TODO: convert from @documentencoding to UTF-8, assuming we 
-                 want to use UTF-8 internally. */
-
-              /* Could and check for malformed input: see
-                 <http://savannah.gnu.org/bugs/?42896>. */
-
               // 1920 CPP_LINE_DIRECTIVES
 
               i->line_nr.line_nr++;
               line_nr = i->line_nr;
 
-              return line;
+              return convert_to_utf8 (line, 0); // i->input_encoding);
             }
           free (line); line = 0;
           break;

Modified: trunk/tp/parsetexi/text.c
===================================================================
--- trunk/tp/parsetexi/text.c   2016-09-21 15:25:51 UTC (rev 7404)
+++ trunk/tp/parsetexi/text.c   2016-09-21 21:42:03 UTC (rev 7405)
@@ -23,7 +23,7 @@
 #include "text.h"
 
 /* Make sure there are LEN free bytes. */
-static void
+void
 text_alloc (TEXT *t, size_t len)
 {
   if (t->end + len > t->space)

Modified: trunk/tp/parsetexi/text.h
===================================================================
--- trunk/tp/parsetexi/text.h   2016-09-21 15:25:51 UTC (rev 7404)
+++ trunk/tp/parsetexi/text.h   2016-09-21 21:42:03 UTC (rev 7405)
@@ -17,5 +17,6 @@
 void text_append (TEXT *t, char *s);
 void text_append_n (TEXT *t, char *s, size_t len);
 void text_printf (TEXT *t, char *format, ...);
+void text_alloc (TEXT *t, size_t len);
 
 #define text_base(t) ((t)->space ? (t)->text : (char *) 0)
[Prev in Thread]
Current Thread
[Next in Thread]
[7405] parsetexi convert_to_utf8, gavinsmith0123 <=
Prev by Date: [7404] @synindex update ->{'merged_indices'}
Next by Date: [7406] use feature 'unicode_strings'
Previous by thread: [7404] @synindex update ->{'merged_indices'}
Next by thread: [7406] use feature 'unicode_strings'
Index(es):
- Date
- Thread