[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]
[7405] parsetexi convert_to_utf8
From: |
gavinsmith0123 |
Subject: |
[7405] parsetexi convert_to_utf8 |
Date: |
Wed, 21 Sep 2016 21:42:03 +0000 (UTC) |
Revision: 7405
http://svn.sv.gnu.org/viewvc/?view=rev&root=texinfo&revision=7405
Author: gavin
Date: 2016-09-21 21:42:03 +0000 (Wed, 21 Sep 2016)
Log Message:
-----------
parsetexi convert_to_utf8
Modified Paths:
--------------
trunk/tp/parsetexi/ChangeLog
trunk/tp/parsetexi/end_line.c
trunk/tp/parsetexi/input.c
trunk/tp/parsetexi/text.c
trunk/tp/parsetexi/text.h
Modified: trunk/tp/parsetexi/ChangeLog
===================================================================
--- trunk/tp/parsetexi/ChangeLog 2016-09-21 15:25:51 UTC (rev 7404)
+++ trunk/tp/parsetexi/ChangeLog 2016-09-21 21:42:03 UTC (rev 7405)
@@ -1,3 +1,8 @@
+2016-09-21 Gavin Smith <address@hidden>
+
+ * input.c (convert_to_utf8): Start on converting the character
+ encoding of the input.
+
2016-07-09 Gavin Smith <address@hidden>
* convert.c (convert_to_texinfo): New function.
Modified: trunk/tp/parsetexi/end_line.c
===================================================================
--- trunk/tp/parsetexi/end_line.c 2016-09-21 15:25:51 UTC (rev 7404)
+++ trunk/tp/parsetexi/end_line.c 2016-09-21 21:42:03 UTC (rev 7405)
@@ -578,9 +578,14 @@
{
// 5650
if (idx->merged_in)
- line_warn
- ("printing an index `%s' merged in another one, `%s'",
- arg, idx->merged_in->name);
+ {
+ INDEX *i2;
+ for (i2 = idx; (i2->merged_in); i2 = i2->merged_in)
+ ;
+ line_warn
+ ("printing an index `%s' merged in another one, `%s'",
+ arg, i2->name);
+ }
if (!current_node && !current_section && !current_region ())
{
line_warn ("printindex before document beginning: "
@@ -1480,7 +1485,8 @@
char *from; char *to;
};
static struct encoding_map map[] = {
- "utf-8", "utf-8-strict"
+ "utf-8", "utf-8-strict",
+ "us-ascii", "ascii"
};
perl_encoding = texinfo_encoding;
for (i = 0; i < sizeof map / sizeof *map; i++)
Modified: trunk/tp/parsetexi/input.c
===================================================================
--- trunk/tp/parsetexi/input.c 2016-09-21 15:25:51 UTC (rev 7404)
+++ trunk/tp/parsetexi/input.c 2016-09-21 21:42:03 UTC (rev 7405)
@@ -19,6 +19,8 @@
#include <stdlib.h>
#include <stdio.h>
#include <string.h>
+#include <iconv.h>
+#include <errno.h>
#include "tree_types.h"
#include "input.h"
@@ -27,10 +29,16 @@
enum input_type { IN_file, IN_text };
+enum character_encoding {
+ ce_latin1,
+ ce_utf8
+};
+
typedef struct {
enum input_type type;
FILE *file;
+ enum character_encoding input_encoding;
LINE_NR line_nr;
char *text; /* Input text to be parsed as Texinfo. */
@@ -79,6 +87,113 @@
return 0;
}
+
+/* TODO: integrate with gnulib */
+#define ICONV_CONST
+
+static iconv_t iconv_from_latin1 = (iconv_t) 0;
+
+/* Run iconv using text buffer as output buffer. */
+size_t
+text_buffer_iconv (TEXT *buf, iconv_t iconv_state,
+ ICONV_CONST char **inbuf, size_t *inbytesleft)
+{
+ size_t out_bytes_left;
+ char *outptr;
+ size_t iconv_ret;
+
+ outptr = buf->text + buf->end;
+ out_bytes_left = buf->space - buf->end;
+ iconv_ret = iconv (iconv_state, inbuf, inbytesleft,
+ &outptr, &out_bytes_left);
+
+ buf->end = outptr - buf->text;
+
+ return iconv_ret;
+}
+
+
+
+/* Return conversion of S according to ENC. This function frees S. */
+static char *
+convert_to_utf8 (char *s, enum character_encoding enc)
+{
+ iconv_t our_iconv;
+ static TEXT t;
+ char *inptr; size_t bytes_left;
+ size_t iconv_ret;
+
+ /* Convert from @documentencoding to UTF-8.
+ It might be possible not to convert to UTF-8 and use an 8-bit encoding
+ throughout, but then we'd have to not set the UTF-8 flag on the Perl
+ strings in api.c. If multiple character encodings were used in a single
+ file, then we'd have to keep track of which strings needed the UTF-8 flag
+ and which didn't. */
+
+ /* Could and check for malformed input: see
+ <http://savannah.gnu.org/bugs/?42896>. */
+
+ if (iconv_from_latin1 == (iconv_t) 0)
+ {
+ /* Initialize the conversion for the first time. */
+ iconv_from_latin1 = iconv_open ("UTF-8", "ISO-8859-1");
+ if (iconv_from_latin1 == (iconv_t) -1)
+ {
+ abort ();
+
+ /* big trouble. if we do return it unconverted, we will have to
+ remember not to set the UTF-8 flags on the Perl strings, otherwise
+ Perl will choke. */
+ return s;
+ }
+ }
+
+ switch (enc)
+ {
+ case ce_latin1:
+ our_iconv = iconv_from_latin1;
+ break;
+ case ce_utf8:
+ return s; /* no conversion required. */
+ break;
+ }
+
+ t.end = 0;
+ inptr = s;
+ bytes_left = strlen (s);
+ text_alloc (&t, 10);
+
+ while (1)
+ {
+ iconv_ret = text_buffer_iconv (&t, our_iconv,
+ &inptr, &bytes_left);
+
+ /* Make sure libiconv flushes out the last converted character.
+ This is required when the conversion is stateful, in which
+ case libiconv might not output the last character, waiting to
+ see whether it should be combined with the next one. */
+ if (iconv_ret != (size_t) -1
+ && text_buffer_iconv (&t, our_iconv, 0, 0) != (size_t) -1)
+ /* Success: all of input converted. */
+ break;
+
+ switch (errno)
+ {
+ case E2BIG:
+ text_alloc (&t, t.space + 20);
+ break;
+ default:
+ abort ();
+ break;
+ }
+ }
+
+ free (s);
+ t.text[t.end] = '\0';
+ //fprintf (stderr, "CONVERTED STRING IS <<%s>>", t.text);
+ return strdup (t.text);
+}
+
/* Return value to be freed by caller. Return null if we are out of input. */
char *
next_text (void)
@@ -116,7 +231,7 @@
line_nr = i->line_nr;
- return new;
+ return convert_to_utf8 (new, 0); // i->input_encoding);
break;
case IN_file: // 1911
@@ -139,18 +254,12 @@
if (comment)
*comment = '\0';
- /* TODO: convert from @documentencoding to UTF-8, assuming we
- want to use UTF-8 internally. */
-
- /* Could and check for malformed input: see
- <http://savannah.gnu.org/bugs/?42896>. */
-
// 1920 CPP_LINE_DIRECTIVES
i->line_nr.line_nr++;
line_nr = i->line_nr;
- return line;
+ return convert_to_utf8 (line, 0); // i->input_encoding);
}
free (line); line = 0;
break;
Modified: trunk/tp/parsetexi/text.c
===================================================================
--- trunk/tp/parsetexi/text.c 2016-09-21 15:25:51 UTC (rev 7404)
+++ trunk/tp/parsetexi/text.c 2016-09-21 21:42:03 UTC (rev 7405)
@@ -23,7 +23,7 @@
#include "text.h"
/* Make sure there are LEN free bytes. */
-static void
+void
text_alloc (TEXT *t, size_t len)
{
if (t->end + len > t->space)
Modified: trunk/tp/parsetexi/text.h
===================================================================
--- trunk/tp/parsetexi/text.h 2016-09-21 15:25:51 UTC (rev 7404)
+++ trunk/tp/parsetexi/text.h 2016-09-21 21:42:03 UTC (rev 7405)
@@ -17,5 +17,6 @@
void text_append (TEXT *t, char *s);
void text_append_n (TEXT *t, char *s, size_t len);
void text_printf (TEXT *t, char *format, ...);
+void text_alloc (TEXT *t, size_t len);
#define text_base(t) ((t)->space ? (t)->text : (char *) 0)
[Prev in Thread] |
Current Thread |
[Next in Thread] |
- [7405] parsetexi convert_to_utf8,
gavinsmith0123 <=