[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]
Re: [striconveh] Error handling and Unicode replacement character
From: |
Bruno Haible |
Subject: |
Re: [striconveh] Error handling and Unicode replacement character |
Date: |
Sat, 01 Jan 2022 19:55:28 +0100 |
Marc Nieper-Wißkirchen wrote on 2021-12-30:
> The striconveh module and related modules offer an error handler
> argument. The current possible values are:
>
> iconveh_error
> iconveh_question_mark
> iconveh_escape_sequence
>
> The second option replaces any unconvertible character with a question mark
> "?".
>
> I would like to request to add a fourth option, say,
> iconveh_replacement_character, which is like iconveh_question_mark but
> uses U+FFFD whenever the target codeset is a Unicode codeset.
That's a good suggestion, as nowadays people are frequently converting
to UTF-8 or GB18030. Implemented as follows.
2022-01-01 Bruno Haible <bruno@clisp.org>
striconveh: Support an error handler that produces a Unicode U+FFFD.
Suggested by Marc Nieper-Wißkirchen in
<https://lists.gnu.org/archive/html/bug-gnulib/2021-12/msg00175.html>.
* lib/iconveh.h (iconveh_replacement_character): New enum value.
* lib/striconveh.c (mem_cd_iconveh_internal): When the handler is
iconveh_replacement_character, try to produce U+FFFD when possible,
instead of '?'.
* tests/test-striconveh.c (main): Add GB18030 tests. Test also
iconveh_replacement_character.
diff --git a/lib/iconveh.h b/lib/iconveh.h
index d321d34cb..058f68ca2 100644
--- a/lib/iconveh.h
+++ b/lib/iconveh.h
@@ -29,7 +29,10 @@ enum iconv_ilseq_handler
{
iconveh_error, /* return and set errno = EILSEQ */
iconveh_question_mark, /* use one '?' per unconvertible character */
- iconveh_escape_sequence /* use escape sequence \uxxxx or \Uxxxxxxxx */
+ iconveh_escape_sequence, /* use escape sequence \uxxxx or \Uxxxxxxxx */
+ iconveh_replacement_character /* use one U+FFFD per unconvertible character
+ if that fits in the target encoding,
+ otherwise one '?' */
};
diff --git a/lib/striconveh.c b/lib/striconveh.c
index 4aa8a2f07..612c38c3e 100644
--- a/lib/striconveh.c
+++ b/lib/striconveh.c
@@ -457,13 +457,18 @@ mem_cd_iconveh_internal (const char *src, size_t srclen,
if (cd2 == (iconv_t)(-1))
{
/* TO_CODESET is UTF-8. */
- /* Error handling can produce up to 1 byte of output. */
- if (length + 1 + extra_alloc > allocated)
+ /* Error handling can produce up to 1 or 3 bytes of
+ output. */
+ size_t extra_need =
+ (handler == iconveh_replacement_character ? 3 : 1);
+ if (length + extra_need + extra_alloc > allocated)
{
char *memory;
allocated = 2 * allocated;
- if (length + 1 + extra_alloc > allocated)
+ if (length + extra_need + extra_alloc > allocated)
+ allocated = 2 * allocated;
+ if (length + extra_need + extra_alloc > allocated)
abort ();
if (result == initial_result)
memory = (char *) malloc (allocated);
@@ -482,7 +487,7 @@ mem_cd_iconveh_internal (const char *src, size_t srclen,
grow = false;
}
/* The input is invalid in FROM_CODESET. Eat up one byte
- and emit a question mark. */
+ and emit a replacement character or a question mark. */
if (!incremented)
{
if (insize == 0)
@@ -490,8 +495,19 @@ mem_cd_iconveh_internal (const char *src, size_t srclen,
inptr++;
insize--;
}
- result[length] = '?';
- length++;
+ if (handler == iconveh_replacement_character)
+ {
+ /* U+FFFD in UTF-8 encoding. */
+ result[length+0] = '\357';
+ result[length+1] = '\277';
+ result[length+2] = '\275';
+ length += 3;
+ }
+ else
+ {
+ result[length] = '?';
+ length++;
+ }
}
else
goto indirectly;
@@ -594,7 +610,7 @@ mem_cd_iconveh_internal (const char *src, size_t srclen,
{
const bool slowly = (offsets != NULL || handler == iconveh_error);
# define utf8bufsize 4096 /* may also be smaller or larger than tmpbufsize */
- char utf8buf[utf8bufsize + 1];
+ char utf8buf[utf8bufsize + 3];
size_t utf8len = 0;
const char *in1ptr = src;
size_t in1size = srclen;
@@ -682,8 +698,8 @@ mem_cd_iconveh_internal (const char *src, size_t srclen,
&& errno == EILSEQ && handler != iconveh_error)
{
/* The input is invalid in FROM_CODESET. Eat up one byte and
- emit a question mark. Room for the question mark was allocated
- at the end of utf8buf. */
+ emit a U+FFFD character or a question mark. Room for this
+ character was allocated at the end of utf8buf. */
if (!incremented1)
{
if (in1size == 0)
@@ -691,7 +707,16 @@ mem_cd_iconveh_internal (const char *src, size_t srclen,
in1ptr++;
in1size--;
}
- *out1ptr++ = '?';
+ if (handler == iconveh_replacement_character)
+ {
+ /* U+FFFD in UTF-8 encoding. */
+ out1ptr[0] = '\357';
+ out1ptr[1] = '\277';
+ out1ptr[2] = '\275';
+ out1ptr += 3;
+ }
+ else
+ *out1ptr++ = '?';
res1 = 0;
}
errno1 = errno;
@@ -756,7 +781,7 @@ mem_cd_iconveh_internal (const char *src, size_t srclen,
break;
else if (errno == EILSEQ && handler != iconveh_error)
{
- /* Error handling can produce up to 10 bytes of ASCII
+ /* Error handling can produce up to 10 bytes of UTF-8
output. But TO_CODESET may be UCS-2, UTF-16 or
UCS-4, so use CD2 here as well. */
char scratchbuf[10];
@@ -804,6 +829,14 @@ mem_cd_iconveh_internal (const char *src, size_t srclen,
scratchbuf[scratchlen++] = hex[(uc>>4) & 15];
scratchbuf[scratchlen++] = hex[uc & 15];
}
+ else if (handler == iconveh_replacement_character)
+ {
+ /* U+FFFD in UTF-8 encoding. */
+ scratchbuf[0] = '\357';
+ scratchbuf[1] = '\277';
+ scratchbuf[2] = '\275';
+ scratchlen = 3;
+ }
else
{
scratchbuf[0] = '?';
@@ -813,9 +846,24 @@ mem_cd_iconveh_internal (const char *src, size_t srclen,
inptr = scratchbuf;
insize = scratchlen;
if (cd2 != (iconv_t)(-1))
- res = iconv (cd2,
- (ICONV_CONST char **) &inptr, &insize,
- &out2ptr, &out2size);
+ {
+ res = iconv (cd2,
+ (ICONV_CONST char **) &inptr, &insize,
+ &out2ptr, &out2size);
+ if (handler == iconveh_replacement_character
+ && res == (size_t)(-1) && errno == EILSEQ)
+ {
+ /* U+FFFD can't be converted to TO_CODESET.
+ Use '?' instead. */
+ scratchbuf[0] = '?';
+ scratchlen = 1;
+ inptr = scratchbuf;
+ insize = scratchlen;
+ res = iconv (cd2,
+ (ICONV_CONST char **) &inptr,
&insize,
+ &out2ptr, &out2size);
+ }
+ }
else
{
/* TO_CODESET is UTF-8. */
diff --git a/tests/test-striconveh.c b/tests/test-striconveh.c
index 438b7b087..781aa5254 100644
--- a/tests/test-striconveh.c
+++ b/tests/test-striconveh.c
@@ -46,14 +46,19 @@ main ()
{
#if HAVE_ICONV
static enum iconv_ilseq_handler handlers[] =
- { iconveh_error, iconveh_question_mark, iconveh_escape_sequence };
+ {
+ iconveh_error,
+ iconveh_question_mark,
+ iconveh_replacement_character,
+ iconveh_escape_sequence
+ };
size_t indirect;
size_t h;
size_t o;
size_t i;
/* Assume that iconv() supports at least the encodings ASCII, ISO-8859-1,
- ISO-8859-2, and UTF-8. */
+ ISO-8859-2, UTF-8, and with libiconv or glibc also GB18030. */
iconv_t cd_ascii_to_88591 = iconv_open ("ISO-8859-1", "ASCII");
iconv_t cd_88591_to_88592 = iconv_open ("ISO-8859-2", "ISO-8859-1");
iconv_t cd_88592_to_88591 = iconv_open ("ISO-8859-1", "ISO-8859-2");
@@ -63,6 +68,12 @@ main ()
iconv_t cd_88592_to_utf8 = iconv_open ("UTF-8", "ISO-8859-2");
iconv_t cd_utf8_to_88592 = iconv_open ("ISO-8859-2", "UTF-8");
iconv_t cd_utf7_to_utf8 = iconv_open ("UTF-8", "UTF-7");
+# if defined _LIBICONV_VERSION || (defined __GLIBC__ && !defined __UCLIBC__)
+ iconv_t cd_ascii_to_gb18030 = iconv_open ("GB18030", "ASCII");
+ iconv_t cd_utf8_to_gb18030 = iconv_open ("GB18030", "UTF-8");
+ iconv_t cd_88591_to_gb18030 = iconv_open ("GB18030", "ISO-8859-1");
+ iconv_t cd_utf7_to_gb18030 = iconv_open ("GB18030", "UTF-7");
+# endif
iconveh_t cdeh_ascii_to_88591;
iconveh_t cdeh_ascii_to_88591_indirectly;
iconveh_t cdeh_88592_to_88591;
@@ -71,12 +82,21 @@ main ()
iconveh_t cdeh_88591_to_utf8;
iconveh_t cdeh_utf8_to_88591;
iconveh_t cdeh_utf7_to_utf8;
+# if defined _LIBICONV_VERSION || (defined __GLIBC__ && !defined __UCLIBC__)
+ iconveh_t cdeh_ascii_to_gb18030;
+ iconveh_t cdeh_88591_to_gb18030;
+ iconveh_t cdeh_utf7_to_gb18030;
+# endif
ASSERT (cd_ascii_to_utf8 != (iconv_t)(-1));
ASSERT (cd_88591_to_utf8 != (iconv_t)(-1));
ASSERT (cd_utf8_to_88591 != (iconv_t)(-1));
ASSERT (cd_88592_to_utf8 != (iconv_t)(-1));
ASSERT (cd_utf8_to_88592 != (iconv_t)(-1));
+# if defined _LIBICONV_VERSION || (defined __GLIBC__ && !defined __UCLIBC__)
+ ASSERT (cd_ascii_to_gb18030 != (iconv_t)(-1));
+ ASSERT (cd_utf8_to_gb18030 != (iconv_t)(-1));
+# endif
cdeh_ascii_to_88591.cd = cd_ascii_to_88591;
cdeh_ascii_to_88591.cd1 = cd_ascii_to_utf8;
@@ -110,6 +130,20 @@ main ()
cdeh_utf7_to_utf8.cd1 = cd_utf7_to_utf8;
cdeh_utf7_to_utf8.cd2 = (iconv_t)(-1);
+# if defined _LIBICONV_VERSION || (defined __GLIBC__ && !defined __UCLIBC__)
+ cdeh_ascii_to_gb18030.cd = cd_ascii_to_gb18030;
+ cdeh_ascii_to_gb18030.cd1 = cd_ascii_to_utf8;
+ cdeh_ascii_to_gb18030.cd2 = cd_utf8_to_gb18030;
+
+ cdeh_88591_to_gb18030.cd = cd_88591_to_gb18030;
+ cdeh_88591_to_gb18030.cd1 = cd_88591_to_utf8;
+ cdeh_88591_to_gb18030.cd2 = cd_utf8_to_gb18030;
+
+ cdeh_utf7_to_gb18030.cd = cd_utf7_to_gb18030;
+ cdeh_utf7_to_gb18030.cd1 = cd_utf7_to_utf8;
+ cdeh_utf7_to_gb18030.cd2 = cd_utf8_to_gb18030;
+# endif
+
/* ------------------------ Test mem_cd_iconveh() ------------------------ */
/* Test conversion from ISO-8859-2 to ISO-8859-1 with no errors. */
@@ -175,6 +209,7 @@ main ()
free (offsets);
break;
case iconveh_question_mark:
+ case iconveh_replacement_character:
case iconveh_escape_sequence:
{
static const char expected[] = "Rafa? Maszkowski";
@@ -224,6 +259,7 @@ main ()
free (offsets);
break;
case iconveh_question_mark:
+ case iconveh_replacement_character:
{
static const char expected[] = "Rafa? Maszkowski";
ASSERT (retval == 0);
@@ -294,6 +330,41 @@ main ()
}
}
+# if defined _LIBICONV_VERSION || (defined __GLIBC__ && !defined __UCLIBC__)
+ /* Test conversion from ISO-8859-1 to GB18030 with no errors. */
+ for (h = 0; h < SIZEOF (handlers); h++)
+ {
+ enum iconv_ilseq_handler handler = handlers[h];
+ static const char input[] = "\304rger mit b\366sen B\374bchen ohne
Augenma\337";
+ static const char expected[] = "\2010\2072rger mit b\2010\2132sen
B\250\271bchen ohne Augenma\2010\2118";
+ for (o = 0; o < 2; o++)
+ {
+ size_t *offsets = (o ? new_offsets (strlen (input)) : NULL);
+ char *result = NULL;
+ size_t length = 0;
+ int retval = mem_cd_iconveh (input, strlen (input),
+ &cdeh_88591_to_gb18030,
+ handler,
+ offsets,
+ &result, &length);
+ ASSERT (retval == 0);
+ ASSERT (length == strlen (expected));
+ ASSERT (result != NULL && memcmp (result, expected, strlen
(expected)) == 0);
+ if (o)
+ {
+ for (i = 0; i < 37; i++)
+ ASSERT (offsets[i] == (i < 1 ? i :
+ i < 12 ? i + 3 :
+ i < 18 ? i + 6 :
+ i + 7));
+ ASSERT (offsets[37] == MAGIC);
+ free (offsets);
+ }
+ free (result);
+ }
+ }
+# endif
+
/* Test conversion from UTF-8 to ISO-8859-1 with no errors. */
for (h = 0; h < SIZEOF (handlers); h++)
{
@@ -371,10 +442,88 @@ main ()
free (result);
}
break;
+ case iconveh_replacement_character:
+ {
+ static const char expected[] = "Rafa\357\277\275 Maszkowski";
+ ASSERT (retval == 0);
+ ASSERT (length == strlen (expected));
+ ASSERT (result != NULL && memcmp (result, expected, strlen
(expected)) == 0);
+ if (o)
+ {
+ for (i = 0; i < 16; i++)
+ ASSERT (offsets[i] == (i < 5 ? i : i + 2));
+ ASSERT (offsets[16] == MAGIC);
+ free (offsets);
+ }
+ free (result);
+ }
+ break;
}
}
}
+# if defined _LIBICONV_VERSION || (defined __GLIBC__ && !defined __UCLIBC__)
+ /* Test conversion from ASCII to GB18030 with invalid input (EILSEQ). */
+ for (h = 0; h < SIZEOF (handlers); h++)
+ {
+ enum iconv_ilseq_handler handler = handlers[h];
+ static const char input[] = "Rafa\263 Maszkowski"; /* Rafa? Maszkowski */
+ for (o = 0; o < 2; o++)
+ {
+ size_t *offsets = (o ? new_offsets (strlen (input)) : NULL);
+ char *result = NULL;
+ size_t length = 0;
+ int retval = mem_cd_iconveh (input, strlen (input),
+ &cdeh_ascii_to_gb18030,
+ handler,
+ offsets,
+ &result, &length);
+ switch (handler)
+ {
+ case iconveh_error:
+ ASSERT (retval == -1 && errno == EILSEQ);
+ ASSERT (result == NULL);
+ if (o)
+ free (offsets);
+ break;
+ case iconveh_question_mark:
+ case iconveh_escape_sequence:
+ {
+ static const char expected[] = "Rafa? Maszkowski";
+ ASSERT (retval == 0);
+ ASSERT (length == strlen (expected));
+ ASSERT (result != NULL && memcmp (result, expected, strlen
(expected)) == 0);
+ if (o)
+ {
+ for (i = 0; i < 16; i++)
+ ASSERT (offsets[i] == i);
+ ASSERT (offsets[16] == MAGIC);
+ free (offsets);
+ }
+ free (result);
+ }
+ break;
+ case iconveh_replacement_character:
+ {
+ static const char expected[] = "Rafa\2041\2447 Maszkowski";
+ ASSERT (retval == 0);
+ ASSERT (length == strlen (expected));
+ ASSERT (result != NULL && memcmp (result, expected, strlen
(expected)) == 0);
+ if (o)
+ {
+ for (i = 0; i < 16; i++)
+ ASSERT (offsets[i] == (i < 5 ? i : i + 3));
+ ASSERT (offsets[16] == MAGIC);
+ free (offsets);
+ }
+ free (result);
+ }
+ break;
+ }
+ }
+ }
+# endif
+
/* Test conversion from UTF-8 to ISO-8859-1 with EILSEQ. */
for (h = 0; h < SIZEOF (handlers); h++)
{
@@ -399,6 +548,7 @@ main ()
free (offsets);
break;
case iconveh_question_mark:
+ case iconveh_replacement_character:
{
static const char expected[] = "Rafa? Maszkowski";
ASSERT (retval == 0);
@@ -496,6 +646,34 @@ main ()
free (result);
}
+# if defined _LIBICONV_VERSION || (defined __GLIBC__ && !defined __UCLIBC__)
+ /* Test conversion from UTF-7 to GB18030 with EINVAL. */
+ for (h = 0; h < SIZEOF (handlers); h++)
+ {
+ enum iconv_ilseq_handler handler = handlers[h];
+ /* This is base64 encoded 0x54 0x32 0xD8 0x3F 0xD8 0x40. It would
+ convert to U+5432 U+D83F U+D840 but these are Unicode surrogates.
*/
+ static const char input[] = "+VDLYP9hA";
+ static const char expected1[] = "\337\305"; /* 吲 glibc */
+ static const char expected2[] = ""; /* libiconv */
+ char *result = NULL;
+ size_t length = 0;
+ int retval = mem_cd_iconveh (input, 7,
+ &cdeh_utf7_to_gb18030,
+ handler,
+ NULL,
+ &result, &length);
+ ASSERT (retval == 0);
+ ASSERT (length == strlen (expected1) || length == strlen
(expected2));
+ ASSERT (result != NULL);
+ if (length == strlen (expected1))
+ ASSERT (memcmp (result, expected1, strlen (expected1)) == 0);
+ else
+ ASSERT (memcmp (result, expected2, strlen (expected2)) == 0);
+ free (result);
+ }
+# endif
+
/* Disabled on NetBSD, because NetBSD 5.0 iconv() is buggy: it converts
the input "+2D/YQNhB" to U+1FED8 U+3FD8 U+40D8. */
# if !(defined __NetBSD__ && !defined _LIBICONV_VERSION)
@@ -544,8 +722,98 @@ main ()
free (result);
}
break;
+ case iconveh_replacement_character:
+ {
+ /* glibc result */
+ static const char expected1[] =
"\357\277\275\357\277\275\357\277\275\357\277\275\357\277\275";
+ /* libiconv <= 1.12 result */
+ static const char expected2[] = "\357\277\2752D/YQNhB";
+ /* libiconv >= 1.13 result */
+ static const char expected3[] =
"\357\277\275\340\277\266\341\200\266";
+ ASSERT (retval == 0);
+ ASSERT (length == strlen (expected1)
+ || length == strlen (expected2)
+ || length == strlen (expected3));
+ ASSERT (result != NULL);
+ if (length == strlen (expected1))
+ ASSERT (memcmp (result, expected1, strlen (expected1)) == 0);
+ else if (length == strlen (expected2))
+ ASSERT (memcmp (result, expected2, strlen (expected2)) == 0);
+ else
+ ASSERT (memcmp (result, expected3, strlen (expected3)) == 0);
+ free (result);
+ }
+ }
+ }
+
+# if defined _LIBICONV_VERSION || (defined __GLIBC__ && !defined __UCLIBC__)
+ /* Test conversion from UTF-7 to GB18030 with EILSEQ. */
+ for (h = 0; h < SIZEOF (handlers); h++)
+ {
+ enum iconv_ilseq_handler handler = handlers[h];
+ /* This is base64 encoded 0xD8 0x3F 0xD8 0x40 0xD8 0x41. It would
+ convert to U+D83F U+D840 U+D841 but these are Unicode surrogates.
*/
+ static const char input[] = "+2D/YQNhB";
+ char *result = NULL;
+ size_t length = 0;
+ int retval = mem_cd_iconveh (input, strlen (input),
+ &cdeh_utf7_to_gb18030,
+ handler,
+ NULL,
+ &result, &length);
+ switch (handler)
+ {
+ case iconveh_error:
+ ASSERT (retval == -1 && errno == EILSEQ);
+ ASSERT (result == NULL);
+ break;
+ case iconveh_question_mark:
+ case iconveh_escape_sequence:
+ {
+ /* glibc result */
+ static const char expected1[] = "?????";
+ /* libiconv <= 1.12 result */
+ static const char expected2[] = "?2D/YQNhB";
+ /* libiconv behaviour changed in version 1.13: the result is
+ '?' U+0FF6 U+1036; this is U+D83F U+D840 U+D841 shifted left
+ by 6 bits. */
+ static const char expected3[] = "?\2013\2030\2013\2114";
+ ASSERT (retval == 0);
+ ASSERT (length == strlen (expected1)
+ || length == strlen (expected2)
+ || length == strlen (expected3));
+ ASSERT (result != NULL);
+ if (length == strlen (expected1))
+ ASSERT (memcmp (result, expected1, strlen (expected1)) == 0);
+ else if (length == strlen (expected2))
+ ASSERT (memcmp (result, expected2, strlen (expected2)) == 0
+ || memcmp (result, expected3, strlen (expected3)) ==
0);
+ free (result);
+ }
+ break;
+ case iconveh_replacement_character:
+ {
+ /* glibc result */
+ static const char expected1[] =
"\2041\2447\2041\2447\2041\2447\2041\2447\2041\2447";
+ /* libiconv <= 1.12 result */
+ static const char expected2[] = "\2041\24472D/YQNhB";
+ /* libiconv >= 1.13 result */
+ static const char expected3[] =
"\2041\2447\2013\2030\2013\2114";
+ ASSERT (retval == 0);
+ ASSERT (length == strlen (expected1)
+ || length == strlen (expected2)
+ || length == strlen (expected3));
+ ASSERT (result != NULL);
+ if (length == strlen (expected1))
+ ASSERT (memcmp (result, expected1, strlen (expected1)) == 0);
+ else if (length == strlen (expected2))
+ ASSERT (memcmp (result, expected2, strlen (expected2)) == 0
+ || memcmp (result, expected3, strlen (expected3)) ==
0);
+ free (result);
+ }
}
}
+# endif
# endif
# endif
}
@@ -589,6 +857,7 @@ main ()
ASSERT (result == NULL && errno == EILSEQ);
break;
case iconveh_question_mark:
+ case iconveh_replacement_character:
case iconveh_escape_sequence:
{
static const char expected[] = "Rafa? Maszkowski";
@@ -619,6 +888,7 @@ main ()
ASSERT (result == NULL && errno == EILSEQ);
break;
case iconveh_question_mark:
+ case iconveh_replacement_character:
{
static const char expected[] = "Rafa? Maszkowski";
ASSERT (result != NULL);
@@ -652,6 +922,22 @@ main ()
free (result);
}
+# if defined _LIBICONV_VERSION || (defined __GLIBC__ && !defined __UCLIBC__)
+ /* Test conversion from ISO-8859-1 to GB18030 with no errors. */
+ for (h = 0; h < SIZEOF (handlers); h++)
+ {
+ enum iconv_ilseq_handler handler = handlers[h];
+ static const char input[] = "\304rger mit b\366sen B\374bchen ohne
Augenma\337";
+ static const char expected[] = "\2010\2072rger mit b\2010\2132sen
B\250\271bchen ohne Augenma\2010\2118";
+ char *result = str_cd_iconveh (input,
+ &cdeh_88591_to_gb18030,
+ handler);
+ ASSERT (result != NULL);
+ ASSERT (strcmp (result, expected) == 0);
+ free (result);
+ }
+# endif
+
/* Test conversion from UTF-8 to ISO-8859-1 with no errors. */
for (h = 0; h < SIZEOF (handlers); h++)
{
@@ -688,8 +974,51 @@ main ()
free (result);
}
break;
+ case iconveh_replacement_character:
+ {
+ static const char expected[] = "Rafa\357\277\275 Maszkowski";
+ ASSERT (result != NULL);
+ ASSERT (strcmp (result, expected) == 0);
+ free (result);
+ }
+ break;
+ }
+ }
+
+# if defined _LIBICONV_VERSION || (defined __GLIBC__ && !defined __UCLIBC__)
+ /* Test conversion from ASCII to GB18030 with invalid input (EILSEQ). */
+ for (h = 0; h < SIZEOF (handlers); h++)
+ {
+ enum iconv_ilseq_handler handler = handlers[h];
+ static const char input[] = "Rafa\263 Maszkowski"; /* Rafa? Maszkowski */
+ char *result = str_cd_iconveh (input,
+ &cdeh_ascii_to_gb18030,
+ handler);
+ switch (handler)
+ {
+ case iconveh_error:
+ ASSERT (result == NULL && errno == EILSEQ);
+ break;
+ case iconveh_question_mark:
+ case iconveh_escape_sequence:
+ {
+ static const char expected[] = "Rafa? Maszkowski";
+ ASSERT (result != NULL);
+ ASSERT (strcmp (result, expected) == 0);
+ free (result);
+ }
+ break;
+ case iconveh_replacement_character:
+ {
+ static const char expected[] = "Rafa\2041\2447 Maszkowski";
+ ASSERT (result != NULL);
+ ASSERT (strcmp (result, expected) == 0);
+ free (result);
+ }
+ break;
}
}
+# endif
/* Test conversion from UTF-8 to ISO-8859-1 with EILSEQ. */
for (h = 0; h < SIZEOF (handlers); h++)
@@ -705,6 +1034,7 @@ main ()
ASSERT (result == NULL && errno == EILSEQ);
break;
case iconveh_question_mark:
+ case iconveh_replacement_character:
{
static const char expected[] = "Costs: 27 ?";
ASSERT (result != NULL);
@@ -801,6 +1131,7 @@ main ()
free (offsets);
break;
case iconveh_question_mark:
+ case iconveh_replacement_character:
{
static const char expected[] = "Rafa? Maszkowski";
ASSERT (retval == 0);
@@ -870,6 +1201,41 @@ main ()
}
}
+# if defined _LIBICONV_VERSION || (defined __GLIBC__ && !defined __UCLIBC__)
+ /* Test conversion from ISO-8859-1 to GB18030 with no errors. */
+ for (h = 0; h < SIZEOF (handlers); h++)
+ {
+ enum iconv_ilseq_handler handler = handlers[h];
+ static const char input[] = "\304rger mit b\366sen B\374bchen ohne
Augenma\337";
+ static const char expected[] = "\2010\2072rger mit b\2010\2132sen
B\250\271bchen ohne Augenma\2010\2118";
+ for (o = 0; o < 2; o++)
+ {
+ size_t *offsets = (o ? new_offsets (strlen (input)) : NULL);
+ char *result = NULL;
+ size_t length = 0;
+ int retval = mem_iconveh (input, strlen (input),
+ "ISO-8859-1", "GB18030",
+ handler,
+ offsets,
+ &result, &length);
+ ASSERT (retval == 0);
+ ASSERT (length == strlen (expected));
+ ASSERT (result != NULL && memcmp (result, expected, strlen
(expected)) == 0);
+ if (o)
+ {
+ for (i = 0; i < 37; i++)
+ ASSERT (offsets[i] == (i < 1 ? i :
+ i < 12 ? i + 3 :
+ i < 18 ? i + 6 :
+ i + 7));
+ ASSERT (offsets[37] == MAGIC);
+ free (offsets);
+ }
+ free (result);
+ }
+ }
+# endif
+
/* Test conversion from UTF-8 to ISO-8859-1 with no errors. */
for (h = 0; h < SIZEOF (handlers); h++)
{
@@ -931,6 +1297,7 @@ main ()
free (offsets);
break;
case iconveh_question_mark:
+ case iconveh_replacement_character:
{
static const char expected[] = "Rafa? Maszkowski";
ASSERT (retval == 0);
@@ -1023,6 +1390,7 @@ main ()
ASSERT (result == NULL && errno == EILSEQ);
break;
case iconveh_question_mark:
+ case iconveh_replacement_character:
{
static const char expected[] = "Rafa? Maszkowski";
ASSERT (result != NULL);
@@ -1053,6 +1421,20 @@ main ()
free (result);
}
+# if defined _LIBICONV_VERSION || (defined __GLIBC__ && !defined __UCLIBC__)
+ /* Test conversion from ISO-8859-1 to GB18030 with no errors. */
+ for (h = 0; h < SIZEOF (handlers); h++)
+ {
+ enum iconv_ilseq_handler handler = handlers[h];
+ static const char input[] = "\304rger mit b\366sen B\374bchen ohne
Augenma\337";
+ static const char expected[] = "\2010\2072rger mit b\2010\2132sen
B\250\271bchen ohne Augenma\2010\2118";
+ char *result = str_iconveh (input, "ISO-8859-1", "GB18030", handler);
+ ASSERT (result != NULL);
+ ASSERT (strcmp (result, expected) == 0);
+ free (result);
+ }
+# endif
+
/* Test conversion from UTF-8 to ISO-8859-1 with no errors. */
for (h = 0; h < SIZEOF (handlers); h++)
{
@@ -1077,6 +1459,7 @@ main ()
ASSERT (result == NULL && errno == EILSEQ);
break;
case iconveh_question_mark:
+ case iconveh_replacement_character:
{
static const char expected[] = "Costs: 27 ?";
ASSERT (result != NULL);
- Re: [striconveh] Error handling and Unicode replacement character,
Bruno Haible <=