[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]
Re: [striconveh] Error handling and Unicode replacement character
From: |
Marc Nieper-Wißkirchen |
Subject: |
Re: [striconveh] Error handling and Unicode replacement character |
Date: |
Wed, 5 Jan 2022 11:06:04 +0100 |
Dear Bruno,
thank you for responding so quickly and for this addition!
Marc
Am Sa., 1. Jan. 2022 um 19:55 Uhr schrieb Bruno Haible <bruno@clisp.org>:
>
> Marc Nieper-Wißkirchen wrote on 2021-12-30:
> > The striconveh module and related modules offer an error handler
> > argument. The current possible values are:
> >
> > iconveh_error
> > iconveh_question_mark
> > iconveh_escape_sequence
> >
> > The second option replaces any unconvertible character with a question mark
> > "?".
> >
> > I would like to request to add a fourth option, say,
> > iconveh_replacement_character, which is like iconveh_question_mark but
> > uses U+FFFD whenever the target codeset is a Unicode codeset.
>
> That's a good suggestion, as nowadays people are frequently converting
> to UTF-8 or GB18030. Implemented as follows.
>
>
> 2022-01-01 Bruno Haible <bruno@clisp.org>
>
> striconveh: Support an error handler that produces a Unicode U+FFFD.
> Suggested by Marc Nieper-Wißkirchen in
> <https://lists.gnu.org/archive/html/bug-gnulib/2021-12/msg00175.html>.
> * lib/iconveh.h (iconveh_replacement_character): New enum value.
> * lib/striconveh.c (mem_cd_iconveh_internal): When the handler is
> iconveh_replacement_character, try to produce U+FFFD when possible,
> instead of '?'.
> * tests/test-striconveh.c (main): Add GB18030 tests. Test also
> iconveh_replacement_character.
>
> diff --git a/lib/iconveh.h b/lib/iconveh.h
> index d321d34cb..058f68ca2 100644
> --- a/lib/iconveh.h
> +++ b/lib/iconveh.h
> @@ -29,7 +29,10 @@ enum iconv_ilseq_handler
> {
> iconveh_error, /* return and set errno = EILSEQ */
> iconveh_question_mark, /* use one '?' per unconvertible character */
> - iconveh_escape_sequence /* use escape sequence \uxxxx or \Uxxxxxxxx
> */
> + iconveh_escape_sequence, /* use escape sequence \uxxxx or \Uxxxxxxxx
> */
> + iconveh_replacement_character /* use one U+FFFD per unconvertible character
> + if that fits in the target encoding,
> + otherwise one '?' */
> };
>
>
> diff --git a/lib/striconveh.c b/lib/striconveh.c
> index 4aa8a2f07..612c38c3e 100644
> --- a/lib/striconveh.c
> +++ b/lib/striconveh.c
> @@ -457,13 +457,18 @@ mem_cd_iconveh_internal (const char *src, size_t srclen,
> if (cd2 == (iconv_t)(-1))
> {
> /* TO_CODESET is UTF-8. */
> - /* Error handling can produce up to 1 byte of output. */
> - if (length + 1 + extra_alloc > allocated)
> + /* Error handling can produce up to 1 or 3 bytes of
> + output. */
> + size_t extra_need =
> + (handler == iconveh_replacement_character ? 3 : 1);
> + if (length + extra_need + extra_alloc > allocated)
> {
> char *memory;
>
> allocated = 2 * allocated;
> - if (length + 1 + extra_alloc > allocated)
> + if (length + extra_need + extra_alloc > allocated)
> + allocated = 2 * allocated;
> + if (length + extra_need + extra_alloc > allocated)
> abort ();
> if (result == initial_result)
> memory = (char *) malloc (allocated);
> @@ -482,7 +487,7 @@ mem_cd_iconveh_internal (const char *src, size_t srclen,
> grow = false;
> }
> /* The input is invalid in FROM_CODESET. Eat up one byte
> - and emit a question mark. */
> + and emit a replacement character or a question mark.
> */
> if (!incremented)
> {
> if (insize == 0)
> @@ -490,8 +495,19 @@ mem_cd_iconveh_internal (const char *src, size_t srclen,
> inptr++;
> insize--;
> }
> - result[length] = '?';
> - length++;
> + if (handler == iconveh_replacement_character)
> + {
> + /* U+FFFD in UTF-8 encoding. */
> + result[length+0] = '\357';
> + result[length+1] = '\277';
> + result[length+2] = '\275';
> + length += 3;
> + }
> + else
> + {
> + result[length] = '?';
> + length++;
> + }
> }
> else
> goto indirectly;
> @@ -594,7 +610,7 @@ mem_cd_iconveh_internal (const char *src, size_t srclen,
> {
> const bool slowly = (offsets != NULL || handler == iconveh_error);
> # define utf8bufsize 4096 /* may also be smaller or larger than tmpbufsize */
> - char utf8buf[utf8bufsize + 1];
> + char utf8buf[utf8bufsize + 3];
> size_t utf8len = 0;
> const char *in1ptr = src;
> size_t in1size = srclen;
> @@ -682,8 +698,8 @@ mem_cd_iconveh_internal (const char *src, size_t srclen,
> && errno == EILSEQ && handler != iconveh_error)
> {
> /* The input is invalid in FROM_CODESET. Eat up one byte and
> - emit a question mark. Room for the question mark was
> allocated
> - at the end of utf8buf. */
> + emit a U+FFFD character or a question mark. Room for this
> + character was allocated at the end of utf8buf. */
> if (!incremented1)
> {
> if (in1size == 0)
> @@ -691,7 +707,16 @@ mem_cd_iconveh_internal (const char *src, size_t srclen,
> in1ptr++;
> in1size--;
> }
> - *out1ptr++ = '?';
> + if (handler == iconveh_replacement_character)
> + {
> + /* U+FFFD in UTF-8 encoding. */
> + out1ptr[0] = '\357';
> + out1ptr[1] = '\277';
> + out1ptr[2] = '\275';
> + out1ptr += 3;
> + }
> + else
> + *out1ptr++ = '?';
> res1 = 0;
> }
> errno1 = errno;
> @@ -756,7 +781,7 @@ mem_cd_iconveh_internal (const char *src, size_t srclen,
> break;
> else if (errno == EILSEQ && handler != iconveh_error)
> {
> - /* Error handling can produce up to 10 bytes of ASCII
> + /* Error handling can produce up to 10 bytes of UTF-8
> output. But TO_CODESET may be UCS-2, UTF-16 or
> UCS-4, so use CD2 here as well. */
> char scratchbuf[10];
> @@ -804,6 +829,14 @@ mem_cd_iconveh_internal (const char *src, size_t srclen,
> scratchbuf[scratchlen++] = hex[(uc>>4) & 15];
> scratchbuf[scratchlen++] = hex[uc & 15];
> }
> + else if (handler == iconveh_replacement_character)
> + {
> + /* U+FFFD in UTF-8 encoding. */
> + scratchbuf[0] = '\357';
> + scratchbuf[1] = '\277';
> + scratchbuf[2] = '\275';
> + scratchlen = 3;
> + }
> else
> {
> scratchbuf[0] = '?';
> @@ -813,9 +846,24 @@ mem_cd_iconveh_internal (const char *src, size_t srclen,
> inptr = scratchbuf;
> insize = scratchlen;
> if (cd2 != (iconv_t)(-1))
> - res = iconv (cd2,
> - (ICONV_CONST char **) &inptr, &insize,
> - &out2ptr, &out2size);
> + {
> + res = iconv (cd2,
> + (ICONV_CONST char **) &inptr,
> &insize,
> + &out2ptr, &out2size);
> + if (handler == iconveh_replacement_character
> + && res == (size_t)(-1) && errno == EILSEQ)
> + {
> + /* U+FFFD can't be converted to TO_CODESET.
> + Use '?' instead. */
> + scratchbuf[0] = '?';
> + scratchlen = 1;
> + inptr = scratchbuf;
> + insize = scratchlen;
> + res = iconv (cd2,
> + (ICONV_CONST char **) &inptr,
> &insize,
> + &out2ptr, &out2size);
> + }
> + }
> else
> {
> /* TO_CODESET is UTF-8. */
> diff --git a/tests/test-striconveh.c b/tests/test-striconveh.c
> index 438b7b087..781aa5254 100644
> --- a/tests/test-striconveh.c
> +++ b/tests/test-striconveh.c
> @@ -46,14 +46,19 @@ main ()
> {
> #if HAVE_ICONV
> static enum iconv_ilseq_handler handlers[] =
> - { iconveh_error, iconveh_question_mark, iconveh_escape_sequence };
> + {
> + iconveh_error,
> + iconveh_question_mark,
> + iconveh_replacement_character,
> + iconveh_escape_sequence
> + };
> size_t indirect;
> size_t h;
> size_t o;
> size_t i;
>
> /* Assume that iconv() supports at least the encodings ASCII, ISO-8859-1,
> - ISO-8859-2, and UTF-8. */
> + ISO-8859-2, UTF-8, and with libiconv or glibc also GB18030. */
> iconv_t cd_ascii_to_88591 = iconv_open ("ISO-8859-1", "ASCII");
> iconv_t cd_88591_to_88592 = iconv_open ("ISO-8859-2", "ISO-8859-1");
> iconv_t cd_88592_to_88591 = iconv_open ("ISO-8859-1", "ISO-8859-2");
> @@ -63,6 +68,12 @@ main ()
> iconv_t cd_88592_to_utf8 = iconv_open ("UTF-8", "ISO-8859-2");
> iconv_t cd_utf8_to_88592 = iconv_open ("ISO-8859-2", "UTF-8");
> iconv_t cd_utf7_to_utf8 = iconv_open ("UTF-8", "UTF-7");
> +# if defined _LIBICONV_VERSION || (defined __GLIBC__ && !defined __UCLIBC__)
> + iconv_t cd_ascii_to_gb18030 = iconv_open ("GB18030", "ASCII");
> + iconv_t cd_utf8_to_gb18030 = iconv_open ("GB18030", "UTF-8");
> + iconv_t cd_88591_to_gb18030 = iconv_open ("GB18030", "ISO-8859-1");
> + iconv_t cd_utf7_to_gb18030 = iconv_open ("GB18030", "UTF-7");
> +# endif
> iconveh_t cdeh_ascii_to_88591;
> iconveh_t cdeh_ascii_to_88591_indirectly;
> iconveh_t cdeh_88592_to_88591;
> @@ -71,12 +82,21 @@ main ()
> iconveh_t cdeh_88591_to_utf8;
> iconveh_t cdeh_utf8_to_88591;
> iconveh_t cdeh_utf7_to_utf8;
> +# if defined _LIBICONV_VERSION || (defined __GLIBC__ && !defined __UCLIBC__)
> + iconveh_t cdeh_ascii_to_gb18030;
> + iconveh_t cdeh_88591_to_gb18030;
> + iconveh_t cdeh_utf7_to_gb18030;
> +# endif
>
> ASSERT (cd_ascii_to_utf8 != (iconv_t)(-1));
> ASSERT (cd_88591_to_utf8 != (iconv_t)(-1));
> ASSERT (cd_utf8_to_88591 != (iconv_t)(-1));
> ASSERT (cd_88592_to_utf8 != (iconv_t)(-1));
> ASSERT (cd_utf8_to_88592 != (iconv_t)(-1));
> +# if defined _LIBICONV_VERSION || (defined __GLIBC__ && !defined __UCLIBC__)
> + ASSERT (cd_ascii_to_gb18030 != (iconv_t)(-1));
> + ASSERT (cd_utf8_to_gb18030 != (iconv_t)(-1));
> +# endif
>
> cdeh_ascii_to_88591.cd = cd_ascii_to_88591;
> cdeh_ascii_to_88591.cd1 = cd_ascii_to_utf8;
> @@ -110,6 +130,20 @@ main ()
> cdeh_utf7_to_utf8.cd1 = cd_utf7_to_utf8;
> cdeh_utf7_to_utf8.cd2 = (iconv_t)(-1);
>
> +# if defined _LIBICONV_VERSION || (defined __GLIBC__ && !defined __UCLIBC__)
> + cdeh_ascii_to_gb18030.cd = cd_ascii_to_gb18030;
> + cdeh_ascii_to_gb18030.cd1 = cd_ascii_to_utf8;
> + cdeh_ascii_to_gb18030.cd2 = cd_utf8_to_gb18030;
> +
> + cdeh_88591_to_gb18030.cd = cd_88591_to_gb18030;
> + cdeh_88591_to_gb18030.cd1 = cd_88591_to_utf8;
> + cdeh_88591_to_gb18030.cd2 = cd_utf8_to_gb18030;
> +
> + cdeh_utf7_to_gb18030.cd = cd_utf7_to_gb18030;
> + cdeh_utf7_to_gb18030.cd1 = cd_utf7_to_utf8;
> + cdeh_utf7_to_gb18030.cd2 = cd_utf8_to_gb18030;
> +# endif
> +
> /* ------------------------ Test mem_cd_iconveh() ------------------------
> */
>
> /* Test conversion from ISO-8859-2 to ISO-8859-1 with no errors. */
> @@ -175,6 +209,7 @@ main ()
> free (offsets);
> break;
> case iconveh_question_mark:
> + case iconveh_replacement_character:
> case iconveh_escape_sequence:
> {
> static const char expected[] = "Rafa? Maszkowski";
> @@ -224,6 +259,7 @@ main ()
> free (offsets);
> break;
> case iconveh_question_mark:
> + case iconveh_replacement_character:
> {
> static const char expected[] = "Rafa? Maszkowski";
> ASSERT (retval == 0);
> @@ -294,6 +330,41 @@ main ()
> }
> }
>
> +# if defined _LIBICONV_VERSION || (defined __GLIBC__ && !defined __UCLIBC__)
> + /* Test conversion from ISO-8859-1 to GB18030 with no errors. */
> + for (h = 0; h < SIZEOF (handlers); h++)
> + {
> + enum iconv_ilseq_handler handler = handlers[h];
> + static const char input[] = "\304rger mit b\366sen B\374bchen ohne
> Augenma\337";
> + static const char expected[] = "\2010\2072rger mit b\2010\2132sen
> B\250\271bchen ohne Augenma\2010\2118";
> + for (o = 0; o < 2; o++)
> + {
> + size_t *offsets = (o ? new_offsets (strlen (input)) : NULL);
> + char *result = NULL;
> + size_t length = 0;
> + int retval = mem_cd_iconveh (input, strlen (input),
> + &cdeh_88591_to_gb18030,
> + handler,
> + offsets,
> + &result, &length);
> + ASSERT (retval == 0);
> + ASSERT (length == strlen (expected));
> + ASSERT (result != NULL && memcmp (result, expected, strlen
> (expected)) == 0);
> + if (o)
> + {
> + for (i = 0; i < 37; i++)
> + ASSERT (offsets[i] == (i < 1 ? i :
> + i < 12 ? i + 3 :
> + i < 18 ? i + 6 :
> + i + 7));
> + ASSERT (offsets[37] == MAGIC);
> + free (offsets);
> + }
> + free (result);
> + }
> + }
> +# endif
> +
> /* Test conversion from UTF-8 to ISO-8859-1 with no errors. */
> for (h = 0; h < SIZEOF (handlers); h++)
> {
> @@ -371,10 +442,88 @@ main ()
> free (result);
> }
> break;
> + case iconveh_replacement_character:
> + {
> + static const char expected[] = "Rafa\357\277\275 Maszkowski";
> + ASSERT (retval == 0);
> + ASSERT (length == strlen (expected));
> + ASSERT (result != NULL && memcmp (result, expected, strlen
> (expected)) == 0);
> + if (o)
> + {
> + for (i = 0; i < 16; i++)
> + ASSERT (offsets[i] == (i < 5 ? i : i + 2));
> + ASSERT (offsets[16] == MAGIC);
> + free (offsets);
> + }
> + free (result);
> + }
> + break;
> }
> }
> }
>
> +# if defined _LIBICONV_VERSION || (defined __GLIBC__ && !defined __UCLIBC__)
> + /* Test conversion from ASCII to GB18030 with invalid input (EILSEQ). */
> + for (h = 0; h < SIZEOF (handlers); h++)
> + {
> + enum iconv_ilseq_handler handler = handlers[h];
> + static const char input[] = "Rafa\263 Maszkowski"; /* Rafa? Maszkowski
> */
> + for (o = 0; o < 2; o++)
> + {
> + size_t *offsets = (o ? new_offsets (strlen (input)) : NULL);
> + char *result = NULL;
> + size_t length = 0;
> + int retval = mem_cd_iconveh (input, strlen (input),
> + &cdeh_ascii_to_gb18030,
> + handler,
> + offsets,
> + &result, &length);
> + switch (handler)
> + {
> + case iconveh_error:
> + ASSERT (retval == -1 && errno == EILSEQ);
> + ASSERT (result == NULL);
> + if (o)
> + free (offsets);
> + break;
> + case iconveh_question_mark:
> + case iconveh_escape_sequence:
> + {
> + static const char expected[] = "Rafa? Maszkowski";
> + ASSERT (retval == 0);
> + ASSERT (length == strlen (expected));
> + ASSERT (result != NULL && memcmp (result, expected, strlen
> (expected)) == 0);
> + if (o)
> + {
> + for (i = 0; i < 16; i++)
> + ASSERT (offsets[i] == i);
> + ASSERT (offsets[16] == MAGIC);
> + free (offsets);
> + }
> + free (result);
> + }
> + break;
> + case iconveh_replacement_character:
> + {
> + static const char expected[] = "Rafa\2041\2447 Maszkowski";
> + ASSERT (retval == 0);
> + ASSERT (length == strlen (expected));
> + ASSERT (result != NULL && memcmp (result, expected, strlen
> (expected)) == 0);
> + if (o)
> + {
> + for (i = 0; i < 16; i++)
> + ASSERT (offsets[i] == (i < 5 ? i : i + 3));
> + ASSERT (offsets[16] == MAGIC);
> + free (offsets);
> + }
> + free (result);
> + }
> + break;
> + }
> + }
> + }
> +# endif
> +
> /* Test conversion from UTF-8 to ISO-8859-1 with EILSEQ. */
> for (h = 0; h < SIZEOF (handlers); h++)
> {
> @@ -399,6 +548,7 @@ main ()
> free (offsets);
> break;
> case iconveh_question_mark:
> + case iconveh_replacement_character:
> {
> static const char expected[] = "Rafa? Maszkowski";
> ASSERT (retval == 0);
> @@ -496,6 +646,34 @@ main ()
> free (result);
> }
>
> +# if defined _LIBICONV_VERSION || (defined __GLIBC__ && !defined __UCLIBC__)
> + /* Test conversion from UTF-7 to GB18030 with EINVAL. */
> + for (h = 0; h < SIZEOF (handlers); h++)
> + {
> + enum iconv_ilseq_handler handler = handlers[h];
> + /* This is base64 encoded 0x54 0x32 0xD8 0x3F 0xD8 0x40. It would
> + convert to U+5432 U+D83F U+D840 but these are Unicode
> surrogates. */
> + static const char input[] = "+VDLYP9hA";
> + static const char expected1[] = "\337\305"; /* 吲 glibc */
> + static const char expected2[] = ""; /* libiconv */
> + char *result = NULL;
> + size_t length = 0;
> + int retval = mem_cd_iconveh (input, 7,
> + &cdeh_utf7_to_gb18030,
> + handler,
> + NULL,
> + &result, &length);
> + ASSERT (retval == 0);
> + ASSERT (length == strlen (expected1) || length == strlen
> (expected2));
> + ASSERT (result != NULL);
> + if (length == strlen (expected1))
> + ASSERT (memcmp (result, expected1, strlen (expected1)) == 0);
> + else
> + ASSERT (memcmp (result, expected2, strlen (expected2)) == 0);
> + free (result);
> + }
> +# endif
> +
> /* Disabled on NetBSD, because NetBSD 5.0 iconv() is buggy: it converts
> the input "+2D/YQNhB" to U+1FED8 U+3FD8 U+40D8. */
> # if !(defined __NetBSD__ && !defined _LIBICONV_VERSION)
> @@ -544,8 +722,98 @@ main ()
> free (result);
> }
> break;
> + case iconveh_replacement_character:
> + {
> + /* glibc result */
> + static const char expected1[] =
> "\357\277\275\357\277\275\357\277\275\357\277\275\357\277\275";
> + /* libiconv <= 1.12 result */
> + static const char expected2[] = "\357\277\2752D/YQNhB";
> + /* libiconv >= 1.13 result */
> + static const char expected3[] =
> "\357\277\275\340\277\266\341\200\266";
> + ASSERT (retval == 0);
> + ASSERT (length == strlen (expected1)
> + || length == strlen (expected2)
> + || length == strlen (expected3));
> + ASSERT (result != NULL);
> + if (length == strlen (expected1))
> + ASSERT (memcmp (result, expected1, strlen (expected1)) ==
> 0);
> + else if (length == strlen (expected2))
> + ASSERT (memcmp (result, expected2, strlen (expected2)) ==
> 0);
> + else
> + ASSERT (memcmp (result, expected3, strlen (expected3)) ==
> 0);
> + free (result);
> + }
> + }
> + }
> +
> +# if defined _LIBICONV_VERSION || (defined __GLIBC__ && !defined
> __UCLIBC__)
> + /* Test conversion from UTF-7 to GB18030 with EILSEQ. */
> + for (h = 0; h < SIZEOF (handlers); h++)
> + {
> + enum iconv_ilseq_handler handler = handlers[h];
> + /* This is base64 encoded 0xD8 0x3F 0xD8 0x40 0xD8 0x41. It would
> + convert to U+D83F U+D840 U+D841 but these are Unicode
> surrogates. */
> + static const char input[] = "+2D/YQNhB";
> + char *result = NULL;
> + size_t length = 0;
> + int retval = mem_cd_iconveh (input, strlen (input),
> + &cdeh_utf7_to_gb18030,
> + handler,
> + NULL,
> + &result, &length);
> + switch (handler)
> + {
> + case iconveh_error:
> + ASSERT (retval == -1 && errno == EILSEQ);
> + ASSERT (result == NULL);
> + break;
> + case iconveh_question_mark:
> + case iconveh_escape_sequence:
> + {
> + /* glibc result */
> + static const char expected1[] = "?????";
> + /* libiconv <= 1.12 result */
> + static const char expected2[] = "?2D/YQNhB";
> + /* libiconv behaviour changed in version 1.13: the result is
> + '?' U+0FF6 U+1036; this is U+D83F U+D840 U+D841 shifted
> left
> + by 6 bits. */
> + static const char expected3[] = "?\2013\2030\2013\2114";
> + ASSERT (retval == 0);
> + ASSERT (length == strlen (expected1)
> + || length == strlen (expected2)
> + || length == strlen (expected3));
> + ASSERT (result != NULL);
> + if (length == strlen (expected1))
> + ASSERT (memcmp (result, expected1, strlen (expected1)) ==
> 0);
> + else if (length == strlen (expected2))
> + ASSERT (memcmp (result, expected2, strlen (expected2)) == 0
> + || memcmp (result, expected3, strlen (expected3))
> == 0);
> + free (result);
> + }
> + break;
> + case iconveh_replacement_character:
> + {
> + /* glibc result */
> + static const char expected1[] =
> "\2041\2447\2041\2447\2041\2447\2041\2447\2041\2447";
> + /* libiconv <= 1.12 result */
> + static const char expected2[] = "\2041\24472D/YQNhB";
> + /* libiconv >= 1.13 result */
> + static const char expected3[] =
> "\2041\2447\2013\2030\2013\2114";
> + ASSERT (retval == 0);
> + ASSERT (length == strlen (expected1)
> + || length == strlen (expected2)
> + || length == strlen (expected3));
> + ASSERT (result != NULL);
> + if (length == strlen (expected1))
> + ASSERT (memcmp (result, expected1, strlen (expected1)) ==
> 0);
> + else if (length == strlen (expected2))
> + ASSERT (memcmp (result, expected2, strlen (expected2)) == 0
> + || memcmp (result, expected3, strlen (expected3))
> == 0);
> + free (result);
> + }
> }
> }
> +# endif
> # endif
> # endif
> }
> @@ -589,6 +857,7 @@ main ()
> ASSERT (result == NULL && errno == EILSEQ);
> break;
> case iconveh_question_mark:
> + case iconveh_replacement_character:
> case iconveh_escape_sequence:
> {
> static const char expected[] = "Rafa? Maszkowski";
> @@ -619,6 +888,7 @@ main ()
> ASSERT (result == NULL && errno == EILSEQ);
> break;
> case iconveh_question_mark:
> + case iconveh_replacement_character:
> {
> static const char expected[] = "Rafa? Maszkowski";
> ASSERT (result != NULL);
> @@ -652,6 +922,22 @@ main ()
> free (result);
> }
>
> +# if defined _LIBICONV_VERSION || (defined __GLIBC__ && !defined __UCLIBC__)
> + /* Test conversion from ISO-8859-1 to GB18030 with no errors. */
> + for (h = 0; h < SIZEOF (handlers); h++)
> + {
> + enum iconv_ilseq_handler handler = handlers[h];
> + static const char input[] = "\304rger mit b\366sen B\374bchen ohne
> Augenma\337";
> + static const char expected[] = "\2010\2072rger mit b\2010\2132sen
> B\250\271bchen ohne Augenma\2010\2118";
> + char *result = str_cd_iconveh (input,
> + &cdeh_88591_to_gb18030,
> + handler);
> + ASSERT (result != NULL);
> + ASSERT (strcmp (result, expected) == 0);
> + free (result);
> + }
> +# endif
> +
> /* Test conversion from UTF-8 to ISO-8859-1 with no errors. */
> for (h = 0; h < SIZEOF (handlers); h++)
> {
> @@ -688,8 +974,51 @@ main ()
> free (result);
> }
> break;
> + case iconveh_replacement_character:
> + {
> + static const char expected[] = "Rafa\357\277\275 Maszkowski";
> + ASSERT (result != NULL);
> + ASSERT (strcmp (result, expected) == 0);
> + free (result);
> + }
> + break;
> + }
> + }
> +
> +# if defined _LIBICONV_VERSION || (defined __GLIBC__ && !defined __UCLIBC__)
> + /* Test conversion from ASCII to GB18030 with invalid input (EILSEQ). */
> + for (h = 0; h < SIZEOF (handlers); h++)
> + {
> + enum iconv_ilseq_handler handler = handlers[h];
> + static const char input[] = "Rafa\263 Maszkowski"; /* Rafa? Maszkowski
> */
> + char *result = str_cd_iconveh (input,
> + &cdeh_ascii_to_gb18030,
> + handler);
> + switch (handler)
> + {
> + case iconveh_error:
> + ASSERT (result == NULL && errno == EILSEQ);
> + break;
> + case iconveh_question_mark:
> + case iconveh_escape_sequence:
> + {
> + static const char expected[] = "Rafa? Maszkowski";
> + ASSERT (result != NULL);
> + ASSERT (strcmp (result, expected) == 0);
> + free (result);
> + }
> + break;
> + case iconveh_replacement_character:
> + {
> + static const char expected[] = "Rafa\2041\2447 Maszkowski";
> + ASSERT (result != NULL);
> + ASSERT (strcmp (result, expected) == 0);
> + free (result);
> + }
> + break;
> }
> }
> +# endif
>
> /* Test conversion from UTF-8 to ISO-8859-1 with EILSEQ. */
> for (h = 0; h < SIZEOF (handlers); h++)
> @@ -705,6 +1034,7 @@ main ()
> ASSERT (result == NULL && errno == EILSEQ);
> break;
> case iconveh_question_mark:
> + case iconveh_replacement_character:
> {
> static const char expected[] = "Costs: 27 ?";
> ASSERT (result != NULL);
> @@ -801,6 +1131,7 @@ main ()
> free (offsets);
> break;
> case iconveh_question_mark:
> + case iconveh_replacement_character:
> {
> static const char expected[] = "Rafa? Maszkowski";
> ASSERT (retval == 0);
> @@ -870,6 +1201,41 @@ main ()
> }
> }
>
> +# if defined _LIBICONV_VERSION || (defined __GLIBC__ && !defined __UCLIBC__)
> + /* Test conversion from ISO-8859-1 to GB18030 with no errors. */
> + for (h = 0; h < SIZEOF (handlers); h++)
> + {
> + enum iconv_ilseq_handler handler = handlers[h];
> + static const char input[] = "\304rger mit b\366sen B\374bchen ohne
> Augenma\337";
> + static const char expected[] = "\2010\2072rger mit b\2010\2132sen
> B\250\271bchen ohne Augenma\2010\2118";
> + for (o = 0; o < 2; o++)
> + {
> + size_t *offsets = (o ? new_offsets (strlen (input)) : NULL);
> + char *result = NULL;
> + size_t length = 0;
> + int retval = mem_iconveh (input, strlen (input),
> + "ISO-8859-1", "GB18030",
> + handler,
> + offsets,
> + &result, &length);
> + ASSERT (retval == 0);
> + ASSERT (length == strlen (expected));
> + ASSERT (result != NULL && memcmp (result, expected, strlen
> (expected)) == 0);
> + if (o)
> + {
> + for (i = 0; i < 37; i++)
> + ASSERT (offsets[i] == (i < 1 ? i :
> + i < 12 ? i + 3 :
> + i < 18 ? i + 6 :
> + i + 7));
> + ASSERT (offsets[37] == MAGIC);
> + free (offsets);
> + }
> + free (result);
> + }
> + }
> +# endif
> +
> /* Test conversion from UTF-8 to ISO-8859-1 with no errors. */
> for (h = 0; h < SIZEOF (handlers); h++)
> {
> @@ -931,6 +1297,7 @@ main ()
> free (offsets);
> break;
> case iconveh_question_mark:
> + case iconveh_replacement_character:
> {
> static const char expected[] = "Rafa? Maszkowski";
> ASSERT (retval == 0);
> @@ -1023,6 +1390,7 @@ main ()
> ASSERT (result == NULL && errno == EILSEQ);
> break;
> case iconveh_question_mark:
> + case iconveh_replacement_character:
> {
> static const char expected[] = "Rafa? Maszkowski";
> ASSERT (result != NULL);
> @@ -1053,6 +1421,20 @@ main ()
> free (result);
> }
>
> +# if defined _LIBICONV_VERSION || (defined __GLIBC__ && !defined __UCLIBC__)
> + /* Test conversion from ISO-8859-1 to GB18030 with no errors. */
> + for (h = 0; h < SIZEOF (handlers); h++)
> + {
> + enum iconv_ilseq_handler handler = handlers[h];
> + static const char input[] = "\304rger mit b\366sen B\374bchen ohne
> Augenma\337";
> + static const char expected[] = "\2010\2072rger mit b\2010\2132sen
> B\250\271bchen ohne Augenma\2010\2118";
> + char *result = str_iconveh (input, "ISO-8859-1", "GB18030", handler);
> + ASSERT (result != NULL);
> + ASSERT (strcmp (result, expected) == 0);
> + free (result);
> + }
> +# endif
> +
> /* Test conversion from UTF-8 to ISO-8859-1 with no errors. */
> for (h = 0; h < SIZEOF (handlers); h++)
> {
> @@ -1077,6 +1459,7 @@ main ()
> ASSERT (result == NULL && errno == EILSEQ);
> break;
> case iconveh_question_mark:
> + case iconveh_replacement_character:
> {
> static const char expected[] = "Costs: 27 ?";
> ASSERT (result != NULL);
>
>
>