#include #include #include #include #include #include // Valid locales for setlocale: // German_Germany.1252 // Chinese_Taiwan // Chinese_China // etc. // but none supports an encoding that goes further than the BMP // (at least not in Windows XP). // So use the Win32 functions instead of the C89 functions. #define codepage 65001 // UTF-8 //#define codepage 54936 // GB18030 // unsupported in Windows XP, // despite what says #define mbrtowc my_mbrtowc static size_t mbrtowc (wchar_t *pwc, const char *s, size_t n, mbstate_t *ps) { unsigned char c; size_t i; if (n == 0) return (size_t)-2; /* MultiByteToWideChar fails with ERROR_NO_UNICODE_TRANSLATION if the input is too short. We need to distinguish this case from an invalid input sequence. */ c = (unsigned char) s[0]; if (c < 0xc0) i = 1; else if (c < 0xe0) i = 2; else if (c < 0xf0) i = 3; else if (c < 0xf8) i = 4; else { errno = EILSEQ; return (size_t)-1; } if (MultiByteToWideChar (codepage, MB_ERR_INVALID_CHARS /* | MB_PRECOMPOSED does not work */, s, i, pwc, pwc != NULL)) return i; switch (GetLastError ()) { case ERROR_INSUFFICIENT_BUFFER: break; case ERROR_NO_UNICODE_TRANSLATION: errno = EILSEQ; return (size_t)-1; case ERROR_INVALID_FLAGS: case ERROR_INVALID_PARAMETER: default: fprintf (stderr, "last error: %x\n", GetLastError ()); fflush (stderr); abort (); } return (size_t)-2; } #define wcrtomb my_wcrtomb static size_t wcrtomb (char *s, wchar_t wc, mbstate_t *ps) { BOOL invalid_conversion; int bytes; if (s == NULL) { static char buf[6]; s = buf; wc = 0; } bytes = WideCharToMultiByte (codepage, 0, &wc, 1, s, 6, NULL, codepage == 65001 ? NULL : &invalid_conversion); if (bytes) { if (codepage == 65001 || !invalid_conversion) return bytes; else { errno = EILSEQ; return (size_t)-1; } } else switch (GetLastError ()) { case ERROR_NO_UNICODE_TRANSLATION: errno = EILSEQ; return (size_t)-1; case ERROR_INVALID_PARAMETER: errno = EINVAL; return (size_t)-1; case ERROR_INVALID_FLAGS: case ERROR_INSUFFICIENT_BUFFER: default: fprintf (stderr, "last error: 0x%x\n", GetLastError ()); fflush (stderr); abort (); } } int main (int argc, char*argv[]) { // U+21234; in CJK Ideograph Extension B // in UTF-16 form: 0xD844 0xDE34 #if codepage == 65001 const char buf[4] = { 0xF0, 0xA1, 0x88, 0xB4 }; // fails //const char buf[4] = { 0xE3, 0xBF, 0xA3, 0x66 }; // OK //const char buf[4] = { 0xC4, 0xB4, 0x55, 0x66 }; // OK //const char buf[4] = { 0x22, 0x33, 0x55, 0x66 }; // OK #endif #if codepage == 54936 //const char buf[4] = { 0x95, 0x35, 0xDA, 0x36 }; //const char buf[4] = { 0x82, 0x32, 0xA3, 0x38 }; //const char buf[4] = { 0x81, 0x30, 0x90, 0x36 }; const char buf[4] = { 0x22, 0x33, 0x55, 0x66 }; #endif wchar_t wbuf[4] = { 0xFFFE, 0xFFFE, 0xFFFE, 0xFFFE }; const char *inptr = buf; char rbuf[6]; size_t ret, ret1, ret2; BOOL invalid_conversion; size_t i; /* Test mbrtowc */ ret = mbrtowc (&wbuf[0], inptr, buf + sizeof (buf) - inptr, NULL); printf ("ret = %d, wbuf[0] = %x\n", (int) ret, (unsigned int) wbuf[0]); /* Test MultiByteToWideChar, converting 2 wchar_t units at once. */ if (MultiByteToWideChar (codepage, MB_ERR_INVALID_CHARS /* | MB_PRECOMPOSED does not work */, inptr, buf + sizeof (buf) - inptr, wbuf, 2)) { printf ("wbuf[0] = %x, wbuf[1] = %x\n", (unsigned int) wbuf[0], (unsigned int) wbuf[1]); /* Test wcrtomb */ ret1 = wcrtomb (rbuf, wbuf[0], NULL); printf ("ret1 = %d\n", (int) ret1); if (ret1 > 0) { printf ("rbuf ="); for (i = 0; i < ret1; i++) printf (" %02X", (unsigned char) rbuf[i]); printf ("\n"); } /* Test WideCharToMultiByte, converting 2 wchar_t units at once. */ ret2 = WideCharToMultiByte (codepage, 0, wbuf, 2, rbuf, sizeof (rbuf), NULL, codepage == 65001 ? NULL : &invalid_conversion); printf ("ret2 = %d\n", ret2); if (ret2 > 0) { printf ("rbuf ="); for (i = 0; i < ret2; i++) printf (" %02X", (unsigned char) rbuf[i]); printf ("\n"); } } return 0; } /* Result on Windows XP: ret = -1, wbuf[0] = 0 wbuf[0] = d844, wbuf[1] = de34 ret1 = 3 rbuf = ED A1 84 ret2 = 4 rbuf = F0 A1 88 B4 */