[Top][All Lists]
[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]
[Eliot-dev] eliot/dic compdic.cpp encoding.cpp encoding.h h... [cppdic]
From: |
eliot-dev |
Subject: |
[Eliot-dev] eliot/dic compdic.cpp encoding.cpp encoding.h h... [cppdic] |
Date: |
Mon, 10 Dec 2007 11:56:39 +0000 |
CVSROOT: /cvsroot/eliot
Module name: eliot
Branch: cppdic
Changes by: Olivier Teulière <ipkiss> 07/12/10 11:56:39
Modified files:
dic : compdic.cpp encoding.cpp encoding.h header.cpp
Log message:
- Win32 version of the various conversion functions. It should fix
most problems on Windows when using a non-ascii dictionary (to be confirmed)
- Simplified the prototype of one of the readFromUTF8 functions
CVSWeb URLs:
http://cvs.savannah.gnu.org/viewcvs/eliot/dic/compdic.cpp?cvsroot=eliot&only_with_tag=cppdic&r1=1.1.2.14&r2=1.1.2.15
http://cvs.savannah.gnu.org/viewcvs/eliot/dic/encoding.cpp?cvsroot=eliot&only_with_tag=cppdic&r1=1.1.2.9&r2=1.1.2.10
http://cvs.savannah.gnu.org/viewcvs/eliot/dic/encoding.h?cvsroot=eliot&only_with_tag=cppdic&r1=1.1.2.4&r2=1.1.2.5
http://cvs.savannah.gnu.org/viewcvs/eliot/dic/header.cpp?cvsroot=eliot&only_with_tag=cppdic&r1=1.1.2.15&r2=1.1.2.16
Patches:
Index: compdic.cpp
===================================================================
RCS file: /cvsroot/eliot/eliot/dic/Attic/compdic.cpp,v
retrieving revision 1.1.2.14
retrieving revision 1.1.2.15
diff -u -b -r1.1.2.14 -r1.1.2.15
--- compdic.cpp 9 Dec 2007 16:29:54 -0000 1.1.2.14
+++ compdic.cpp 10 Dec 2007 11:56:38 -0000 1.1.2.15
@@ -151,8 +151,7 @@
char buff[MAX_SIZE];
strncpy(buff, tokens[0].c_str(), MAX_SIZE);
- wstring letter;
- readFromUTF8(letter, buff, tokens[0].size(), "readLetters");
+ wstring letter = readFromUTF8(buff, tokens[0].size(), "readLetters");
if (letter.size() != 1)
{
Index: encoding.cpp
===================================================================
RCS file: /cvsroot/eliot/eliot/dic/Attic/encoding.cpp,v
retrieving revision 1.1.2.9
retrieving revision 1.1.2.10
diff -u -b -r1.1.2.9 -r1.1.2.10
--- encoding.cpp 9 Dec 2007 16:29:55 -0000 1.1.2.9
+++ encoding.cpp 10 Dec 2007 11:56:39 -0000 1.1.2.10
@@ -36,12 +36,37 @@
#include <string.h>
#include <iconv.h>
+#ifdef WIN32
+#include <windows.h>
+#endif
+
#include "encoding.h"
#include "dic_exception.h"
using namespace std;
+#ifdef WIN32
+// Utility function to get the last system error as a string
+static string GetWin32Error()
+{
+ char *lpMsgBuf;
+ DWORD dw = GetLastError();
+ cerr << dw << endl;
+ FormatMessage(FORMAT_MESSAGE_ALLOCATE_BUFFER |
+ FORMAT_MESSAGE_FROM_SYSTEM |
+ FORMAT_MESSAGE_IGNORE_INSERTS,
+ NULL, dw,
+ MAKELANGID(LANG_NEUTRAL, SUBLANG_DEFAULT),
+ (LPTSTR) &lpMsgBuf,
+ 0, NULL);
+ string msg = lpMsgBuf;
+ LocalFree(lpMsgBuf);
+ return msg;
+}
+#endif
+
+
#if !HAVE_WCWIDTH
// wcwidth replacement (for win32 in particular)
// Inspired from the gnulib package, without some of the refinements
@@ -89,6 +114,10 @@
#define _MAX_SIZE_FOR_STACK_ 30
wstring convertToWc(const string& iStr)
{
+#ifdef WIN32
+ // XXX: Assume the input is in UTF-8
+ return readFromUTF8(iStr.c_str(), iStr.size(), "convertToWc");
+#else
// Get the needed length (we _can't_ use string::size())
size_t len = mbstowcs(NULL, iStr.c_str(), 0);
if (len == (size_t)-1)
@@ -110,11 +139,19 @@
delete[] tmp;
return res;
}
+#endif
}
string convertToMb(const wstring& iWStr)
{
+#ifdef WIN32
+ const unsigned int size = iWStr.size() * 4;
+ char buf[size];
+ // XXX: Assume the output is in UTF-8
+ int nb = writeInUTF8(iWStr, buf, size, "convertToMb");
+ return string(buf, nb);
+#else
// Get the needed length (we _can't_ use wstring::size())
size_t len = wcstombs(NULL, iWStr.c_str(), 0);
if (len == (size_t)-1)
@@ -136,12 +173,16 @@
delete[] tmp;
return res;
}
+#endif
}
#undef _MAX_SIZE_FOR_STACK_
string convertToMb(wchar_t iWChar)
{
+#ifdef WIN32
+ return convertToMb(wstring(1, iWChar));
+#else
char res[MB_CUR_MAX + 1];
int len = wctomb(res, iWChar);
if (len == -1)
@@ -149,6 +190,7 @@
res[len] = '\0';
return res;
+#endif
}
@@ -189,6 +231,17 @@
const char *iBuffer, unsigned int iBufSize,
const string &iContext)
{
+#ifdef WIN32
+ int res = MultiByteToWideChar(CP_UTF8, MB_ERR_INVALID_CHARS, iBuffer,
+ iBufSize, oString, iWideSize);
+ if (res == 0)
+ {
+ // Retrieve the system error message for the last-error code
+ throw DicException("readFromUTF8: MultiByteToWideChar failed (" +
+ iContext + "): " + GetWin32Error());
+ }
+ return res;
+#else
iconv_t handle = iconv_open("WCHAR_T", "UTF-8");
if (handle == (iconv_t)(-1))
throw DicException("readFromUTF8: iconv_open failed");
@@ -207,11 +260,12 @@
iContext + "): " + string(strerror(errno)));
}
return iWideSize - outChars / sizeof(wchar_t);
+#endif
}
-void readFromUTF8(wstring &oString, const char *iBuffer,
- unsigned int iBufSize, const string &iContext)
+wstring readFromUTF8(const char *iBuffer, unsigned int iBufSize,
+ const string &iContext)
{
// Temporary buffer for output
// We will have at most as many characters as in the UTF-8 string
@@ -228,25 +282,39 @@
throw;
}
// Copy the string
- oString = wstring(wideBuf, number);
+ wstring res(wideBuf, number);
delete[] wideBuf;
+ return res;
}
-unsigned int writeInUTF8(const wstring &iString, char *oBuffer,
+unsigned int writeInUTF8(const wstring &iWString, char *oBuffer,
unsigned int iBufSize, const string &iContext)
{
+#ifdef WIN32
+ int res = WideCharToMultiByte(CP_UTF8, 0, iWString.c_str(),
iWString.size(),
+ oBuffer, iBufSize, NULL, NULL);
+ if (res == 0)
+ {
+ DWORD dw = GetLastError();
+ cerr << dw << endl;
+ // Retrieve the system error message for the last-error code
+ throw DicException("writeInUTF8: WideCharToMultiByte failed (" +
+ iContext + "): " + GetWin32Error());
+ }
+ return res;
+#else
iconv_t handle = iconv_open("UTF-8", "WCHAR_T");
if (handle == (iconv_t)(-1))
throw DicException("writeInUTF8: iconv_open failed");
- size_t length = iString.size();
+ size_t length = iWString.size();
size_t inChars = sizeof(wchar_t) * length;
size_t outChars = iBufSize;
// Use the ICONV_CONST trick because the declaration of iconv()
// differs depending on the implementations...
// FIXME: bonus ugliness for doing 2 casts at once, and accessing string
// internals...
- ICONV_CONST char *in = (ICONV_CONST char*)(&iString[0]);
+ ICONV_CONST char *in = (ICONV_CONST char*)(&iWString[0]);
char *out = oBuffer;
size_t res = iconv(handle, &in, &inChars, &out, &outChars);
iconv_close(handle);
@@ -258,5 +326,6 @@
}
// Return the number of written bytes
return iBufSize - outChars;
+#endif
}
Index: encoding.h
===================================================================
RCS file: /cvsroot/eliot/eliot/dic/Attic/encoding.h,v
retrieving revision 1.1.2.4
retrieving revision 1.1.2.5
diff -u -b -r1.1.2.4 -r1.1.2.5
--- encoding.h 9 Dec 2007 16:29:55 -0000 1.1.2.4
+++ encoding.h 10 Dec 2007 11:56:39 -0000 1.1.2.5
@@ -77,23 +77,23 @@
* Same as the other readFromUTF8 function, dealing with a wstring
* instead of a wchar_t*. Note that it performs an additional copy
* of the output string...
- * @param oString: where to write the converted string
* @param iBuffer: UTF-8 string to convert
* @param iBufSize: available size in iBuffer
* @param iContext: free text used in case of exception
+ * @return: the converted wide string
*/
-void readFromUTF8(wstring &oString, const char *iBuffer,
- unsigned int iBufSize, const string &iContext);
+wstring readFromUTF8(const char *iBuffer, unsigned int iBufSize,
+ const string &iContext);
/**
* Utility function to convert a wstring into an UTF-8 char* buffer
- * @param iString: the wide string to encode
+ * @param iWString: the wide string to encode
* @param oBuffer: where to write the encoded string
* @param iBufSize: available size in oBuffer
* @param iContext: free text used in case of exception
* @return: number of bytes actually written
*/
-unsigned int writeInUTF8(const wstring &iString, char *oBuffer,
+unsigned int writeInUTF8(const wstring &iWString, char *oBuffer,
unsigned int iBufSize, const string &iContext);
#endif
Index: header.cpp
===================================================================
RCS file: /cvsroot/eliot/eliot/dic/Attic/header.cpp,v
retrieving revision 1.1.2.15
retrieving revision 1.1.2.16
diff -u -b -r1.1.2.15 -r1.1.2.16
--- header.cpp 9 Dec 2007 16:29:55 -0000 1.1.2.15
+++ header.cpp 10 Dec 2007 11:56:39 -0000 1.1.2.16
@@ -381,16 +381,16 @@
else
throw DicException("Header::read: unrecognized algorithm type");
- readFromUTF8(m_userHost, aHeaderExt.userHost,
- aHeaderExt.userHostSize, "user and host information");
+ m_userHost = readFromUTF8(aHeaderExt.userHost, aHeaderExt.userHostSize,
+ "user and host information");
// Convert the dictionary letters from UTF-8 to wchar_t*
- readFromUTF8(m_dicName, aHeaderExt.dicName,
- aHeaderExt.dicNameSize, "dictionary name");
+ m_dicName = readFromUTF8(aHeaderExt.dicName, aHeaderExt.dicNameSize,
+ "dictionary name");
// Convert the dictionary letters from UTF-8 to wchar_t*
- readFromUTF8(m_letters, aHeaderExt.letters,
- aHeaderExt.lettersSize, "dictionary letters");
+ m_letters = readFromUTF8(aHeaderExt.letters, aHeaderExt.lettersSize,
+ "dictionary letters");
// Safety check: correct number of letters?
if (m_letters.size() != aHeaderExt.nbLetters)
{
@@ -500,9 +500,9 @@
{
printf(_("compressed on: Unknown date (old format)\n"));
}
- printf(_("compressed using a binary compiled by: %ls\n"),
m_userHost.c_str());
+ printf(_("compressed using a binary compiled by: %s\n"),
convertToMb(m_userHost).c_str());
printf(_("dictionary type: %s\n"), m_type == kDAWG ? "DAWG" : "GADDAG");
- printf(_("letters: %ls\n"), m_letters.c_str());
+ printf(_("letters: %s\n"), convertToMb(m_letters).c_str());
printf(_("number of letters: %d\n"), m_letters.size());
printf(_("number of words: %d\n"), m_nbWords);
printf(_("header size: %u bytes\n"), sizeof(Dict_header_old) +
[Prev in Thread] |
Current Thread |
[Next in Thread] |
- [Eliot-dev] eliot/dic compdic.cpp encoding.cpp encoding.h h... [cppdic],
eliot-dev <=