[Top][All Lists]
[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]
[Eliot-dev] eliot dic/compdic.cpp dic/encoding.cpp dic/enco... [cppdic]
From: |
eliot-dev |
Subject: |
[Eliot-dev] eliot dic/compdic.cpp dic/encoding.cpp dic/enco... [cppdic] |
Date: |
Sun, 09 Dec 2007 16:29:55 +0000 |
CVSROOT: /cvsroot/eliot
Module name: eliot
Branch: cppdic
Changes by: Olivier Teulière <ipkiss> 07/12/09 16:29:55
Modified files:
dic : compdic.cpp encoding.cpp encoding.h header.cpp
header.h
po : POTFILES.in
Log message:
- Moved the functions converting between wstring and UTF-8 to
encoding.cpp, and used them in compdic
- POTFILES.in: removed duplicated lines
CVSWeb URLs:
http://cvs.savannah.gnu.org/viewcvs/eliot/dic/compdic.cpp?cvsroot=eliot&only_with_tag=cppdic&r1=1.1.2.13&r2=1.1.2.14
http://cvs.savannah.gnu.org/viewcvs/eliot/dic/encoding.cpp?cvsroot=eliot&only_with_tag=cppdic&r1=1.1.2.8&r2=1.1.2.9
http://cvs.savannah.gnu.org/viewcvs/eliot/dic/encoding.h?cvsroot=eliot&only_with_tag=cppdic&r1=1.1.2.3&r2=1.1.2.4
http://cvs.savannah.gnu.org/viewcvs/eliot/dic/header.cpp?cvsroot=eliot&only_with_tag=cppdic&r1=1.1.2.14&r2=1.1.2.15
http://cvs.savannah.gnu.org/viewcvs/eliot/dic/header.h?cvsroot=eliot&only_with_tag=cppdic&r1=1.1.2.8&r2=1.1.2.9
http://cvs.savannah.gnu.org/viewcvs/eliot/po/POTFILES.in?cvsroot=eliot&only_with_tag=cppdic&r1=1.3.2.2&r2=1.3.2.3
Patches:
Index: dic/compdic.cpp
===================================================================
RCS file: /cvsroot/eliot/eliot/dic/Attic/compdic.cpp,v
retrieving revision 1.1.2.13
retrieving revision 1.1.2.14
diff -u -b -r1.1.2.13 -r1.1.2.14
--- dic/compdic.cpp 6 Dec 2007 13:24:48 -0000 1.1.2.13
+++ dic/compdic.cpp 9 Dec 2007 16:29:54 -0000 1.1.2.14
@@ -32,7 +32,6 @@
#include <vector>
#include <map>
#include <boost/tokenizer.hpp>
-#include <iconv.h>
#include <getopt.h>
#include <time.h>
#include <sys/types.h>
@@ -77,7 +76,7 @@
#define CHECK_RECURSION
-wchar_t* load_uncompressed(const string &iFileName, unsigned int &ioDicSize)
+const wchar_t* load_uncompressed(const string &iFileName, unsigned int
&ioDicSize)
{
ifstream file(iFileName.c_str());
if (!file.is_open())
@@ -89,34 +88,25 @@
file.read(&buffer.front(), ioDicSize);
file.close();
- // The words are supposed to be in utf-8, so convert everything to
- // wide characters
- iconv_t handle = iconv_open("WCHAR_T", "UTF-8");
- if (handle == (iconv_t)(-1))
- throw DicException("load_uncompressed: Error in iconv_open");
-
// Buffer for the wide characters (it will use at most as many characters
// as the utf-8 version)
- // FIXME: not exception safe
- wchar_t *wide_buf = new wchar_t[ioDicSize];
+ wchar_t *wideBuf = new wchar_t[ioDicSize];
+ unsigned int number;
- size_t inChars = ioDicSize;
- size_t outChars = sizeof(wchar_t) * ioDicSize;
- ICONV_CONST char *in = &buffer.front();
- char *out = (char*)wide_buf;
- size_t res = iconv(handle, &in, &inChars, &out, &outChars);
- iconv_close(handle);
- // Problem during encoding conversion?
- if (res == (size_t)(-1))
+ try
{
- delete[] wide_buf;
- throw DicException("load_uncompressed: " + string(strerror(errno)));
+ number = readFromUTF8(wideBuf, ioDicSize, &buffer.front(),
+ ioDicSize, "load_uncompressed");
}
+ catch (...)
+ {
+ // Avoid leaks, and propagate the exception
+ delete[] wideBuf;
+ throw;
+ }
+ ioDicSize = number;
- // Update ioDicSize with the actual length of the wchar_t array
- ioDicSize -= outChars / sizeof(wchar_t);
-
- return wide_buf;
+ return wideBuf;
}
@@ -158,25 +148,13 @@
}
#define MAX_SIZE 4
- wchar_t letter[MAX_SIZE];
char buff[MAX_SIZE];
strncpy(buff, tokens[0].c_str(), MAX_SIZE);
- // The letter is supposed to be in utf-8, so convert it to
- // wide characters
- iconv_t handle = iconv_open("WCHAR_T", "UTF-8");
- if (handle == (iconv_t)(-1))
- throw DicException("readLetters: Error in iconv_open");
- size_t inChars = tokens[0].size();
- size_t outChars = sizeof(wchar_t) * MAX_SIZE;
- ICONV_CONST char *in = buff;
- char *out = (char*)letter;
- size_t res = iconv(handle, &in, &inChars, &out, &outChars);
- iconv_close(handle);
- // Problem during encoding conversion?
- if (res == (size_t)(-1))
- throw DicException("readLetters: " + string(strerror(errno)));
- if (outChars != sizeof(wchar_t) * (MAX_SIZE - 1))
+ wstring letter;
+ readFromUTF8(letter, buff, tokens[0].size(), "readLetters");
+
+ if (letter.size() != 1)
{
ostringstream ss;
ss << "readLetters: Invalid letter at line " << lineNb;
@@ -298,8 +276,8 @@
wchar_t global_stringbuf[MAX_STRING_LENGTH]; /* Space for current string */
wchar_t* global_endstring; /* Marks END of current string */
-wchar_t* global_input;
-wchar_t* global_endofinput;
+const wchar_t* global_input;
+const wchar_t* global_endofinput;
#ifdef CHECK_RECURSION
map<int, vector<Dawg_edge> > global_mapfordepth;
#endif
@@ -521,7 +499,7 @@
clock_t startLoadTime = clock();
// FIXME: not exception safe
- wchar_t *uncompressed = load_uncompressed(inFileName, dicsize);
+ const wchar_t *uncompressed = load_uncompressed(inFileName, dicsize);
clock_t endLoadTime = clock();
global_input = uncompressed;
Index: dic/encoding.cpp
===================================================================
RCS file: /cvsroot/eliot/eliot/dic/Attic/encoding.cpp,v
retrieving revision 1.1.2.8
retrieving revision 1.1.2.9
diff -u -b -r1.1.2.8 -r1.1.2.9
--- dic/encoding.cpp 6 Dec 2007 15:09:52 -0000 1.1.2.8
+++ dic/encoding.cpp 9 Dec 2007 16:29:55 -0000 1.1.2.9
@@ -32,6 +32,9 @@
#include <stdarg.h>
#include <wchar.h>
#include <wctype.h>
+#include <errno.h>
+#include <string.h>
+#include <iconv.h>
#include "encoding.h"
#include "dic_exception.h"
@@ -181,3 +184,79 @@
}
}
+
+unsigned int readFromUTF8(wchar_t *oString, unsigned int iWideSize,
+ const char *iBuffer, unsigned int iBufSize,
+ const string &iContext)
+{
+ iconv_t handle = iconv_open("WCHAR_T", "UTF-8");
+ if (handle == (iconv_t)(-1))
+ throw DicException("readFromUTF8: iconv_open failed");
+ size_t inChars = iBufSize;
+ size_t outChars = iWideSize * sizeof(wchar_t);
+ // Use the ICONV_CONST trick because the declaration of iconv()
+ // differs depending on the implementations...
+ ICONV_CONST char *in = const_cast<ICONV_CONST char*>(iBuffer);
+ char *out = (char*)oString;
+ size_t res = iconv(handle, &in, &inChars, &out, &outChars);
+ iconv_close(handle);
+ // Problem during encoding conversion?
+ if (res == (size_t)(-1))
+ {
+ throw DicException("readFromUTF8: iconv failed (" +
+ iContext + "): " + string(strerror(errno)));
+ }
+ return iWideSize - outChars / sizeof(wchar_t);
+}
+
+
+void readFromUTF8(wstring &oString, const char *iBuffer,
+ unsigned int iBufSize, const string &iContext)
+{
+ // Temporary buffer for output
+ // We will have at most as many characters as in the UTF-8 string
+ wchar_t *wideBuf = new wchar_t[iBufSize];
+ unsigned int number;
+ try
+ {
+ number = readFromUTF8(wideBuf, iBufSize, iBuffer, iBufSize, iContext);
+ }
+ catch (...)
+ {
+ // Make sure not to leak
+ delete[] wideBuf;
+ throw;
+ }
+ // Copy the string
+ oString = wstring(wideBuf, number);
+ delete[] wideBuf;
+}
+
+
+unsigned int writeInUTF8(const wstring &iString, char *oBuffer,
+ unsigned int iBufSize, const string &iContext)
+{
+ iconv_t handle = iconv_open("UTF-8", "WCHAR_T");
+ if (handle == (iconv_t)(-1))
+ throw DicException("writeInUTF8: iconv_open failed");
+ size_t length = iString.size();
+ size_t inChars = sizeof(wchar_t) * length;
+ size_t outChars = iBufSize;
+ // Use the ICONV_CONST trick because the declaration of iconv()
+ // differs depending on the implementations...
+ // FIXME: bonus ugliness for doing 2 casts at once, and accessing string
+ // internals...
+ ICONV_CONST char *in = (ICONV_CONST char*)(&iString[0]);
+ char *out = oBuffer;
+ size_t res = iconv(handle, &in, &inChars, &out, &outChars);
+ iconv_close(handle);
+ // Problem during encoding conversion?
+ if (res == (size_t)(-1))
+ {
+ throw DicException("writeInUTF8: iconv failed (" +
+ iContext + ")" + string(strerror(errno)));
+ }
+ // Return the number of written bytes
+ return iBufSize - outChars;
+}
+
Index: dic/encoding.h
===================================================================
RCS file: /cvsroot/eliot/eliot/dic/Attic/encoding.h,v
retrieving revision 1.1.2.3
retrieving revision 1.1.2.4
diff -u -b -r1.1.2.3 -r1.1.2.4
--- dic/encoding.h 4 Dec 2007 11:09:59 -0000 1.1.2.3
+++ dic/encoding.h 9 Dec 2007 16:29:55 -0000 1.1.2.4
@@ -59,5 +59,42 @@
string padAndConvert(const wstring &iWstr, unsigned int iLength,
bool iLeftPad = true, char c = ' ');
+/**
+ * Utility function to convert a char* buffer encoded in UTF-8 into a
+ * wchar_t* string
+ * @param oString: where to write the converted string
+ * @param iWideSize: size available in oString (number of wchar_t)
+ * @param iBuffer: UTF-8 string to convert
+ * @param iBufSize: available size in iBuffer
+ * @param iContext: free text used in case of exception
+ * @return: number of wide chars actually written
+ */
+unsigned int readFromUTF8(wchar_t *oString, unsigned int iWideSize,
+ const char *iBuffer, unsigned int iBufSize,
+ const string &iContext);
+
+/**
+ * Same as the other readFromUTF8 function, dealing with a wstring
+ * instead of a wchar_t*. Note that it performs an additional copy
+ * of the output string...
+ * @param oString: where to write the converted string
+ * @param iBuffer: UTF-8 string to convert
+ * @param iBufSize: available size in iBuffer
+ * @param iContext: free text used in case of exception
+ */
+void readFromUTF8(wstring &oString, const char *iBuffer,
+ unsigned int iBufSize, const string &iContext);
+
+/**
+ * Utility function to convert a wstring into an UTF-8 char* buffer
+ * @param iString: the wide string to encode
+ * @param oBuffer: where to write the encoded string
+ * @param iBufSize: available size in oBuffer
+ * @param iContext: free text used in case of exception
+ * @return: number of bytes actually written
+ */
+unsigned int writeInUTF8(const wstring &iString, char *oBuffer,
+ unsigned int iBufSize, const string &iContext);
+
#endif
Index: dic/header.cpp
===================================================================
RCS file: /cvsroot/eliot/eliot/dic/Attic/header.cpp,v
retrieving revision 1.1.2.14
retrieving revision 1.1.2.15
diff -u -b -r1.1.2.14 -r1.1.2.15
--- dic/header.cpp 6 Dec 2007 13:24:48 -0000 1.1.2.14
+++ dic/header.cpp 9 Dec 2007 16:29:55 -0000 1.1.2.15
@@ -22,9 +22,6 @@
#include <string>
#include <sstream>
#include <iostream>
-#include <iconv.h>
-#include <errno.h>
-#include <string.h>
// For ntohl & Co.
#ifdef WIN32
@@ -490,62 +487,6 @@
}
-void Header::readFromUTF8(wstring &oString, const char *iBuffer,
- unsigned int iBufSize, const string &iContext) const
-{
- iconv_t handle = iconv_open("WCHAR_T", "UTF-8");
- if (handle == (iconv_t)(-1))
- throw DicException("Header::readFromUTF8: iconv_open failed");
- // Temporary buffer for output
- // We will have at most as many characters as in the UTF-8 string
- wchar_t outbuf[iBufSize];
- size_t inChars = iBufSize;
- size_t outChars = sizeof(wchar_t) * iBufSize;
- // Use the ICONV_CONST trick because the declaration of iconv()
- // differs depending on the implementations...
- ICONV_CONST char *in = const_cast<ICONV_CONST char*>(iBuffer);
- char *out = (char*)outbuf;
- size_t res = iconv(handle, &in, &inChars, &out, &outChars);
- iconv_close(handle);
- // Problem during encoding conversion?
- if (res == (size_t)(-1))
- {
- throw DicException("Header::readFromUTF8: iconv failed (" +
- iContext + "): " + string(strerror(errno)));
- }
- // We finally have the letters as a wstring!
- oString = wstring(outbuf, iBufSize - outChars / sizeof(wchar_t));
-}
-
-
-unsigned int Header::writeInUTF8(const wstring &iString, char *oBuffer,
- unsigned int iBufSize, const string
&iContext) const
-{
- iconv_t handle = iconv_open("UTF-8", "WCHAR_T");
- if (handle == (iconv_t)(-1))
- throw DicException("Header::writeInUTF8: iconv_open failed");
- size_t length = iString.size();
- size_t inChars = sizeof(wchar_t) * length;
- size_t outChars = iBufSize;
- // Use the ICONV_CONST trick because the declaration of iconv()
- // differs depending on the implementations...
- // FIXME: bonus ugliness for doing 2 casts at once, and accessing string
- // internals...
- ICONV_CONST char *in = (ICONV_CONST char*)(&iString[0]);
- char *out = oBuffer;
- size_t res = iconv(handle, &in, &inChars, &out, &outChars);
- iconv_close(handle);
- // Problem during encoding conversion?
- if (res == (size_t)(-1))
- {
- throw DicException("Header::writeInUTF8: iconv failed (" +
- iContext + ")");
- }
- // Return the number of written bytes
- return iBufSize - outChars;
-}
-
-
void Header::print() const
{
printf(_("dictionary name: %s\n"), convertToMb(m_dicName).c_str());
Index: dic/header.h
===================================================================
RCS file: /cvsroot/eliot/eliot/dic/Attic/header.h,v
retrieving revision 1.1.2.8
retrieving revision 1.1.2.9
diff -u -b -r1.1.2.8 -r1.1.2.9
--- dic/header.h 4 Dec 2007 11:09:59 -0000 1.1.2.8
+++ dic/header.h 9 Dec 2007 16:29:55 -0000 1.1.2.9
@@ -177,28 +177,6 @@
/** Build m_mapCodeFromChar */
void buildMapCodeFromChar();
-
- /**
- * Utility function to convert a char* buffer encoded in UTF-8 into a
- * wide wstring
- * @param oString: where to write the converted string
- * @param iBuffer: UTF-8 string to convert
- * @param iBufSize: available size in iBuffer
- * @param iContext: free text used in case of exception
- */
- void readFromUTF8(wstring &oString, const char *iBuffer,
- unsigned int iBufSize, const string &iContext) const;
-
- /**
- * Utility function to convert a wstring into an UTF-8 char* buffer
- * @param iString: the wide string to encode
- * @param oBuffer: where to write the encoded string
- * @param iBufSize: available size in oBuffer
- * @param iContext: free text used in case of exception
- * @return: number of bytes actually written
- */
- unsigned int writeInUTF8(const wstring &iString, char *oBuffer,
- unsigned int iBufSize, const string &iContext)
const;
};
Index: po/POTFILES.in
===================================================================
RCS file: /cvsroot/eliot/eliot/po/POTFILES.in,v
retrieving revision 1.3.2.2
retrieving revision 1.3.2.3
diff -u -b -r1.3.2.2 -r1.3.2.3
--- po/POTFILES.in 8 Dec 2007 13:56:10 -0000 1.3.2.2
+++ po/POTFILES.in 9 Dec 2007 16:29:55 -0000 1.3.2.3
@@ -1,6 +1,5 @@
./dic/automaton.cpp
./dic/automaton.h
-./dic/compdic.cpp
./dic/header.cpp
./dic/header.h
./dic/dic.cpp
@@ -9,7 +8,6 @@
./dic/dic_search.cpp
./dic/hashtable.cpp
./dic/hashtable.h
-./dic/listdic.cpp
./dic/regexp.cpp
./dic/regexp.h
./dic/tile.cpp
[Prev in Thread] |
Current Thread |
[Next in Thread] |
- [Eliot-dev] eliot dic/compdic.cpp dic/encoding.cpp dic/enco... [cppdic],
eliot-dev <=