eliot-dev
[Top][All Lists]
Advanced

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

[Eliot-dev] eliot dic/compdic.cpp dic/encoding.cpp dic/enco... [cppdic]


From: eliot-dev
Subject: [Eliot-dev] eliot dic/compdic.cpp dic/encoding.cpp dic/enco... [cppdic]
Date: Sun, 09 Dec 2007 16:29:55 +0000

CVSROOT:        /cvsroot/eliot
Module name:    eliot
Branch:         cppdic
Changes by:     Olivier Teulière <ipkiss>      07/12/09 16:29:55

Modified files:
        dic            : compdic.cpp encoding.cpp encoding.h header.cpp 
                         header.h 
        po             : POTFILES.in 

Log message:
         - Moved the functions converting between wstring and UTF-8 to 
encoding.cpp, and used them in compdic
         - POTFILES.in: removed duplicated lines

CVSWeb URLs:
http://cvs.savannah.gnu.org/viewcvs/eliot/dic/compdic.cpp?cvsroot=eliot&only_with_tag=cppdic&r1=1.1.2.13&r2=1.1.2.14
http://cvs.savannah.gnu.org/viewcvs/eliot/dic/encoding.cpp?cvsroot=eliot&only_with_tag=cppdic&r1=1.1.2.8&r2=1.1.2.9
http://cvs.savannah.gnu.org/viewcvs/eliot/dic/encoding.h?cvsroot=eliot&only_with_tag=cppdic&r1=1.1.2.3&r2=1.1.2.4
http://cvs.savannah.gnu.org/viewcvs/eliot/dic/header.cpp?cvsroot=eliot&only_with_tag=cppdic&r1=1.1.2.14&r2=1.1.2.15
http://cvs.savannah.gnu.org/viewcvs/eliot/dic/header.h?cvsroot=eliot&only_with_tag=cppdic&r1=1.1.2.8&r2=1.1.2.9
http://cvs.savannah.gnu.org/viewcvs/eliot/po/POTFILES.in?cvsroot=eliot&only_with_tag=cppdic&r1=1.3.2.2&r2=1.3.2.3

Patches:
Index: dic/compdic.cpp
===================================================================
RCS file: /cvsroot/eliot/eliot/dic/Attic/compdic.cpp,v
retrieving revision 1.1.2.13
retrieving revision 1.1.2.14
diff -u -b -r1.1.2.13 -r1.1.2.14
--- dic/compdic.cpp     6 Dec 2007 13:24:48 -0000       1.1.2.13
+++ dic/compdic.cpp     9 Dec 2007 16:29:54 -0000       1.1.2.14
@@ -32,7 +32,6 @@
 #include <vector>
 #include <map>
 #include <boost/tokenizer.hpp>
-#include <iconv.h>
 #include <getopt.h>
 #include <time.h>
 #include <sys/types.h>
@@ -77,7 +76,7 @@
 #define CHECK_RECURSION
 
 
-wchar_t* load_uncompressed(const string &iFileName, unsigned int &ioDicSize)
+const wchar_t* load_uncompressed(const string &iFileName, unsigned int 
&ioDicSize)
 {
     ifstream file(iFileName.c_str());
     if (!file.is_open())
@@ -89,34 +88,25 @@
     file.read(&buffer.front(), ioDicSize);
     file.close();
 
-    // The words are supposed to be in utf-8, so convert everything to
-    // wide characters
-    iconv_t handle = iconv_open("WCHAR_T", "UTF-8");
-    if (handle == (iconv_t)(-1))
-        throw DicException("load_uncompressed: Error in iconv_open");
-
     // Buffer for the wide characters (it will use at most as many characters
     // as the utf-8 version)
-    // FIXME: not exception safe
-    wchar_t *wide_buf = new wchar_t[ioDicSize];
+    wchar_t *wideBuf = new wchar_t[ioDicSize];
+    unsigned int number;
 
-    size_t inChars = ioDicSize;
-    size_t outChars = sizeof(wchar_t) * ioDicSize;
-    ICONV_CONST char *in = &buffer.front();
-    char *out = (char*)wide_buf;
-    size_t res = iconv(handle, &in, &inChars, &out, &outChars);
-    iconv_close(handle);
-    // Problem during encoding conversion?
-    if (res == (size_t)(-1))
+    try
     {
-        delete[] wide_buf;
-        throw DicException("load_uncompressed: " + string(strerror(errno)));
+        number = readFromUTF8(wideBuf, ioDicSize, &buffer.front(),
+                              ioDicSize, "load_uncompressed");
     }
+    catch (...)
+    {
+        // Avoid leaks, and propagate the exception
+        delete[] wideBuf;
+        throw;
+    }
+    ioDicSize = number;
 
-    // Update ioDicSize with the actual length of the wchar_t array
-    ioDicSize -= outChars / sizeof(wchar_t);
-
-    return wide_buf;
+    return wideBuf;
 }
 
 
@@ -158,25 +148,13 @@
         }
 
 #define MAX_SIZE 4
-        wchar_t letter[MAX_SIZE];
         char buff[MAX_SIZE];
         strncpy(buff, tokens[0].c_str(), MAX_SIZE);
 
-        // The letter is supposed to be in utf-8, so convert it to
-        // wide characters
-        iconv_t handle = iconv_open("WCHAR_T", "UTF-8");
-        if (handle == (iconv_t)(-1))
-            throw DicException("readLetters: Error in iconv_open");
-        size_t inChars = tokens[0].size();
-        size_t outChars = sizeof(wchar_t) * MAX_SIZE;
-        ICONV_CONST char *in = buff;
-        char *out = (char*)letter;
-        size_t res = iconv(handle, &in, &inChars, &out, &outChars);
-        iconv_close(handle);
-        // Problem during encoding conversion?
-        if (res == (size_t)(-1))
-            throw DicException("readLetters: " + string(strerror(errno)));
-        if (outChars != sizeof(wchar_t) * (MAX_SIZE - 1))
+        wstring letter;
+        readFromUTF8(letter, buff, tokens[0].size(), "readLetters");
+
+        if (letter.size() != 1)
         {
             ostringstream ss;
             ss << "readLetters: Invalid letter at line " << lineNb;
@@ -298,8 +276,8 @@
 
 wchar_t  global_stringbuf[MAX_STRING_LENGTH]; /* Space for current string */
 wchar_t* global_endstring;                    /* Marks END of current string */
-wchar_t* global_input;
-wchar_t* global_endofinput;
+const wchar_t* global_input;
+const wchar_t* global_endofinput;
 #ifdef CHECK_RECURSION
 map<int, vector<Dawg_edge> > global_mapfordepth;
 #endif
@@ -521,7 +499,7 @@
 
         clock_t startLoadTime = clock();
         // FIXME: not exception safe
-        wchar_t *uncompressed = load_uncompressed(inFileName, dicsize);
+        const wchar_t *uncompressed = load_uncompressed(inFileName, dicsize);
         clock_t endLoadTime = clock();
 
         global_input = uncompressed;

Index: dic/encoding.cpp
===================================================================
RCS file: /cvsroot/eliot/eliot/dic/Attic/encoding.cpp,v
retrieving revision 1.1.2.8
retrieving revision 1.1.2.9
diff -u -b -r1.1.2.8 -r1.1.2.9
--- dic/encoding.cpp    6 Dec 2007 15:09:52 -0000       1.1.2.8
+++ dic/encoding.cpp    9 Dec 2007 16:29:55 -0000       1.1.2.9
@@ -32,6 +32,9 @@
 #include <stdarg.h>
 #include <wchar.h>
 #include <wctype.h>
+#include <errno.h>
+#include <string.h>
+#include <iconv.h>
 
 #include "encoding.h"
 #include "dic_exception.h"
@@ -181,3 +184,79 @@
     }
 }
 
+
+unsigned int readFromUTF8(wchar_t *oString, unsigned int iWideSize,
+                          const char *iBuffer, unsigned int iBufSize,
+                          const string &iContext)
+{
+    iconv_t handle = iconv_open("WCHAR_T", "UTF-8");
+    if (handle == (iconv_t)(-1))
+        throw DicException("readFromUTF8: iconv_open failed");
+    size_t inChars = iBufSize;
+    size_t outChars = iWideSize * sizeof(wchar_t);
+    // Use the ICONV_CONST trick because the declaration of iconv()
+    // differs depending on the implementations...
+    ICONV_CONST char *in = const_cast<ICONV_CONST char*>(iBuffer);
+    char *out = (char*)oString;
+    size_t res = iconv(handle, &in, &inChars, &out, &outChars);
+    iconv_close(handle);
+    // Problem during encoding conversion?
+    if (res == (size_t)(-1))
+    {
+        throw DicException("readFromUTF8: iconv failed (" +
+                           iContext + "): " + string(strerror(errno)));
+    }
+    return iWideSize - outChars / sizeof(wchar_t);
+}
+
+
+void readFromUTF8(wstring &oString, const char *iBuffer,
+                  unsigned int iBufSize, const string &iContext)
+{
+    // Temporary buffer for output
+    // We will have at most as many characters as in the UTF-8 string
+    wchar_t *wideBuf = new wchar_t[iBufSize];
+    unsigned int number;
+    try
+    {
+        number = readFromUTF8(wideBuf, iBufSize, iBuffer, iBufSize, iContext);
+    }
+    catch (...)
+    {
+        // Make sure not to leak
+        delete[] wideBuf;
+        throw;
+    }
+    // Copy the string
+    oString = wstring(wideBuf, number);
+    delete[] wideBuf;
+}
+
+
+unsigned int writeInUTF8(const wstring &iString, char *oBuffer,
+                         unsigned int iBufSize, const string &iContext)
+{
+    iconv_t handle = iconv_open("UTF-8", "WCHAR_T");
+    if (handle == (iconv_t)(-1))
+        throw DicException("writeInUTF8: iconv_open failed");
+    size_t length = iString.size();
+    size_t inChars = sizeof(wchar_t) * length;
+    size_t outChars = iBufSize;
+    // Use the ICONV_CONST trick because the declaration of iconv()
+    // differs depending on the implementations...
+    // FIXME: bonus ugliness for doing 2 casts at once, and accessing string
+    // internals...
+    ICONV_CONST char *in = (ICONV_CONST char*)(&iString[0]);
+    char *out = oBuffer;
+    size_t res = iconv(handle, &in, &inChars, &out, &outChars);
+    iconv_close(handle);
+    // Problem during encoding conversion?
+    if (res == (size_t)(-1))
+    {
+        throw DicException("writeInUTF8: iconv failed (" +
+                           iContext + ")" + string(strerror(errno)));
+    }
+    // Return the number of written bytes
+    return iBufSize - outChars;
+}
+

Index: dic/encoding.h
===================================================================
RCS file: /cvsroot/eliot/eliot/dic/Attic/encoding.h,v
retrieving revision 1.1.2.3
retrieving revision 1.1.2.4
diff -u -b -r1.1.2.3 -r1.1.2.4
--- dic/encoding.h      4 Dec 2007 11:09:59 -0000       1.1.2.3
+++ dic/encoding.h      9 Dec 2007 16:29:55 -0000       1.1.2.4
@@ -59,5 +59,42 @@
 string padAndConvert(const wstring &iWstr, unsigned int iLength,
                      bool iLeftPad = true, char c = ' ');
 
+/**
+ * Utility function to convert a char* buffer encoded in UTF-8 into a
+ * wchar_t* string
+ * @param oString: where to write the converted string
+ * @param iWideSize: size available in oString (number of wchar_t)
+ * @param iBuffer: UTF-8 string to convert
+ * @param iBufSize: available size in iBuffer
+ * @param iContext: free text used in case of exception
+ * @return: number of wide chars actually written
+ */
+unsigned int readFromUTF8(wchar_t *oString, unsigned int iWideSize,
+                          const char *iBuffer, unsigned int iBufSize,
+                          const string &iContext);
+
+/**
+ * Same as the other readFromUTF8 function, dealing with a wstring
+ * instead of a wchar_t*. Note that it performs an additional copy
+ * of the output string...
+ * @param oString: where to write the converted string
+ * @param iBuffer: UTF-8 string to convert
+ * @param iBufSize: available size in iBuffer
+ * @param iContext: free text used in case of exception
+ */
+void readFromUTF8(wstring &oString, const char *iBuffer,
+                  unsigned int iBufSize, const string &iContext);
+
+/**
+ * Utility function to convert a wstring into an UTF-8 char* buffer
+ * @param iString: the wide string to encode
+ * @param oBuffer: where to write the encoded string
+ * @param iBufSize: available size in oBuffer
+ * @param iContext: free text used in case of exception
+ * @return: number of bytes actually written
+ */
+unsigned int writeInUTF8(const wstring &iString, char *oBuffer,
+                         unsigned int iBufSize, const string &iContext);
+
 #endif
 

Index: dic/header.cpp
===================================================================
RCS file: /cvsroot/eliot/eliot/dic/Attic/header.cpp,v
retrieving revision 1.1.2.14
retrieving revision 1.1.2.15
diff -u -b -r1.1.2.14 -r1.1.2.15
--- dic/header.cpp      6 Dec 2007 13:24:48 -0000       1.1.2.14
+++ dic/header.cpp      9 Dec 2007 16:29:55 -0000       1.1.2.15
@@ -22,9 +22,6 @@
 #include <string>
 #include <sstream>
 #include <iostream>
-#include <iconv.h>
-#include <errno.h>
-#include <string.h>
 
 // For ntohl & Co.
 #ifdef WIN32
@@ -490,62 +487,6 @@
 }
 
 
-void Header::readFromUTF8(wstring &oString, const char *iBuffer,
-                          unsigned int iBufSize, const string &iContext) const
-{
-    iconv_t handle = iconv_open("WCHAR_T", "UTF-8");
-    if (handle == (iconv_t)(-1))
-        throw DicException("Header::readFromUTF8: iconv_open failed");
-    // Temporary buffer for output
-    // We will have at most as many characters as in the UTF-8 string
-    wchar_t outbuf[iBufSize];
-    size_t inChars = iBufSize;
-    size_t outChars = sizeof(wchar_t) * iBufSize;
-    // Use the ICONV_CONST trick because the declaration of iconv()
-    // differs depending on the implementations...
-    ICONV_CONST char *in = const_cast<ICONV_CONST char*>(iBuffer);
-    char *out = (char*)outbuf;
-    size_t res = iconv(handle, &in, &inChars, &out, &outChars);
-    iconv_close(handle);
-    // Problem during encoding conversion?
-    if (res == (size_t)(-1))
-    {
-        throw DicException("Header::readFromUTF8: iconv failed (" +
-                           iContext + "): " + string(strerror(errno)));
-    }
-    // We finally have the letters as a wstring!
-    oString = wstring(outbuf, iBufSize - outChars / sizeof(wchar_t));
-}
-
-
-unsigned int Header::writeInUTF8(const wstring &iString, char *oBuffer,
-                                 unsigned int iBufSize, const string 
&iContext) const
-{
-    iconv_t handle = iconv_open("UTF-8", "WCHAR_T");
-    if (handle == (iconv_t)(-1))
-        throw DicException("Header::writeInUTF8: iconv_open failed");
-    size_t length = iString.size();
-    size_t inChars = sizeof(wchar_t) * length;
-    size_t outChars = iBufSize;
-    // Use the ICONV_CONST trick because the declaration of iconv()
-    // differs depending on the implementations...
-    // FIXME: bonus ugliness for doing 2 casts at once, and accessing string
-    // internals...
-    ICONV_CONST char *in = (ICONV_CONST char*)(&iString[0]);
-    char *out = oBuffer;
-    size_t res = iconv(handle, &in, &inChars, &out, &outChars);
-    iconv_close(handle);
-    // Problem during encoding conversion?
-    if (res == (size_t)(-1))
-    {
-        throw DicException("Header::writeInUTF8: iconv failed (" +
-                           iContext + ")");
-    }
-    // Return the number of written bytes
-    return iBufSize - outChars;
-}
-
-
 void Header::print() const
 {
     printf(_("dictionary name: %s\n"), convertToMb(m_dicName).c_str());

Index: dic/header.h
===================================================================
RCS file: /cvsroot/eliot/eliot/dic/Attic/header.h,v
retrieving revision 1.1.2.8
retrieving revision 1.1.2.9
diff -u -b -r1.1.2.8 -r1.1.2.9
--- dic/header.h        4 Dec 2007 11:09:59 -0000       1.1.2.8
+++ dic/header.h        9 Dec 2007 16:29:55 -0000       1.1.2.9
@@ -177,28 +177,6 @@
 
     /** Build m_mapCodeFromChar */
     void buildMapCodeFromChar();
-
-    /**
-     * Utility function to convert a char* buffer encoded in UTF-8 into a
-     * wide wstring
-     * @param oString: where to write the converted string
-     * @param iBuffer: UTF-8 string to convert
-     * @param iBufSize: available size in iBuffer
-     * @param iContext: free text used in case of exception
-     */
-    void readFromUTF8(wstring &oString, const char *iBuffer,
-                      unsigned int iBufSize, const string &iContext) const;
-
-    /**
-     * Utility function to convert a wstring into an UTF-8 char* buffer
-     * @param iString: the wide string to encode
-     * @param oBuffer: where to write the encoded string
-     * @param iBufSize: available size in oBuffer
-     * @param iContext: free text used in case of exception
-     * @return: number of bytes actually written
-     */
-    unsigned int writeInUTF8(const wstring &iString, char *oBuffer,
-                             unsigned int iBufSize, const string &iContext) 
const;
 };
 
 

Index: po/POTFILES.in
===================================================================
RCS file: /cvsroot/eliot/eliot/po/POTFILES.in,v
retrieving revision 1.3.2.2
retrieving revision 1.3.2.3
diff -u -b -r1.3.2.2 -r1.3.2.3
--- po/POTFILES.in      8 Dec 2007 13:56:10 -0000       1.3.2.2
+++ po/POTFILES.in      9 Dec 2007 16:29:55 -0000       1.3.2.3
@@ -1,6 +1,5 @@
 ./dic/automaton.cpp
 ./dic/automaton.h
-./dic/compdic.cpp
 ./dic/header.cpp
 ./dic/header.h
 ./dic/dic.cpp
@@ -9,7 +8,6 @@
 ./dic/dic_search.cpp
 ./dic/hashtable.cpp
 ./dic/hashtable.h
-./dic/listdic.cpp
 ./dic/regexp.cpp
 ./dic/regexp.h
 ./dic/tile.cpp




reply via email to

[Prev in Thread] Current Thread [Next in Thread]