[Top][All Lists]
[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]
[GNUnet-SVN] r9810 - Extractor/src/plugins
From: |
gnunet |
Subject: |
[GNUnet-SVN] r9810 - Extractor/src/plugins |
Date: |
Sat, 19 Dec 2009 13:58:35 +0100 |
Author: grothoff
Date: 2009-12-19 13:58:35 +0100 (Sat, 19 Dec 2009)
New Revision: 9810
Modified:
Extractor/src/plugins/translitextractor.c
Log:
dos2unix
Modified: Extractor/src/plugins/translitextractor.c
===================================================================
--- Extractor/src/plugins/translitextractor.c 2009-12-19 12:58:07 UTC (rev
9809)
+++ Extractor/src/plugins/translitextractor.c 2009-12-19 12:58:35 UTC (rev
9810)
@@ -1,129 +1,129 @@
-/*
- This file is part of libextractor.
- (C) 2002 - 2005 Vidyut Samanta and Christian Grothoff
-
- libextractor is free software; you can redistribute it and/or modify
- it under the terms of the GNU General Public License as published
- by the Free Software Foundation; either version 2, or (at your
- option) any later version.
-
- libextractor is distributed in the hope that it will be useful, but
- WITHOUT ANY WARRANTY; without even the implied warranty of
- MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- General Public License for more details.
-
- You should have received a copy of the GNU General Public License
- along with libextractor; see the file COPYING. If not, write to the
- Free Software Foundation, Inc., 59 Temple Place - Suite 330,
- Boston, MA 02111-1307, USA.
- */
-
-/**
- * @brief Transliterate keywords that contain international characters
- * @author Nils Durner
- */
-
-#include "platform.h"
-#include "extractor.h"
-#include "convert.h"
-
-/* Language independent chars were taken from glibc's locale/C-translit.h.in
- *
- * This extractor uses two tables: one contains the Unicode
- * characters and the other one contains the transliterations (since
- * transliterations are often used more than once: � -> ae, � -> ae).
- * The first table points to an appropriate transliteration stored in the
- * second table.
- *
- * To generate the two tables, a relational database was prepared:
- * create table TBL(UNI varchar(20), TRANSL varchar(10), TRANSLID integer);
- * create table TRANSL (TRANSL varchar(20) primary key, TRANSLID integer);
- *
- * After that, the data from glibc was converted to a SQL script using
- * "awk -F '\t'":
- * {
- * transl = $2;
- * gsub(/'/, "''", transl);
- * print "insert into TBL(UNI, TRANSL) values ('0x" substr($3, 6,
index($3, ">") - 6) "', '" transl "');";
- * print "insert into TRANSL(TRANSL, TRANSLID) values ('" transl "',
(Select count(*) from TRANSL));";
- * }
- *
- * Then the SQL script was executed, "commit"ted and the relation between the
- * two tables established using:
- * update TBL Set TRANSLID = (Select TRANSLID from TRANSL where
TRANSL.TRANSL = TBL.TRANSL);
- * commit;
- *
- * The C arrays were then created with:
- * Select '{' || UNI || ', ' || TRANSLID || '},' from TBL order by UNI;
- * Select TRANSL || ', ' from TRANSL order by TRANSLID;
- * and reformatted with:
- * {
- * a = $0;
- * getline;
- * b = $0;
- * getline;
- * c = $0;
- * getline;
- * printf("%s %s %s %s\n", a, b, c, $0);
- * }
- *
- * The unicode values for the other characters were taken from
- * http://bigfield.ddo.jp/unicode/unicode0.html
- */
+/*
+ This file is part of libextractor.
+ (C) 2002 - 2005 Vidyut Samanta and Christian Grothoff
+
+ libextractor is free software; you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published
+ by the Free Software Foundation; either version 2, or (at your
+ option) any later version.
+
+ libextractor is distributed in the hope that it will be useful, but
+ WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ General Public License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with libextractor; see the file COPYING. If not, write to the
+ Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ Boston, MA 02111-1307, USA.
+ */
+
+/**
+ * @brief Transliterate keywords that contain international characters
+ * @author Nils Durner
+ */
+
+#include "platform.h"
+#include "extractor.h"
+#include "convert.h"
+
+/* Language independent chars were taken from glibc's locale/C-translit.h.in
+ *
+ * This extractor uses two tables: one contains the Unicode
+ * characters and the other one contains the transliterations (since
+ * transliterations are often used more than once: � -> ae, � -> ae).
+ * The first table points to an appropriate transliteration stored in the
+ * second table.
+ *
+ * To generate the two tables, a relational database was prepared:
+ * create table TBL(UNI varchar(20), TRANSL varchar(10), TRANSLID integer);
+ * create table TRANSL (TRANSL varchar(20) primary key, TRANSLID integer);
+ *
+ * After that, the data from glibc was converted to a SQL script using
+ * "awk -F '\t'":
+ * {
+ * transl = $2;
+ * gsub(/'/, "''", transl);
+ * print "insert into TBL(UNI, TRANSL) values ('0x" substr($3, 6,
index($3, ">") - 6) "', '" transl "');";
+ * print "insert into TRANSL(TRANSL, TRANSLID) values ('" transl "',
(Select count(*) from TRANSL));";
+ * }
+ *
+ * Then the SQL script was executed, "commit"ted and the relation between the
+ * two tables established using:
+ * update TBL Set TRANSLID = (Select TRANSLID from TRANSL where
TRANSL.TRANSL = TBL.TRANSL);
+ * commit;
+ *
+ * The C arrays were then created with:
+ * Select '{' || UNI || ', ' || TRANSLID || '},' from TBL order by UNI;
+ * Select TRANSL || ', ' from TRANSL order by TRANSLID;
+ * and reformatted with:
+ * {
+ * a = $0;
+ * getline;
+ * b = $0;
+ * getline;
+ * c = $0;
+ * getline;
+ * printf("%s %s %s %s\n", a, b, c, $0);
+ * }
+ *
+ * The unicode values for the other characters were taken from
+ * http://bigfield.ddo.jp/unicode/unicode0.html
+ */
-unsigned int chars[][2] = {
+unsigned int chars[][2] = {
{0x00C4, 444}, {0x00D6, 445}, {0x00DC, 446}, {0x00DF, 13},
- /* �, �, �, � */
-{0x00E4, 14}, {0x00F6, 19}, {0x00FC, 447}, {0x00C5, 448}, /* �, �, �, � */
-{0x00E5, 449}, {0x00C6, 444}, {0x00E6, 14}, {0x00D8, 445}, /* �, �, �, � */
-{0x00F8, 19}, {0x00C0, 419}, {0x00C8, 77}, {0x00D9, 426}, /* �, �, �, � */
-{0x00E0, 431}, {0x00E8, 76}, {0x00F9, 5}, {0x00C9, 77}, /* �, �, �, � */
-{0x00E9, 76}, {0x00C2, 419}, {0x00CA, 77}, {0x00CE, 63}, /* �, �, �, � */
-{0x00D4, 423}, {0x00DB, 426}, {0x00E2, 431}, {0x00EA, 76}, /* �, �, �, � */
-{0x00EE, 80}, {0x00F4, 41}, {0x00FB, 5}, {0x00CB, 77}, /* �, �, �, � */
-{0x00CF, 63}, {0x00EB, 76}, {0x00EF, 80}, {0x00C7, 57}, /* �, �, �, � */
-{0x00E7, 118}, {0x0152, 445}, {0x0053, 19}, {0x0080, 66}, /* �, �, �, � */
-
- /* Language independent */
-{0xFB00, 391}, {0xFB01, 392}, {0xFB02, 393}, {0xFB03, 394},
- {0xFB04, 395}, {0xFB06, 396}, {0xFB29, 40}, {0xFEFF, 36},
- {0xFE4D, 33}, {0xFE4E, 33}, {0xFE4F, 33}, {0xFE5A, 401},
- {0xFE5B, 402}, {0xFE5C, 403}, {0xFE5F, 404}, {0xFE50, 6},
- {0xFE52, 42}, {0xFE54, 397}, {0xFE55, 34}, {0xFE56, 398},
- {0xFE57, 399}, {0xFE59, 400}, {0xFE6A, 407}, {0xFE6B, 408},
- {0xFE60, 405}, {0xFE61, 128}, {0xFE62, 40}, {0xFE63, 3},
- {0xFE64, 47}, {0xFE65, 48}, {0xFE66, 262}, {0xFE68, 127},
- {0xFE69, 406}, {0xFF0A, 128}, {0xFF0B, 40}, {0xFF0C, 6},
- {0xFF0D, 3}, {0xFF0E, 42}, {0xFF0F, 126}, {0xFF01, 399},
- {0xFF02, 38}, {0xFF03, 404}, {0xFF04, 406}, {0xFF05, 407},
- {0xFF06, 405}, {0xFF07, 30}, {0xFF08, 400}, {0xFF09, 401},
- {0xFF1A, 34}, {0xFF1B, 397}, {0xFF1C, 47}, {0xFF1D, 262},
- {0xFF1E, 48}, {0xFF1F, 398}, {0xFF10, 409}, {0xFF11, 410},
- {0xFF12, 411}, {0xFF13, 412}, {0xFF14, 413}, {0xFF15, 414},
- {0xFF16, 415}, {0xFF17, 416}, {0xFF18, 417}, {0xFF19, 418},
- {0xFF2A, 421}, {0xFF2B, 422}, {0xFF2C, 64}, {0xFF2D, 79},
- {0xFF2E, 66}, {0xFF2F, 423}, {0xFF20, 408}, {0xFF21, 419},
- {0xFF22, 75}, {0xFF23, 57}, {0xFF24, 81}, {0xFF25, 77},
- {0xFF26, 78}, {0xFF27, 420}, {0xFF28, 61}, {0xFF29, 63},
- {0xFF3A, 73}, {0xFF3B, 429}, {0xFF3C, 127}, {0xFF3D, 430},
- {0xFF3E, 31}, {0xFF3F, 33}, {0xFF30, 68}, {0xFF31, 69},
- {0xFF32, 70}, {0xFF33, 424}, {0xFF34, 425}, {0xFF35, 426},
- {0xFF36, 100}, {0xFF37, 427}, {0xFF38, 105}, {0xFF39, 428},
- {0xFF4A, 83}, {0xFF4B, 434}, {0xFF4C, 65}, {0xFF4D, 119},
- {0xFF4E, 435}, {0xFF4F, 41}, {0xFF40, 32}, {0xFF41, 431},
- {0xFF42, 432}, {0xFF43, 118}, {0xFF44, 82}, {0xFF45, 76},
- {0xFF46, 433}, {0xFF47, 60}, {0xFF48, 62}, {0xFF49, 80},
- {0xFF5A, 442}, {0xFF5B, 402}, {0xFF5C, 129}, {0xFF5D, 403},
- {0xFF5E, 35}, {0xFF50, 436}, {0xFF51, 437}, {0xFF52, 438},
- {0xFF53, 20}, {0xFF54, 439}, {0xFF55, 5}, {0xFF56, 111},
- {0xFF57, 440}, {0xFF58, 12}, {0xFF59, 441}, {0x00AB, 2},
- {0x00AD, 3}, {0x00AE, 4}, {0x00A0, 0}, {0x00A9, 1},
- {0x00BB, 7}, {0x00BC, 8}, {0x00BD, 9}, {0x00BE, 10},
- {0x00B5, 5}, {0x00B8, 6}, {0x00C6, 11}, {0x00DF, 13},
- {0x00D7, 12}, {0x00E6, 14}, {0x0001D4AA, 423}, {0x0001D4AB, 68},
- {0x0001D4AC, 69}, {0x0001D4AE, 424}, {0x0001D4AF, 425}, {0x0001D4A2, 420},
- {0x0001D4A5, 421}, {0x0001D4A6, 422}, {0x0001D4A9, 66}, {0x0001D4BB, 433},
- {0x0001D4BD, 62}, {0x0001D4BE, 80}, {0x0001D4BF, 83}, {0x0001D4B0, 426},
+ /* �, �, �, � */
+{0x00E4, 14}, {0x00F6, 19}, {0x00FC, 447}, {0x00C5, 448}, /* �, �, �, � */
+{0x00E5, 449}, {0x00C6, 444}, {0x00E6, 14}, {0x00D8, 445}, /* �, �, �, � */
+{0x00F8, 19}, {0x00C0, 419}, {0x00C8, 77}, {0x00D9, 426}, /* �, �, �, � */
+{0x00E0, 431}, {0x00E8, 76}, {0x00F9, 5}, {0x00C9, 77}, /* �, �, �, � */
+{0x00E9, 76}, {0x00C2, 419}, {0x00CA, 77}, {0x00CE, 63}, /* �, �, �, � */
+{0x00D4, 423}, {0x00DB, 426}, {0x00E2, 431}, {0x00EA, 76}, /* �, �, �, � */
+{0x00EE, 80}, {0x00F4, 41}, {0x00FB, 5}, {0x00CB, 77}, /* �, �, �, � */
+{0x00CF, 63}, {0x00EB, 76}, {0x00EF, 80}, {0x00C7, 57}, /* �, �, �, � */
+{0x00E7, 118}, {0x0152, 445}, {0x0053, 19}, {0x0080, 66}, /* �, �, �, � */
+
+ /* Language independent */
+{0xFB00, 391}, {0xFB01, 392}, {0xFB02, 393}, {0xFB03, 394},
+ {0xFB04, 395}, {0xFB06, 396}, {0xFB29, 40}, {0xFEFF, 36},
+ {0xFE4D, 33}, {0xFE4E, 33}, {0xFE4F, 33}, {0xFE5A, 401},
+ {0xFE5B, 402}, {0xFE5C, 403}, {0xFE5F, 404}, {0xFE50, 6},
+ {0xFE52, 42}, {0xFE54, 397}, {0xFE55, 34}, {0xFE56, 398},
+ {0xFE57, 399}, {0xFE59, 400}, {0xFE6A, 407}, {0xFE6B, 408},
+ {0xFE60, 405}, {0xFE61, 128}, {0xFE62, 40}, {0xFE63, 3},
+ {0xFE64, 47}, {0xFE65, 48}, {0xFE66, 262}, {0xFE68, 127},
+ {0xFE69, 406}, {0xFF0A, 128}, {0xFF0B, 40}, {0xFF0C, 6},
+ {0xFF0D, 3}, {0xFF0E, 42}, {0xFF0F, 126}, {0xFF01, 399},
+ {0xFF02, 38}, {0xFF03, 404}, {0xFF04, 406}, {0xFF05, 407},
+ {0xFF06, 405}, {0xFF07, 30}, {0xFF08, 400}, {0xFF09, 401},
+ {0xFF1A, 34}, {0xFF1B, 397}, {0xFF1C, 47}, {0xFF1D, 262},
+ {0xFF1E, 48}, {0xFF1F, 398}, {0xFF10, 409}, {0xFF11, 410},
+ {0xFF12, 411}, {0xFF13, 412}, {0xFF14, 413}, {0xFF15, 414},
+ {0xFF16, 415}, {0xFF17, 416}, {0xFF18, 417}, {0xFF19, 418},
+ {0xFF2A, 421}, {0xFF2B, 422}, {0xFF2C, 64}, {0xFF2D, 79},
+ {0xFF2E, 66}, {0xFF2F, 423}, {0xFF20, 408}, {0xFF21, 419},
+ {0xFF22, 75}, {0xFF23, 57}, {0xFF24, 81}, {0xFF25, 77},
+ {0xFF26, 78}, {0xFF27, 420}, {0xFF28, 61}, {0xFF29, 63},
+ {0xFF3A, 73}, {0xFF3B, 429}, {0xFF3C, 127}, {0xFF3D, 430},
+ {0xFF3E, 31}, {0xFF3F, 33}, {0xFF30, 68}, {0xFF31, 69},
+ {0xFF32, 70}, {0xFF33, 424}, {0xFF34, 425}, {0xFF35, 426},
+ {0xFF36, 100}, {0xFF37, 427}, {0xFF38, 105}, {0xFF39, 428},
+ {0xFF4A, 83}, {0xFF4B, 434}, {0xFF4C, 65}, {0xFF4D, 119},
+ {0xFF4E, 435}, {0xFF4F, 41}, {0xFF40, 32}, {0xFF41, 431},
+ {0xFF42, 432}, {0xFF43, 118}, {0xFF44, 82}, {0xFF45, 76},
+ {0xFF46, 433}, {0xFF47, 60}, {0xFF48, 62}, {0xFF49, 80},
+ {0xFF5A, 442}, {0xFF5B, 402}, {0xFF5C, 129}, {0xFF5D, 403},
+ {0xFF5E, 35}, {0xFF50, 436}, {0xFF51, 437}, {0xFF52, 438},
+ {0xFF53, 20}, {0xFF54, 439}, {0xFF55, 5}, {0xFF56, 111},
+ {0xFF57, 440}, {0xFF58, 12}, {0xFF59, 441}, {0x00AB, 2},
+ {0x00AD, 3}, {0x00AE, 4}, {0x00A0, 0}, {0x00A9, 1},
+ {0x00BB, 7}, {0x00BC, 8}, {0x00BD, 9}, {0x00BE, 10},
+ {0x00B5, 5}, {0x00B8, 6}, {0x00C6, 11}, {0x00DF, 13},
+ {0x00D7, 12}, {0x00E6, 14}, {0x0001D4AA, 423}, {0x0001D4AB, 68},
+ {0x0001D4AC, 69}, {0x0001D4AE, 424}, {0x0001D4AF, 425}, {0x0001D4A2, 420},
+ {0x0001D4A5, 421}, {0x0001D4A6, 422}, {0x0001D4A9, 66}, {0x0001D4BB, 433},
+ {0x0001D4BD, 62}, {0x0001D4BE, 80}, {0x0001D4BF, 83}, {0x0001D4B0, 426},
{0x0001D4B1, 100}, {0x0001D4B2, 427}, {0x0001D4B3, 105}, {0x0001D4B4, 428},
{0x0001D4B5, 73}, {0x0001D4B6, 431}, {0x0001D4B7, 432}, {0x0001D4B8, 118},
@@ -839,9 +839,9 @@
"Y", "[", "]", "a",
"b", "f", "k", "n",
"p", "q", "r", "t",
-"w", "y", "z", "z",
- /* German */ "Ae", "Oe", "Ue", "ue",
- /* Scandinavian */ "Aa", "aa"
+"w", "y", "z", "z",
+ /* German */ "Ae", "Oe", "Ue", "ue",
+ /* Scandinavian */ "Aa", "aa"
};
@@ -867,7 +867,7 @@
}
-struct EXTRACTOR_Keywords *
+struct EXTRACTOR_Keywords *
libextractor_translit_extract (const char *filename,
const char *data,
@@ -892,7 +892,7 @@
while (pos != NULL)
-
+
{
int charlen = 0;
@@ -916,28 +916,27 @@
char *tr;
-
- /* Get length of character */
+ /* Get length of character */
c = srcdata[src];
if ((c & 0xC0) == 0xC0)
-
- /* UTF-8 char */
+
+ /* UTF-8 char */
if ((c & 0xE0) == 0xE0)
if ((c & 0xF0) == 0xF0)
charlen = 4;
-
+
else
charlen = 3;
-
+
else
charlen = 2;
-
+
else
charlen = 1;
@@ -945,16 +944,15 @@
if (src + charlen - 1 > len)
{
-
- /* incomplete UTF-8 */
+
+ /* incomplete UTF-8 */
src = len;
continue;
}
-
-
- /* Copy character to destination */
+
+ /* Copy character to destination */
if (charlen > 1)
{
@@ -963,39 +961,38 @@
if (charlen == 2)
{
-
- /* 5 bits from the first byte and 6 bits from the second.
- 64 = 2^6 */
+
+ /* 5 bits from the first byte and 6 bits from the second.
+ 64 = 2^6 */
unicode =
((srcdata[src] & 0x1F) * 64) | (srcdata[src + 1] & 0x3F);
}
-
+
else if (charlen == 3)
{
-
- /* 4 bits from the first byte and 6 bits from the second
and third
- byte. 4096 = 2^12 */
- unicode = ((srcdata[src] & 0xF) * 4096) |
+
+ /* 4 bits from the first byte and 6 bits from the second
and third
+ byte. 4096 = 2^12 */
+ unicode = ((srcdata[src] & 0xF) * 4096) |
((srcdata[src + 1] & 0x3F) *
64) | (srcdata[src + 2] & 0x3F);
}
-
+
else if (charlen == 4)
{
-
- /* 3 bits from the first byte and 6 bits from the second,
third
- and fourth byte. 262144 = 2^18 */
- unicode = ((srcdata[src] & 7) * 262144) |
- ((srcdata[src] & 0xF) * 4096) |
+
+ /* 3 bits from the first byte and 6 bits from the second,
third
+ and fourth byte. 262144 = 2^18 */
+ unicode = ((srcdata[src] & 7) * 262144) |
+ ((srcdata[src] & 0xF) * 4096) |
((srcdata[src + 1] & 0x3F) *
64) | (srcdata[src + 2] & 0x3F);
}
-
-
- /* Look it up */
+
+ /* Look it up */
idx = 0;
tr = srcdata + src;
@@ -1007,8 +1004,8 @@
if (unicode == chars[idx][0])
{
-
- /* Found it */
+
+ /* Found it */
tr = translit[chars[idx][1]];
trlen = strlen (tr);
@@ -1022,7 +1019,7 @@
}
}
-
+
else
trlen = 1;
@@ -1040,12 +1037,12 @@
if (charlen > 1)
{
-
- /* Copy character to destination string */
+
+ /* Copy character to destination string */
memcpy (transl + dest, tr, trlen);
}
-
+
else
transl[dest] = c;
@@ -1076,4 +1073,4 @@
}
-
+
[Prev in Thread] |
Current Thread |
[Next in Thread] |
- [GNUnet-SVN] r9810 - Extractor/src/plugins,
gnunet <=