[Top][All Lists]
[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]
[GNUnet-SVN] r9814 - in Extractor/src: include main plugins
From: |
gnunet |
Subject: |
[GNUnet-SVN] r9814 - in Extractor/src: include main plugins |
Date: |
Sat, 19 Dec 2009 22:10:55 +0100 |
Author: grothoff
Date: 2009-12-19 22:10:55 +0100 (Sat, 19 Dec 2009)
New Revision: 9814
Modified:
Extractor/src/include/extractor.h
Extractor/src/main/extractor_metatypes.c
Extractor/src/plugins/html_extractor.c
Extractor/src/plugins/id3v2_extractor.c
Log:
id3v2
Modified: Extractor/src/include/extractor.h
===================================================================
--- Extractor/src/include/extractor.h 2009-12-19 13:42:26 UTC (rev 9813)
+++ Extractor/src/include/extractor.h 2009-12-19 21:10:55 UTC (rev 9814)
@@ -136,7 +136,7 @@
EXTRACTOR_METATYPE_PUBLICATION_DATE = 24,
EXTRACTOR_METATYPE_BIBTEX_EPRINT = 25,
EXTRACTOR_METATYPE_BIBTEX_ENTRY_TYPE = 26,
- EXTRACTOR_METATYPE_DOCUMENT_LANGUAGE = 27,
+ EXTRACTOR_METATYPE_LANGUAGE = 27,
EXTRACTOR_METATYPE_CREATION_TIME = 28,
EXTRACTOR_METATYPE_URL = 29,
@@ -285,12 +285,21 @@
EXTRACTOR_METATYPE_CHAPTER_NAME = 153,
EXTRACTOR_METATYPE_SONG_COUNT = 154,
EXTRACTOR_METATYPE_STARTING_SONG = 155,
+ EXTRACTOR_METATYPE_PLAY_COUNTER = 156,
+ EXTRACTOR_METATYPE_CONDUCTOR = 157,
+ EXTRACTOR_METATYPE_INTERPRETATION = 158,
+ EXTRACTOR_METATYPE_COMPOSER = 159,
+ EXTRACTOR_METATYPE_BEATS_PER_MINUTE = 160,
+ EXTRACTOR_METATYPE_ENCODED_BY = 161,
+ EXTRACTOR_METATYPE_ORIGINAL_TITLE = 162,
+ EXTRACTOR_METATYPE_ORIGINAL_ARTIST = 163,
+ EXTRACTOR_METATYPE_ORIGINAL_WRITER = 164,
+ EXTRACTOR_METATYPE_ORIGINAL_RELEASE_YEAR = 165,
+ EXTRACTOR_METATYPE_LYRICS = 166,
+ EXTRACTOR_METATYPE_POPULARITY_METER = 167,
/* fixme: used up to here! */
- EXTRACTOR_METATYPE_LYRICS = 67,
- EXTRACTOR_METATYPE_CONDUCTOR = 64,
- EXTRACTOR_METATYPE_INTERPRET = 65,
EXTRACTOR_METATYPE_MUSIC_CD_IDENTIFIER = 117,
EXTRACTOR_METATYPE_MUSICIAN_CREDITS_LIST = 123,
@@ -304,7 +313,6 @@
/* numeric metrics */
- EXTRACTOR_METATYPE_POPULARITY_METER = 119,
EXTRACTOR_METATYPE_RATING = 145,
EXTRACTOR_METATYPE_PRIORITY = 60,
@@ -316,7 +324,6 @@
/* misc (see if these are still needed...) */
EXTRACTOR_METATYPE_GENERATOR = 103,
- EXTRACTOR_METATYPE_ENCODED_BY = 121,
EXTRACTOR_METATYPE_FULL_DATA = 137,
Modified: Extractor/src/main/extractor_metatypes.c
===================================================================
--- Extractor/src/main/extractor_metatypes.c 2009-12-19 13:42:26 UTC (rev
9813)
+++ Extractor/src/main/extractor_metatypes.c 2009-12-19 21:10:55 UTC (rev
9814)
@@ -380,16 +380,34 @@
/* 155 */
{ gettext_noop ("starting song"),
gettext_noop ("number of the first song to play") },
- { gettext_noop (""),
+ { gettext_noop ("play counter"),
+ gettext_noop ("number of times the media has been played") },
+ { gettext_noop ("conductor"),
+ gettext_noop ("name of the conductor") },
+ { gettext_noop ("interpretation"),
+ gettext_noop ("information about the people behind interpretations of an
existing piece") },
+ { gettext_noop ("composer"),
+ gettext_noop ("name of the composer") },
+ /* 160 */
+ { gettext_noop ("beats per minute"),
gettext_noop ("") },
+ { gettext_noop ("encoded by"),
+ gettext_noop ("name of person or organization that encoded the file") },
+ { gettext_noop ("original title"),
+ gettext_noop ("title of the original work") },
+ { gettext_noop ("original artist"),
+ gettext_noop ("name of the original artist") },
+ { gettext_noop ("original writer"),
+ gettext_noop ("name of the original lyricist or writer") },
+ /* 165 */
+ { gettext_noop ("original release year"),
+ gettext_noop ("year of the original release") },
+ { gettext_noop ("lyrics"),
+ gettext_noop ("lyrics of the song or text description of vocal
activities") },
+ { gettext_noop ("popularity"),
+ gettext_noop ("information about the file's popularity") },
{ gettext_noop (""),
gettext_noop ("") },
- { gettext_noop (""),
- gettext_noop ("") },
- { gettext_noop (""),
- gettext_noop ("") },
- { gettext_noop (""),
- gettext_noop ("") },
#if 0
gettext_noop("author"),
Modified: Extractor/src/plugins/html_extractor.c
===================================================================
--- Extractor/src/plugins/html_extractor.c 2009-12-19 13:42:26 UTC (rev
9813)
+++ Extractor/src/plugins/html_extractor.c 2009-12-19 21:10:55 UTC (rev
9814)
@@ -44,7 +44,7 @@
{ "rights", EXTRACTOR_METATYPE_RIGHTS },
{ "dc.rights", EXTRACTOR_METATYPE_RIGHTS },
{ "copyright", EXTRACTOR_METATYPE_COPYRIGHT },
- { "language", EXTRACTOR_METATYPE_DOCUMENT_LANGUAGE },
+ { "language", EXTRACTOR_METATYPE_LANGUAGE },
{ "keywords", EXTRACTOR_METATYPE_KEYWORDS },
{ "abstract", EXTRACTOR_METATYPE_ABSTRACT },
{ "formatter", EXTRACTOR_METATYPE_CREATED_BY_SOFTWARE },
Modified: Extractor/src/plugins/id3v2_extractor.c
===================================================================
--- Extractor/src/plugins/id3v2_extractor.c 2009-12-19 13:42:26 UTC (rev
9813)
+++ Extractor/src/plugins/id3v2_extractor.c 2009-12-19 21:10:55 UTC (rev
9814)
@@ -28,46 +28,91 @@
#define DEBUG_EXTRACT_ID3v2 0
+enum Id3v2Fmt
+ {
+ T, /* simple, 0-terminated string, prefixed by encoding */
+ U, /* 0-terminated ASCII string, no encoding */
+ UL, /* unsync'ed lyrics */
+ SL, /* sync'ed lyrics */
+ L, /* string with language prefix */
+ I /* image */
+ };
+
typedef struct
{
const char *text;
enum EXTRACTOR_MetaType type;
+ enum Id3v2Fmt fmt;
} Matches;
static Matches tmap[] = {
- {"TAL", EXTRACTOR_METATYPE_TITLE},
- {"TT1", EXTRACTOR_METATYPE_GROUP},
- {"TT2", EXTRACTOR_METATYPE_TITLE},
- {"TT3", EXTRACTOR_METATYPE_TITLE},
- {"TXT", EXTRACTOR_METATYPE_DESCRIPTION},
- {"TPB", EXTRACTOR_METATYPE_PUBLISHER},
- {"WAF", EXTRACTOR_METATYPE_LOCATION},
- {"WAR", EXTRACTOR_METATYPE_LOCATION},
- {"WAS", EXTRACTOR_METATYPE_LOCATION},
- {"WCP", EXTRACTOR_METATYPE_COPYRIGHT},
- {"WAF", EXTRACTOR_METATYPE_LOCATION},
- {"WCM", EXTRACTOR_METATYPE_DISCLAIMER},
- {"TSS", EXTRACTOR_METATYPE_FORMAT},
- {"TYE", EXTRACTOR_METATYPE_DATE},
- {"TLA", EXTRACTOR_METATYPE_LANGUAGE},
- {"TP1", EXTRACTOR_METATYPE_ARTIST},
- {"TP2", EXTRACTOR_METATYPE_ARTIST},
- {"TP3", EXTRACTOR_METATYPE_CONDUCTOR},
- {"TP4", EXTRACTOR_METATYPE_INTERPRET},
- {"IPL", EXTRACTOR_METATYPE_CONTRIBUTOR},
- {"TOF", EXTRACTOR_METATYPE_FILENAME},
- {"TEN", EXTRACTOR_METATYPE_PRODUCER},
- {"TCO", EXTRACTOR_METATYPE_SUBJECT},
- {"TCR", EXTRACTOR_METATYPE_COPYRIGHT},
- {"SLT", EXTRACTOR_METATYPE_LYRICS},
- {"TOA", EXTRACTOR_METATYPE_ARTIST},
- {"TRC", EXTRACTOR_METATYPE_ISRC},
- {"TRK", EXTRACTOR_METATYPE_TRACK_NUMBER},
- {"TCM", EXTRACTOR_METATYPE_CREATOR},
- {"TOT", EXTRACTOR_METATYPE_ALBUM},
- {"TOL", EXTRACTOR_METATYPE_AUTHOR},
- {"COM", EXTRACTOR_METATYPE_COMMENT},
- {"", EXTRACTOR_METATYPE_KEYWORDS},
+ /* skipping UFI */
+ {"TT1", EXTRACTOR_METATYPE_SECTION, T},
+ {"TT2", EXTRACTOR_METATYPE_TITLE, T},
+ {"TT3", EXTRACTOR_METATYPE_SONG_VERSION, T},
+ {"TP1", EXTRACTOR_METATYPE_ARTIST, T},
+ {"TP2", EXTRACTOR_METATYPE_PERFORMER, T},
+ {"TP3", EXTRACTOR_METATYPE_CONDUCTOR, T},
+ {"TP4", EXTRACTOR_METATYPE_INTERPRETATION, T},
+ {"TCM", EXTRACTOR_METATYPE_COMPOSER, T},
+ {"TXT", EXTRACTOR_METATYPE_WRITER, T},
+ {"TLA", EXTRACTOR_METATYPE_LANGUAGE, T},
+ {"TCO", EXTRACTOR_METATYPE_GENRE, T},
+ {"TAL", EXTRACTOR_METATYPE_ALBUM, T},
+ {"TPA", EXTRACTOR_METATYPE_DISC_NUMBER, T},
+ {"TRK", EXTRACTOR_METATYPE_TRACK_NUMBER, T},
+ {"TRC", EXTRACTOR_METATYPE_ISRC, T},
+ {"TYE", EXTRACTOR_METATYPE_PUBLICATION_YEAR, T},
+ /*
+ FIXME: these two and TYE should be combined into
+ the actual publication date (if TRD is missing)
+ {"TDA", EXTRACTOR_METATYPE_PUBLICATION_DATE},
+ {"TIM", EXTRACTOR_METATYPE_PUBLICATION_DATE},
+ */
+ {"TRD", EXTRACTOR_METATYPE_CREATION_TIME, T},
+ {"TMT", EXTRACTOR_METATYPE_SOURCE, T},
+ {"TFT", EXTRACTOR_METATYPE_FORMAT_VERSION, T},
+ {"TBP", EXTRACTOR_METATYPE_BEATS_PER_MINUTE, T},
+ {"TCR", EXTRACTOR_METATYPE_COPYRIGHT, T},
+ {"TPB", EXTRACTOR_METATYPE_PUBLISHER, T},
+ {"TEN", EXTRACTOR_METATYPE_ENCODED_BY, T},
+ {"TSS", EXTRACTOR_METATYPE_PRODUCED_BY_SOFTWARE, T},
+ {"TOF", EXTRACTOR_METATYPE_FILENAME, T},
+ {"TLE", EXTRACTOR_METATYPE_DURATION, T}, /* FIXME: should append 'ms' as
unit */
+ {"TSI", EXTRACTOR_METATYPE_EMBEDDED_FILE_SIZE, T},
+ /* skipping TDY, TKE */
+ {"TOT", EXTRACTOR_METATYPE_ORIGINAL_TITLE, T},
+ {"TOA", EXTRACTOR_METATYPE_ORIGINAL_ARTIST, T},
+ {"TOL", EXTRACTOR_METATYPE_ORIGINAL_WRITER, T},
+ {"TOR", EXTRACTOR_METATYPE_ORIGINAL_RELEASE_YEAR, T},
+ /* skipping TXX */
+
+ {"WAF", EXTRACTOR_METATYPE_URL, U},
+ {"WAR", EXTRACTOR_METATYPE_URL, U},
+ {"WAS", EXTRACTOR_METATYPE_URL, U},
+ {"WCM", EXTRACTOR_METATYPE_URL, U},
+ {"WCP", EXTRACTOR_METATYPE_RIGHTS, U},
+ {"WCB", EXTRACTOR_METATYPE_URL, U},
+ /* skipping WXX */
+ {"IPL", EXTRACTOR_METATYPE_CONTRIBUTOR_NAME, T},
+ /* skipping MCI */
+ /* skipping ETC */
+ /* skipping MLL */
+ /* skipping STC */
+ {"ULT", EXTRACTOR_METATYPE_LYRICS, UL},
+ {"SLT", EXTRACTOR_METATYPE_LYRICS, SL},
+ {"COM", EXTRACTOR_METATYPE_COMMENT, L},
+ /* skipping RVA */
+ /* skipping EQU */
+ /* skipping REV */
+ {"PIC", EXTRACTOR_METATYPE_PICTURE, I},
+ /* skipping GEN */
+ /* {"CNT", EXTRACTOR_METATYPE_PLAY_COUNTER, XXX}, */
+ /* {"POP", EXTRACTOR_METATYPE_POPULARITY_METER, XXX}, */
+ /* skipping BUF */
+ /* skipping CRM */
+ /* skipping CRA */
+ /* {"LNK", EXTRACTOR_METATYPE_URL, XXX}, */
{NULL, 0},
};
@@ -83,6 +128,9 @@
int unsync;
unsigned int tsize;
unsigned int pos;
+ unsigned int off;
+ enum EXTRACTOR_MetaType type;
+ const char *mime;
if ((size < 16) ||
(data[0] != 0x49) ||
@@ -102,10 +150,10 @@
size_t csize;
int i;
- if (pos + 6 > tsize)
+ if (pos + 7 > tsize)
return 0;
csize = (data[pos + 3] << 16) + (data[pos + 4] << 8) + data[pos + 5];
- if ((pos + 6 + csize > tsize) || (csize > tsize) || (csize == 0))
+ if ((pos + 7 + csize > tsize) || (csize > tsize) || (csize == 0))
break;
i = 0;
while (tmap[i].text != NULL)
@@ -116,33 +164,190 @@
/* this byte describes the encoding
try to convert strings to UTF-8
if it fails, then forget it */
- switch (data[pos + 6])
+ switch (tmap[i].fmt)
+ {
+ case T:
+ switch (data[pos + 6])
+ {
+ case 0x00:
+ word = EXTRACTOR_common_convert_to_utf8 ((const char *)
&data[pos + 7],
+ csize - 1,
"ISO-8859-1");
+ break;
+ case 0x01:
+ word = EXTRACTOR_common_convert_to_utf8 ((const char *)
&data[pos + 7],
+ csize - 1,
"UCS-2");
+ break;
+ default:
+ /* bad encoding byte,
+ try to convert from iso-8859-1 */
+ word = EXTRACTOR_common_convert_to_utf8 ((const char *)
&data[pos + 7],
+ csize - 1,
"ISO-8859-1");
+ break;
+ }
+ break;
+ case U:
+ word = EXTRACTOR_common_convert_to_utf8 ((const char *)
&data[pos + 6],
+ csize, "ISO-8859-1");
+ break;
+ case UL:
+ if (csize < 6)
+ return 0; /* malformed */
+ /* find end of description */
+ off = 10;
+ while ( (off < size) &&
+ (off - pos < csize) &&
+ (data[pos + off] == '\0') )
+ off++;
+ if ( (off >= csize) ||
+ (data[pos+off] != '\0') )
+ return 0; /* malformed */
+ off++;
+ switch (data[pos + 6])
+ {
+ case 0x00:
+ word = EXTRACTOR_common_convert_to_utf8 ((const char *)
&data[pos + off],
+ csize - off,
"ISO-8859-1");
+ break;
+ case 0x01:
+ word = EXTRACTOR_common_convert_to_utf8 ((const char *)
&data[pos + off],
+ csize - off,
"UCS-2");
+ break;
+ default:
+ /* bad encoding byte,
+ try to convert from iso-8859-1 */
+ word = EXTRACTOR_common_convert_to_utf8 ((const char *)
&data[pos + off],
+ csize - off,
"ISO-8859-1");
+ break;
+ }
+ break;
+ case SL:
+ if (csize < 7)
+ return 0; /* malformed */
+ /* find end of description */
+ switch (data[pos + 6])
+ {
+ case 0x00:
+ word = EXTRACTOR_common_convert_to_utf8 ((const char *)
&data[pos + 12],
+ csize - 6,
"ISO-8859-1");
+ break;
+ case 0x01:
+ word = EXTRACTOR_common_convert_to_utf8 ((const char *)
&data[pos + 12],
+ csize - 6,
"UCS-2");
+ break;
+ default:
+ /* bad encoding byte,
+ try to convert from iso-8859-1 */
+ word = EXTRACTOR_common_convert_to_utf8 ((const char *)
&data[pos + 12],
+ csize - 6,
"ISO-8859-1");
+ break;
+ }
+ break;
+ case L:
+ if (csize < 5)
+ return 0; /* malformed */
+ /* find end of description */
+ switch (data[pos + 6])
+ {
+ case 0x00:
+ word = EXTRACTOR_common_convert_to_utf8 ((const char *)
&data[pos + 10],
+ csize - 4,
"ISO-8859-1");
+ break;
+ case 0x01:
+ word = EXTRACTOR_common_convert_to_utf8 ((const char *)
&data[pos + 10],
+ csize - 4,
"UCS-2");
+ break;
+ default:
+ /* bad encoding byte,
+ try to convert from iso-8859-1 */
+ word = EXTRACTOR_common_convert_to_utf8 ((const char *)
&data[pos + 10],
+ csize - 4,
"ISO-8859-1");
+ break;
+ }
+ break;
+ case I:
+ if (csize < 6)
+ return 0; /* malformed */
+ /* find end of description */
+ off = 12;
+ while ( (off < size) &&
+ (off - pos < csize) &&
+ (data[pos + off] == '\0') )
+ off++;
+ if ( (off >= csize) ||
+ (data[pos+off] != '\0') )
+ return 0; /* malformed */
+ off++;
+ switch (data[pos+11])
+ {
+ case 0x03:
+ case 0x04:
+ type = EXTRACTOR_METATYPE_COVER_PICTURE;
+ break;
+ case 0x07:
+ case 0x08:
+ case 0x09:
+ case 0x0A:
+ case 0x0B:
+ case 0x0C:
+ type = EXTRACTOR_METATYPE_CONTRIBUTOR_PICTURE;
+ break;
+ case 0x0D:
+ case 0x0E:
+ case 0x0F:
+ type = EXTRACTOR_METATYPE_EVENT_PICTURE;
+ break;
+ case 0x14:
+ type = EXTRACTOR_METATYPE_LOGO;
+ type = EXTRACTOR_METATYPE_LOGO;
+ break;
+ default:
+ type = EXTRACTOR_METATYPE_PICTURE;
+ break;
+ }
+ if (0 == strncasecmp ("PNG",
+ (const char*) &data[pos + 7], 3))
+ mime = "image/png";
+ else if (0 == strncasecmp ("JPG",
+ (const char*) &data[pos + 7], 3))
+ mime = "image/jpeg";
+ else
+ mime = NULL;
+ if (0 == strncasecmp ("-->",
+ (const char*) &data[pos + 7], 3))
+ {
+ /* not supported */
+ }
+ else
+ {
+ if (0 != proc (proc_cls,
+ "id3v2",
+ type,
+ EXTRACTOR_METAFORMAT_BINARY,
+ mime,
+ (const char*) &data[pos + off],
+ csize + 6 - off))
+ return 1;
+ }
+ word = NULL;
+ break;
+ default:
+ return 0;
+ }
+ if ((word != NULL) && (strlen (word) > 0))
{
- case 0x00:
- word = EXTRACTOR_common_convert_to_utf8 ((const char *)
&data[pos + 7],
- csize, "ISO-8859-1");
- break;
- case 0x01:
- word = EXTRACTOR_common_convert_to_utf8 ((const char *)
&data[pos + 7],
- csize, "UCS-2");
- break;
- default:
- /* bad encoding byte,
- try to convert from iso-8859-1 */
- word = EXTRACTOR_common_convert_to_utf8 ((const char *)
&data[pos + 7],
- csize, "ISO-8859-1");
- break;
- }
- pos++;
- csize--;
- if ((word != NULL) && (strlen (word) > 0))
- {
- prev = addKeyword (prev, word, tmap[i].type);
- }
- else
- {
- free (word);
- }
+ if (0 != proc (proc_cls,
+ "id3v2",
+ type,
+ EXTRACTOR_METAFORMAT_UTF8,
+ "text/plain",
+ word,
+ strlen(word)+1))
+ {
+ free (word);
+ return 1;
+ }
+ }
+ free (word);
break;
}
i++;
[Prev in Thread] |
Current Thread |
[Next in Thread] |
- [GNUnet-SVN] r9814 - in Extractor/src: include main plugins,
gnunet <=