[GNUnet-SVN] r9746 - in Extractor: . doc src/include src/main src/plugin

gnunet-svn
[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]
[GNUnet-SVN] r9746 - in Extractor: . doc src/include src/main src/plugin

From:	gnunet
Subject:	[GNUnet-SVN] r9746 - in Extractor: . doc src/include src/main src/plugins
Date:	Mon, 14 Dec 2009 00:02:19 +0100
Author: grothoff
Date: 2009-12-14 00:02:19 +0100 (Mon, 14 Dec 2009)
New Revision: 9746

Added:
   Extractor/src/plugins/html_extractor.c
   Extractor/src/plugins/it_extractor.c
   Extractor/src/plugins/mime_extractor.c
Removed:
   Extractor/src/include/winproc.h
   Extractor/src/main/test_binary.c
   Extractor/src/main/winproc.c
   Extractor/src/plugins/filenameextractor.c
   Extractor/src/plugins/htmlextractor.c
   Extractor/src/plugins/itextractor.c
   Extractor/src/plugins/lowerextractor.c
   Extractor/src/plugins/mimeextractor.c
   Extractor/src/plugins/splitextractor.c
Modified:
   Extractor/AUTHORS
   Extractor/ChangeLog
   Extractor/configure.ac
   Extractor/doc/extract.1
   Extractor/doc/libextractor.3
   Extractor/doc/version.texi
   Extractor/src/include/Makefile.am
   Extractor/src/include/extractor.h
   Extractor/src/main/Makefile.am
   Extractor/src/main/extract.c
   Extractor/src/main/extractor.c
   Extractor/src/main/iconv.c
   Extractor/src/plugins/Makefile.am
Log:
new API for GNU libextractor, converted first 3 plugins as well

Modified: Extractor/AUTHORS
===================================================================
--- Extractor/AUTHORS   2009-12-12 20:09:31 UTC (rev 9745)
+++ Extractor/AUTHORS   2009-12-13 23:02:19 UTC (rev 9746)
@@ -1,6 +1,6 @@
 Core Team:
-Vidyut Samanta <address@hidden>
 Christian Grothoff <address@hidden>
+Nils Durner <address@hidden>
 
 Formats:
 html          - core team with code from libhtmlparse 0.1.13, 
http://msalem.translator.cx/libhtmlparse.html
@@ -55,6 +55,7 @@
 Ronan MELENNEC <address@hidden>
 Vasil Dimov <address@hidden>
 Pavol Rusnak <address@hidden>
+Vidyut Samanta <address@hidden>
 
 Translations:
 German - Karl Eichwalder <address@hidden>

Modified: Extractor/ChangeLog
===================================================================
--- Extractor/ChangeLog 2009-12-12 20:09:31 UTC (rev 9745)
+++ Extractor/ChangeLog 2009-12-13 23:02:19 UTC (rev 9746)
@@ -1,3 +1,7 @@
+Sun Dec 13 16:53:35 CET 2009
+       Starting with major API breakage with the goal to fix all of
+       the not-so-nice things that have accumulated since version 0.0.0. -CG
+
 Sat Dec  5 11:32:30 CET 2009
        Adding extraction of Iptc data using exiv2.
 

Modified: Extractor/configure.ac
===================================================================
--- Extractor/configure.ac      2009-12-12 20:09:31 UTC (rev 9745)
+++ Extractor/configure.ac      2009-12-13 23:02:19 UTC (rev 9746)
@@ -1,14 +1,14 @@
 # Process this file with autoconf to produce a configure script.
 AC_PREREQ(2.61)
-AC_INIT([libextractor], [0.5.23], address@hidden)
+AC_INIT([libextractor], [0.6.0], address@hidden)
 AC_CONFIG_AUX_DIR([libltdl/config])
-AM_INIT_AUTOMAKE([libextractor], [0.5.23])
+AM_INIT_AUTOMAKE([libextractor], [0.6.0])
 AC_CONFIG_HEADERS([config.h])
 AH_TOP([#define _GNU_SOURCE  1])
 
-LIB_VERSION_CURRENT=2
-LIB_VERSION_REVISION=1
-LIB_VERSION_AGE=1
+LIB_VERSION_CURRENT=3
+LIB_VERSION_REVISION=0
+LIB_VERSION_AGE=0
 AC_SUBST(LIB_VERSION_CURRENT)
 AC_SUBST(LIB_VERSION_REVISION)
 AC_SUBST(LIB_VERSION_AGE)
@@ -540,7 +540,7 @@
 if test "x$gn_cv_export_symbols_regex_works" = "xyes"
 then
  LE_LIB_LDFLAGS="$LE_LIB_LDFLAGS -export-symbols-regex 
\"EXTRACTOR_@<:@a-zA-Z0-9_@:>@*\""
- LE_PLUGIN_LDFLAGS="$LE_PLUGIN_LDFLAGS -export-symbols-regex 
\"libextractor_@<:@a-zA-Z0-9_@:>@*_extract\""
+ LE_PLUGIN_LDFLAGS="$LE_PLUGIN_LDFLAGS -export-symbols-regex 
\"EXTRACTOR_@<:@a-zA-Z0-9_@:>@*_extract\""
 fi
 AC_SUBST(LE_LIB_LDFLAGS)
 AC_SUBST(LE_PLUGIN_LDFLAGS)

Modified: Extractor/doc/extract.1
===================================================================
--- Extractor/doc/extract.1     2009-12-12 20:09:31 UTC (rev 9745)
+++ Extractor/doc/extract.1     2009-12-13 23:02:19 UTC (rev 9746)
@@ -1,4 +1,4 @@
-.TH EXTRACT 1 "Dec 29, 2006" "libextractor 0.5.17"
+.TH EXTRACT 1 "Dec 14, 2009" "libextractor 0.6.0"
 .\" $Id
 .SH NAME
 extract
@@ -6,7 +6,7 @@
 .SH SYNOPSIS
 .B extract
 [
-.B \-abdfghLnrsvV
+.B \-bghLnvV
 ]
 [
 .B \-B
@@ -32,7 +32,7 @@
 \&...
 .br
 .SH DESCRIPTION
-This manual page documents version 0.5.17 of the
+This manual page documents version 0.6.0 of the
 .B extract
 command.
 .PP
@@ -46,9 +46,6 @@
 
 .SH OPTIONS
 .TP 8
-.B \-a
-Do not remove any duplicates, even if the keywords match exactly and have the 
same type (i.e. because the same keyword was found by different extractor 
libraries).
-.TP 8
 .B \-b
 Display the output in BiBTeX format. This implies the
 .B \-d
@@ -57,12 +54,6 @@
 .B \-B LANG
 Use the generic plaintext extractor for the language with the 2\-letter 
language code LANG.  Supported languages are DA (Danish), DE (German), EN 
(English), ES (Spanish), FI (Finnish), FR (French), GA (Gaelic), IT (Italian), 
NO (Norwegian) and SV (Swedish).
 .TP 8
-.B \-d
-Remove duplicates only if the types match exactly. By default, duplicates are 
removed if the types match or if one of the types is \I unknown (in this case, 
the duplicate of unknown type is removed).
-.TP 8
-.B \-f
-add the filename(s) (without directory) to the list of keywords.
-.TP 8
 .B \-g
 Use grep\-friendly output (all keywords on a single line for each file).  Use 
the verbose option to print the filename first, followed by the keywords.  Use 
the verbose option twice to also display the keyword types.  This option will 
not print keyword types or non\-textual metadata.
 .TP 8
@@ -78,12 +69,6 @@
 .B \-n
 Do not use the default set of extractors (typically all standard extractors, 
currently mp3, ogg, jpg, gif, png, tiff, real, html, pdf and mime\-types), use 
only the extractors specified with the .B \-l option.
 .TP 8
-.B \-r
-Remove all duplicates disregarding differences in the keyword type.
-.TP 8
-.B \-s
-Split keywords at delimiters (space, comma, colon, etc.) and list split 
keywords to be of .I unknown type. This can also be done by loading the 
split\-library. Using this option guarantees that the splitting is performed 
after all other libraries have been run. It is always performed before 
duplicate elimination.
-.TP 8
 .B \-v
 Print the version number and exit.
 .TP 8
@@ -111,10 +96,9 @@
 comment \- (C) 2001 by Christian Grothoff, using gimp 1.2 1
 mimetype \- image/jpeg
 
-$ extract \-Vf \-x comment test/test.jpg
+$ extract \-V \-x comment test/test.jpg
 Keywords for file test/test.jpg:
 mimetype \- image/jpeg
-filename \- test.jpg
 
 $ extract \-p comment test/test.jpg
 comment \- (C) 2001 by Christian Grothoff, using gimp 1.2 1
@@ -125,7 +109,7 @@
 comment \- Testing keyword extraction
 
 .SH LEGAL NOTICE
-libextractor and the extract tool are released under the GPL.  libextractor is 
a GNU project.
+libextractor and the extract tool are released under the GPL.  libextractor is 
a GNU package.
 
 .SH BUGS
 A couple of file\-formats (on the order of 10^3) are not recognized...
@@ -138,4 +122,4 @@
 
 .SH AVAILABILITY
 You can obtain the original author's latest version from
-http://gnunet.org/libextractor/
+http://www.gnu.org/software/libextractor/

Modified: Extractor/doc/libextractor.3
===================================================================
--- Extractor/doc/libextractor.3        2009-12-12 20:09:31 UTC (rev 9745)
+++ Extractor/doc/libextractor.3        2009-12-13 23:02:19 UTC (rev 9746)
@@ -1,75 +1,55 @@
-.TH LIBEXTRACTOR 3 "Jul 14, 2005"
+.TH LIBEXTRACTOR 3 "Dec 14, 2009"
 .SH NAME
-libextractor \- meta\-information extraction library 0.5.11
+libextractor \- meta\-information extraction library 0.6.0
 .SH SYNOPSIS
 
 \fB#include <extractor.h>
 
- \fBtypedef struct EXTRACTOR_Keywords {
-   char * \fIkeyword\fB;
-   EXTRACTOR_KeywordType \fIkeywordType\fB;
-   struct EXTRACTOR_Keywords * \fInext\fB;
- } EXTRACTOR_KeywordList;\FB
+\fBconst char *EXTRACTOR_metatype_to_string(enum EXTRACTOR_MetaType 
\fItype\fB);
 
+\fBconst char *EXTRACTOR_metatype_to_description(enum EXTRACTOR_MetaType 
\fItype\fB);
 
- \fBEXTRACTOR_ExtractorList * EXTRACTOR_loadDefaultLibraries ();
+\fBenum EXTRACTOR_MetaTypeEXTRACTOR_metatype_get_max (void);
 
- \fBconst char * EXTRACTOR_getKeywordTypeAsString (const EXTRACTOR_KeywordType 
\fItype\fB);
+\fBstruct EXTRACTOR_PluginList *EXTRACTOR_plugin_add_defaults(enum 
EXTRACTOR_Options \fIflags\fB);
 
- \fBEXTRACTOR_ExtractorList * EXTRACTOR_loadConfigLibraries 
(EXTRACTOR_ExtractorList * \fIprev\fB, const char * \fIconfig\fB);
+\fBstruct EXTRACTOR_PluginList *EXTRACTOR_plugin_add (struct 
EXTRACTOR_PluginList * \fIprev\fB, const char * \fIlibrary\fB, const char * 
\fIoptions\fB, enum EXTRACTOR_Options \fIflags\fB);
 
- \fBEXTRACTOR_ExtractorList * EXTRACTOR_addLibrary (EXTRACTOR_ExtractorList * 
\fIprev\fB, const char * \fIlibrary\fB);
 
- \fBEXTRACTOR_ExtractorList * EXTRACTOR_addLibraryLast 
(EXTRACTOR_ExtractorList * \fIprev\fB, const char * \fIlibrary\fB);
+\fBstruct EXTRACTOR_PluginList *EXTRACTOR_plugin_add_last(struct 
EXTRACTOR_PluginList *\fIprev\fB, const char *\fIlibrary\fB, const char 
*\fIoptions\fB, enum EXTRACTOR_Options \fIflags\fB);
 
- \fBEXTRACTOR_ExtractorList * EXTRACTOR_removeLibrary (EXTRACTOR_ExtractorList 
* \fIprev\fB, const char * \fIlibrary\fB);
+\fBstruct EXTRACTOR_PluginList *EXTRACTOR_plugin_add_config (struct 
EXTRACTOR_PluginList * \fIprev\fB, const char *\fIconfig\fB, enum 
EXTRACTOR_Options \fIflags\fB);
+               
+\fBstruct EXTRACTOR_PluginList *EXTRACTOR_plugin_remove(struct 
EXTRACTOR_PluginList * \fIprev\fB, const char * \fIlibrary\fB);
 
- \fBvoid EXTRACTOR_removeAll (EXTRACTOR_ExtractorList * \fIprev\fB);
+\fBvoid EXTRACTOR_plugin_remove_all(struct EXTRACTOR_PluginList 
*\fIplugins\fB);
 
- \fBEXTRACTOR_KeywordList * EXTRACTOR_getKeywords (EXTRACTOR_ExtractorList * 
\fIextractor\fB, const char * \fIfilename\fB);
+\fBvoid EXTRACTOR_extract(struct EXTRACTOR_PluginList *\fIplugins\fB, const 
char *\fIfilename\fB, const void *\fIdata\fB, size_t \fIsize\fB, 
EXTRACTOR_MetaDataProcessor \fIproc\fB, void *\fIproc_cls\fB);
 
- \fBEXTRACTOR_KeywordList * EXTRACTOR_getKeywords (EXTRACTOR_ExtractorList * 
\fIextractor\fB, const char * \fIdata\fB, size_t \fIsize\fB);
+\fBint EXTRACTOR_meta_data_print(void * \fIhandle\fB, const char 
*\fIplugin_name\fB, enum EXTRACTOR_MetaType \fItype\fB, enum 
EXTRACTOR_MetaFormat \fIformat\fB, const char *\fIdata_mime_type\fB, const char 
*\fIdata\fB, size_t \fIdata_len\fB);
 
- \fBEXTRACTOR_KeywordList * EXTRACTOR_removeEmptyKeywords 
(EXTRACTOR_KeywordList * \fIlist\fB);
+\fBEXTRACTOR_VERSION
 
- \fBEXTRACTOR_KeywordList * EXTRACTOR_removeDuplicateKeywords 
(EXTRACTOR_KeywordList * \fIlist\fB, const unsigned int \fIoptions\fB);
-
- \fBvoid EXTRACTOR_printKeywords (FILE * \fIhandle\fB, EXTRACTOR_KeywordList * 
\fIkeywords\fB);
-
- \fBvoid EXTRACTOR_freeKeywords (EXTRACTOR_KeywordList * \fIkeywords\fB);
-
- \fBconst char * EXTRACTOR_extractLast (const EXTRACTOR_KeywordType * 
\fItype\fB, EXTRACTOR_KeywordList * \fIkeywords\fB);
-
- \fBconst char * EXTRACTOR_extractLastByString (const char * \fItype\fB, 
EXTRACTOR_KeywordList * \fIkeywords\fB);
-
- \fBunsigned int EXTRACTOR_countKeywords (EXTRACTOR_KeywordList * 
\fIkeywords\fB);
-
- \fBEXTRACTOR_DEFAULT_LIBRARIES
-
- \fBEXTRACTOR_VERSION
-
 .SH DESCRIPTION
 .P
-libextractor is a simple library for keyword extraction.  libExtractor does 
not support all formats but supports a simple plugging mechanism such that you 
can quickly add extractors for additional formats, even without recompiling 
libExtractor.  libExtractor typically ships with one or more helper-libraries 
that can be used to obtain keywords from common file-types.  If you want to 
write your own extractor for some filetype, all you need to do is write a 
little library that implements a single method with this signature:
+GNU libextractor is a simple library for keyword extraction.  libextractor 
does not support all formats but supports a simple plugging mechanism such that 
you can quickly add extractors for additional formats, even without recompiling 
libextractor.  libextractor typically ships with dozens of plugins that can be 
used to obtain meta data from common file-types.  If you want to write your own 
plugin for some filetype, all you need to do is write a little library that 
implements a single method with this signature:
 
- \fBEXTRACTOR_KeywordList * LIBRARYNAME_extract(const char * \fIfilename\fB,
-                                             char * \fIdata\fB,
-                                             size_t \fIsize\fB,
-                                             EXTRACTOR_KeywordList * 
\fIprev\fB);
+ \fBint EXTRACTOR_name_extract(const char *\fIdata\fB, size_t \fIdatasize\fB, 
EXTRACTOR_MetaDataProcessor \fIproc\fB, void *\fIproc_cls\fB, const char 
*\fIoptions\fB);
 
 .P
-The filename is the name of the file, data is a pointer to the contents of the 
file and size is the size of the file.  The extract method must prepend 
keywords that it finds to the linked list 'prev' and return the new head. The 
library must allocate (malloc) the entry in the keyword list and the memory for 
the filename since both will be free'ed by libExtractor once the application 
calls freeKeywords. An example implementation can be found in 
\fImp3extractor.c\fP.  The application extract gives an example how to use 
libExtractor.
-
+Data is a pointer to the contents of the file and datasize is the size of 
data.  The extract method must call proc for meta data that it finds.  The 
interpretation of options is up to the plugin.  The function should return 0 if 
'proc' always returned 0, otherwise 1.  After 'proc' returned a non-zero value, 
proc should not be called again. An example implementation can be found in 
\fIhtml_extractor.c\fP.  Plugins should be automatically found and used once 
they are installed in the respective directory (typically something like 
/usr/lib/libextractor/).  
 .P
-The basic use of libextractor is to load the plugins (for example with 
\fBEXTRACTOR_loadDefaultLibraries\fP), then to extract the keyword list using 
\fBEXTRACTOR_getKeywords\fP, processing the list (using application specific 
code and possibly some of the postprocessing convenience functions like 
\fBEXTRACTOR_removeDuplicateKeywords\fP), freeing the keyword list (using 
\fBEXTRACTOR_freeKeywords\fP) and finally unloading the plugins (with 
\fBEXTRACTOR_removeAll\fP).
+The application extract gives an example how to use libextractor.
 .P
-The keywords obtained from libextractor are supposed to be UTF-8 encoded.  The 
EXTRACTOR_printKeywords function converts the UTF-8 keywords to the character 
set from the current locale before printing them.  Plugins are supposed to 
convert meta-data to UTF-8 if necessary.  
+The basic use of libextractor is to load the plugins (for example with 
\fBEXTRACTOR_plugin_add_defaults\fP), then to extract the keyword list using 
\fBEXTRACTOR_extract\fP, and finally unloading the plugins (with 
\fBEXTRACTOR_plugin_remove_all\fP).
 .P
+Textual meta data obtained from libextractor is supposed to be UTF-8 encoded 
if the text encoding is known.  Plugins are supposed to convert meta-data to 
UTF-8 if necessary.    The EXTRACTOR_meta_data_print function converts the 
UTF-8 keywords to the character set from the current locale before printing 
them.  
+.P
 .SH "SEE ALSO"
 extract(1)
 
 .SH LEGAL NOTICE
-libextractor is released under the GPL and a GNU project (http://www.gnu.org/).
+libextractor is released under the GPL and a GNU package (http://www.gnu.org/).
 
 .SH BUGS
 A couple of file-formats (on the order of 10^3) are not recognized...
@@ -78,4 +58,4 @@
 extract was originally written by Christian Grothoff <address@hidden> and 
Vidyut Samanta <address@hidden>. Use <address@hidden> to contact the current 
maintainer(s).
 
 .SH AVAILABILITY
-You can obtain the original author's latest version from 
http://gnunet.org/libextractor/.
+You can obtain the original author's latest version from 
http://www.gnu.org/software/libextractor/.

Modified: Extractor/doc/version.texi
===================================================================
--- Extractor/doc/version.texi  2009-12-12 20:09:31 UTC (rev 9745)
+++ Extractor/doc/version.texi  2009-12-13 23:02:19 UTC (rev 9746)
@@ -1,4 +1,4 @@
 @set UPDATED 1 October 2009
 @set UPDATED-MONTH October 2009
address@hidden EDITION 0.5.23
address@hidden VERSION 0.5.23
address@hidden EDITION 0.6.0
address@hidden VERSION 0.6.0

Modified: Extractor/src/include/Makefile.am
===================================================================
--- Extractor/src/include/Makefile.am   2009-12-12 20:09:31 UTC (rev 9745)
+++ Extractor/src/include/Makefile.am   2009-12-13 23:02:19 UTC (rev 9746)
@@ -3,6 +3,5 @@
   extractor.h 
 EXTRA_DIST = \
   plibc.h \
-  winproc.h \
   platform.h \
   gettext.h

Modified: Extractor/src/include/extractor.h
===================================================================
--- Extractor/src/include/extractor.h   2009-12-12 20:09:31 UTC (rev 9745)
+++ Extractor/src/include/extractor.h   2009-12-13 23:02:19 UTC (rev 9746)
@@ -1,6 +1,6 @@
 /*
      This file is part of libextractor.
-     (C) 2002, 2003, 2004, 2005, 2006 Vidyut Samanta and Christian Grothoff
+     (C) 2002, 2003, 2004, 2005, 2006, 2009 Vidyut Samanta and Christian 
Grothoff
 
      libextractor is free software; you can redistribute it and/or modify
      it under the terms of the GNU General Public License as published
@@ -32,432 +32,503 @@
  * 0.2.6-1 => 0x00020601
  * 4.5.2-0 => 0x04050200
  */
-#define EXTRACTOR_VERSION 0x00052301
+#define EXTRACTOR_VERSION 0x00060000
 
 #include <stdio.h>
 
-/* ignore the 'type' of the keyword when eliminating duplicates */
-#define EXTRACTOR_DUPLICATES_TYPELESS 1
-/* remove type 'UNKNOWN' if there is a duplicate keyword of
-   known type, even if usually different types should be
-   preserved */
-#define EXTRACTOR_DUPLICATES_REMOVE_UNKNOWN 2
 
-#define EXTRACTOR_DEFAULT_LIBRARIES EXTRACTOR_getDefaultLibraries()
-
-const char * EXTRACTOR_getDefaultLibraries(void);
-
 /**
- * Enumeration defining various sources of keywords.
- * See also
- * http://dublincore.org/documents/1998/09/dces/
+ * Options for how plugin execution should be done.
  */
-typedef enum {
-  EXTRACTOR_UNKNOWN = 0,
-  EXTRACTOR_FILENAME = 1,
-  EXTRACTOR_MIMETYPE = 2,
-  EXTRACTOR_TITLE = 3,
-  EXTRACTOR_AUTHOR = 4,
-  EXTRACTOR_ARTIST = 5,
-  EXTRACTOR_DESCRIPTION = 6,
-  EXTRACTOR_COMMENT = 7,
-  EXTRACTOR_DATE = 8,
-  EXTRACTOR_PUBLISHER = 9,
-  EXTRACTOR_LANGUAGE = 10,
-  EXTRACTOR_ALBUM = 11,
-  EXTRACTOR_GENRE = 12,
-  EXTRACTOR_LOCATION = 13,
-  EXTRACTOR_VERSIONNUMBER = 14,
-  EXTRACTOR_ORGANIZATION = 15,
-  EXTRACTOR_COPYRIGHT = 16,
-  EXTRACTOR_SUBJECT = 17,
-  EXTRACTOR_KEYWORDS = 18,
-  EXTRACTOR_CONTRIBUTOR = 19,
-  EXTRACTOR_RESOURCE_TYPE = 20,
-  EXTRACTOR_FORMAT = 21,
-  EXTRACTOR_RESOURCE_IDENTIFIER = 22,
-  EXTRACTOR_SOURCE = 23,
-  EXTRACTOR_RELATION = 24,
-  EXTRACTOR_COVERAGE = 25,
-  EXTRACTOR_SOFTWARE = 26,
-  EXTRACTOR_DISCLAIMER = 27,
-  EXTRACTOR_WARNING = 28,
-  EXTRACTOR_TRANSLATED = 29,
-  EXTRACTOR_CREATION_DATE = 30,
-  EXTRACTOR_MODIFICATION_DATE = 31,
-  EXTRACTOR_CREATOR = 32,
-  EXTRACTOR_PRODUCER = 33,
-  EXTRACTOR_PAGE_COUNT = 34,
-  EXTRACTOR_PAGE_ORIENTATION = 35,
-  EXTRACTOR_PAPER_SIZE = 36,
-  EXTRACTOR_USED_FONTS = 37,
-  EXTRACTOR_PAGE_ORDER = 38,
-  EXTRACTOR_CREATED_FOR = 39,
-  EXTRACTOR_MAGNIFICATION = 40,
-  EXTRACTOR_RELEASE = 41,
-  EXTRACTOR_GROUP = 42,
-  EXTRACTOR_SIZE = 43,
-  EXTRACTOR_SUMMARY = 44,
-  EXTRACTOR_PACKAGER = 45,
-  EXTRACTOR_VENDOR = 46,
-  EXTRACTOR_LICENSE = 47,
-  EXTRACTOR_DISTRIBUTION = 48,
-  EXTRACTOR_BUILDHOST = 49,
-  EXTRACTOR_OS = 50,
-  EXTRACTOR_DEPENDENCY = 51,
-  EXTRACTOR_HASH_MD4 = 52,
-  EXTRACTOR_HASH_MD5 = 53,
-  EXTRACTOR_HASH_SHA0 = 54,
-  EXTRACTOR_HASH_SHA1 = 55,
-  EXTRACTOR_HASH_RMD160 = 56,
-  EXTRACTOR_RESOLUTION = 57,
-  EXTRACTOR_CATEGORY = 58,
-  EXTRACTOR_BOOKTITLE = 59,
-  EXTRACTOR_PRIORITY = 60,
-  EXTRACTOR_CONFLICTS = 61,
-  EXTRACTOR_REPLACES = 62,
-  EXTRACTOR_PROVIDES = 63,
-  EXTRACTOR_CONDUCTOR = 64,
-  EXTRACTOR_INTERPRET = 65,
-  EXTRACTOR_OWNER = 66,
-  EXTRACTOR_LYRICS = 67,
-  EXTRACTOR_MEDIA_TYPE = 68,
-  EXTRACTOR_CONTACT = 69,
-  EXTRACTOR_THUMBNAIL_DATA = 70,
-  EXTRACTOR_PUBLICATION_DATE = 71,
-  EXTRACTOR_CAMERA_MAKE = 72,
-  EXTRACTOR_CAMERA_MODEL = 73,
-  EXTRACTOR_EXPOSURE = 74,
-  EXTRACTOR_APERTURE = 75,
-  EXTRACTOR_EXPOSURE_BIAS = 76,
-  EXTRACTOR_FLASH = 77,
-  EXTRACTOR_FLASH_BIAS = 78,
-  EXTRACTOR_FOCAL_LENGTH = 79,
-  EXTRACTOR_FOCAL_LENGTH_35MM = 80,
-  EXTRACTOR_ISO_SPEED = 81,
-  EXTRACTOR_EXPOSURE_MODE = 82,
-  EXTRACTOR_METERING_MODE = 83,
-  EXTRACTOR_MACRO_MODE = 84,
-  EXTRACTOR_IMAGE_QUALITY = 85,
-  EXTRACTOR_WHITE_BALANCE = 86,
-  EXTRACTOR_ORIENTATION = 87,
-  EXTRACTOR_TEMPLATE = 88,
-  EXTRACTOR_SPLIT = 89,
-  EXTRACTOR_PRODUCTVERSION = 90,
-  EXTRACTOR_LAST_SAVED_BY = 91,
-  EXTRACTOR_LAST_PRINTED = 92,
-  EXTRACTOR_WORD_COUNT = 93,
-  EXTRACTOR_CHARACTER_COUNT = 94,
-  EXTRACTOR_TOTAL_EDITING_TIME = 95,
-  EXTRACTOR_THUMBNAILS = 96,
-  EXTRACTOR_SECURITY = 97,
-  EXTRACTOR_CREATED_BY_SOFTWARE = 98,
-  EXTRACTOR_MODIFIED_BY_SOFTWARE = 99,
-  EXTRACTOR_REVISION_HISTORY = 100,
-  EXTRACTOR_LOWERCASE = 101,
-  EXTRACTOR_COMPANY = 102,
-  EXTRACTOR_GENERATOR = 103,
-  EXTRACTOR_CHARACTER_SET = 104,
-  EXTRACTOR_LINE_COUNT = 105,
-  EXTRACTOR_PARAGRAPH_COUNT = 106,
-  EXTRACTOR_EDITING_CYCLES = 107,
-  EXTRACTOR_SCALE = 108,
-  EXTRACTOR_MANAGER = 109,
-  EXTRACTOR_MOVIE_DIRECTOR = 110,
-  EXTRACTOR_DURATION = 111,
-  EXTRACTOR_INFORMATION = 112,
-  EXTRACTOR_FULL_NAME = 113,
-  EXTRACTOR_CHAPTER = 114,
-  EXTRACTOR_YEAR = 115,
-  EXTRACTOR_LINK = 116,
-  EXTRACTOR_MUSIC_CD_IDENTIFIER = 117,
-  EXTRACTOR_PLAY_COUNTER = 118,
-  EXTRACTOR_POPULARITY_METER = 119,
-  EXTRACTOR_CONTENT_TYPE = 120,
-  EXTRACTOR_ENCODED_BY = 121,
-  EXTRACTOR_TIME = 122,
-  EXTRACTOR_MUSICIAN_CREDITS_LIST = 123,
-  EXTRACTOR_MOOD = 124, 
-  EXTRACTOR_FORMAT_VERSION = 125,
-  EXTRACTOR_TELEVISION_SYSTEM = 126,
-  EXTRACTOR_SONG_COUNT = 127,
-  EXTRACTOR_STARTING_SONG = 128,
-  EXTRACTOR_HARDWARE_DEPENDENCY = 129,
-  EXTRACTOR_RIPPER = 130,
-  EXTRACTOR_FILE_SIZE = 131,
-  EXTRACTOR_TRACK_NUMBER = 132,
-  EXTRACTOR_ISRC = 133,
-  EXTRACTOR_DISC_NUMBER = 134,
-  EXTRACTOR_GNUNET_DISPLAY_TYPE = 135,
-  EXTRACTOR_GNUNET_ECBC_URI = 136,
-  EXTRACTOR_GNUNET_FULL_DATA = 137,
-  EXTRACTOR_LOCATION_CITY = 138,
-  EXTRACTOR_LOCATION_COUNTRY = 139,
-  EXTRACTOR_LOCATION_SUBLOCATION = 140,
-  EXTRACTOR_GPS_LATITUDE_REF = 141,
-  EXTRACTOR_GPS_LATITUDE = 142,
-  EXTRACTOR_GPS_LONGITUDE_REF = 143,
-  EXTRACTOR_GPS_LONGITUDE = 144,
-  EXTRACTOR_RATING = 145,
-  EXTRACTOR_COUNTRY_CODE = 146
-} EXTRACTOR_KeywordType;
+enum EXTRACTOR_Options
+  {
+    /**
+     * Run plugins in-process.
+     */
+    EXTRACTOR_OPTION_NONE = 0,
 
-/**
- * Test if a given LE type contains binary data.
- */
-#define EXTRACTOR_isBinaryType(type) (type == EXTRACTOR_THUMBNAIL_DATA) 
+    /**
+     * Run plugins out-of-process, starting the process
+     * once at the time the plugin is loaded.  This will
+     * prevent the main process crashing if a plugin dies.
+     * Ignored on platforms where out-of-process starts
+     * are not supported.
+     */
+    EXTRACTOR_OPTION_OUT_OF_PROCESS = 1,
 
-/**
- * A linked list of keywords. This structure is passed around
- * in libExtractor and is typically the result of any keyword
- * extraction operation.
- * <p>
- * Each entry in the keyword list consists of a string (the
- * keyword) and the keyword type (of type KeywordType)
- * describing how/from where the keyword was obtained.
- */
-typedef struct EXTRACTOR_Keywords {
-  /* the keyword that was found */
-  char * keyword;
-  /* the type of the keyword (classification) */
-  EXTRACTOR_KeywordType keywordType;
-  /* the next entry in the list */
-  struct EXTRACTOR_Keywords * next;
-} EXTRACTOR_KeywordList;
+    /**
+     * If a plugin crashes, automatically restart the respective
+     * process for the next file.  Implies
+     * EXTRACTOR_OPTION_OUT_OF_PROCESS.
+     */
+    EXTRACTOR_OPTION_AUTO_RESTART = 2
 
-/**
- * Signature of the extract method that each plugin
- * must provide.
- *
- * @param filename MAYBE NULL (!)
- * @param data must not be modified (!)
- */
-typedef EXTRACTOR_KeywordList *
-(*ExtractMethod)(const char * filename,
-                char * data,
-                size_t filesize,
-                EXTRACTOR_KeywordList * next,
-                const char * options);
+  };
 
+
 /**
- * Linked list of extractor helper-libraries. An application
- * builds this list by telling libextractor to load various
- * keyword-extraction libraries. Libraries can also be unloaded
- * (removed from this list, see removeLibrary).
- * <p>
- * Client code should never be concerned with the internals of
- * this struct.
+ * Format in which the extracted meta data is presented.
  */
-typedef struct EXTRACTOR_Extractor {
-  void * libraryHandle;
-  char * libname;
-  ExtractMethod extractMethod;
-  struct EXTRACTOR_Extractor * next;
-  char * options;
-} EXTRACTOR_ExtractorList;
+enum EXTRACTOR_MetaFormat
+  {
+    /**
+     * Format is unknown.
+     */
+    EXTRACTOR_METAFORMAT_UNKNOWN = 0,
 
+    /**
+     * 0-terminated, UTF-8 encoded string.  "data_len"
+     * is strlen(data)+1.
+     */
+    EXTRACTOR_METAFORMAT_UTF8 = 1,
+
+    /**
+     * Some kind of binary format, see given Mime type.
+     */
+    EXTRACTOR_METAFORMAT_BINARY = 2,
+
+    /**
+     * 0-terminated string.  The specific encoding is unknown.
+     * "data_len" is strlen(data)+1.
+     */
+    EXTRACTOR_METAFORMAT_C_STRING = 3
+  };
+
+
 /**
- * Load the default set of libraries.
- * @return the default set of libraries.
+ * Enumeration defining various sources of keywords.  See also
+ * http://dublincore.org/documents/1998/09/dces/
  */
-EXTRACTOR_ExtractorList * EXTRACTOR_loadDefaultLibraries(void);
+enum EXTRACTOR_MetaType 
+  {
+    /* fundamental types */
+    EXTRACTOR_METATYPE_RESERVED = 0,
+    EXTRACTOR_METATYPE_MIMETYPE = 1,
+    EXTRACTOR_METATYPE_FILENAME = 2,
+    EXTRACTOR_METATYPE_COMMENT = 3,
 
+    /* Standard types from bibtex */
+    EXTRACTOR_METATYPE_TITLE = 4,
+    EXTRACTOR_METATYPE_BOOK_TITLE = 5,
+    EXTRACTOR_METATYPE_BOOK_EDITION = 6,
+    EXTRACTOR_METATYPE_BOOK_CHAPTER_NUMBER = 7,
+    EXTRACTOR_METATYPE_JOURNAL_NAME = 8,
+    EXTRACTOR_METATYPE_JOURNAL_VOLUME = 9,    
+    EXTRACTOR_METATYPE_JOURNAL_NUMBER = 10,
+    EXTRACTOR_METATYPE_PAGE_COUNT = 11,
+    EXTRACTOR_METATYPE_PAGE_RANGE = 12,
+    EXTRACTOR_METATYPE_AUTHOR_NAME = 13,
+    EXTRACTOR_METATYPE_AUTHOR_EMAIL = 14,
+    EXTRACTOR_METATYPE_AUTHOR_INSTITUTION = 15,
+    EXTRACTOR_METATYPE_PUBLISHER = 16,
+    EXTRACTOR_METATYPE_PUBLISHER_ADDRESS = 17,
+    EXTRACTOR_METATYPE_PUBLISHER_INSTITUTION = 18,
+    EXTRACTOR_METATYPE_PUBLISHER_SERIES = 19,
+    EXTRACTOR_METATYPE_PUBLICATION_TYPE = 20,
+    EXTRACTOR_METATYPE_PUBLICATION_YEAR = 21,
+    EXTRACTOR_METATYPE_PUBLICATION_MONTH = 22,
+    EXTRACTOR_METATYPE_PUBLICATION_DAY = 23,
+    EXTRACTOR_METATYPE_PUBLICATION_DATE = 24,
+    EXTRACTOR_METATYPE_BIBTEX_EPRINT = 25,
+    EXTRACTOR_METATYPE_BIBTEX_ENTRY_TYPE = 26,
+    EXTRACTOR_METATYPE_DOCUMENT_LANGUAGE = 27,
+    EXTRACTOR_METATYPE_CREATION_TIME = 28,
+    EXTRACTOR_METATYPE_URL = 29,
+
+    /* "unique" document identifiers */
+    EXTRACTOR_METATYPE_URI = 30, 
+    EXTRACTOR_METATYPE_ISRC = 31,
+    EXTRACTOR_METATYPE_HASH_MD4 = 32,
+    EXTRACTOR_METATYPE_HASH_MD5 = 33,
+    EXTRACTOR_METATYPE_HASH_SHA0 = 34,
+    EXTRACTOR_METATYPE_HASH_SHA1 = 35,
+    EXTRACTOR_METATYPE_HASH_RMD160 = 36,
+
+    /* identifiers of a location */
+    EXTRACTOR_METATYPE_GPS_LATITUDE_REF = 37,
+    EXTRACTOR_METATYPE_GPS_LATITUDE = 38,
+    EXTRACTOR_METATYPE_GPS_LONGITUDE_REF = 39,
+    EXTRACTOR_METATYPE_GPS_LONGITUDE = 40,
+    EXTRACTOR_METATYPE_LOCATION_CITY = 41,
+    EXTRACTOR_METATYPE_LOCATION_SUBLOCATION = 42,
+    EXTRACTOR_METATYPE_LOCATION_COUNTRY = 43,
+    EXTRACTOR_METATYPE_LOCATION_COUNTRY_CODE = 44,
+
+    /* generic attributes */
+    EXTRACTOR_METATYPE_UNKNOWN = 45,
+    EXTRACTOR_METATYPE_DESCRIPTION = 46,
+    EXTRACTOR_METATYPE_COPYRIGHT = 47,
+    EXTRACTOR_METATYPE_RIGHTS = 48,
+    EXTRACTOR_METATYPE_KEYWORDS = 49,
+    EXTRACTOR_METATYPE_ABSTRACT = 50,
+    EXTRACTOR_METATYPE_SUMMARY = 51,
+    EXTRACTOR_METATYPE_SUBJECT = 52,
+    EXTRACTOR_METATYPE_CREATOR = 53,
+    EXTRACTOR_METATYPE_FORMAT = 54,
+    EXTRACTOR_METATYPE_FORMAT_VERSION = 55,
+
+    /* processing history */
+    EXTRACTOR_METATYPE_CREATED_BY_SOFTWARE = 56, 
+    EXTRACTOR_METATYPE_UNKNOWN_DATE = 57, 
+    EXTRACTOR_METATYPE_CREATION_DATE = 58,
+    EXTRACTOR_METATYPE_MODIFICATION_DATE = 59,
+    EXTRACTOR_METATYPE_LAST_PRINTED = 60,
+    EXTRACTOR_METATYPE_LAST_SAVED_BY = 61,
+    EXTRACTOR_METATYPE_TOTAL_EDITING_TIME = 62,
+    EXTRACTOR_METATYPE_EDITING_CYCLES = 63,
+    EXTRACTOR_METATYPE_MODIFIED_BY_SOFTWARE = 64,
+    EXTRACTOR_METATYPE_REVISION_HISTORY = 65,
+
+    /* FIXME... */
+
+    /* software package specifics (deb, rpm, tgz) */
+    EXTRACTOR_METATYPE_PACKAGER = 45,
+    EXTRACTOR_METATYPE_VENDOR = 46,
+    EXTRACTOR_METATYPE_LICENSE = 47,
+    EXTRACTOR_METATYPE_DISTRIBUTION = 48,
+    EXTRACTOR_METATYPE_BUILDHOST = 49,
+    EXTRACTOR_METATYPE_TARGET_OS = 50,
+    EXTRACTOR_METATYPE_DEPENDENCY = 51,
+    EXTRACTOR_METATYPE_CONFLICTS = 61,
+    EXTRACTOR_METATYPE_REPLACES = 62,
+    EXTRACTOR_METATYPE_PROVIDES = 63,
+
+    /* (text) document processing specifics */
+    EXTRACTOR_METATYPE_CHARACTER_SET = 104,
+    EXTRACTOR_METATYPE_LINE_COUNT = 105,
+    EXTRACTOR_METATYPE_PARAGRAPH_COUNT = 106,
+    EXTRACTOR_METATYPE_WORD_COUNT = 93,
+    EXTRACTOR_METATYPE_CHARACTER_COUNT = 94,
+    EXTRACTOR_METATYPE_PAGE_ORIENTATION = 35,
+    EXTRACTOR_METATYPE_PAPER_SIZE = 36,
+    EXTRACTOR_METATYPE_USED_FONTS = 37,
+    EXTRACTOR_METATYPE_PAGE_ORDER = 38,
+
+    /* music / video specifics */
+    EXTRACTOR_METATYPE_LYRICS = 67,
+    EXTRACTOR_METATYPE_CONDUCTOR = 64,
+    EXTRACTOR_METATYPE_INTERPRET = 65,
+    EXTRACTOR_METATYPE_MUSIC_CD_IDENTIFIER = 117,
+    EXTRACTOR_METATYPE_PLAY_COUNTER = 118,
+    EXTRACTOR_METATYPE_DURATION = 111,
+    EXTRACTOR_METATYPE_MOVIE_DIRECTOR = 110,
+    EXTRACTOR_METATYPE_SONG_COUNT = 127,
+    EXTRACTOR_METATYPE_STARTING_SONG = 128,
+    EXTRACTOR_METATYPE_MUSICIAN_CREDITS_LIST = 123,
+    EXTRACTOR_METATYPE_TRACK_NUMBER = 132,
+    EXTRACTOR_METATYPE_DISC_NUMBER = 134,
+    EXTRACTOR_METATYPE_ALBUM = 11,
+    EXTRACTOR_METATYPE_ARTIST = 5,
+    EXTRACTOR_METATYPE_GENRE = 12,
+
+    /* image specifics */
+    EXTRACTOR_METATYPE_THUMBNAIL_DATA = 70,
+    EXTRACTOR_METATYPE_RESOLUTION = 57,
+    EXTRACTOR_METATYPE_IMAGE_DIMENSIONS = 43,
+    EXTRACTOR_METATYPE_SCALE = 108,
+
+    /* photography specifics */
+    EXTRACTOR_METATYPE_CAMERA_MAKE = 72,
+    EXTRACTOR_METATYPE_CAMERA_MODEL = 73,
+    EXTRACTOR_METATYPE_EXPOSURE = 74,
+    EXTRACTOR_METATYPE_APERTURE = 75,
+    EXTRACTOR_METATYPE_EXPOSURE_BIAS = 76,
+    EXTRACTOR_METATYPE_FLASH = 77,
+    EXTRACTOR_METATYPE_FLASH_BIAS = 78,
+    EXTRACTOR_METATYPE_FOCAL_LENGTH = 79,
+    EXTRACTOR_METATYPE_FOCAL_LENGTH_35MM = 80,
+    EXTRACTOR_METATYPE_ISO_SPEED = 81,
+    EXTRACTOR_METATYPE_EXPOSURE_MODE = 82,
+    EXTRACTOR_METATYPE_METERING_MODE = 83,
+    EXTRACTOR_METATYPE_MACRO_MODE = 84,
+    EXTRACTOR_METATYPE_IMAGE_QUALITY = 85,
+    EXTRACTOR_METATYPE_WHITE_BALANCE = 86,
+    EXTRACTOR_METATYPE_ORIENTATION = 87,
+    EXTRACTOR_METATYPE_MAGNIFICATION = 40,
+
+    /* numeric metrics */
+    EXTRACTOR_METATYPE_POPULARITY_METER = 119,
+    EXTRACTOR_METATYPE_RATING = 145,
+    EXTRACTOR_METATYPE_PRIORITY = 60,
+
+    /* gnunet specific attributes */
+    EXTRACTOR_METATYPE_GNUNET_DISPLAY_TYPE = 135,
+    EXTRACTOR_METATYPE_GNUNET_ECBC_URI = 136,
+
+
+    /* misc (see if these are still needed...) */
+
+    EXTRACTOR_METATYPE_GENERATOR = 103,
+    EXTRACTOR_METATYPE_ENCODED_BY = 121,
+    EXTRACTOR_METATYPE_PRODUCTVERSION = 90,
+
+    EXTRACTOR_METATYPE_DISCLAIMER = 27,
+    EXTRACTOR_METATYPE_FILE_SIZE = 131,
+    EXTRACTOR_METATYPE_FULL_DATA = 137,
+    EXTRACTOR_METATYPE_VERSIONNUMBER = 14,
+
+    EXTRACTOR_METATYPE_ORGANIZATION = 15,
+    EXTRACTOR_METATYPE_CONTRIBUTOR = 19,
+    EXTRACTOR_METATYPE_RESOURCE_TYPE = 20,
+    EXTRACTOR_METATYPE_SOURCE = 23,
+    EXTRACTOR_METATYPE_RELATION = 24,
+    EXTRACTOR_METATYPE_COVERAGE = 25,
+    EXTRACTOR_METATYPE_SOFTWARE = 26,
+    EXTRACTOR_METATYPE_WARNING = 28,
+    EXTRACTOR_METATYPE_TRANSLATED = 29,
+    EXTRACTOR_METATYPE_PRODUCER = 33,
+    EXTRACTOR_METATYPE_CREATED_FOR = 39,
+    EXTRACTOR_METATYPE_RELEASE = 41,
+    EXTRACTOR_METATYPE_GROUP = 42,
+    EXTRACTOR_METATYPE_CATEGORY = 58,
+    EXTRACTOR_METATYPE_OWNER = 66,
+    EXTRACTOR_METATYPE_MEDIA_TYPE = 68,
+    EXTRACTOR_METATYPE_CONTACT = 69,
+    EXTRACTOR_METATYPE_TEMPLATE = 88,
+    EXTRACTOR_METATYPE_SECURITY = 97,
+    EXTRACTOR_METATYPE_COMPANY = 102,
+    EXTRACTOR_METATYPE_MANAGER = 109,
+    EXTRACTOR_METATYPE_INFORMATION = 112,
+    EXTRACTOR_METATYPE_FULL_NAME = 113,
+    EXTRACTOR_METATYPE_LINK = 116,
+    EXTRACTOR_METATYPE_TIME = 122,
+    EXTRACTOR_METATYPE_MOOD = 124, 
+    EXTRACTOR_METATYPE_TELEVISION_SYSTEM = 126,
+    EXTRACTOR_METATYPE_HARDWARE_DEPENDENCY = 129,
+    EXTRACTOR_METATYPE_RIPPER = 130,
+  };
+
+
 /**
  * Get the textual name of the keyword.
- * @return NULL if the type is not known
+ *
+ * @param type meta type to get a UTF-8 string for
+ * @return NULL if the type is not known, otherwise
+ *         an English (locale: C) string describing the type;
+ *         translate using 'dgettext ("libextractor", rval)'
  */
 const char *
-EXTRACTOR_getKeywordTypeAsString(EXTRACTOR_KeywordType type);
+EXTRACTOR_metatype_to_string(enum EXTRACTOR_MetaType type);
 
-/**
- * Return the highest type number, exclusive as in [0,highest).
- */
-EXTRACTOR_KeywordType
-EXTRACTOR_getHighestKeywordTypeNumber(void);
 
 /**
- * Load multiple libraries as specified by the user.
- * @param config a string given by the user that defines which
- *        libraries should be loaded. Has the format
- *        "[[-]LIBRARYNAME[(options)][:[-]LIBRARYNAME[(options)]]]*".
- *        For example,
- *        libextractor_mp3.so:libextractor_ogg.so loads the
- *        mp3 and the ogg library. The '-' before the LIBRARYNAME
- *        indicates that the library should be added to the end
- *        of the library list (addLibraryLast).
- * @param prev the  previous list of libraries, may be NULL
- * @return the new list of libraries, equal to prev iff an error occured
- *         or if config was empty (or NULL).
+ * Get a long description for the meta type.
+ *
+ * @param type meta type to get a UTF-8 description for
+ * @return NULL if the type is not known, otherwise
+ *         an English (locale: C) string describing the type;
+ *         translate using 'dgettext ("libextractor", rval)'
  */
-EXTRACTOR_ExtractorList *
-EXTRACTOR_loadConfigLibraries(EXTRACTOR_ExtractorList * prev,
-                             const char * config);
+const char *
+EXTRACTOR_metatype_to_description(enum EXTRACTOR_MetaType type);
 
+
 /**
- * Add a library for keyword extraction.
- * @param prev the previous list of libraries, may be NULL
- * @param library the name of the library
- * @return the new list of libraries, equal to prev iff an error occured
+ * Return the highest type number, exclusive as in [0,max).
+ *
+ * @return highest legal metatype number for this version of libextractor
  */
-EXTRACTOR_ExtractorList *
-EXTRACTOR_addLibrary(EXTRACTOR_ExtractorList * prev,
-                    const char * library);
+enum EXTRACTOR_MetaType
+EXTRACTOR_metatype_get_max (void);
 
+
 /**
- * Add a library for keyword extraction at the END of the list.
- * @param prev the previous list of libraries, may be NULL
- * @param library the name of the library
- * @return the new list of libraries, always equal to prev
- *         except if prev was NULL and no error occurs
- */
-EXTRACTOR_ExtractorList *
-EXTRACTOR_addLibraryLast(EXTRACTOR_ExtractorList * prev,
-                        const char * library);
-               
-/**
- * Remove a library for keyword extraction.
- * @param prev the current list of libraries
- * @param library the name of the library to remove
- * @return the reduced list, unchanged if the library was not loaded
- */
-EXTRACTOR_ExtractorList *
-EXTRACTOR_removeLibrary(EXTRACTOR_ExtractorList * prev,
-                       const char * library);
+ * Type of a function that libextractor calls for each
+ * meta data item found.
+ *
+ * @param cls closure (user-defined)
+ * @param plugin_name name of the plugin that produced this value;
+ *        special values can be used (i.e. '<zlib>' for zlib being
+ *        used in the main libextractor library and yielding
+ *        meta data).
+ * @param type libextractor-type describing the meta data
+ * @param format basic format information about data 
+ * @param data_mime_type mime-type of data (not of the original file);
+ *        can be NULL (if mime-type is not known)
+ * @param data actual meta-data found
+ * @param data_len number of bytes in data
+ * @return 0 to continue extracting, 1 to abort
+ */ 
+typedef int (*EXTRACTOR_MetaDataProcessor)(void *cls,
+                                          const char *plugin_name,
+                                          enum EXTRACTOR_MetaType type,
+                                          enum EXTRACTOR_MetaFormat format,
+                                          const char *data_mime_type,
+                                          const char *data,
+                                          size_t data_len);
 
+                                          
 /**
- * Remove all extractors.
- * @param libraries the list of extractors
+ * Signature of the extract method that each plugin
+ * must provide.
+ *
+ * @param data data to process
+ * @param datasize number of bytes available in data
+ * @param proc function to call for meta data found
+ * @param proc_cls cls argument to proc
+ * @param options options for this plugin; can be NULL
+ * @return 0 if all calls to proc returned 0, otherwise 1
  */
-void EXTRACTOR_removeAll(EXTRACTOR_ExtractorList * libraries);
+typedef int (*EXTRACTOR_ExtractMethod)(const char *data,
+                                      size_t datasize,
+                                      EXTRACTOR_MetaDataProcessor proc,
+                                      void *proc_cls,
+                                      const char *options);
 
+
 /**
- * Extract keywords from a file using the available extractors.
- * @param extractor the list of extractor libraries
- * @param filename the name of the file
- * @return the list of keywords found in the file, NULL if none
- *         were found (or other errors)
+ * Linked list of extractor plugins.  An application builds this list
+ * by telling libextractor to load various keyword-extraction
+ * plugins. Libraries can also be unloaded (removed from this list,
+ * see EXTRACTOR_plugin_remove).
  */
-EXTRACTOR_KeywordList *
-EXTRACTOR_getKeywords(EXTRACTOR_ExtractorList * extractor,
-                     const char * filename);
+struct EXTRACTOR_PluginList;
 
 
 /**
- * Extract keywords from a buffer in memory
- * using the available extractors.
+ * Load the default set of plugins.  The default can be changed
+ * by setting the LIBEXTRACTOR_LIBRARIES environment variable;
+ * If it is set to "env", then this function will return
+ * EXTRACTOR_plugin_add_config (NULL, env, flags). 
  *
- * @param extractor the list of extractor libraries
- * @param data the data of the file
- * @param size the number of bytes in data
- * @return the list of keywords found in the file, NULL if none
- *         were found (or other errors)
+ * If LIBEXTRACTOR_LIBRARIES is not set, the function will attempt
+ * to locate the installed plugins and load all of them. 
+ * The directory where the code will search for plugins is typically
+ * automatically determined; it can be specified explicitly using the
+ * "LIBEXTRACTOR_PREFIX" environment variable.  
+ *
+ * This environment variable must be set to the precise directory with
+ * the plugins (i.e. "/usr/lib/libextractor", not "/usr").  Note that
+ * setting the environment variable will disable all of the methods
+ * that are typically used to determine the location of plugins.
+ * Multiple paths can be specified using ':' to separate them.
+ *
+ * @param flags options for all of the plugins loaded
+ * @return the default set of plugins, NULL if no plugins were found
  */
-EXTRACTOR_KeywordList *
-EXTRACTOR_getKeywords2(EXTRACTOR_ExtractorList * extractor,
-                      const void * data,
-                      size_t size);
+struct EXTRACTOR_PluginList * 
+EXTRACTOR_plugin_add_defaults(enum EXTRACTOR_Options flags);
 
 
 /**
- * Remove duplicate keywords from the list.
- * @param list the original keyword list (destroyed in the process!)
- * @param options a set of options (DUPLICATES_XXXX)
- * @return a list of keywords without duplicates
+ * Add a library for keyword extraction.
+ *
+ * @param prev the previous list of libraries, may be NULL
+ * @param library the name of the library (full path)
+ * @param options options to give to the library
+ * @param flags options to use
+ * @return the new list of libraries, equal to prev iff an error occured
  */
-EXTRACTOR_KeywordList *
-EXTRACTOR_removeDuplicateKeywords(EXTRACTOR_KeywordList * list,
-                                 unsigned int options);
+struct EXTRACTOR_PluginList *
+EXTRACTOR_plugin_add (struct EXTRACTOR_PluginList * prev,
+                     const char * library,
+                     const char *options,
+                     enum EXTRACTOR_Options flags);
 
 
 /**
- * Remove empty (all-whitespace) keywords from the list.
- * @param list the original keyword list (destroyed in the process!)
- * @return a list of keywords without duplicates
+ * Add a library for keyword extraction at the END of the list.
+ * @param prev the previous list of libraries, may be NULL
+ * @param library the name of the library (full path)
+ * @param options options to give to the library
+ * @param flags options to use
+ * @return the new list of libraries, always equal to prev
+ *         except if prev was NULL and no error occurs
  */
-EXTRACTOR_KeywordList *
-EXTRACTOR_removeEmptyKeywords (EXTRACTOR_KeywordList * list);
+struct EXTRACTOR_PluginList *
+EXTRACTOR_plugin_add_last(struct EXTRACTOR_PluginList *prev,
+                         const char *library,
+                         const char *options,
+                         enum EXTRACTOR_Options flags);
 
-/**
- * Remove keywords of a particular type from the list.
- * @param list the original keyword list (altered in the process!)
- * @param type the type to remove
- * @return a list of keywords without entries of given type
- */
-EXTRACTOR_KeywordList *
-EXTRACTOR_removeKeywordsOfType(EXTRACTOR_KeywordList * list,
-                              EXTRACTOR_KeywordType type);
 
 /**
- * Print a keyword list to a file.
- * For debugging.
- * @param handle the file to write to (stdout, stderr), must NOT be NULL
- * @param keywords the list of keywords to print, may be NULL
+ * Load multiple libraries as specified by the user.
+ *
+ * @param config a string given by the user that defines which
+ *        libraries should be loaded. Has the format
+ *        "[[-]LIBRARYNAME[(options)][:[-]LIBRARYNAME[(options)]]]*".
+ *        For example,
+ *        
/usr/lib/libextractor/libextractor_mp3.so:/usr/lib/libextractor/libextractor_ogg.so
 loads the
+ *        mp3 and the ogg library. The '-' before the LIBRARYNAME
+ *        indicates that the library should be added to the end
+ *        of the library list (addLibraryLast).
+ * @param prev the  previous list of libraries, may be NULL
+ * @param flags options to use
+ * @return the new list of libraries, equal to prev iff an error occured
+ *         or if config was empty (or NULL).
  */
-void EXTRACTOR_printKeywords(FILE * handle,
-                            EXTRACTOR_KeywordList * keywords);
+struct EXTRACTOR_PluginList *
+EXTRACTOR_plugin_add_config (struct EXTRACTOR_PluginList * prev,
+                            const char *config,
+                            enum EXTRACTOR_Options flags);
 
+               
 /**
- * Free the memory occupied by the keyword list (and the
- * keyword strings in it!)
- * @param keywords the list to free
+ * Remove a plugin from a list.
+ *
+ * @param prev the current list of plugins
+ * @param library the name of the plugin to remove (full path)
+ * @return the reduced list, unchanged if the plugin was not loaded
  */
-void EXTRACTOR_freeKeywords(EXTRACTOR_KeywordList * keywords);
+struct EXTRACTOR_PluginList *
+EXTRACTOR_plugin_remove(struct EXTRACTOR_PluginList * prev,
+                       const char * library);
 
-/**
- * Extract the last keyword that of the given type from the keyword list.
- * @param type the type of the keyword
- * @param keywords the keyword list
- * @return the last matching keyword, or NULL if none matches;
- *  the string returned is aliased in the keywords list and must
- *  not be freed or manipulated by the client.  It will become
- *  invalid once the keyword list is freed.
- */
-const char * EXTRACTOR_extractLast(EXTRACTOR_KeywordType type,
-                                  EXTRACTOR_KeywordList * keywords);
 
 /**
- * Extract the last keyword of the given string from the keyword list.
- * @param type the string describing the type of the keyword
- * @param keywords the keyword list
- * @return the last matching keyword, or NULL if none matches;
- *  the string returned is aliased in the keywords list and must
- *  not be freed or manipulated by the client.  It will become
- *  invalid once the keyword list is freed.
+ * Remove all plugins from the given list (destroys the list).
+ *
+ * @param plugin the list of plugins
  */
-const char * EXTRACTOR_extractLastByString(const char * type,
-                                          EXTRACTOR_KeywordList * keywords);
+void 
+EXTRACTOR_plugin_remove_all(struct EXTRACTOR_PluginList *plugins);
 
-/**
- * Count the number of keywords in the keyword list.
- * @param keywords the keyword list
- * @return the number of keywords in the list
- */
-unsigned int EXTRACTOR_countKeywords(EXTRACTOR_KeywordList * keywords);
 
-
 /**
- * This function can be used to decode the binary data
- * encoded in the libextractor metadata (i.e. for
- * the  thumbnails).
+ * Extract keywords from a file using the given set of plugins.
  *
- * @param in 0-terminated string from the meta-data
- * @return 1 on error, 0 on success
+ * @param plugins the list of plugins to use
+ * @param filename the name of the file, can be NULL if data is not NULL
+ * @param data data of the file in memory, can be NULL (in which
+ *        case libextractor will open file) if filename is not NULL
+ * @param size number of bytes in data, ignored if data is NULL
+ * @param proc function to call for each meta data item found
+ * @param proc_cls cls argument to proc
  */
-int EXTRACTOR_binaryDecode(const char * in,
-                          unsigned char ** out,
-                          size_t * outSize);
+void
+EXTRACTOR_extract(struct EXTRACTOR_PluginList *plugins,
+                 const char *filename,
+                 const void *data,
+                 size_t size,
+                 EXTRACTOR_MetaDataProcessor proc,
+                 void *proc_cls);
 
 
 /**
- * Encode the given binary data object
- * as a 0-terminated C-string according
- * to the LE binary data encoding standard.
- *
- * @return NULL on error, the 0-terminated
- *  encoding otherwise
+ * Simple EXTRACTOR_MetaDataProcessor implementation that simply
+ * prints the extracted meta data to the given file.  Only prints
+ * those keywords that are in UTF-8 format.
+ * 
+ * @param handle the file to write to (stdout, stderr), must NOT be NULL,
+ *               must be of type "FILE *".
+ * @param plugin_name name of the plugin that produced this value
+ * @param type libextractor-type describing the meta data
+ * @param format basic format information about data 
+ * @param data_mime_type mime-type of data (not of the original file);
+ *        can be NULL (if mime-type is not known)
+ * @param data actual meta-data found
+ * @param data_len number of bytes in data
+ * @return non-zero if printing failed, otherwise 0.
  */
-char * EXTRACTOR_binaryEncode(const unsigned char * data,
-                             size_t size);
+int 
+EXTRACTOR_meta_data_print(void * handle,
+                         const char *plugin_name,
+                         enum EXTRACTOR_MetaType type,
+                         enum EXTRACTOR_MetaFormat format,
+                         const char *data_mime_type,
+                         const char *data,
+                         size_t data_len);
 
 
 #if 0 /* keep Emacsens' auto-indent happy */

Deleted: Extractor/src/include/winproc.h
===================================================================
--- Extractor/src/include/winproc.h     2009-12-12 20:09:31 UTC (rev 9745)
+++ Extractor/src/include/winproc.h     2009-12-13 23:02:19 UTC (rev 9746)
@@ -1,44 +0,0 @@
-/*
-     This file is part of libextractor.
-     (C) 2001, 2002, 2003, 2003, 2005 Christian Grothoff (and other 
contributing authors)
-
-     libextractor is free software; you can redistribute it and/or modify
-     it under the terms of the GNU General Public License as published
-     by the Free Software Foundation; either version 2, or (at your
-     option) any later version.
-
-     libextractor is distributed in the hope that it will be useful, but
-     WITHOUT ANY WARRANTY; without even the implied warranty of
-     MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
-     General Public License for more details.
-
-     You should have received a copy of the GNU General Public License
-     along with libextractor; see the file COPYING.  If not, write to the
-     Free Software Foundation, Inc., 59 Temple Place - Suite 330,
-     Boston, MA 02111-1307, USA.
-*/
-
-/**
- * @file include/winproc.h
- * @brief Definitions for MS Windows
- * @author Nils Durner
- * @note This file differs from GNUnet's winproc.h
- */
-
-#ifndef WINPROC_H
-#define WINPROC_H
-
-#include "platform.h"
-
-#ifdef __cplusplus
-extern "C" {
-#endif
-
-void InitWinEnv();
-void ShutdownWinEnv();
-
-#endif
-
-#ifdef __cplusplus
-} /* extern "C" */
-#endif

Modified: Extractor/src/main/Makefile.am
===================================================================
--- Extractor/src/main/Makefile.am      2009-12-12 20:09:31 UTC (rev 9745)
+++ Extractor/src/main/Makefile.am      2009-12-13 23:02:19 UTC (rev 9746)
@@ -29,31 +29,19 @@
   bz2lib = -lbz2
 endif
 
-if HAVE_GLIB
-if WITH_GSF
- GSF_LIBS_X = $(GSF_LIBS)
-endif
-endif
-
 libextractor_la_LDFLAGS = \
   $(LE_LIB_LDFLAGS) -version-info 
@LIB_VERSION_CURRENT@:@LIB_VERSION_REVISION@:@LIB_VERSION_AGE@
 libextractor_la_LIBADD = \
-  $(LIBLTDL) $(dlflag) $(zlib) $(bz2lib) $(GSF_LIBS_X) $(LIBICONV) -lpthread
+  $(LIBLTDL) $(dlflag) $(zlib) $(bz2lib) $(LIBICONV) -lrt
 
 
 EXTRA_DIST = \
-  winproc.c \
   iconv.c
 
-if MINGW
-  winproc = winproc.c
-endif
-
 libextractor_la_CPPFLAGS = -DPLUGINDIR=\"@address@hidden" $(AM_CPPFLAGS)
 
 libextractor_la_SOURCES = \
-  extractor.c \
-  $(winproc) 
+  extractor.c
 
 extract_SOURCES = \
   extract.c \
@@ -62,15 +50,3 @@
   getopt1.c 
 
 
-check_PROGRAMS = \
-  test_binary
-
-TESTS = $(check_PROGRAMS)
-
-test_binary_SOURCES = \
-  test_binary.c
-test_binary_LDADD = \
-  $(top_builddir)/src/main/libextractor.la
-
-
-

Modified: Extractor/src/main/extract.c
===================================================================
--- Extractor/src/main/extract.c        2009-12-12 20:09:31 UTC (rev 9745)
+++ Extractor/src/main/extract.c        2009-12-13 23:02:19 UTC (rev 9746)
@@ -26,6 +26,17 @@
 #define NO 0
 
 
+/**
+ * Which keyword types should we print?
+ */
+static int * print;
+
+/**
+ * How verbose are we supposed to be?
+ */
+static int verbose;
+
+
 typedef struct {
   char shortArg;
   char * longArg;
@@ -122,16 +133,10 @@
 printHelp ()
 {
   static Help help[] = {
-    { 'a', "all", NULL,
-      gettext_noop("do not remove any duplicates") },
     { 'b', "bibtex", NULL,
       gettext_noop("print output in bibtex format") },
     { 'B', "binary", "LANG",
       gettext_noop("use the generic plaintext extractor for the language with 
the 2-letter language code LANG") },
-    { 'd', "duplicates", NULL,
-      gettext_noop("remove duplicates only if types match") },
-    { 'f', "filename", NULL,
-      gettext_noop("use the filename as a keyword (loads filename-extractor 
plugin)") },
     { 'g', "grep-friendly", NULL,
       gettext_noop("produce grep-friendly output (all results on one line per 
file)") },
     { 'h', "help", NULL,
@@ -146,10 +151,6 @@
       gettext_noop("do not use the default set of extractor plugins") },
     { 'p', "print", "TYPE",
       gettext_noop("print only keywords of the given TYPE (use -L to get a 
list)") },
-    { 'r', "remove-duplicates", NULL,
-      gettext_noop("remove duplicates even if keyword types do not match") },
-    { 's', "split", NULL,
-      gettext_noop("use keyword splitting (loads split-extractor plugin)") },
     { 'v', "version", NULL,
       gettext_noop("print the version number") },
     { 'V', "verbose", NULL,
@@ -166,109 +167,159 @@
 
 #include "iconv.c"
 
-
 /**
  * Print a keyword list to a file.
  *
- * @param handle the file to write to (stdout, stderr), may NOT be NULL
- * @param keywords the list of keywords to print, may be NULL
- * @param print array indicating which types to print
- */
-static void
-printSelectedKeywords(FILE * handle,
-                     EXTRACTOR_KeywordList * keywords,
-                     const int * print,
-                     const int verbose)
-{
+ * @param cls closure, not used
+ * @param plugin_name name of the plugin that produced this value;
+ *        special values can be used (i.e. '<zlib>' for zlib being
+ *        used in the main libextractor library and yielding
+ *        meta data).
+ * @param type libextractor-type describing the meta data
+ * @param format basic format information about data 
+ * @param data_mime_type mime-type of data (not of the original file);
+ *        can be NULL (if mime-type is not known)
+ * @param data actual meta-data found
+ * @param data_len number of bytes in data
+ * @return 0 to continue extracting, 1 to abort
+ */ 
+static int
+print_selected_keywords (void *cls,
+                        const char *plugin_name,
+                        enum EXTRACTOR_MetaType type,
+                        enum EXTRACTOR_MetaFormat format,
+                        const char *data_mime_type,
+                        const char *data,
+                        size_t data_len)
+{ 
   char * keyword;
   iconv_t cd;
+  const char *stype;
 
-  cd = iconv_open(nl_langinfo(CODESET), "UTF-8");
-  while (keywords != NULL) {
-    if (EXTRACTOR_isBinaryType(keywords->keywordType)) {
-      fprintf (handle,
-              _("%s - (binary)\n"),
-              _(EXTRACTOR_getKeywordTypeAsString(keywords->keywordType)));
-    } else {
+  if (print[type] != YES)
+    return 0;
+  stype = gettext(EXTRACTOR_metatype_to_string(type));
+  switch (format)
+    {
+    case EXTRACTOR_METAFORMAT_UNKNOWN:
+      fprintf (stdout,
+              _("%s - (unknown, %u bytes)\n"),
+              stype,
+              (unsigned int) data_len);
+      break;
+    case EXTRACTOR_METAFORMAT_UTF8:
+      cd = iconv_open(nl_langinfo(CODESET), "UTF-8");
       if (cd != (iconv_t) -1)
-       keyword = iconvHelper(cd,
-                             keywords->keyword);
+       keyword = iconv_helper(cd,
+                              data);
       else
-       keyword = strdup(keywords->keyword);
-      if (NULL == EXTRACTOR_getKeywordTypeAsString(keywords->keywordType)) {
-       if (verbose == YES) {
-         fprintf(handle,
-                 _("INVALID TYPE - %s\n"),
-                 keyword);
-       }
-      } else if (print[keywords->keywordType] == YES)
-       fprintf (handle,
-                "%s - %s\n",
-                _(EXTRACTOR_getKeywordTypeAsString(keywords->keywordType)),
-                keyword);
+       keyword = strdup(data);
+      fprintf (stdout,
+              "%s - %s\n",
+              stype,
+              keyword);
       free(keyword);
+      if (cd != (iconv_t) -1)
+       iconv_close(cd);
+      break;
+    case EXTRACTOR_METAFORMAT_BINARY:
+      fprintf (stdout,
+              _("%s - (binary, %u bytes)\n"),
+              stype,
+              (unsigned int) data_len);
+      break;
+    case EXTRACTOR_METAFORMAT_C_STRING:
+      fprintf (stdout,
+              "%s - %s\n",
+              stype,
+              data);
+      break;
+
+    default:
+      break;
     }
-    keywords = keywords->next;
-  }
-  if (cd != (iconv_t) -1)
-    iconv_close(cd);
+  return 0;
 }
 
+
+
 /**
- * Print a keyword list to a file in a grep-friendly manner.
+ * Print a keyword list to a file without new lines.
  *
- * @param handle the file to write to (stdout, stderr), may NOT be NULL
- * @param keywords the list of keywords to print, may be NULL
- * @param print array indicating which types to print
- */
-static void
-printSelectedKeywordsGrepFriendly(FILE * handle,
-                                 EXTRACTOR_KeywordList * keywords,
-                                 const int * print,
-                                 const int verbose)
-{
+ * @param cls closure, not used
+ * @param plugin_name name of the plugin that produced this value;
+ *        special values can be used (i.e. '<zlib>' for zlib being
+ *        used in the main libextractor library and yielding
+ *        meta data).
+ * @param type libextractor-type describing the meta data
+ * @param format basic format information about data 
+ * @param data_mime_type mime-type of data (not of the original file);
+ *        can be NULL (if mime-type is not known)
+ * @param data actual meta-data found
+ * @param data_len number of bytes in data
+ * @return 0 to continue extracting, 1 to abort
+ */ 
+static int
+print_selected_keywords_grep_friendly (void *cls,
+                                      const char *plugin_name,
+                                      enum EXTRACTOR_MetaType type,
+                                      enum EXTRACTOR_MetaFormat format,
+                                      const char *data_mime_type,
+                                      const char *data,
+                                      size_t data_len)
+{ 
   char * keyword;
   iconv_t cd;
-  size_t pos;
 
-  cd = iconv_open(nl_langinfo(CODESET), "UTF-8");
-  while (keywords != NULL) {
-    if ( (EXTRACTOR_isBinaryType(EXTRACTOR_THUMBNAIL_DATA)) &&
-        (print[keywords->keywordType] == YES) ) {
-      if (verbose > 1) 
-       fprintf(handle,
-               "%s: ",
-               _(EXTRACTOR_getKeywordTypeAsString(keywords->keywordType)));
+  if (print[type] != YES)
+    return 0;
+  switch (format)
+    {
+    case EXTRACTOR_METAFORMAT_UNKNOWN:      
+      break;
+    case EXTRACTOR_METAFORMAT_UTF8:
+      if (verbose > 1)
+       fprintf (stdout,
+                "%s: ",
+                gettext(EXTRACTOR_metatype_to_string(type)));
+      cd = iconv_open(nl_langinfo(CODESET), "UTF-8");
       if (cd != (iconv_t) -1)
-       keyword = iconvHelper(cd,
-                             keywords->keyword);
+       keyword = iconv_helper(cd,
+                              data);
       else
-       keyword = strdup(keywords->keyword);
-      pos = 0;
-      while (keyword[pos] != '\0') {
-       if (iscntrl(keyword[pos]))      
-         keyword[pos] = ' ';
-       pos++;
-      }
-      fprintf (handle,
-              (keywords->next == NULL) ? "%s" : "%s ",
+       keyword = strdup(data);
+      fprintf (stdout,
+              "'%s' ",
               keyword);
       free(keyword);
+      if (cd != (iconv_t) -1)
+       iconv_close(cd);
+      break;
+    case EXTRACTOR_METAFORMAT_BINARY:
+      break;
+    case EXTRACTOR_METAFORMAT_C_STRING:
+      if (verbose > 1)
+       fprintf (stdout,
+                "%s ",
+                gettext(EXTRACTOR_metatype_to_string(type)));      
+      fprintf (stdout,
+              "'%s'",
+              data);
+      break;
+    default:
+      break;
     }
-    keywords = keywords->next;
-  }
-  fprintf(handle, "\n");
-  if (cd != (iconv_t) -1)
-    iconv_close(cd);
+  return 0;
 }
 
+
 /**
  * Take title, auth, year and return a string
  */
 static char *
 str_splice(const char * title,
-          const char * auth,
-          const char * year) {
+          const char * year,
+          const char * auth) {
   char * temp = malloc(16);
   int i = 0;
 
@@ -287,190 +338,202 @@
   return temp;
 }
 
+
 /**
- * Print a keyword list in bibtex format to a file.
- * FIXME: We should generate the three letter abbrev of the month
- * @param handle the file to write to (stdout, stderr), may NOT be NULL
- * @param keywords the list of keywords to print, may be NULL
- * @param print array indicating which types to print
+ * Entry in the map we construct for each file.
  */
-static void
-printSelectedKeywordsBibtex (FILE * handle,
-                            EXTRACTOR_KeywordList * keywords,
-                            const int * print,
-                            const char * filename)
+struct BibTexMap
 {
-  const char * last = NULL;
-  if (keywords == NULL)
-    return;
-  if (print[keywords->keywordType] == YES)
-    {
-      const char * title = NULL;
-      const char * author = NULL;
-      const char * note = NULL;
-      const char * date = NULL;
-      const char * publisher = NULL;
-      const char * organization = NULL;
-      const char * key = NULL;
-      const char * pages = NULL;
-      char * year = NULL;
-      char * month = NULL;
-      char * tmp;
+  const char *bibTexName;
+  enum EXTRACTOR_MetaType le_type;
+  char *value;
+};
 
-      title = EXTRACTOR_extractLastByString(_("title"), keywords);
-      if ( !title )
-       title = EXTRACTOR_extractLastByString(_("filename"), keywords);
-      if ( !title )
-       title = (char*)filename;
-      last = title;
 
-      author = EXTRACTOR_extractLastByString(_("author"), keywords);
-      if ( author )
-       last = author;
+/**
+ * Type of the entry for bibtex.
+ */
+static char *entry_type;
 
-      note = EXTRACTOR_extractLastByString(_("description"), keywords);
-      if ( !note )
-       note = EXTRACTOR_extractLastByString(_("keywords"), keywords);
-      if ( !note )
-       note = EXTRACTOR_extractLastByString(_("comment"), keywords);
-      if ( note )
-       last = note;
+/**
+ * Mapping between bibTeX strings, libextractor
+ * meta data types and values for the current document.
+ */
+static struct BibTexMap btm[] =
+  {
+    { "title", EXTRACTOR_METATYPE_TITLE, NULL},
+    { "year", EXTRACTOR_METATYPE_PUBLICATION_YEAR, NULL },
+    { "author", EXTRACTOR_METATYPE_AUTHOR_NAME, NULL },
+    { "book", EXTRACTOR_METATYPE_BOOK_TITLE, NULL},
+    { "edition", EXTRACTOR_METATYPE_BOOK_EDITION, NULL},
+    { "chapter", EXTRACTOR_METATYPE_BOOK_CHAPTER_NUMBER, NULL},
+    { "journal", EXTRACTOR_METATYPE_JOURNAL_NAME, NULL},
+    { "volume", EXTRACTOR_METATYPE_JOURNAL_VOLUME, NULL},
+    { "number", EXTRACTOR_METATYPE_JOURNAL_NUMBER, NULL},
+    { "pages", EXTRACTOR_METATYPE_PAGE_COUNT, NULL },
+    { "pages", EXTRACTOR_METATYPE_PAGE_RANGE, NULL },
+    { "school", EXTRACTOR_METATYPE_AUTHOR_INSTITUTION, NULL},
+    { "publisher", EXTRACTOR_METATYPE_PUBLISHER, NULL },
+    { "address", EXTRACTOR_METATYPE_PUBLISHER_ADDRESS, NULL },
+    { "institution", EXTRACTOR_METATYPE_PUBLISHER_INSTITUTION, NULL },
+    { "series", EXTRACTOR_METATYPE_PUBLISHER_SERIES, NULL},
+    { "month", EXTRACTOR_METATYPE_PUBLICATION_MONTH, NULL },
+    { "url", EXTRACTOR_METATYPE_URL, NULL}, 
+    { "note", EXTRACTOR_METATYPE_COMMENT, NULL},
+    { "eprint", EXTRACTOR_METATYPE_BIBTEX_EPRINT, NULL },
+    { "type", EXTRACTOR_METATYPE_PUBLICATION_TYPE, NULL },
+    { NULL, 0, NULL }
+  };
 
-      date = EXTRACTOR_extractLastByString(_("date"), keywords);
-      if ( !date )
-       date = EXTRACTOR_extractLastByString(_("creation date"), keywords);
-      if ( date ) {
-       if ( strlen(keywords->keyword) >= 7 ) {
-         year = (char*)malloc(sizeof(char)*5);
-         memset(year, 0, sizeof(char)*5);
-         month = (char*)malloc(sizeof(char)*3);
-         memset(month, 0, sizeof(char)*3);
-         year[0] = keywords->keyword[0];
-         year[1] = keywords->keyword[1];
-         year[2] = keywords->keyword[2];
-         year[3] = keywords->keyword[3];
-         month[0] = keywords->keyword[4];
-         month[1] = keywords->keyword[5];
-       } else if ( strlen(keywords->keyword) >= 4 ) {
-         year = (char*)malloc(sizeof(char)*5);
-         memset(year, 0, sizeof(char)*5);
-         year[0] = keywords->keyword[0];
-         year[1] = keywords->keyword[1];
-         year[2] = keywords->keyword[2];
-         year[3] = keywords->keyword[3];
-       }
-      }
-      if ( year )
-       last = year;
 
-      if ( month )
-       last = month;
+/**
+ * Clean up the bibtex processor in preparation for the next round.
+ */
+static void 
+start_bibtex ()
+{
+  int i;
+  
+  i = 0;
+  while (btm[i].bibTexName != NULL)
+    {
+      free (btm[i].value);
+      btm[i].value = NULL;
+      i++;
+    }
+  free (entry_type);
+  entry_type = NULL;
+}
 
-      publisher = EXTRACTOR_extractLastByString(_("publisher"), keywords);
-      if ( publisher )
-       last = publisher;
 
-      organization = EXTRACTOR_extractLastByString(_("organization"), 
keywords);
-      if ( organization )
-       last = organization;
+/**
+ * Callback function for printing meta data in bibtex format.
+ *
+ * @param cls closure, not used
+ * @param plugin_name name of the plugin that produced this value;
+ *        special values can be used (i.e. '<zlib>' for zlib being
+ *        used in the main libextractor library and yielding
+ *        meta data).
+ * @param type libextractor-type describing the meta data
+ * @param format basic format information about data 
+ * @param data_mime_type mime-type of data (not of the original file);
+ *        can be NULL (if mime-type is not known)
+ * @param data actual meta-data found
+ * @param data_len number of bytes in data
+ * @return 0 to continue extracting (always)
+ */
+static int
+print_bibtex (void *cls,
+             const char *plugin_name,
+             enum EXTRACTOR_MetaType type,
+             enum EXTRACTOR_MetaFormat format,
+             const char *data_mime_type,
+             const char *data,
+             size_t data_len)
+{
+  int i;
 
-      key = EXTRACTOR_extractLastByString(_("subject"), keywords);
-      if ( key )
-       last = key;
+  if (print[type] != YES)
+    return 0;
+  if (format != EXTRACTOR_METAFORMAT_UTF8)
+    return 0;
+  if (type == EXTRACTOR_METATYPE_BIBTEX_ENTRY_TYPE)
+    {
+      entry_type = strdup (data);
+      return 0;
+    }
+  i = 0;
+  while (btm[i].bibTexName != NULL)
+    {
+      if ( (btm[i].value == NULL) &&
+          (btm[i].le_type == type) )
+       btm[i].value = strdup (data);
+      i++;
+    }  
+  return 0;
+}
 
-      pages = EXTRACTOR_extractLastByString(_("page count"), keywords);
-      if ( pages )
-       last = pages;
 
-      tmp = str_splice(title, author, year);
-      fprintf(handle, 
-             "@misc{ %s,\n", 
-             tmp);
-      free(tmp);      
-      if ( title )
-       fprintf(handle, "    title = \"%s\"%s\n", title,
-           (last == title)?"":",");
-      if ( author )
-       fprintf(handle, "    author = \"%s\"%s\n", author,
-           (last == author)?"":",");
-      if ( note )
-       fprintf(handle, "    note = \"%s\"%s\n", note,
-           (last == note)?"":",");
-      if ( year )
-       fprintf(handle, "    year = \"%s\"%s\n", year,
-           (last == year)?"":",");
-      if ( month )
-       fprintf(handle, "    month = \"%s\"%s\n", month,
-           (last == month)?"":",");
-      if ( publisher )
-       fprintf(handle, "    publisher = \"%s\"%s\n", publisher,
-           (last == publisher)?"":",");
-      if ( organization )
-       fprintf(handle, "    organization = \"%s\"%s\n", organization,
-           (last == organization)?"":",");
-      if ( key )
-       fprintf(handle, "    key = \"%s\"%s\n", key,
-           (last == key)?"":",");
-      if ( pages )
-       fprintf(handle, "    pages = \"%s\"%s\n", pages,
-           (last == pages)?"":",");
-      if (month != NULL)
-       free(month);
-      if (year != NULL)
-       free(year);
-      fprintf(handle, "}\n\n");
+static void
+finish_bibtex (const char *fn)
+{
+  int i;
+  char *tya;
+  const char *et;
+
+  if (entry_type != NULL)
+    et = entry_type;
+  else
+    et = "misc";
+  if ( (btm[0].value == NULL) ||
+       (btm[1].value == NULL) ||
+       (btm[2].value == NULL) )          
+    fprintf (stdout,
+            "@%s %s { ",
+            et,
+            fn);
+  else
+    {
+      tya = str_splice (btm[0].value,
+                       btm[1].value,
+                       btm[2].value);      
+      fprintf (stdout,
+              "@%s %s { ",
+              et,
+              tya);
+      free (tya);
     }
+
+            
+  i = 0;
+  while (btm[i].bibTexName != NULL)
+    {
+      if (btm[i].value != NULL) 
+       fprintf (stdout,
+                "\t%s = {%s},\n",
+                btm[i].bibTexName,
+                btm[i].value);
+      i++;
+    }  
+  fprintf(stdout, "}\n\n");
 }
 
+
 /**
- * Demo for libExtractor.
- * <p>
- * Invoke with a list of filenames to extract keywords
- * from (demo will use all the extractor libraries that
- * are available by default).
+ * Main function for the 'extract' tool.  Invoke with a list of
+ * filenames to extract keywords from.
  */
 int
 main (int argc, char *argv[])
 {
   int i;
-  EXTRACTOR_ExtractorList *extractors;
-  EXTRACTOR_KeywordList *keywords;
+  struct EXTRACTOR_PluginList *plugins;
   int option_index;
   int c;
   char * libraries = NULL;
   char * hash = NULL;
-  int splitKeywords = NO;
-  int verbose = 0;
-  int useFilename = NO;
   int nodefault = NO;
-  int *print;
   int defaultAll = YES;
-  int duplicates = EXTRACTOR_DUPLICATES_REMOVE_UNKNOWN;
   int bibtex = NO;
   int grepfriendly = NO;
   char * binary = NULL;
+  char * name;
   int ret = 0;
+  EXTRACTOR_MetaDataProcessor processor = NULL;
 
-#ifdef MINGW
-  InitWinEnv();
-#endif
 #if ENABLE_NLS
   setlocale(LC_ALL, "");
-  textdomain("libextractor");
-  BINDTEXTDOMAIN("libextractor", LOCALEDIR);
+  textdomain(PACKAGE);
 #endif
-  print = malloc (sizeof (int) * EXTRACTOR_getHighestKeywordTypeNumber ());
-  for (i = 0; i < EXTRACTOR_getHighestKeywordTypeNumber (); i++)
+  print = malloc (sizeof (int) * EXTRACTOR_metatype_get_max ());
+  for (i = 0; i < EXTRACTOR_metatype_get_max (); i++)
     print[i] = YES;            /* default: print everything */
 
   while (1)
     {
       static struct option long_options[] = {
-       {"all", 0, 0, 'a'},
        {"binary", 1, 0, 'B'},
        {"bibtex", 0, 0, 'b'},
-       {"duplicates", 0, 0, 'd'},
-       {"filename", 0, 0, 'f'},
        {"grep-friendly", 0, 0, 'g'},
        {"help", 0, 0, 'h'},
        {"hash", 1, 0, 'H'},
@@ -478,8 +541,6 @@
        {"library", 1, 0, 'l'},
        {"nodefault", 0, 0, 'n'},
        {"print", 1, 0, 'p'},
-       {"remove-duplicates", 0, 0, 'r'},
-       {"split", 0, 0, 's'},
        {"verbose", 0, 0, 'V'},
        {"version", 0, 0, 'v'},
        {"exclude", 1, 0, 'x'},
@@ -487,7 +548,8 @@
       };
       option_index = 0;
       c = getopt_long (argc,
-                      argv, "vhbgl:nsH:fp:x:LVdraB:",
+                      argv, 
+                      "abB:ghH:l:Lnp:vVx:",
                       long_options,
                       &option_index);
 
@@ -495,23 +557,28 @@
        break;                  /* No more flags to process */
       switch (c)
        {
-       case 'a':
-         duplicates = -1;
-         break;
        case 'b':
          bibtex = YES;
+         if (processor != NULL)
+           {
+             fprintf (stderr,
+                      _("Illegal combination of options, cannot combine 
multiple styles of printing.\n"));
+             return 0;
+           }
+         processor = &print_bibtex;
          break;
        case 'B':
          binary = optarg;
          break;
-       case 'd':
-         duplicates = 0;
-         break;
-       case 'f':
-         useFilename = YES;
-         break;
        case 'g':
          grepfriendly = YES;
+         if (processor != NULL)
+           {
+             fprintf (stderr,
+                      _("Illegal combination of options, cannot combine 
multiple styles of printing.\n"));
+             return 0;
+           }
+         processor = &print_selected_keywords_grep_friendly;
          break;
        case 'h':
          printHelp();
@@ -524,32 +591,35 @@
          break;
        case 'L':
          i = 0;
-         while (NULL != EXTRACTOR_getKeywordTypeAsString (i))
+         while (NULL != EXTRACTOR_metatype_to_string (i))
            printf ("%s\n",
-                   _(EXTRACTOR_getKeywordTypeAsString (i++)));
+                   gettext(EXTRACTOR_metatype_to_string (i++)));
          return 0;
        case 'n':
          nodefault = YES;
          break;
        case 'p':
-         if (optarg == NULL) {
-           fprintf(stderr,
-                   _("You must specify an argument for the `%s' option (option 
ignored).\n"),
-                   "-p");
-           break;
-         }
+         if (optarg == NULL) 
+           {
+             fprintf(stderr,
+                     _("You must specify an argument for the `%s' option 
(option ignored).\n"),
+                     "-p");
+             break;
+           }
          if (defaultAll == YES)
            {
              defaultAll = NO;
              i = 0;
-             while (NULL != EXTRACTOR_getKeywordTypeAsString (i))
+             while (NULL != EXTRACTOR_metatype_to_string (i))
                print[i++] = NO;
            }
          i = 0;
-         while (NULL != EXTRACTOR_getKeywordTypeAsString (i))
+         while (NULL != EXTRACTOR_metatype_to_string (i))
            {
-             if ( (0 == strcmp (optarg, EXTRACTOR_getKeywordTypeAsString (i))) 
||
-                  (0 == strcmp (optarg, _(EXTRACTOR_getKeywordTypeAsString 
(i)))) )
+             if ( (0 == strcmp (optarg, 
+                                EXTRACTOR_metatype_to_string (i))) ||
+                  (0 == strcmp (optarg, 
+                                gettext(EXTRACTOR_metatype_to_string (i)))) )
                
                {
                  print[i] = YES;
@@ -557,7 +627,7 @@
                }
              i++;
            }
-         if (NULL == EXTRACTOR_getKeywordTypeAsString (i))
+         if (NULL == EXTRACTOR_metatype_to_string (i))
            {
              fprintf(stderr,
                      "Unknown keyword type `%s', use option `%s' to get a 
list.\n",
@@ -566,12 +636,6 @@
              return -1;
            }
          break;
-       case 'r':
-         duplicates = EXTRACTOR_DUPLICATES_TYPELESS;
-         break;
-       case 's':
-         splitKeywords = YES;
-         break;
                case 'v':
          printf ("extract v%s\n", PACKAGE_VERSION);
          return 0;
@@ -580,34 +644,30 @@
          break;
        case 'x':
          i = 0;
-         while (NULL != EXTRACTOR_getKeywordTypeAsString (i))
+         while (NULL != EXTRACTOR_metatype_to_string (i))
            {
-             if ( (0 == strcmp (optarg, EXTRACTOR_getKeywordTypeAsString (i))) 
||
-                  (0 == strcmp (optarg, _(EXTRACTOR_getKeywordTypeAsString 
(i)))) )
+             if ( (0 == strcmp (optarg, 
+                                EXTRACTOR_metatype_to_string (i))) ||
+                  (0 == strcmp (optarg, 
+                                gettext(EXTRACTOR_metatype_to_string (i)))) )
                {
                  print[i] = NO;
                  break;
                }
              i++;
            }
-         if (NULL == EXTRACTOR_getKeywordTypeAsString (i))
+         if (NULL == EXTRACTOR_metatype_to_string (i))
            {
              fprintf (stderr,
                       "Unknown keyword type `%s', use option `%s' to get a 
list.\n",
                       optarg,
                       "-L");
-#ifdef MINGW
-                       ShutdownWinEnv();
-#endif
              return -1;
            }
          break;
        default:
          fprintf (stderr,
                   _("Use --help to get a list of options.\n"));
-#ifdef MINGW
-       ShutdownWinEnv();
-#endif
          return -1;
        }                       /* end of parsing commandline */
     }                          /* while (1) */
@@ -616,54 +676,66 @@
     {
       fprintf (stderr,
               "Invoke with list of filenames to extract keywords form!\n");
-#ifdef MINGW
-               ShutdownWinEnv();
-#endif
       free (print);
       return -1;
     }
 
   /* build list of libraries */
   if (nodefault == NO)
-    extractors = EXTRACTOR_loadDefaultLibraries ();
+    plugins = EXTRACTOR_plugin_add_defaults (EXTRACTOR_OPTION_NONE);
   else
-    extractors = NULL;
-  if (useFilename == YES)
-    extractors = EXTRACTOR_addLibrary (extractors,
-                                      "libextractor_filename");
+    plugins = NULL;
   if (libraries != NULL)
-    extractors = EXTRACTOR_loadConfigLibraries (extractors, libraries);
+    plugins = EXTRACTOR_plugin_add_config (plugins, 
+                                             libraries,
+                                             EXTRACTOR_OPTION_NONE);
+  if (binary != NULL) 
+    {
+      /* FIXME: need full path here now... */
+      name = malloc(strlen(binary) + strlen("libextractor_printable_") + 1);
+      strcpy(name, "libextractor_printable_");
+      strcat(name, binary);
+      plugins = EXTRACTOR_plugin_add_last(plugins,
+                                         name,
+                                         NULL,
+                                         EXTRACTOR_OPTION_NONE);
+      free(name);
+    }
+  if (hash != NULL) 
+    {
+      /* FIXME: need full path here now... */
+      name = malloc(strlen(hash) + strlen("libextractor_hash_") + 1);
+      strcpy(name, "libextractor_hash_");
+      strcat(name, hash);
+      plugins = EXTRACTOR_plugin_add_last(plugins,
+                                         name,
+                                         NULL,
+                                         EXTRACTOR_OPTION_NONE);
+      free(name);
+    }
 
-  if (binary != NULL) {
-    char * name;
-    name = malloc(strlen(binary) + strlen("libextractor_printable_") + 1);
-    strcpy(name, "libextractor_printable_");
-    strcat(name, binary);
-    extractors = EXTRACTOR_addLibraryLast(extractors,
-                                         name);
-    free(name);
-  }
-  if (hash != NULL) {
-    char * name;
-    name = malloc(strlen(hash) + strlen("libextractor_hash_") + 1);
-    strcpy(name, "libextractor_hash_");
-    strcat(name, hash);
-    extractors = EXTRACTOR_addLibraryLast(extractors,
-                                         name);
-    free(name);
-  }
+  if (processor == NULL)
+    processor = &print_selected_keywords;
 
-  if (splitKeywords == YES)
-    extractors = EXTRACTOR_addLibraryLast(extractors,
-                                         "libextractor_split");
-
   /* extract keywords */
-  if ( bibtex == YES )
+  if (bibtex == YES)
     fprintf(stdout,
            _("%% BiBTeX file\n"));
   for (i = optind; i < argc; i++) {
     errno = 0;
-    keywords = EXTRACTOR_getKeywords (extractors, argv[i]);
+    if (grepfriendly == YES)
+      fprintf (stdout, "%s ", argv[i]);
+    else if (bibtex == NO)
+      fprintf (stdout,
+              _("Keywords for file %s:\n"),
+              argv[i]);
+    else
+      start_bibtex ();
+    EXTRACTOR_extract (plugins,
+                      argv[i],
+                      NULL, 0,
+                      processor,
+                      NULL);    
     if (0 != errno) {
       if (verbose > 0) {
        fprintf(stderr,
@@ -671,34 +743,21 @@
                argv[0], argv[i], strerror(errno));
       }
       ret = 1;
+      if (grepfriendly == YES)
+       fprintf (stdout, "\n");
       continue;
     }
-    if ( (duplicates != -1) || (bibtex == YES))
-      keywords = EXTRACTOR_removeDuplicateKeywords (keywords, duplicates);
-    if ( (verbose > 0) 
-        && (bibtex == NO) ) {
-      if (grepfriendly == YES)
-       printf ("%s ", argv[i]);
-      else
-       printf (_("Keywords for file %s:\n"),
-               argv[i]);
-    }
-    if (bibtex == YES)
-      printSelectedKeywordsBibtex (stdout, keywords, print, argv[i]);
-    else if (grepfriendly == YES)
-      printSelectedKeywordsGrepFriendly(stdout, keywords, print, verbose);
-    else
-      printSelectedKeywords (stdout, keywords, print, verbose);
-    if (verbose > 0 && bibtex == NO)
+    if (grepfriendly == YES)
+      fprintf (stdout, "\n");
+    if (bibtex)
+      finish_bibtex (argv[i]);
+    if (verbose > 0)
       printf ("\n");
-    EXTRACTOR_freeKeywords (keywords);
   }
   free (print);
-  EXTRACTOR_removeAll (extractors);
-
-#ifdef MINGW
-  ShutdownWinEnv();
-#endif
-
+  EXTRACTOR_plugin_remove_all (plugins);
+  start_bibtex (); /* actually free's stuff */
   return ret;
 }
+
+/* end of extract.c */

Modified: Extractor/src/main/extractor.c
===================================================================
--- Extractor/src/main/extractor.c      2009-12-12 20:09:31 UTC (rev 9745)
+++ Extractor/src/main/extractor.c      2009-12-13 23:02:19 UTC (rev 9746)
@@ -20,8 +20,33 @@
 
 #include "platform.h"
 #include "extractor.h"
-#include <pthread.h>
+#include <dirent.h>
+#include <sys/types.h>
+#include <sys/wait.h>
+#include <sys/shm.h>
+#include <signal.h>
 
+
+/**
+ * How many bytes do we actually try to scan? (from the beginning
+ * of the file).  Limit to 32 MB.
+ */
+#define MAX_READ 32 * 1024 * 1024
+
+/**
+ * How many bytes do we actually try to decompress? (from the beginning
+ * of the file).  Limit to 16 MB.
+ */
+#define MAX_DECOMPRESS 16 * 1024 * 1024
+
+/**
+ * Maximum length of a Mime-Type string.
+ */
+#define MAX_MIME_LEN 256
+
+#define DEBUG 0
+
+
 #if HAVE_LTDL_H
 #include <ltdl.h>
 #else
@@ -36,20 +61,171 @@
 #include <zlib.h>
 #endif
 
-#define DEBUG 0
 
+struct MetaTypeDescription
+{
+  const char *short_description;
+
+  const char *long_description;
+};
+
+
 /**
  * The sources of keywords as strings.
  */
-static const char *keywordTypes[] = {
-  gettext_noop("unknown"), /* 0 */
-  gettext_noop("filename"),
-  gettext_noop("mimetype"),
-  gettext_noop("title"),
+static const struct MetaTypeDescription meta_type_descriptions[] = {
+  /* 0 */
+  { gettext_noop ("reserved"),
+    gettext_noop ("reserved value, do not use") },
+  { gettext_noop ("mimetype"),
+    gettext_noop ("mime type") },
+  { gettext_noop ("embedded filename"),
+    gettext_noop ("filename that was embedded (not necessarily the current 
filename)") },
+  { gettext_noop ("comment"),
+    gettext_noop ("comment about the content") },
+  { gettext_noop ("title"),
+    gettext_noop ("title of the work")},
+  /* 5 */
+  { gettext_noop ("book title"),
+    gettext_noop ("title of the book containing the work") },
+  { gettext_noop ("book edition"),
+    gettext_noop ("edition of the book (or book containing the work)") },
+  { gettext_noop ("book chapter"),
+    gettext_noop ("chapter number") },
+  { gettext_noop ("journal name"),
+    gettext_noop ("journal or magazine the work was published in") },
+  { gettext_noop ("journal volume"),    
+    gettext_noop ("volume of a journal or multi-volume book") },
+  /* 10 */
+  { gettext_noop ("journal number"),    
+    gettext_noop ("number of a journal, magazine or tech-report") },
+  { gettext_noop ("page count"),
+    gettext_noop ("total number of pages of the work") },
+  { gettext_noop ("page range"),
+    gettext_noop ("page numbers of the publication in the respective journal 
or book") },
+  { gettext_noop ("author name"),
+    gettext_noop ("name of the author(s)") },
+  { gettext_noop ("author email"),
+    gettext_noop ("e-mail of the author(s)") },
+  /* 15 */
+  { gettext_noop ("author institution"),
+    gettext_noop ("institution the author worked for") },
+  { gettext_noop ("publisher"),
+    gettext_noop ("name of the publisher") },
+  { gettext_noop ("publisher's address"),
+    gettext_noop ("Address of the publisher (often only the city)") },
+  { gettext_noop ("publishing institution"),
+    gettext_noop ("institution that was involved in the publishing, but not 
necessarily the publisher") },
+  { gettext_noop ("publication series"),
+    gettext_noop ("series of books the book was published in") },
+  /* 20 */
+  { gettext_noop ("publication type"),
+    gettext_noop ("type of the tech-report") },
+  { gettext_noop ("publication year"),
+    gettext_noop ("year of publication (or, if unpublished, the year of 
creation)") },
+  { gettext_noop ("publication month"),
+    gettext_noop ("month of publication (or, if unpublished, the month of 
creation)") },
+  { gettext_noop ("publication day"),
+    gettext_noop ("day of publication (or, if unpublished, the day of 
creation), relative to the given month") },
+  { gettext_noop ("publication date"),
+    gettext_noop ("date of publication (or, if unpublished, the date of 
creation)") },
+  /* 25 */
+  { gettext_noop ("bibtex eprint"),
+    gettext_noop ("specification of an electronic publication") },
+  { gettext_noop ("bibtex entry type"),
+    gettext_noop ("type of the publication for bibTeX bibliographies") },
+  { gettext_noop ("language"),
+    gettext_noop ("language the work uses") },
+  { gettext_noop ("creation time"),
+    gettext_noop ("time and date of creation") },
+  { gettext_noop ("URL"),
+    gettext_noop ("universal resource location (where the work is made 
available)") },
+  /* 30 */
+  { gettext_noop ("URI"),
+    gettext_noop ("universal resource identifier") },
+  { gettext_noop ("international standard recording code"),
+    gettext_noop ("ISRC number identifying the work") },
+  { gettext_noop ("MD4"),
+    gettext_noop ("MD4 hash") },
+  { gettext_noop ("MD5"),
+    gettext_noop ("MD5 hash") },
+  { gettext_noop ("SHA-0"),
+    gettext_noop ("SHA-0 hash") },
+  /* 35 */
+  { gettext_noop ("SHA-1"), 
+    gettext_noop ("SHA-1 hash") },
+  { gettext_noop ("RipeMD160"),
+    gettext_noop ("RipeMD150 hash") },
+  { gettext_noop ("GPS latitude ref"),
+    gettext_noop ("GPS latitude ref") },
+  { gettext_noop ("GPS latitude"),
+    gettext_noop ("GPS latitude") },
+  { gettext_noop ("GPS longitude ref"),
+    gettext_noop ("GPS longitude ref") },
+  /* 40 */
+  { gettext_noop ("GPS longitude"),
+    gettext_noop ("GPS longitude") },
+  { gettext_noop ("city"),
+    gettext_noop ("name of the city where the document originated") },
+  { gettext_noop ("sublocation"), 
+    gettext_noop ("more specific location of the geographic origin") },
+  { gettext_noop ("country"),
+    gettext_noop ("name of the country where the document originated") },
+  { gettext_noop ("country code"),
+    gettext_noop ("ISO 2-letter country code for the country of origin") },
+  /* 45 */
+  { gettext_noop ("unknown"),
+    gettext_noop ("specifics are not known") },
+  { gettext_noop ("description"),
+    gettext_noop ("description") },
+  { gettext_noop ("copyright"),
+    gettext_noop ("copyright information") },
+  { gettext_noop ("rights"),
+    gettext_noop ("information about rights") },
+  { gettext_noop ("keywords"),
+    gettext_noop ("keywords") },
+  /* 50 */
+  { gettext_noop ("abstract"),
+    gettext_noop ("abstract") },
+  { gettext_noop ("summary"),
+    gettext_noop ("summary") },
+  { gettext_noop ("subject"),
+    gettext_noop ("subject matter") },
+  { gettext_noop ("creator"),
+    gettext_noop ("name of the person who created the document") },
+  { gettext_noop ("format"),
+    gettext_noop ("name of the document format") },
+  /* 55 */
+  { gettext_noop ("format version"),
+    gettext_noop ("version of the document format") },
+  { gettext_noop ("created by software"),
+    gettext_noop ("name of the software that created the document") },
+  { gettext_noop ("unknown date"),
+    gettext_noop ("ambiguous date (could specify creation time, modification 
time or access time)") },
+  { gettext_noop ("creation date"),
+    gettext_noop ("date the document was created") },
+  { gettext_noop ("modification date"),
+    gettext_noop ("date the document was modified") },
+  /* 60 */
+  { gettext_noop ("last printed"),
+    gettext_noop ("date the document was last printed") },
+  { gettext_noop ("last saved by"),
+    gettext_noop ("name of the user who saved the document last") },
+  { gettext_noop ("total editing time"),
+    gettext_noop ("time spent editing the document") },
+  { gettext_noop ("editing cycles"),
+    gettext_noop ("number of editing cycles") },
+  { gettext_noop ("modified by software"),
+    gettext_noop ("name of software making modifications") },
+  /* 65 */
+  { gettext_noop ("revision history"),
+    gettext_noop ("information about the revision history") },
+
+#if 0
+  
   gettext_noop("author"),
   gettext_noop("artist"), /* 5 */
   gettext_noop("description"),
-  gettext_noop("comment"),
   gettext_noop("date"),
   gettext_noop("publisher"),
   gettext_noop("language"), /* 10 */
@@ -94,11 +270,6 @@
   gettext_noop("build-host"),
   gettext_noop("operating system"), /* 50 */
   gettext_noop("dependency"),
-  gettext_noop("MD4"),
-  gettext_noop("MD5"),
-  gettext_noop("SHA-0"),
-  gettext_noop("SHA-1"), /* 55 */
-  gettext_noop("RipeMD160"),
   gettext_noop("resolution"),
   gettext_noop("category"),
   gettext_noop("book title"),
@@ -143,7 +314,6 @@
   gettext_noop("created by software"),
   gettext_noop("modified by software"),
   gettext_noop("revision history"), /* 100 */
-  gettext_noop("lower case conversion"),
   gettext_noop("company"),
   gettext_noop("generator"),
   gettext_noop("character set"),
@@ -175,137 +345,131 @@
   gettext_noop("ripper"), /* 130 */
   gettext_noop("filesize"),
   gettext_noop("track number"),
-  gettext_noop("international standard recording code"),
   gettext_noop("disc number"), 
   gettext_noop("preferred display style (GNUnet)"), /* 135 */
   gettext_noop("GNUnet URI of ECBC data"),
   gettext_noop("Complete file data (for non-binary files only)"),
-  gettext_noop("city"),
-  gettext_noop("country"),
-  gettext_noop("sublocation"), /* 140 */
-  gettext_noop("GPS latitude ref"),
-  gettext_noop("GPS latitude"),
-  gettext_noop("GPS longitude ref"),
-  gettext_noop("GPS longitude"),
   gettext_noop("rating"), /* 145 */
-  gettext_noop("country code"),
-  NULL
+
+#endif
 };
 
-/* the number of keyword types (for bounds-checking) */
-#define HIGHEST_TYPE_NUMBER 147
+/**
+ * Total number of keyword types (for bounds-checking) 
+ */
+#define HIGHEST_METATYPE_NUMBER (sizeof (meta_type_descriptions) / 
sizeof(*meta_type_descriptions))
 
-#ifdef HAVE_LIBOGG
-#if HAVE_VORBIS
-#define WITH_OGG 1
-#endif
-#endif
 
-#if HAVE_VORBISFILE
-#define WITH_OGG 1
-#endif
+/**
+ * Get the textual name of the keyword.
+ *
+ * @param type meta type to get a UTF-8 string for
+ * @return NULL if the type is not known, otherwise
+ *         an English (locale: C) string describing the type;
+ *         translate using 'dgettext ("libextractor", rval)'
+ */
+const char *
+EXTRACTOR_metatype_to_string(enum EXTRACTOR_MetaType type)
+{
+  if ((type < 0) || (type >= HIGHEST_METATYPE_NUMBER))
+    return NULL;
+  return meta_type_descriptions[type].short_description;
+}
 
-#if HAVE_EXIV2
-#define EXSO "libextractor_exiv2:"
-#else
-#define EXSO ""
-#endif
 
-#if WITH_OGG
-#define OGGSO "libextractor_ogg:"
-#else
-#define OGGSO ""
-#endif
+/**
+ * Get a long description for the meta type.
+ *
+ * @param type meta type to get a UTF-8 description for
+ * @return NULL if the type is not known, otherwise
+ *         an English (locale: C) string describing the type;
+ *         translate using 'dgettext ("libextractor", rval)'
+ */
+const char *
+EXTRACTOR_metatype_to_description(enum EXTRACTOR_MetaType type)
+{
+  if ((type < 0) || (type >= HIGHEST_METATYPE_NUMBER))
+    return NULL;
+  return meta_type_descriptions[type].long_description;
+}
 
-#if HAVE_FLAC
-#define FLACSO "libextractor_flac:"
-#else
-#define FLACSO ""
-#endif
 
-#if HAVE_ZLIB
-#define QTSO "libextractor_qt:"
-#else
-#define QTSO ""
-#endif
+/**
+ * Return the highest type number, exclusive as in [0,max).
+ *
+ * @return highest legal metatype number for this version of libextractor
+ */
+enum EXTRACTOR_MetaType
+EXTRACTOR_metatype_get_max ()
+{
+  return HIGHEST_METATYPE_NUMBER;
+}
 
-#if HAVE_GSF
-#define OLESO "libextractor_ole2:"
-#else
-#define OLESO ""
-#endif
 
-#if HAVE_MPEG2
-#define MPEGSO "libextractor_mpeg:"
-#else
-#define MPEGSO ""
-#endif 
+/**
+ * Linked list of extractor plugins.  An application builds this list
+ * by telling libextractor to load various keyword-extraction
+ * plugins. Libraries can also be unloaded (removed from this list,
+ * see EXTRACTOR_plugin_remove).
+ */
+struct EXTRACTOR_PluginList
+{
+  /**
+   * This is a linked list.
+   */
+  struct EXTRACTOR_PluginList *next;
 
-/* ATTN: order matters (for performance!) since
-   mime-types can be used to avoid parsing once
-   the type has been established! */
-#define DEFSO \
-"libextractor_html:\
-libextractor_man:\
-libextractor_ps:\
-libextractor_pdf:\
-libextractor_mp3:\
-libextractor_id3v2:\
-libextractor_id3v23:\
-libextractor_id3v24:\
-libextractor_mime:\
-libextractor_tar:\
-libextractor_dvi:\
-libextractor_deb:\
-libextractor_png:\
-libextractor_gif:\
-libextractor_wav:\
-libextractor_flv:\
-libextractor_real:\
-libextractor_jpeg:\
-libextractor_tiff:\
-libextractor_zip:\
-libextractor_rpm:\
-libextractor_riff:\
-libextractor_applefile:\
-libextractor_elf:\
-libextractor_oo:\
-libextractor_asf:\
-libextractor_sid:\
-libextractor_nsfe:\
-libextractor_nsf:\
-libextractor_it:\
-libextractor_xm:\
-libextractor_s3m"
+  /**
+   * Pointer to the plugin (as returned by lt_dlopen).
+   */
+  void * libraryHandle;
 
-#define DEFAULT_LIBRARIES MPEGSO EXSO OLESO OGGSO FLACSO QTSO DEFSO
+  /**
+   * Name of the library (i.e., 'libextractor_foo.so')
+   */
+  char *libname;
+  
+  /**
+   * Pointer to the function used for meta data extraction.
+   */
+  EXTRACTOR_ExtractMethod extractMethod;
 
-const char * EXTRACTOR_getDefaultLibraries() {
-  return DEFAULT_LIBRARIES;
-}
+  /**
+   * Options for the plugin.
+   */
+  char * plugin_options;
 
-/* determine installation path */
+  /**
+   * Flags to control how the plugin is executed.
+   */
+  enum EXTRACTOR_Options flags;
 
-static char * cut_bin(char * in) {
-  size_t p;
+  /**
+   * Process ID of the child process for this plugin. 0 for 
+   * none.
+   */
+  pid_t cpid;
 
-  if (in == NULL)
-    return NULL;
-  p = strlen(in);
-  if (p > 4) {
-    if ( (in[p-1] == '/') ||
-        (in[p-1] == '\\') )
-      in[--p] = '\0';
-    if (0 == strcmp(&in[p-3],
-                   "bin")) {
-      in[p-3] = '\0';
-      p -= 3;
-    }
-  }
-  return in;
-}
+  /**
+   * Pipe used to send information about shared memory segments to
+   * the child process.  NULL if not initialized.
+   */
+  FILE *cpipe_in;
 
-static char * cut_lib(char * in) {
+  /**
+   * Pipe used to read information about extracted meta data from
+   * the child process.  -1 if not initialized.
+   */
+  int cpipe_out;
+
+};
+
+
+/**
+ * Remove a trailing '/bin' from in (if present).
+ */
+static char * 
+cut_bin(char * in) {
   size_t p;
 
   if (in == NULL)
@@ -316,7 +480,7 @@
         (in[p-1] == '\\') )
       in[--p] = '\0';
     if (0 == strcmp(&in[p-3],
-                   "lib")) {
+                   "bin")) {
       in[p-3] = '\0';
       p -= 3;
     }
@@ -324,7 +488,6 @@
   return in;
 }
 
-
 #if LINUX
 /**
  * Try to determine path by reading /proc/PID/exe or
@@ -490,404 +653,337 @@
   return NULL;
 }
 
+
+/**
+ * Function to call on paths.
+ * 
+ * @param cls closure
+ * @param path a directory path
+ */
+typedef void (*PathProcessor)(void *cls,
+                             const char *path);
+
+
+/**
+ * Create a filename by appending 'fname' to 'path'.
+ *
+ * @param path the base path 
+ * @param fname the filename to append
+ * @return '$path/$fname'
+ */
 static char *
-get_path_from_ENV_PREFIX() {
-  const char * p;
+append_to_dir (const char *path,
+              const char *fname)
+{
+  char *ret;
 
-  p = getenv("LIBEXTRACTOR_PREFIX");
-  if (p != NULL) {
-    char * s = malloc(strlen(p) + 6);
-    if (s != NULL) {
-      int len;
-      strcpy(s, p);
-      s = cut_bin(cut_lib(s));
-      len = strlen(s);
-      s = realloc(s, len + 6);
-      if (len > 0 && s[len-1] != '/')
-        strcat(s, "/lib/");
-      else
-        strcat(s, "lib/");
-      return s;
-    }
-  }
-  return NULL;
+  ret = malloc (strlen (path) + strlen(fname) + 2);
+  sprintf (ret,
+#ifdef MINGW
+          "%s\%s",
+#else
+          "%s/%s",
+#endif
+          path, 
+          fname);
+  return ret;
 }
 
-/*
- * @brief get the path to the plugin directory
- * @return a pointer to the dir path (to be freed by the caller)
+
+/**
+ * Iterate over all paths where we expect to find GNU libextractor
+ * plugins.
+ *
+ * @param pp function to call for each path
+ * @param pp_cls cls argument for pp.
  */
-static char * os_get_installation_path() {
-  size_t n;
-  char * tmp;
-  char * lpref;
-  char * pexe;
-  char * modu;
-  char * dima;
+static void
+get_installation_paths (PathProcessor pp,
+                       void *pp_cls)
+{
+  const char *p;
   char * path;
+  char * prefix;
+  char * d;
 
-  lpref = get_path_from_ENV_PREFIX();
+  prefix = NULL;
+  p = getenv("LIBEXTRACTOR_PREFIX");
+  if (p != NULL)
+    {
+      d = strdup (p);
+      prefix = strtok (d, ":");
+      while (NULL != prefix)
+       {
+         pp (pp_cls, prefix);
+         prefix = strtok (NULL, ":");
+       }
+      free (d);
+      return;
+    }
 #if LINUX
-  pexe = get_path_from_proc_exe();
-#else
-  pexe = NULL;
+  if (prefix == NULL)
+    prefix = get_path_from_proc_exe();
 #endif
 #if WINDOWS
-  modu = get_path_from_module_filename();
-#else
-  modu = NULL;
+  if (prefix == NULL)
+    prefix = get_path_from_module_filename();
 #endif
 #if DARWIN
-  dima = get_path_from_dyld_image();
-  path = NULL;
-#else
-  dima = NULL;
-  path = get_path_from_PATH();
+  if (prefix == NULL)
+    prefix = get_path_from_dyld_image();
 #endif
-  n = 1;
-  if (lpref != NULL)
-    n += strlen(lpref) + strlen(PLUGINDIR "/:");
-  if (pexe != NULL)
-    n += strlen(pexe) + strlen(PLUGINDIR "/:");
-  if (modu != NULL)
-    n += strlen(modu) + strlen(PLUGINDIR "/:");
-  if (dima != NULL)
-    n += strlen(dima) + strlen(PLUGINDIR "/:");
-  if (path != NULL)
-    n += strlen(path) + strlen(PLUGINDIR "/:");
-  tmp = malloc(n);
-  tmp[0] = '\0';
-  if (lpref != NULL) {
-    strcat(tmp, lpref);
-    strcat(tmp, PLUGINDIR "/:");
-    free(lpref);
-  }
-  if (pexe != NULL) {
-    strcat(tmp, pexe);
-    strcat(tmp, PLUGINDIR "/:");
-    free(pexe);
-  }
-  if (modu != NULL) {
-    strcat(tmp, modu);
-    strcat(tmp, PLUGINDIR "/:");
-    free(modu);
-  }
-  if (dima != NULL) {
-    strcat(tmp, dima);
-    strcat(tmp, PLUGINDIR "/:");
-    free(dima);
-  }
-  if (path != NULL) {
-    strcat(tmp, path);
-    strcat(tmp, PLUGINDIR "/:");
-    free(path);
-  }
-  if (strlen(tmp) > 0)
-    tmp[strlen(tmp)-1] = '\0';
-  if (strlen(tmp) == 0) {
-    free(tmp);
-    return NULL;
-  }
-  return tmp;
+  if (prefix == NULL)
+    prefix = get_path_from_PATH();
+  if (prefix == NULL)
+    return;
+  if (prefix != NULL)
+    {
+      path = append_to_dir (prefix, PLUGINDIR);
+      pp (pp_cls, path);
+      free (path);
+      free (prefix);
+      return;
+    }
 }
 
 
-/* ************library initialization ***************** */
+struct DefaultLoaderContext
+{
+  struct EXTRACTOR_PluginList *res;
+  enum EXTRACTOR_Options flags;
+};
 
-static char * old_dlsearchpath = NULL;
 
-/* using libtool, needs init! */
-void __attribute__ ((constructor)) le_ltdl_init() {
-  int err;
-  const char * opath;
-  char * path;
-  char * cpath;
-
-#if ENABLE_NLS
-  BINDTEXTDOMAIN(PACKAGE, LOCALEDIR);
-  BINDTEXTDOMAIN("iso-639", ISOLOCALEDIR); /* used by wordextractor */
-#endif
-  err = lt_dlinit ();
-  if (err > 0) {
-#if DEBUG
-    fprintf(stderr,
-           _("Initialization of plugin mechanism failed: %s!\n"),
-           lt_dlerror());
-#endif
-    return;
-  }
-  opath = lt_dlgetsearchpath();
-  if (opath != NULL)
-    old_dlsearchpath = strdup(opath);
-  path = os_get_installation_path();
-  if (path != NULL) {
-    if (opath != NULL) {
-      cpath = malloc(strlen(path) + strlen(opath) + 4);
-      strcpy(cpath, opath);
-      strcat(cpath, ":");
-      strcat(cpath, path);
-      lt_dlsetsearchpath(cpath);
-      free(path);
-      free(cpath);
-    } else {
-      lt_dlsetsearchpath(path);
-      free(path);
-    }
-  }
-#ifdef MINGW
-  InitWinEnv();
-#endif
-}
-
-void __attribute__ ((destructor)) le_ltdl_fini() {
-  lt_dlsetsearchpath(old_dlsearchpath);
-  if (old_dlsearchpath != NULL) {
-    free(old_dlsearchpath);
-    old_dlsearchpath = NULL;
-  }
-#ifdef MINGW
-  ShutdownWinEnv();
-#endif
-  lt_dlexit ();
-}
-
 /**
- * Open a file
+ * Load all plugins from the given directory.
+ * 
+ * @param cls pointer to the "struct EXTRACTOR_PluginList*" to extend
+ * @param path path to a directory with plugins
  */
-static int fileopen(const char *filename, int oflag, ...)
+static void
+load_plugins_from_dir (void *cls,
+                      const char *path)
 {
-  int mode;
-  char *fn;
+  struct DefaultLoaderContext *dlc = cls;
+  DIR *dir;
+  struct dirent *ent;
+  char *fname;
+  const char *la;
 
-#ifdef MINGW
-  char szFile[_MAX_PATH + 1];
-  long lRet;
-
-  if ((lRet = plibc_conv_to_win_path(filename, szFile)) != ERROR_SUCCESS)
-  {
-    errno = ENOENT;
-    SetLastError(lRet);
-
-    return -1;
-  }
-  fn = szFile;
-#else
-  fn = (char *) filename;
-#endif
-
-  if (oflag & O_CREAT)
-  {
-    va_list arg;
-    va_start(arg, oflag);
-    mode = va_arg(arg, int);
-    va_end(arg);
-  }
-  else
-  {
-    mode = 0;
-  }
-
-#ifdef MINGW
-  /* Set binary mode */
-  mode |= O_BINARY;
-#endif
-
-  return open(fn, oflag, mode);
+  dir = opendir (path);
+  if (NULL == dir)
+    return;
+  while (NULL != (ent = readdir (dir)))
+    {
+      if (ent->d_name[0] == '.')
+       continue;
+      if ( (NULL != (la = strstr (ent->d_name, ".la"))) &&
+          (la[3] == '\0') )
+       continue; /* only load '.so' and '.dll' */
+      fname = append_to_dir (path, ent->d_name);
+      dlc->res = EXTRACTOR_plugin_add (dlc->res,
+                                      fname,
+                                      NULL,
+                                      dlc->flags);
+      free (fname);
+    }
+  closedir (dir);
 }
 
 
-
 /**
- * Load the default set of libraries. The default set of
- * libraries consists of the libraries that are part of
- * the libextractor distribution (except split and filename
- * extractor) plus the extractors that are specified
- * in the environment variable "LIBEXTRACTOR_LIBRARIES".
+ * Load the default set of plugins. The default can be changed
+ * by setting the LIBEXTRACTOR_LIBRARIES environment variable.
+ * If it is set to "env", then this function will return
+ * EXTRACTOR_plugin_add_config (NULL, env, flags).  Otherwise,
+ * it will load all of the installed plugins and return them.
  *
- * @return the default set of libraries.
+ * @param flags options for all of the plugins loaded
+ * @return the default set of plugins, NULL if no plugins were found
  */
-EXTRACTOR_ExtractorList *
-EXTRACTOR_loadDefaultLibraries ()
+struct EXTRACTOR_PluginList * 
+EXTRACTOR_plugin_add_defaults(enum EXTRACTOR_Options flags)
 {
+  struct DefaultLoaderContext dlc;
   char *env;
-  char *tmp;
-  EXTRACTOR_ExtractorList *res;
 
-
   env = getenv ("LIBEXTRACTOR_LIBRARIES");
-  if (env == NULL)
-    {
-      return EXTRACTOR_loadConfigLibraries (NULL, DEFAULT_LIBRARIES);
-    }
-  tmp = malloc (strlen (env) + strlen (DEFAULT_LIBRARIES) + 2);
-  strcpy (tmp, env);
-  strcat (tmp, ":");
-  strcat (tmp, DEFAULT_LIBRARIES);
-  res = EXTRACTOR_loadConfigLibraries (NULL, tmp);
-  free (tmp);
-  return res;
+  if (env != NULL)
+    return EXTRACTOR_plugin_add_config (NULL, env, flags);
+  dlc.res = NULL;
+  dlc.flags = flags;
+  get_installation_paths (&load_plugins_from_dir,
+                         &dlc);
+  return dlc.res;
 }
 
+
 /**
- * Get the textual name of the keyword.
- * @return NULL if the type is not known
+ * Try to resolve a plugin function.
+ *
+ * @param lib_handle library to search for the symbol
+ * @param prefix prefix to add
+ * @param sym_name base name for the symbol
+ * @return NULL on error, otherwise pointer to the symbol
  */
-const char *
-EXTRACTOR_getKeywordTypeAsString(const EXTRACTOR_KeywordType type)
+static void *
+get_symbol_with_prefix(void *lib_handle,
+                      const char *prefix)
 {
-  if ((type >= 0) && (type < HIGHEST_TYPE_NUMBER))
-    return keywordTypes[type];
-  else
+  char *name;
+  void *symbol;
+  const char *sym_name;
+  char *sym;
+  char *dot;
+
+  sym_name = strstr (prefix, "_");
+  if (sym_name == NULL)
     return NULL;
-}
-
-static pthread_mutex_t ltdl_lock = PTHREAD_MUTEX_INITIALIZER;
-
-#define LTDL_MUTEX_LOCK                     \
-  if (pthread_mutex_lock (&ltdl_lock) != 0) \
-    abort();
-#define LTDL_MUTEX_UNLOCK                     \
-  if (pthread_mutex_unlock (&ltdl_lock) != 0) \
-    abort();
-
-static void *getSymbolWithPrefix(void *lib_handle,
-                                 const char *lib_name,
-                                 const char *sym_name)
-{
-  size_t name_size
-    = strlen(lib_name)
-    + strlen(sym_name)
-    + 1 /* for the zero delim. */
-    + 1 /* for the optional '_' prefix */;
-  char *name=malloc(name_size),*first_error;
-  void *symbol=NULL;
-
-  snprintf(name,
-          name_size,
-          "_%s%s",
-          lib_name,
-          sym_name);
-
-  LTDL_MUTEX_LOCK
-  symbol=lt_dlsym(lib_handle,name+1 /* skip the '_' */);
-  if (symbol==NULL) {
-    first_error=strdup(lt_dlerror());
-    symbol=lt_dlsym(lib_handle,name /* now try with the '_' */);
+  sym_name++;
+  sym = strdup (sym_name);
+  dot = strstr (sym, ".");
+  if (dot != NULL)
+    *dot = '\0';
+  name = malloc(strlen(sym) + 32);
+  sprintf(name,
+         "_EXTRACTOR_%s_extract",
+         sym);
+  free (sym);
+  /* try without '_' first */
+  symbol = lt_dlsym(lib_handle, name + 1);
+  if (symbol==NULL) 
+    {
+      /* now try with the '_' */
 #if DEBUG
-    fprintf(stderr,
-           _("Resolving symbol `%s' in library `%s' failed, "
-             "so I tried `%s', but that failed also.  Errors are: "
-             "`%s' and `%s'.\n"),
-             name+1,
-             lib_name,
-             name,
-             first_error,
-             lt_dlerror());
+      char *first_error = strdup (lt_dlerror());
 #endif
-    free(first_error);
-  }
-  LTDL_MUTEX_UNLOCK
+      symbol = lt_dlsym(lib_handle, name);
+#if DEBUG
+      if (NULL == symbol)
+       {
+         fprintf(stderr,
+                 "Resolving symbol `%s' failed, "
+                 "so I tried `%s', but that failed also.  Errors are: "
+                 "`%s' and `%s'.\n",
+                 name+1,
+                 name,
+                 first_error,
+                 lt_dlerror());
+       }
+      free(first_error);
+#endif
+    }
   free(name);
   return symbol;
 }
 
+
 /**
- * Load a dynamic library.
- * @return 1 on success, -1 on error
+ * Load a plugin.
+ *
+ * @param name name of the plugin
+ * @param libhandle set to the handle for the plugin
+ * @param method set to the extraction method
+ * @return 0 on success, -1 on error
  */
 static int
-loadLibrary (const char *name,
+plugin_load (const char *name,
             void **libHandle,
-            ExtractMethod * method)
+            EXTRACTOR_ExtractMethod * method)
 {
   lt_dladvise advise;
 
-  LTDL_MUTEX_LOCK
-  lt_dladvise_init(&advise);
-  lt_dladvise_ext(&advise);
-  lt_dladvise_local(&advise);
+  lt_dladvise_init (&advise);
+  lt_dladvise_ext (&advise);
+  lt_dladvise_local (&advise);
   *libHandle = lt_dlopenadvise (name, advise);
   lt_dladvise_destroy(&advise);
   if (*libHandle == NULL)
     {
 #if DEBUG
       fprintf (stderr,
-              _("Loading `%s' plugin failed: %s\n"),
+              "Loading `%s' plugin failed: %s\n",
               name,
               lt_dlerror ());
 #endif
-      LTDL_MUTEX_UNLOCK
       return -1;
     }
-  LTDL_MUTEX_UNLOCK
-
-  *method = (ExtractMethod) getSymbolWithPrefix (*libHandle, name, "_extract");
-  if (*method == NULL) {
-    LTDL_MUTEX_LOCK
-    lt_dlclose (*libHandle);
-    LTDL_MUTEX_UNLOCK
-    return -1;
-  }
-  return 1;
+  *method = get_symbol_with_prefix (*libHandle, name);
+  if (*method == NULL) 
+    {
+      lt_dlclose (*libHandle);
+      return -1;
+    }
+  return 0;
 }
 
-/* Internal function that accepts options. */
-static EXTRACTOR_ExtractorList *
-EXTRACTOR_addLibrary2 (EXTRACTOR_ExtractorList * prev,
-                      const char *library, const char *options)
+
+/**
+ * Add a library for keyword extraction.
+ *
+ * @param prev the previous list of libraries, may be NULL
+ * @param library the name of the library
+ * @param flags options to use
+ * @return the new list of libraries, equal to prev iff an error occured
+ */
+struct EXTRACTOR_PluginList *
+EXTRACTOR_plugin_add (struct EXTRACTOR_PluginList * prev,
+                     const char *library,
+                     const char *options,
+                     enum EXTRACTOR_Options flags)
 {
-  EXTRACTOR_ExtractorList *result;
+  struct EXTRACTOR_PluginList *result;
   void *handle;
-  ExtractMethod method;
+  EXTRACTOR_ExtractMethod method;
 
-  if (-1 == loadLibrary (library, &handle, &method))
+  if (0 != plugin_load (library, &handle, &method))
     return prev;
-  result = malloc (sizeof (EXTRACTOR_ExtractorList));
+  result = malloc (sizeof (struct EXTRACTOR_PluginList));
   result->next = prev;
   result->libraryHandle = handle;
   result->extractMethod = method;
   result->libname = strdup (library);
-  if( options )
-    result->options = strdup (options);
+  result->flags = flags;
+  if (NULL != options)
+    result->plugin_options = strdup (options);
   else
-    result->options = NULL;
+    result->plugin_options = NULL;
   return result;
 }
 
+
 /**
- * Add a library for keyword extraction.
+ * Add a library for keyword extraction at the END of the list.
  * @param prev the previous list of libraries, may be NULL
  * @param library the name of the library
- * @return the new list of libraries, equal to prev iff an error occured
+ * @param options options to give to the library
+ * @param flags options to use
+ * @return the new list of libraries, always equal to prev
+ *         except if prev was NULL and no error occurs
  */
-EXTRACTOR_ExtractorList *
-EXTRACTOR_addLibrary (EXTRACTOR_ExtractorList * prev,
-                     const char *library)
+struct EXTRACTOR_PluginList *
+EXTRACTOR_plugin_add_last(struct EXTRACTOR_PluginList *prev,
+                         const char *library,
+                         const char *options,
+                         enum EXTRACTOR_Options flags)
 {
-  return EXTRACTOR_addLibrary2(prev, library, NULL);
-}
-
-/* Internal function which takes options. */
-static EXTRACTOR_ExtractorList *
-EXTRACTOR_addLibraryLast2 (EXTRACTOR_ExtractorList * prev,
-                          const char *library, const char *options)
-{
-  EXTRACTOR_ExtractorList *result;
-  EXTRACTOR_ExtractorList *pos;
+  struct EXTRACTOR_PluginList *result;
+  struct EXTRACTOR_PluginList *pos;
   void *handle;
-  ExtractMethod method;
+  EXTRACTOR_ExtractMethod method;
 
-  if (-1 == loadLibrary (library, &handle, &method))
+  if (0 != plugin_load (library, &handle, &method))
     return prev;
-  result = malloc (sizeof (EXTRACTOR_ExtractorList));
+  result = malloc (sizeof (struct EXTRACTOR_PluginList));
   result->next = NULL;
   result->libraryHandle = handle;
   result->extractMethod = method;
   result->libname = strdup (library);
   if( options )
-    result->options = strdup (options);
+    result->plugin_options = strdup (options);
   else
-    result->options = NULL;
+    result->plugin_options = NULL;
+  result->flags = flags;
   if (prev == NULL)
     return result;
   pos = prev;
@@ -897,42 +993,33 @@
   return prev;
 }
 
-/**
- * Add a library for keyword extraction at the END of the list.
- * @param prev the previous list of libraries, may be NULL
- * @param library the name of the library
- * @return the new list of libraries, always equal to prev
- *         except if prev was NULL and no error occurs
- */
-EXTRACTOR_ExtractorList *
-EXTRACTOR_addLibraryLast (EXTRACTOR_ExtractorList * prev,
-                         const char *library)
-{
-  return EXTRACTOR_addLibraryLast2(prev, library, NULL);
-}
 
 /**
  * Load multiple libraries as specified by the user.
+ *
  * @param config a string given by the user that defines which
  *        libraries should be loaded. Has the format
- *        "[[-]LIBRARYNAME[:[-]LIBRARYNAME]*]". For example,
- *        libextractor_mp3.so:libextractor_ogg.so loads the
+ *        "[[-]LIBRARYNAME[(options)][:[-]LIBRARYNAME[(options)]]]*".
+ *        For example,
+ *        
/usr/lib/libextractor/libextractor_mp3.so:/usr/lib/libextractor/libextractor_ogg.so
 loads the
  *        mp3 and the ogg library. The '-' before the LIBRARYNAME
  *        indicates that the library should be added to the end
  *        of the library list (addLibraryLast).
  * @param prev the  previous list of libraries, may be NULL
+ * @param flags options to use
  * @return the new list of libraries, equal to prev iff an error occured
  *         or if config was empty (or NULL).
  */
-EXTRACTOR_ExtractorList *
-EXTRACTOR_loadConfigLibraries (EXTRACTOR_ExtractorList * prev,
-                              const char *config)
+struct EXTRACTOR_PluginList *
+EXTRACTOR_plugin_add_config (struct EXTRACTOR_PluginList * prev,
+                            const char *config,
+                            enum EXTRACTOR_Options flags)
 {
   char *cpy;
-  int pos;
-  int last;
-  int lastconf;
-  int len;
+  size_t pos;
+  size_t last;
+  ssize_t lastconf;
+  size_t len;
 
   if (config == NULL)
     return prev;
@@ -966,36 +1053,39 @@
       if (cpy[last] == '-')
        {
          last++;
-         if( lastconf != -1 )
-           prev = EXTRACTOR_addLibraryLast2 (prev, &cpy[last],
-                                             &cpy[lastconf]);
-         else
-           prev = EXTRACTOR_addLibraryLast2 (prev, &cpy[last], NULL);
+         prev = EXTRACTOR_plugin_add_last (prev, 
+                                           &cpy[last],
+                                           (lastconf != -1) ? &cpy[lastconf] : 
NULL,
+                                           flags);
        }
       else
-       if( lastconf != -1 )
-         prev = EXTRACTOR_addLibrary2 (prev, &cpy[last], &cpy[lastconf]);
-       else
-         prev = EXTRACTOR_addLibrary2 (prev, &cpy[last], NULL);
-
+       {
+         prev = EXTRACTOR_plugin_add (prev, 
+                                      &cpy[last], 
+                                      (lastconf != -1) ? &cpy[lastconf] : NULL,
+                                      flags);
+       }
       last = pos;
     }
   free (cpy);
   return prev;
 }
 
+
 /**
- * Remove a library for keyword extraction.
- * @param prev the current list of libraries
- * @param library the name of the library to remove
- * @return the reduced list, unchanged if the library was not loaded
+ * Remove a plugin from a list.
+ *
+ * @param prev the current list of plugins
+ * @param library the name of the plugin to remove
+ * @return the reduced list, unchanged if the plugin was not loaded
  */
-EXTRACTOR_ExtractorList *
-EXTRACTOR_removeLibrary(EXTRACTOR_ExtractorList * prev,
-                       const char *library)
+struct EXTRACTOR_PluginList *
+EXTRACTOR_plugin_remove(struct EXTRACTOR_PluginList * prev,
+                       const char * library)
 {
-  EXTRACTOR_ExtractorList *pos;
-  EXTRACTOR_ExtractorList *first;
+  struct EXTRACTOR_PluginList *pos;
+  struct EXTRACTOR_PluginList *first;
+
   pos = prev;
   first = prev;
   while ((pos != NULL) && (0 != strcmp (pos->libname, library)))
@@ -1011,57 +1101,462 @@
       else
        prev->next = pos->next;
       /* found */
+      /* FIXME: stop sub-process! */
       free (pos->libname);
-      if( pos->options )
-       free (pos->options);
-      if( pos->libraryHandle ) {
-        LTDL_MUTEX_LOCK
-       lt_dlclose (pos->libraryHandle);
-        LTDL_MUTEX_UNLOCK
-      }
+      free (pos->plugin_options);
+      if (NULL != pos->libraryHandle) 
+       lt_dlclose (pos->libraryHandle);      
       free (pos);
     }
 #if DEBUG
   else
     fprintf(stderr,
-           _("Unloading plugin `%s' failed!\n"),
+           "Unloading plugin `%s' failed!\n",
            library);
 #endif
   return first;
 }
 
+
 /**
- * Remove all extractors.
- * @param libraries the list of extractors
+ * Remove all plugins from the given list (destroys the list).
+ *
+ * @param plugin the list of plugins
  */
-void
-EXTRACTOR_removeAll (EXTRACTOR_ExtractorList * libraries)
+void 
+EXTRACTOR_plugin_remove_all(struct EXTRACTOR_PluginList *plugins)
 {
-  while (libraries != NULL)
-    libraries = EXTRACTOR_removeLibrary (libraries, libraries->libname);
+  while (plugins != NULL)
+    plugins = EXTRACTOR_plugin_remove (plugins, plugins->libname);
 }
 
 
+static int
+write_all (int fd,
+          const void *buf,
+          size_t size)
+{
+  const char *data = buf;
+  size_t off = 0;
+  ssize_t ret;
+  
+  while (off < size)
+    {
+      ret = write (fd, &data[off], size - off);
+      if (ret <= 0)
+       return -1;
+      off += ret;
+    }
+  return 0;
+}
 
+
+static int
+read_all (int fd,
+         void *buf,
+         size_t size)
+{
+  char *data = buf;
+  size_t off = 0;
+  ssize_t ret;
+  
+  while (off < size)
+    {
+      ret = read (fd, &data[off], size - off);
+      if (ret <= 0)
+       return -1;
+      off += ret;
+    }
+  return 0;
+}
+
+
 /**
- * How many bytes do we actually try to scan? (from the beginning
- * of the file).  Limit to 1 GB.
+ * Header used for our IPC replies.  A header
+ * with all fields being zero is used to indicate
+ * the end of the stream.
  */
-#define MAX_READ 1024 * 1024 * 1024
+struct IpcHeader
+{
+  enum EXTRACTOR_MetaType type;
+  enum EXTRACTOR_MetaFormat format;
+  size_t data_len;
+  size_t mime_len;
+};
 
+
 /**
- * How many bytes do we actually try to decompress? (from the beginning
- * of the file).  Limit to 16 MB.
+ * Function called by a plugin in a child process.  Transmits
+ * the meta data back to the parent process.
+ *
+ * @param cls closure, "int*" of the FD for transmission
+ * @param plugin_name name of the plugin that produced this value;
+ *        special values can be used (i.e. '<zlib>' for zlib being
+ *        used in the main libextractor library and yielding
+ *        meta data).
+ * @param type libextractor-type describing the meta data
+ * @param format basic format information about data 
+ * @param data_mime_type mime-type of data (not of the original file);
+ *        can be NULL (if mime-type is not known)
+ * @param data actual meta-data found
+ * @param data_len number of bytes in data
+ * @return 0 to continue extracting, 1 to abort (transmission error)
+ */ 
+static int
+transmit_reply (void *cls,
+               const char *plugin_name,
+               enum EXTRACTOR_MetaType type,
+               enum EXTRACTOR_MetaFormat format,
+               const char *data_mime_type,
+               const char *data,
+               size_t data_len)
+{
+  int *cpipe_out = cls;
+  struct IpcHeader hdr;
+  size_t mime_len;
+
+  if (data_mime_type == NULL)
+    mime_len = 0;
+  else
+    mime_len = strlen (data_mime_type) + 1;
+  if (mime_len > MAX_MIME_LEN)
+    mime_len = MAX_MIME_LEN;
+  hdr.type = type;
+  hdr.format = format;
+  hdr.data_len = data_len;
+  hdr.mime_len = mime_len;
+  if ( (hdr.type == 0) &&
+       (hdr.format == 0) &&
+       (hdr.data_len == 0) &&
+       (hdr.mime_len == 0) )
+    return 0; /* better skip this one, would signal termination... */    
+  if ( (0 != write_all (*cpipe_out, &hdr, sizeof(hdr))) ||
+       (0 != write_all (*cpipe_out, data_mime_type, mime_len)) ||
+       (0 != write_all (*cpipe_out, data, data_len)) )
+    return 1;  
+  return 0;
+}
+
+
+
+
+/**
+ * 'main' function of the child process.
+ * Reads shm-filenames from 'in' (line-by-line) and
+ * writes meta data blocks to 'out'.  The meta data
+ * stream is terminated by an empty entry.
+ *
+ * @param plugin extractor plugin to use
+ * @param in stream to read from
+ * @param out stream to write to
  */
-#define MAX_DECOMPRESS 16 * 1024 * 1024
+static void
+process_requests (struct EXTRACTOR_PluginList *plugin,
+                 int in,
+                 int out)
+{
+  char fn[256];
+  FILE *fin;
+  void *ptr;
+  int shmid;
+  struct stat sbuf;
+  struct IpcHeader hdr;
+  
+  memset (&hdr, 0, sizeof (hdr));
+  fin = fdopen (in, "r");
+  while (NULL != fgets (fn, sizeof(fn), fin))
+    {
+      if ( (-1 != (shmid = shm_open (fn, O_RDONLY, 0))) &&
+          (0 == fstat (shmid, &sbuf)) &&
+          (NULL != (ptr = shmat (shmid, NULL, SHM_RDONLY))) )
+       {
+         if (0 != plugin->extractMethod (ptr,
+                                         sbuf.st_size,
+                                         &transmit_reply,
+                                         &out,
+                                         plugin->plugin_options))
+           break;
+         if (0 != write_all (out, &hdr, sizeof(hdr)))
+           break;
+       }
+      if (ptr != NULL)
+       shmdt (ptr);
+      if (-1 != shmid)
+       close (shmid);
+    }
+  fclose (fin);
+  close (out);
+}
 
 
-static EXTRACTOR_KeywordList *
-getKeywords (EXTRACTOR_ExtractorList * extractor,
-            const char * filename,
-            const unsigned char * data,
-            size_t size) {
-  EXTRACTOR_KeywordList *result;
+/**
+ * Start the process for the given plugin.
+ */ 
+static void
+start_process (struct EXTRACTOR_PluginList *plugin)
+{
+  int p1[2];
+  int p2[2];
+  pid_t pid;
+  
+  if (0 != pipe (p1))
+    {
+      plugin->cpid = -1;
+      return;
+    }
+  if (0 != pipe (p2))
+    {
+      close (p1[0]);
+      close (p1[1]);
+      plugin->cpid = -1;
+      return;
+    }
+  pid = fork ();
+  if (pid == -1)
+    {
+      close (p1[0]);
+      close (p1[1]);
+      close (p2[0]);
+      close (p2[1]);
+      plugin->cpid = -1;
+      return;
+    }
+  if (pid == 0)
+    {
+      close (p1[1]);
+      close (p2[0]);
+      process_requests (plugin, p1[0], p2[1]);
+      _exit (0);
+    }
+  plugin->cpid = 0;
+  close (p1[0]);
+  close (p2[1]);
+  plugin->cpipe_in = fdopen (p1[1], "w");
+  plugin->cpipe_out = p2[0];
+}
+
+
+/**
+ * Stop the child process of this plugin.
+ */
+static void
+stop_process (struct EXTRACTOR_PluginList *plugin)
+{
+  int status;
+
+  if (plugin->cpid == -1)
+    return;
+  kill (plugin->cpid, SIGKILL);
+  waitpid (plugin->cpid, &status, 0);
+  plugin->cpid = -1;
+  close (plugin->cpipe_out);
+  plugin->cpipe_out = -1;
+  fclose (plugin->cpipe_in);
+  plugin->cpipe_in = NULL;
+}
+
+
+/**
+ * Extract meta data using the given plugin, running the
+ * actual code of the plugin out-of-process.
+ *
+ * @param plugin which plugin to call
+ * @param shmfn file name of the shared memory segment
+ * @param proc function to call on the meta data
+ * @param proc_cls cls for proc
+ * @return 0 if proc did not return non-zero
+ */
+static int
+extract_oop (struct EXTRACTOR_PluginList *plugin,
+            const char *shmfn,
+            EXTRACTOR_MetaDataProcessor proc,
+            void *proc_cls)
+{
+  struct IpcHeader hdr;
+  char mimetype[MAX_MIME_LEN + 1];
+  char *data;
+
+  if (0 <= fprintf (plugin->cpipe_in, "%s\n", shmfn))
+    {
+      stop_process (plugin);
+      plugin->cpid = -1;
+      return 0;
+    }
+  while (1)
+    {
+      if (0 != read_all (plugin->cpipe_out,
+                        &hdr,
+                        sizeof(hdr)))
+       {
+         return 0;
+       }
+      if  ( (hdr.type == 0) &&
+           (hdr.format == 0) &&
+           (hdr.data_len == 0) &&
+           (hdr.mime_len == 0) )
+       break;
+      if (hdr.mime_len > MAX_MIME_LEN)
+       {
+         stop_process (plugin);
+         return 0;
+       }
+      data = malloc (hdr.data_len);
+      if (data == NULL)
+       {
+         stop_process (plugin);
+         return 1;
+       }
+      if ( (0 != (read_all (plugin->cpipe_out,
+                           mimetype,
+                           hdr.mime_len))) ||
+          (0 != (read_all (plugin->cpipe_out,
+                           data,
+                           hdr.data_len))) )
+       {
+         stop_process (plugin);
+         free (data);
+         return 0;
+       }          
+      mimetype[hdr.mime_len] = '\0';
+      if ( (proc != NULL) &&
+          (0 != proc (proc_cls, 
+                      plugin->libname,
+                      hdr.type,
+                      hdr.format,
+                      mimetype,
+                      data,
+                      hdr.data_len)) )
+       proc = NULL;    
+      free (data);
+    }
+  if (NULL == proc)
+    return 1;
+  return 0;
+}           
+
+
+/**
+ * Extract keywords from a file using the given set of plugins.
+ *
+ * @param plugins the list of plugins to use
+ * @param filename the name of the file, can be NULL 
+ * @param data data to process, never NULL
+ * @param size number of bytes in data, ignored if data is NULL
+ * @param proc function to call for each meta data item found
+ * @param proc_cls cls argument to proc
+ */
+static void
+extract (struct EXTRACTOR_PluginList *plugins,
+        const char * filename,
+        const char * data,
+        size_t size,
+        EXTRACTOR_MetaDataProcessor proc,
+        void *proc_cls) 
+{
+  struct EXTRACTOR_PluginList *ppos;
+  int shmid;
+  enum EXTRACTOR_Options flags;
+  void *ptr;
+  char fn[255];
+  int want_shm;
+
+  want_shm = 0;
+  ppos = plugins;
+  while (NULL != ppos)
+    {      
+      switch (ppos->flags)
+       {
+       case EXTRACTOR_OPTION_NONE:
+         break;
+       case EXTRACTOR_OPTION_OUT_OF_PROCESS:
+         if (0 == plugins->cpid)
+           start_process (plugins);
+         want_shm = 1;
+         break;
+       case EXTRACTOR_OPTION_AUTO_RESTART:
+         if ( (0 == plugins->cpid) ||
+              (-1 == plugins->cpid) )
+           start_process (plugins);
+         want_shm = 1;
+         break;
+       }      
+      ppos = ppos->next;
+    }
+
+  if (want_shm)
+    {
+      sprintf (fn,
+              "/tmp/libextractor-shm-%u-XXXXXX",
+              getpid());          
+      mktemp (fn);
+      shmid = shm_open (fn, O_RDWR | O_CREAT, S_IRUSR | S_IWUSR);
+      ptr = NULL;
+      if (shmid != -1)
+       {
+         if ( (0 != ftruncate (shmid, size)) ||
+              (NULL == (ptr = shmat (shmid, NULL, 0))) )
+           {
+             close (shmid);    
+             shmid = -1;
+           }
+         memcpy (ptr, data, size);
+       }
+    }
+  ppos = plugins;
+  while (NULL != ppos)
+    {
+      flags = ppos->flags;
+      if (shmid == -1)
+       flags = EXTRACTOR_OPTION_NONE;
+      switch (flags)
+       {
+       case EXTRACTOR_OPTION_NONE:
+         if (0 != ppos->extractMethod (data, 
+                                       size, 
+                                       proc, 
+                                       proc_cls,
+                                       ppos->plugin_options))
+           return;
+         break;
+       case EXTRACTOR_OPTION_OUT_OF_PROCESS:
+       case EXTRACTOR_OPTION_AUTO_RESTART:
+         if (0 != extract_oop (ppos, fn, proc, proc_cls))
+           return;
+         break;
+       }      
+      ppos = ppos->next;
+    }
+  if (want_shm)
+    {
+      if (NULL != ptr)
+       shmdt (ptr);
+      if (shmid != -1)
+       close (shmid);
+      shm_unlink (fn);
+      unlink (fn);
+    }
+}
+
+
+/**
+ * If the given data is compressed using gzip or bzip2, decompress
+ * it.  Run 'extract' on the decompressed contents (or the original
+ * contents if they were not compressed).
+ *
+ * @param plugins the list of plugins to use
+ * @param filename the name of the file, can be NULL 
+ * @param data data to process, never NULL
+ * @param size number of bytes in data, ignored if data is NULL
+ * @param proc function to call for each meta data item found
+ * @param proc_cls cls argument to proc
+ */
+static void
+decompress_and_extract (struct EXTRACTOR_PluginList *plugins,
+                       const char * filename,
+                       const unsigned char * data,
+                       size_t size,
+                       EXTRACTOR_MetaDataProcessor proc,
+                       void *proc_cls) {
   unsigned char * buf;
   size_t dsize;
 #if HAVE_ZLIB
@@ -1075,7 +1570,6 @@
   size_t bpos;
 #endif
 
-  result = NULL;
   buf = NULL;
   dsize = 0;
 #if HAVE_ZLIB
@@ -1083,689 +1577,418 @@
   if ( (size >= 12) &&
        (data[0] == 0x1f) &&
        (data[1] == 0x8b) &&
-       (data[2] == 0x08) ) {
-
-    /*
-     * Skip gzip header - we might want to retrieve parts of it as keywords
-     */
-    unsigned gzip_header_length = 10;
-
-    if (data[3] & 0x4) /* FEXTRA  set */
-      gzip_header_length += 2 + (unsigned) (data[10] & 0xff)
-                              + (((unsigned) (data[11] & 0xff)) * 256);
-
-    if(data[3] & 0x8) /* FNAME set */
+       (data[2] == 0x08) ) 
     {
-      const unsigned char * cptr = data + gzip_header_length;
-
+      /* Process gzip header */
+      unsigned int gzip_header_length = 10;
+      
+      if (data[3] & 0x4) /* FEXTRA  set */
+       gzip_header_length += 2 + (unsigned) (data[10] & 0xff)
+         + (((unsigned) (data[11] & 0xff)) * 256);
+      
+      if (data[3] & 0x8) /* FNAME set */
+       {
+         const unsigned char * cptr = data + gzip_header_length;
+         /* stored file name is here */
+         while (cptr < data + size)
+           {
+             if ('\0' == *cptr)
+               break;        
+             cptr++;
+           }
+         if (0 != proc (proc_cls,
+                        "<zlib>",
+                        EXTRACTOR_METATYPE_FILENAME,
+                        EXTRACTOR_METAFORMAT_C_STRING,
+                        "text/plain",
+                        (const char*) (data + gzip_header_length),
+                        cptr - (data + gzip_header_length)))
+           return; /* done */    
+         gzip_header_length = (cptr - data) + 1;
+       }
+      if (data[3] & 0x16) /* FCOMMENT set */
+       {
+         const unsigned char * cptr = data + gzip_header_length;
+         /* stored comment is here */    
+         while (cptr < data + size)
+           {
+             if('\0' == *cptr)
+               break;
+             cptr ++;
+           }   
+         if (0 != proc (proc_cls,
+                        "<zlib>",
+                        EXTRACTOR_METATYPE_COMMENT,
+                        EXTRACTOR_METAFORMAT_C_STRING,
+                        "text/plain",
+                        (const char*) (data + gzip_header_length),
+                        cptr - (data + gzip_header_length)))
+           return; /* done */
+         gzip_header_length = (cptr - data) + 1;
+       }
+      if(data[3] & 0x2) /* FCHRC set */
+       gzip_header_length += 2;
+      memset(&strm,
+            0,
+            sizeof(z_stream));
+#ifdef ZLIB_VERNUM
+      gzip_header_length = 0;
+#endif
+      if (size > gzip_header_length) 
+       {
+         strm.next_in = (Bytef*) data + gzip_header_length;
+         strm.avail_in = size - gzip_header_length;
+       }
+      else
+       {
+         strm.next_in = (Bytef*) data;
+         strm.avail_in = 0;
+       }
+      strm.total_in = 0;
+      strm.zalloc = NULL;
+      strm.zfree = NULL;
+      strm.opaque = NULL;
+      
       /*
-       * stored file name is here
-       * extremely long file names might break the following code.
+       * note: maybe plain inflateInit(&strm) is adequate,
+       * it looks more backward-compatible also ;
+       *
+       * ZLIB_VERNUM isn't defined by zlib version 1.1.4 ;
+       * there might be a better check.
        */
-
-      while(cptr < data + size)
-      {
-        if('\0' == *cptr)
-          break;
-
-        cptr++;
-      }
-      gzip_header_length = (cptr - data) + 1;
-    }
-
-    if(data[3] & 0x16) /* FCOMMENT set */
-    {
-      const unsigned char * cptr = data + gzip_header_length;
-
-      /*
-       * stored comment is here
-       */
-
-      while(cptr < data + size)
-      {
-        if('\0' == *cptr)
-          break;
-
-        cptr ++;
-      }
-
-      gzip_header_length = (cptr - data) + 1;
-    }
-
-    if(data[3] & 0x2) /* FCHRC set */
-      gzip_header_length += 2;
-
-    memset(&strm,
-          0,
-          sizeof(z_stream));
+      if (Z_OK == inflateInit2(&strm,
 #ifdef ZLIB_VERNUM
-    gzip_header_length = 0;
-#endif
-    if (size > gzip_header_length) {
-      strm.next_in = (Bytef*) data + gzip_header_length;
-      strm.avail_in = size - gzip_header_length;
-    } else {
-      strm.next_in = (Bytef*) data;
-      strm.avail_in = 0;
-    }
-    strm.total_in = 0;
-    strm.zalloc = NULL;
-    strm.zfree = NULL;
-    strm.opaque = NULL;
-
-    /*
-     * note: maybe plain inflateInit(&strm) is adequate,
-     * it looks more backward-compatible also ;
-     *
-     * ZLIB_VERNUM isn't defined by zlib version 1.1.4 ;
-     * there might be a better check.
-     */
-#ifdef ZLIB_VERNUM
-    if (Z_OK == inflateInit2(&strm,
-                            15 + 32)) {
+                              15 + 32
 #else
-    if (Z_OK == inflateInit2(&strm,
-                            -MAX_WBITS)) {
+                              -MAX_WBITS
 #endif
-      dsize = 2 * size;
-      if (dsize > MAX_DECOMPRESS)
-       dsize = MAX_DECOMPRESS;
-      buf = malloc(dsize);
-      pos = 0;
-      if (buf == NULL) {
-       inflateEnd(&strm);
-      } else {
-       strm.next_out = (Bytef*) buf;
-       strm.avail_out = dsize;
-       do {
-         ret = inflate(&strm,
-                       Z_SYNC_FLUSH);
-         if (ret == Z_OK) {
-           if (dsize == MAX_DECOMPRESS)
-             break;
-           pos += strm.total_out;
-           strm.total_out = 0;
-           dsize *= 2;
-           if (dsize > MAX_DECOMPRESS)
-             dsize = MAX_DECOMPRESS;
-           buf = realloc(buf, dsize);
-           strm.next_out = (Bytef*) &buf[pos];
-           strm.avail_out = dsize - pos;
-         } else if (ret != Z_STREAM_END) {
-           /* error */
-           free(buf);
-           buf = NULL;
+                              )) {
+       dsize = 2 * size;
+       if (dsize > MAX_DECOMPRESS)
+         dsize = MAX_DECOMPRESS;
+       buf = malloc(dsize);
+       pos = 0;
+       if (buf == NULL) 
+         {
+           inflateEnd(&strm);
+         } 
+       else 
+         {
+           strm.next_out = (Bytef*) buf;
+           strm.avail_out = dsize;
+           do
+             {
+               ret = inflate(&strm,
+                             Z_SYNC_FLUSH);
+               if (ret == Z_OK) 
+                 {
+                   if (dsize == MAX_DECOMPRESS)
+                     break;
+                   pos += strm.total_out;
+                   strm.total_out = 0;
+                   dsize *= 2;
+                   if (dsize > MAX_DECOMPRESS)
+                     dsize = MAX_DECOMPRESS;
+                   buf = realloc(buf, dsize);
+                   strm.next_out = (Bytef*) &buf[pos];
+                   strm.avail_out = dsize - pos;
+                 }
+               else if (ret != Z_STREAM_END) 
+                 {
+                   /* error */
+                   free(buf);
+                   buf = NULL;
+                 }
+             } while ( (buf != NULL) &&                
+                       (ret != Z_STREAM_END) );
+           dsize = pos + strm.total_out;
+           inflateEnd(&strm);
+           if (dsize == 0) {
+             free(buf);
+             buf = NULL;
+           }
          }
-       } while ( (buf != NULL) &&              
-                 (ret != Z_STREAM_END) );
-       dsize = pos + strm.total_out;
-       inflateEnd(&strm);
-       if (dsize == 0) {
-         free(buf);
-         buf = NULL;
-       }
       }
     }
-  }
 #endif
-
+  
 #if HAVE_LIBBZ2
   if ( (size >= 4) &&
        (data[0] == 'B') &&
        (data[1] == 'Z') &&
-       (data[2] == 'h') ) {
-    /* now try bz2 decompression */
-    memset(&bstrm,
-          0,
-          sizeof(bz_stream));
-    bstrm.next_in = (char*) data;
-    bstrm.avail_in = size;
-    bstrm.total_in_lo32 = 0;
-    bstrm.total_in_hi32 = 0;
-    bstrm.bzalloc = NULL;
-    bstrm.bzfree = NULL;
-    bstrm.opaque = NULL;
-    if ( (buf == NULL) &&
-        (BZ_OK == BZ2_bzDecompressInit(&bstrm,
-                                       0,
-                                       0)) ) {
-      dsize = 2 * size;
-      if (dsize > MAX_DECOMPRESS)
-       dsize = MAX_DECOMPRESS;
-      buf = malloc(dsize);
-      bpos = 0;
-      if (buf == NULL) {
-       BZ2_bzDecompressEnd(&bstrm);
-      } else {
-       bstrm.next_out = (char*) buf;
-       bstrm.avail_out = dsize;
-       do {
-         bret = BZ2_bzDecompress(&bstrm);
-         if (bret == Z_OK) {
-           if (dsize == MAX_DECOMPRESS)
-             break;
-           bpos += bstrm.total_out_lo32;
-           bstrm.total_out_lo32 = 0;
-           dsize *= 2;
-           if (dsize > MAX_DECOMPRESS)
-             dsize = MAX_DECOMPRESS;
-           buf = realloc(buf, dsize);
-           bstrm.next_out = (char*) &buf[bpos];
-           bstrm.avail_out = dsize - bpos;
-         } else if (bret != BZ_STREAM_END) {
-           /* error */
-           free(buf);
-           buf = NULL;
-         }
-       } while ( (buf != NULL) &&
-                 (bret != BZ_STREAM_END) );
-       dsize = bpos + bstrm.total_out_lo32;
-       BZ2_bzDecompressEnd(&bstrm);
-       if (dsize == 0) {
-         free(buf);
-         buf = NULL;
+       (data[2] == 'h') ) 
+    {
+      /* now try bz2 decompression */
+      memset(&bstrm,
+            0,
+            sizeof(bz_stream));
+      bstrm.next_in = (char*) data;
+      bstrm.avail_in = size;
+      bstrm.total_in_lo32 = 0;
+      bstrm.total_in_hi32 = 0;
+      bstrm.bzalloc = NULL;
+      bstrm.bzfree = NULL;
+      bstrm.opaque = NULL;
+      if ( (buf == NULL) &&
+          (BZ_OK == BZ2_bzDecompressInit(&bstrm,
+                                         0,
+                                         0)) ) 
+       {
+         dsize = 2 * size;
+         if (dsize > MAX_DECOMPRESS)
+           dsize = MAX_DECOMPRESS;
+         buf = malloc(dsize);
+         bpos = 0;
+         if (buf == NULL) 
+           {
+             BZ2_bzDecompressEnd(&bstrm);
+           }
+         else 
+           {
+             bstrm.next_out = (char*) buf;
+             bstrm.avail_out = dsize;
+             do {
+               bret = BZ2_bzDecompress(&bstrm);
+               if (bret == Z_OK) 
+                 {
+                   if (dsize == MAX_DECOMPRESS)
+                     break;
+                   bpos += bstrm.total_out_lo32;
+                   bstrm.total_out_lo32 = 0;
+                   dsize *= 2;
+                   if (dsize > MAX_DECOMPRESS)
+                     dsize = MAX_DECOMPRESS;
+                   buf = realloc(buf, dsize);
+                   bstrm.next_out = (char*) &buf[bpos];
+                   bstrm.avail_out = dsize - bpos;
+                 } 
+               else if (bret != BZ_STREAM_END) 
+                 {
+                   /* error */
+                   free(buf);
+                   buf = NULL;
+                 }
+             } while ( (buf != NULL) &&
+                       (bret != BZ_STREAM_END) );
+             dsize = bpos + bstrm.total_out_lo32;
+             BZ2_bzDecompressEnd(&bstrm);
+             if (dsize == 0) 
+               {
+                 free(buf);
+                 buf = NULL;
+               }
+           }
        }
-      }
     }
-  }
-#endif
-
-
-  /* finally, call plugins */
-  if (buf != NULL) {
-    data = buf;
-    size = dsize;
-  }
-  while (extractor != NULL) {
-    result = extractor->extractMethod(filename,
-                                     (char*) data,
-                                     size,
-                                     result,
-                                     extractor->options);
-    extractor = extractor->next;
-  }
+#endif  
+  if (buf != NULL) 
+    {
+      data = buf;
+      size = dsize;
+    }
+  extract (plugins,
+          filename,
+          (const char*) data,
+          size,
+          proc,
+          proc_cls);
   if (buf != NULL)
     free(buf);
   errno = 0; /* kill transient errors */
-  return result;
 }
 
+
 /**
- * Extract keywords from a file using the available extractors.
- * @param extractor the list of extractor libraries
- * @param filename the name of the file
- * @return the list of keywords found in the file, NULL if none
- *         were found (or other errors)
+ * Open a file
  */
-EXTRACTOR_KeywordList *
-EXTRACTOR_getKeywords (EXTRACTOR_ExtractorList * extractor,
-                      const char * filename) {
-  EXTRACTOR_KeywordList *result;
-  int file;
-  void * buffer;
-  struct stat fstatbuf;
-  size_t size;
-  int eno, dir;
+static int file_open(const char *filename, int oflag, ...)
+{
+  int mode;
+  const char *fn;
+#ifdef MINGW
+  char szFile[_MAX_PATH + 1];
+  long lRet;
 
-  if (-1 == STAT(filename, &fstatbuf))
-    return NULL;
-
-  if (!S_ISDIR(fstatbuf.st_mode)) {
-    dir = 0;
-      
-#ifdef O_LARGEFILE
-    file = fileopen(filename, O_RDONLY | O_LARGEFILE);
+  if ((lRet = plibc_conv_to_win_path(filename, szFile)) != ERROR_SUCCESS)
+  {
+    errno = ENOENT;
+    SetLastError(lRet);
+    return -1;
+  }
+  fn = szFile;
 #else
-    file = fileopen(filename, O_RDONLY);
+  fn = filename;
 #endif
-    if (-1 == file)
-      return NULL;
-  
-    size = (fstatbuf.st_size > 0xFFFFFFFF) ? 0xFFFFFFFF : fstatbuf.st_size;
-    if (size == 0) {
-      close(file);
-      return NULL;
-    }
-  
-    if (size > MAX_READ)
-      size = MAX_READ; /* do not mmap/read more than 1 GB! */
-    buffer = MMAP(NULL, size, PROT_READ, MAP_PRIVATE, file, 0);
-    if ( (buffer == NULL) || (buffer == (void *) -1) ) {
-      eno = errno;
-      close(file);
-      errno = eno;
-      return NULL;
-    }
-  }
-  else {
-    dir = 1;
-    
-    size = 0;
-    buffer = malloc(1);
-  }
-  
-  result = getKeywords(extractor,
-                      filename,
-                      buffer,
-                      size);
-  
-  if (dir)
-    free(buffer);
-  else {
-    MUNMAP (buffer, size);
-    close(file);
-  }
-  return result;
+  mode = 0;
+#ifdef MINGW
+  /* Set binary mode */
+  mode |= O_BINARY;
+#endif
+  return open(fn, oflag, mode);
 }
 
 
+#ifndef O_LARGEFILE
+#define O_LARGEFILE 0
+#endif
 
+
 /**
- * Extract keywords from a buffer in memory
- * using the available extractors.
+ * Extract keywords from a file using the given set of plugins.
+ * If needed, opens the file and loads its data (via mmap).  Then
+ * decompresses it if the data is compressed.  Finally runs the
+ * plugins on the (now possibly decompressed) data.
  *
- * @param extractor the list of extractor libraries
- * @param data the data of the file
- * @param size the number of bytes in data
- * @return the list of keywords found in the file, NULL if none
- *         were found (or other errors)
+ * @param plugins the list of plugins to use
+ * @param filename the name of the file, can be NULL if data is not NULL
+ * @param data data of the file in memory, can be NULL (in which
+ *        case libextractor will open file) if filename is not NULL
+ * @param size number of bytes in data, ignored if data is NULL
+ * @param proc function to call for each meta data item found
+ * @param proc_cls cls argument to proc
  */
-EXTRACTOR_KeywordList *
-EXTRACTOR_getKeywords2(EXTRACTOR_ExtractorList * extractor,
-                      const void * data,
-                      size_t size) {
-  if (data == NULL)
-    return NULL;
-  return getKeywords(extractor,
-                    NULL,
-                    data,
-                    size);
-}
+void
+EXTRACTOR_extract (struct EXTRACTOR_PluginList *plugins,
+                  const char *filename,
+                  const void *data,
+                  size_t size,
+                  EXTRACTOR_MetaDataProcessor proc,
+                  void *proc_cls)
+{
+  int fd;
+  void * buffer;
+  struct stat fstatbuf;
+  size_t fsize;
+  int eno;
 
-static void
-removeKeyword (const char *keyword,
-              const EXTRACTOR_KeywordType type,
-              const unsigned int options,
-              EXTRACTOR_KeywordList ** list,
-              EXTRACTOR_KeywordList * current) {
-  EXTRACTOR_KeywordList *first;
-  EXTRACTOR_KeywordList *pos;
-  EXTRACTOR_KeywordList *prev;
-  EXTRACTOR_KeywordList *next;
-
-  first = *list;
-  pos = first;
-  prev = NULL;
-  while (pos != NULL) {
-    if (pos == current) {
-      prev = pos;
-      pos = current->next;
-    }
-    if (pos == NULL)
-      break;
-    if ( (0 == strcmp (pos->keyword, keyword)) &&
-        ( (pos->keywordType == type) ||
-          ( ((options & EXTRACTOR_DUPLICATES_TYPELESS) > 0) &&
-            ( (pos->keywordType == EXTRACTOR_SPLIT) ||
-              (type != EXTRACTOR_SPLIT)) ) ||
-          ( ((options & EXTRACTOR_DUPLICATES_REMOVE_UNKNOWN) > 0) &&
-            (pos->keywordType == EXTRACTOR_UNKNOWN)) ) ) {
-      /* remove! */
-      if (prev == NULL)
-       first = pos->next;
-      else
-       prev->next = pos->next;
-      next = pos->next;
-      free (pos->keyword);
-      free (pos);
-      pos = next;
-    } else {
-      prev = pos;
-      pos = pos->next;
-    }
-  } /* end while */
-  *list = first;
-}
-
-/**
- * Remove duplicate keywords from the list.
- * @param list the original keyword list (destroyed in the process!)
- * @param options a set of options (DUPLICATES_XXXX)
- * @return a list of keywords without duplicates
- */
-EXTRACTOR_KeywordList *
-EXTRACTOR_removeDuplicateKeywords (EXTRACTOR_KeywordList * list,
-                                  const unsigned int options) {
-  EXTRACTOR_KeywordList *pos;
-
-  pos = list;
-  while (pos != NULL) {
-    removeKeyword(pos->keyword,
-                 pos->keywordType,
-                 options,
-                 &list,
-                 pos);
-    pos = pos->next;
-  }
-  return list;
-}
-
-/**
- * Remove empty (all-whitespace) keywords from the list.
- * @param list the original keyword list (destroyed in the process!)
- * @return a list of keywords without duplicates
- */
-EXTRACTOR_KeywordList *
-EXTRACTOR_removeEmptyKeywords (EXTRACTOR_KeywordList * list) {
-  EXTRACTOR_KeywordList * pos;
-  EXTRACTOR_KeywordList * last;
-
-  last = NULL;
-  pos = list;
-  while (pos != NULL)
-    {
-      int allWhite;
-      int i;
-      allWhite = 1;
-      for (i=strlen(pos->keyword)-1;i>=0;i--)
-       if (! isspace(pos->keyword[i]))
-           {
-               allWhite = 0;
-               break;
-            }
-      if (allWhite)
+  fd = -1;
+  buffer = NULL;
+  if ( (data == NULL) &&
+       (filename != NULL) &&
+       (0 == STAT(filename, &fstatbuf)) &&
+       (!S_ISDIR(fstatbuf.st_mode)) &&
+       (-1 != (fd = file_open (filename,
+                              O_RDONLY | O_LARGEFILE))) )
+    {      
+      fsize = (fstatbuf.st_size > 0xFFFFFFFF) ? 0xFFFFFFFF : fstatbuf.st_size;
+      if (fsize == 0) 
        {
-         EXTRACTOR_KeywordList * next;
-         next = pos->next;
-         if (last == NULL)
-           list = next;
-         else
-           last->next = next;
-         free(pos->keyword);
-         free(pos);
-         pos = next;
+         close(fd);
+         return;
        }
-      else
+      if (fsize > MAX_READ)
+       fsize = MAX_READ;
+      buffer = MMAP(NULL, fsize, PROT_READ, MAP_PRIVATE, fd, 0);
+      if ( (buffer == NULL) || (buffer == (void *) -1) ) 
        {
-         last = pos;
-         pos = pos->next;
+         eno = errno;
+         close(fd);
+         errno = eno;
+         return;
        }
     }
-  return list;
+  if ( (buffer == NULL) &&
+       (data == NULL) )
+    return;
+  decompress_and_extract (plugins,
+                         filename,
+                         buffer != NULL ? buffer : data,
+                         buffer != NULL ? fsize : size,
+                         proc,
+                         proc_cls);
+  if (buffer != NULL)
+    MUNMAP (buffer, size);
+  if (-1 != fd)
+    close(fd);  
 }
 
-/**
- * Remove keywords of a particular type from the list.
- * @param list the original keyword list (altered in the process!)
- * @param type the type to remove
- * @return a list of keywords without entries of given type
- */
-EXTRACTOR_KeywordList *
-EXTRACTOR_removeKeywordsOfType(EXTRACTOR_KeywordList * list,
-                              EXTRACTOR_KeywordType type) {
-  EXTRACTOR_KeywordList * pos;
-  EXTRACTOR_KeywordList * last;
-
-  last = NULL;
-  pos = list;
-  while (pos != NULL) {
-    if (pos->keywordType == type) {
-      EXTRACTOR_KeywordList * next;
-      next = pos->next;
-      if (last == NULL)
-       list = next;
-      else
-       last->next = next;
-      free(pos->keyword);
-      free(pos);
-      pos = next;
-    } else {
-      last = pos;
-      pos = pos->next;
-    }
-  }
-  return list;
-}
-
 #include "iconv.c"
 
 /**
- * Print a keyword list to a file.
- * For debugging.
- * @param handle the file to write to (stdout, stderr), may NOT be NULL
- * @param keywords the list of keywords to print, may be NULL
+ * Simple EXTRACTOR_MetaDataProcessor implementation that simply
+ * prints the extracted meta data to the given file.  Only prints
+ * those keywords that are in UTF-8 format.
+ * 
+ * @param handle the file to write to (stdout, stderr), must NOT be NULL,
+ *               must be of type "FILE *".
+ * @param plugin_name name of the plugin that produced this value
+ * @param type libextractor-type describing the meta data
+ * @param format basic format information about data 
+ * @param data_mime_type mime-type of data (not of the original file);
+ *        can be NULL (if mime-type is not known)
+ * @param data actual meta-data found
+ * @param data_len number of bytes in data
+ * @return non-zero if printing failed, otherwise 0.
  */
-void
-EXTRACTOR_printKeywords(FILE * handle,
-                       EXTRACTOR_KeywordList * keywords)
+int 
+EXTRACTOR_meta_data_print(void * handle,
+                         const char *plugin_name,
+                         enum EXTRACTOR_MetaType type,
+                         enum EXTRACTOR_MetaFormat format,
+                         const char *data_mime_type,
+                         const char *data,
+                         size_t data_len)
 {
   iconv_t cd;
   char * buf;
+  int ret;
 
-  cd = iconv_open(
-    nl_langinfo(CODESET)
-    , "UTF-8");
-  while (keywords != NULL)
-    {
-      if (cd == (iconv_t) -1)
-       buf = strdup(keywords->keyword);
-      else
-       buf = iconvHelper(cd,
-                         keywords->keyword);
-      if (keywords->keywordType == EXTRACTOR_THUMBNAIL_DATA) {
-       fprintf(handle,
-               _("%s - (binary)\n"),
-               _(keywordTypes[keywords->keywordType]));
-      } else {
-       if (keywords->keywordType >= HIGHEST_TYPE_NUMBER)
-         fprintf(handle,
-                 _("INVALID TYPE - %s\n"),
-                 buf);
-       else
-         fprintf(handle,
-                 "%s - %s\n",
-                 _(keywordTypes[keywords->keywordType]),
-                 buf);
-      }
-      free(buf);
-      keywords = keywords->next;
-    }
-  if (cd != (iconv_t) -1)
-    iconv_close(cd);
+  if (format != EXTRACTOR_METAFORMAT_UTF8)
+    return 0;
+  cd = iconv_open(nl_langinfo(CODESET),
+                 "UTF-8");
+  if (cd == (iconv_t) -1)
+    return 1;
+  buf = iconv_helper(cd, data);
+  ret = fprintf(handle,
+               "%s - %s\n",
+               dgettext ("libextractor",
+                         EXTRACTOR_metatype_to_string (type)),
+               buf);
+  free(buf);
+  iconv_close(cd);
+  if (ret < 0)
+    return 1;
+  return 0;
 }
 
-/**
- * Free the memory occupied by the keyword list (and the
- * keyword strings in it!)
- * @param keywords the list to free
- */
-void
-EXTRACTOR_freeKeywords (EXTRACTOR_KeywordList * keywords)
-{
-  EXTRACTOR_KeywordList *prev;
-  while (keywords != NULL)
-    {
-      prev = keywords;
-      keywords = keywords->next;
-      free (prev->keyword);
-      free (prev);
-    }
-}
 
-/**
- * Return the highest type number, exclusive as in [0,highest).
- */
-EXTRACTOR_KeywordType
-EXTRACTOR_getHighestKeywordTypeNumber ()
-{
-  return HIGHEST_TYPE_NUMBER;
-}
 
 /**
- * Extract the last keyword that of the given type from the keyword list.
- * @param type the type of the keyword
- * @param keywords the keyword list
- * @return the last matching keyword, or NULL if none matches
+ * Initialize gettext and libltdl (and W32 if needed).
  */
-const char *
-EXTRACTOR_extractLast (const EXTRACTOR_KeywordType type,
-                      EXTRACTOR_KeywordList * keywords)
-{
-  char *result = NULL;
-  while (keywords != NULL)
-    {
-      if (keywords->keywordType == type)
-       result = keywords->keyword;
-      keywords = keywords->next;
-    }
-  return result;
-}
+void __attribute__ ((constructor)) EXTRACTOR_ltdl_init() {
+  int err;
 
-/**
- * Extract the last keyword of the given string from the keyword list.
- * @param type the string describing the type of the keyword
- * @param keywords the keyword list
- * @return the last matching keyword, or NULL if none matches
- */
-const char *
-EXTRACTOR_extractLastByString (const char * type,
-                              EXTRACTOR_KeywordList * keywords)
-{
-  char * result = NULL;
-
-  if (type == NULL)
-    return NULL;
-  while (keywords != NULL) {
-    if ( (0 == strcmp(_(keywordTypes[keywords->keywordType]), type)) ||
-        (0 == strcmp(keywordTypes[keywords->keywordType], type) ) )
-      result = keywords->keyword;
-    keywords = keywords->next;
+#if ENABLE_NLS
+  BINDTEXTDOMAIN(PACKAGE, LOCALEDIR);
+  BINDTEXTDOMAIN("iso-639", ISOLOCALEDIR); /* used by wordextractor */
+#endif
+  err = lt_dlinit ();
+  if (err > 0) {
+#if DEBUG
+    fprintf(stderr,
+           _("Initialization of plugin mechanism failed: %s!\n"),
+           lt_dlerror());
+#endif
+    return;
   }
-  return result;
+#ifdef MINGW
+  plibc_init("GNU", PACKAGE);
+#endif
 }
 
-/**
- * Count the number of keywords in the keyword list.
- * @param keywords the keyword list
- * @return the number of keywords in the list
- */
-unsigned int
-EXTRACTOR_countKeywords (EXTRACTOR_KeywordList * keywords)
-{
-  int count = 0;
-  while (keywords != NULL)
-    {
-      count++;
-      keywords = keywords->next;
-    }
-  return count;
-}
 
 /**
- * Encode the given binary data object
- * as a 0-terminated C-string according
- * to the LE binary data encoding standard.
- *
- * @return NULL on error, the 0-terminated
- *  encoding otherwise
+ * Deinit.
  */
-char * EXTRACTOR_binaryEncode(const unsigned char * data,
-                             size_t size) {
-
-  char * binary;
-  size_t pos;
-  size_t end;
-  size_t wpos;
-  size_t i;
-  unsigned int markers[8]; /* 256 bits */
-  unsigned char marker;
-
- /* encode! */
-  binary = malloc(2 + size + (size+256) / 254);
-  if (binary == NULL)
-    return NULL;
-
-  pos = 0;
-  wpos = 0;
-  while (pos < size) {
-    /* find unused value between 1 and 255 in
-       the next 254 bytes */
-    end = pos + 254;
-    if (end < pos)
-      break; /* integer overflow! */
-    if (end > size)
-      end = size;
-    memset(markers,
-          0,
-          sizeof(markers));
-    for (i=pos;i<end;i++)
-      markers[data[i]&7] |= 1 << (data[i] >> 3);
-    marker = 1;
-    while (markers[marker&7] & (1 << (marker >> 3))) {
-      marker++;
-      if (marker == 0) {
-       /* assertion failed... */
-       free(binary);
-       return NULL;
-      }
-    }
-    /* recode */
-    binary[wpos++] = marker;
-    for (i=pos;i<end;i++)
-      binary[wpos++] = data[i] == 0 ? marker : data[i];
-    pos = end;
-  }
-  binary[wpos++] = 0; /* 0-termination! */
-  return binary;
+void __attribute__ ((destructor)) EXTRACTOR_ltdl_fini() {
+#ifdef MINGW
+  plibc_shutdown();
+#endif
+  lt_dlexit ();
 }
 
 
-/**
- * This function can be used to decode the binary data
- * encoded in the libextractor metadata (i.e. for
- * the  thumbnails).
- *
- * @param in 0-terminated string from the meta-data
- * @return 1 on error, 0 on success
- */
-int EXTRACTOR_binaryDecode(const char * in,
-                          unsigned char ** out,
-                          size_t * outSize) {
-  unsigned char * buf;
-  size_t pos;
-  size_t wpos;
-  unsigned char marker;
-  size_t i;
-  size_t end;
-  size_t inSize;
 
-  inSize = strlen(in);
-  if (inSize == 0) {
-    *out = NULL;
-    *outSize = 0;
-    return 0;
-  }
-
-  buf = malloc(inSize); /* slightly more than needed ;-) */
-  if (buf == NULL)
-    return 1; /* error */
-  *out = buf;
-
-  pos = 0;
-  wpos = 0;
-  while (pos < inSize) {
-    end = pos + 255; /* 255 here: count the marker! */
-    if (end > inSize)
-      end = inSize;
-    marker = in[pos++];
-    for (i=pos;i<end;i++)
-      buf[wpos++] = (in[i] == (char) marker) ? 0 : in[i];
-    pos = end;
-  }
-  *outSize = wpos;
-  return 0;
-}
-
-
-
 /* end of extractor.c */

Modified: Extractor/src/main/iconv.c
===================================================================
--- Extractor/src/main/iconv.c  2009-12-12 20:09:31 UTC (rev 9745)
+++ Extractor/src/main/iconv.c  2009-12-13 23:02:19 UTC (rev 9746)
@@ -22,8 +22,9 @@
  * Convert the given input using the given converter
  * and return as a 0-terminated string.
  */
-static char * iconvHelper(iconv_t cd,
-                         const char * in) {
+static char * 
+iconv_helper(iconv_t cd,
+            const char * in) {
   size_t inSize;
   char * buf;
   char * ibuf;

Deleted: Extractor/src/main/test_binary.c
===================================================================
--- Extractor/src/main/test_binary.c    2009-12-12 20:09:31 UTC (rev 9745)
+++ Extractor/src/main/test_binary.c    2009-12-13 23:02:19 UTC (rev 9746)
@@ -1,66 +0,0 @@
-/*
-     This file is part of libextractor.
-     (C) 2005 Vidyut Samanta and Christian Grothoff
-
-     libextractor is free software; you can redistribute it and/or modify
-     it under the terms of the GNU General Public License as published
-     by the Free Software Foundation; either version 2, or (at your
-     option) any later version.
-
-     libextractor is distributed in the hope that it will be useful, but
-     WITHOUT ANY WARRANTY; without even the implied warranty of
-     MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
-     General Public License for more details.
-
-     You should have received a copy of the GNU General Public License
-     along with libextractor; see the file COPYING.  If not, write to the
-     Free Software Foundation, Inc., 59 Temple Place - Suite 330,
-     Boston, MA 02111-1307, USA.
- */
-
-#include "platform.h"
-#include "extractor.h"
-
-static int test(const char * buf,
-               size_t size) {
-  char * enc;
-  unsigned char * dec;
-  size_t out;
-
-  enc = EXTRACTOR_binaryEncode((const unsigned char*) buf,
-                              size);
-  if (0 != EXTRACTOR_binaryDecode(enc,
-                                 &dec,
-                                 &out)) {
-    free(enc);
-    return 0;
-  }
-  free(enc);
-  if (out != size) {
-    free(dec);
-    return 0;
-  }
-  if (0 != memcmp(buf,
-                 dec,
-                 size)) {
-    free(dec);
-    return 0;
-  }
-  free(dec);
-  return 1;
-}
-
-#define MAX 1024
-
-int main(int argc,
-        char * argv[]) {
-  unsigned int i;
-  char buf[MAX];
-
-  for (i=0;i<MAX;i++) {
-    buf[i] = (char) rand();
-    if (! test(buf, i))
-      return -1;
-  }
-  return 0;
-}

Deleted: Extractor/src/main/winproc.c
===================================================================
--- Extractor/src/main/winproc.c        2009-12-12 20:09:31 UTC (rev 9745)
+++ Extractor/src/main/winproc.c        2009-12-13 23:02:19 UTC (rev 9746)
@@ -1,51 +0,0 @@
-/*
-     This file is part of GNUnet.
-     (C) 2001, 2002, 2003, 2004, 2005 Christian Grothoff (and other 
contributing authors)
-
-     GNUnet is free software; you can redistribute it and/or modify
-     it under the terms of the GNU General Public License as published
-     by the Free Software Foundation; either version 2, or (at your
-     option) any later version.
-
-     GNUnet is distributed in the hope that it will be useful, but
-     WITHOUT ANY WARRANTY; without even the implied warranty of
-     MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
-     General Public License for more details.
-
-     You should have received a copy of the GNU General Public License
-     along with GNUnet; see the file COPYING.  If not, write to the
-     Free Software Foundation, Inc., 59 Temple Place - Suite 330,
-     Boston, MA 02111-1307, USA.
-*/
-
-/**
- * @file util/win/winproc.c
- * @brief Functions for MS Windows
- * @author Nils Durner
- * @note This file differs from GNUnet's winproc.c
- */
-
-#include "platform.h"
-
-#ifdef MINGW
-
-/**
- * Initialize PlibC and set up Windows environment
- * @return Error code from winerror.h, ERROR_SUCCESS on success
-*/
-void InitWinEnv()
-{
-       plibc_init("GNU", PACKAGE);
-}
-
-/**
- * Clean up Windows environment
- */
-void ShutdownWinEnv()
-{
-       plibc_shutdown();
-}
-
-#endif /* MINGW */
-
-/* end of winproc.c */

Modified: Extractor/src/plugins/Makefile.am
===================================================================
--- Extractor/src/plugins/Makefile.am   2009-12-12 20:09:31 UTC (rev 9745)
+++ Extractor/src/plugins/Makefile.am   2009-12-13 23:02:19 UTC (rev 9746)
@@ -60,8 +60,8 @@
 endif
 
 # toggle for development
-# SUBDIRS = . 
-SUBDIRS = . $(thumbgtk) $(thumbffmpeg) $(oodir) $(printdir) hash $(oledir) 
$(rpm) $(xpdfdir) $(exiv2dir) 
+SUBDIRS = . 
+# SUBDIRS = . $(thumbgtk) $(thumbffmpeg) $(oodir) $(printdir) hash $(oledir) 
$(rpm) $(xpdfdir) $(exiv2dir) 
 
 
 if HAVE_VORBISFILE
@@ -85,25 +85,46 @@
 oodir = oo
 endif
 
-plugin_LTLIBRARIES = $(pdfplugin) \
+plugin_LTLIBRARIES = \
+  libextractor_html.la \
+  libextractor_it.la \
+  libextractor_mime.la 
+
+libextractor_html_la_SOURCES = \
+  html_extractor.c 
+libextractor_html_la_LDFLAGS = \
+  $(PLUGINFLAGS)
+libextractor_html_la_LIBADD = \
+  $(top_builddir)/src/common/libextractor_common.la
+
+libextractor_it_la_SOURCES = \
+  it_extractor.c 
+libextractor_it_la_LDFLAGS = \
+  $(PLUGINFLAGS)
+
+
+libextractor_mime_la_SOURCES = \
+  mime_extractor.c 
+libextractor_mime_la_LDFLAGS = \
+  $(PLUGINFLAGS)
+
+
+
+OLD_LIBS = \
+  $(pdfplugin) \
   libextractor_applefile.la \
   libextractor_asf.la \
   libextractor_deb.la \
   libextractor_dvi.la \
   libextractor_elf.la \
-  libextractor_filename.la \
   $(extraflac) \
   libextractor_flv.la \
   libextractor_gif.la \
-  libextractor_html.la \
   libextractor_id3v2.la \
   libextractor_id3v24.la \
   libextractor_id3v23.la \
-  libextractor_it.la \
   libextractor_jpeg.la \
-  libextractor_lower.la \
   libextractor_man.la \
-  libextractor_mime.la \
   libextractor_mp3.la \
   $(extrampeg) \
   libextractor_nsf.la \
@@ -116,11 +137,9 @@
   libextractor_riff.la \
   libextractor_s3m.la \
   libextractor_sid.la \
-  libextractor_split.la \
   libextractor_tar.la \
   libextractor_tiff.la \
   $(thumbqt) \
-  libextractor_translit.la \
   libextractor_wav.la \
   libextractor_xm.la \
   libextractor_zip.la 
@@ -205,13 +224,6 @@
 libextractor_id3v24_la_LIBADD = \
   $(top_builddir)/src/common/libextractor_common.la
 
-libextractor_it_la_SOURCES = \
-  itextractor.c 
-libextractor_it_la_LDFLAGS = \
-  $(PLUGINFLAGS)
-libextractor_it_la_LIBADD = \
-  $(top_builddir)/src/main/libextractor.la
-
 libextractor_dvi_la_SOURCES = \
   dviextractor.c 
 libextractor_dvi_la_LDFLAGS = \
@@ -231,11 +243,6 @@
   $(top_builddir)/src/main/libextractor.la -lz
 endif
 
-libextractor_lower_la_SOURCES = \
-  lowerextractor.c
-libextractor_lower_la_LDFLAGS = \
-  $(PLUGINFLAGS)
-
 libextractor_gif_la_SOURCES = \
   gifextractor.c 
 libextractor_gif_la_LDFLAGS = \
@@ -279,14 +286,6 @@
 libextractor_jpeg_la_LIBADD = \
   $(LE_LIBINTL)
 
-libextractor_html_la_SOURCES = \
-  htmlextractor.c 
-libextractor_html_la_LDFLAGS = \
-  $(PLUGINFLAGS)
-libextractor_html_la_LIBADD = \
-  $(top_builddir)/src/main/libextractor.la \
-  $(top_builddir)/src/common/libextractor_common.la
-
 libextractor_flv_la_SOURCES = \
   flvextractor.c
 libextractor_flv_la_LDFLAGS = \
@@ -299,13 +298,6 @@
 libextractor_real_la_LDFLAGS = \
   $(PLUGINFLAGS)
 
-libextractor_mime_la_SOURCES = \
-  mimeextractor.c 
-libextractor_mime_la_LDFLAGS = \
-  $(PLUGINFLAGS)
-libextractor_mime_la_LIBADD = \
-  $(top_builddir)/src/main/libextractor.la
-
 if HAVE_MPEG2
 libextractor_mpeg_la_SOURCES = \
   mpegextractor.c 
@@ -354,15 +346,6 @@
   -lz
 endif
 
-libextractor_filename_la_SOURCES = \
-  filenameextractor.c 
-libextractor_filename_la_LDFLAGS = \
-  $(PLUGINFLAGS)
-libextractor_filename_la_LIBADD = \
-  $(top_builddir)/src/main/libextractor.la \
-  $(top_builddir)/src/common/libextractor_common.la \
-  $(LE_LIBINTL)
-
 libextractor_sid_la_SOURCES = \
   sidextractor.c 
 libextractor_sid_la_LDFLAGS = \
@@ -398,18 +381,8 @@
 libextractor_s3m_la_LIBADD = \
   $(top_builddir)/src/main/libextractor.la
 
-libextractor_split_la_SOURCES = \
-  splitextractor.c 
-libextractor_split_la_LDFLAGS = \
-  $(PLUGINFLAGS)
-
-libextractor_translit_la_SOURCES = \
-  translitextractor.c 
-libextractor_translit_la_LDFLAGS = \
-  $(PLUGINFLAGS)
-
 libextractor_thumbnailqt_la_SOURCES = \
-       thumbnailextractorqt.cc
+  thumbnailextractorqt.cc
 libextractor_thumbnailqt_la_LDFLAGS = \
   $(PLUGINFLAGS)
 libextractor_thumbnailqt_la_LIBADD = \

Deleted: Extractor/src/plugins/filenameextractor.c
===================================================================
--- Extractor/src/plugins/filenameextractor.c   2009-12-12 20:09:31 UTC (rev 
9745)
+++ Extractor/src/plugins/filenameextractor.c   2009-12-13 23:02:19 UTC (rev 
9746)
@@ -1,72 +0,0 @@
-/*
-     This file is part of libextractor.
-     (C) 2002, 2003, 2004 Vidyut Samanta and Christian Grothoff
-
-     libextractor is free software; you can redistribute it and/or modify
-     it under the terms of the GNU General Public License as published
-     by the Free Software Foundation; either version 2, or (at your
-     option) any later version.
-
-     libextractor is distributed in the hope that it will be useful, but
-     WITHOUT ANY WARRANTY; without even the implied warranty of
-     MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
-     General Public License for more details.
-
-     You should have received a copy of the GNU General Public License
-     along with libextractor; see the file COPYING.  If not, write to the
-     Free Software Foundation, Inc., 59 Temple Place - Suite 330,
-     Boston, MA 02111-1307, USA.
- */
-
-#include "platform.h"
-#include "extractor.h"
-#include "convert.h"
-
-
-/* "extract" the 'filename' as a keyword */
-struct EXTRACTOR_Keywords *
-libextractor_filename_extract (const char *filename,
-                               char *date,
-                               size_t size, struct EXTRACTOR_Keywords *prev)
-{
-  EXTRACTOR_KeywordList *keyword;
-  struct stat fstatbuf;
-  const char *filenameRoot = filename;
-  int res;
-
-  /* get filename */
-  if (filename == NULL)
-    return prev;    
-  for (res = strlen (filename) - 1; res >= 0; res--)
-    if (filename[res] == DIR_SEPARATOR)
-      {
-       filenameRoot = &filename[res + 1];
-       break;
-      }
-  keyword = malloc (sizeof (EXTRACTOR_KeywordList));
-  keyword->next = prev;
-  keyword->keyword = EXTRACTOR_common_convert_to_utf8 (filenameRoot,
-                                                      strlen (filenameRoot),
-                                                      nl_langinfo (CODESET));
-  keyword->keywordType = EXTRACTOR_FILENAME;
-  prev = keyword;
-  if (-1 == STAT(filename, &fstatbuf))
-    return prev;
-  keyword = malloc (sizeof (EXTRACTOR_KeywordList));
-  keyword->next = prev;
-  keyword->keyword = malloc (14);
-  keyword->keywordType = EXTRACTOR_FILE_SIZE;
-  
-  if (size >= 1000000000)
-    snprintf (keyword->keyword, 14, "%.2f %s", fstatbuf.st_size / 1000000000.0,
-             _("GB"));
-  else if (size >= 1000000)
-    snprintf (keyword->keyword, 14, "%.2f %s", fstatbuf.st_size / 1000000.0, 
_("MB"));
-  else if (size >= 1000)
-    snprintf (keyword->keyword, 14, "%.2f %s", fstatbuf.st_size / 1000.0, 
_("KB"));
-  else
-    snprintf (keyword->keyword, 14, "%.2f %s", (double)  fstatbuf.st_size, 
_("Bytes"));
-  
-  prev = keyword;
-  return prev;
-}

Copied: Extractor/src/plugins/html_extractor.c (from rev 9738, 
Extractor/src/plugins/htmlextractor.c)
===================================================================
--- Extractor/src/plugins/html_extractor.c                              (rev 0)
+++ Extractor/src/plugins/html_extractor.c      2009-12-13 23:02:19 UTC (rev 
9746)
@@ -0,0 +1,403 @@
+/*
+     This file is part of libextractor.
+     (C) 2002, 2003, 2004, 2005 Vidyut Samanta and Christian Grothoff
+
+     libextractor is free software; you can redistribute it and/or modify
+     it under the terms of the GNU General Public License as published
+     by the Free Software Foundation; either version 2, or (at your
+     option) any later version.
+
+     libextractor is distributed in the hope that it will be useful, but
+     WITHOUT ANY WARRANTY; without even the implied warranty of
+     MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+     General Public License for more details.
+
+     You should have received a copy of the GNU General Public License
+     along with libextractor; see the file COPYING.  If not, write to the
+     Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+     Boston, MA 02111-1307, USA.
+
+ */
+
+#include "platform.h"
+#include "extractor.h"
+#include <string.h>
+#include "convert.h"
+
+static struct
+{
+  const char *name;
+  enum EXTRACTOR_MetaType type;
+} tagmap[] = {
+  { "author", EXTRACTOR_METATYPE_AUTHOR_NAME },
+  { "dc.author", EXTRACTOR_METATYPE_AUTHOR_NAME },
+  { "title", EXTRACTOR_METATYPE_TITLE },
+  { "dc.title", EXTRACTOR_METATYPE_TITLE},
+  { "description", EXTRACTOR_METATYPE_DESCRIPTION },
+  { "dc.description", EXTRACTOR_METATYPE_DESCRIPTION },
+  { "subject", EXTRACTOR_METATYPE_SUBJECT},
+  { "dc.subject", EXTRACTOR_METATYPE_SUBJECT},
+  { "date", EXTRACTOR_METATYPE_UNKNOWN_DATE },
+  { "dc.date", EXTRACTOR_METATYPE_UNKNOWN_DATE},
+  { "publisher", EXTRACTOR_METATYPE_PUBLISHER },
+  { "dc.publisher", EXTRACTOR_METATYPE_PUBLISHER},
+  { "rights", EXTRACTOR_METATYPE_RIGHTS },
+  { "dc.rights", EXTRACTOR_METATYPE_RIGHTS },
+  { "copyright", EXTRACTOR_METATYPE_COPYRIGHT },
+  { "language", EXTRACTOR_METATYPE_DOCUMENT_LANGUAGE },  
+  { "keywords", EXTRACTOR_METATYPE_KEYWORDS },
+  { "abstract", EXTRACTOR_METATYPE_ABSTRACT },
+  { "formatter", EXTRACTOR_METATYPE_CREATED_BY_SOFTWARE },
+  { "dc.creator", EXTRACTOR_METATYPE_CREATOR},
+  { "dc.identifier", EXTRACTOR_METATYPE_URI },
+  { "dc.format", EXTRACTOR_METATYPE_FORMAT },
+  { NULL, EXTRACTOR_METATYPE_RESERVED }
+};
+
+static const char *relevantTags[] = {
+  "title",
+  "meta",
+  NULL,
+};
+
+typedef struct TI
+{
+  struct TI *next;
+  const char *tagStart;
+  const char *tagEnd;
+  const char *dataStart;
+  const char *dataEnd;
+} TagInfo;
+
+
+
+
+/* ******************** parser helper functions ************** */
+
+static int
+tagMatch (const char *tag, const char *s, const char *e)
+{
+  return (((e - s) == strlen (tag)) && (0 == strncasecmp (tag, s, e - s)));
+}
+
+static int
+lookFor (char c, size_t * pos, const char *data, size_t size)
+{
+  size_t p = *pos;
+
+  while ((p < size) && (data[p] != c))
+    {
+      if (data[p] == '\0')
+        return 0;
+      p++;
+    }
+  *pos = p;
+  return p < size;
+}
+
+static int
+skipWhitespace (size_t * pos, const char *data, size_t size)
+{
+  size_t p = *pos;
+
+  while ((p < size) && (isspace (data[p])))
+    {
+      if (data[p] == '\0')
+        return 0;
+      p++;
+    }
+  *pos = p;
+  return p < size;
+}
+
+static int
+skipLetters (size_t * pos, const char *data, size_t size)
+{
+  size_t p = *pos;
+
+  while ((p < size) && (isalpha (data[p])))
+    {
+      if (data[p] == '\0')
+        return 0;
+      p++;
+    }
+  *pos = p;
+  return p < size;
+}
+
+static int
+lookForMultiple (const char *c, size_t * pos, const char *data, size_t size)
+{
+  size_t p = *pos;
+
+  while ((p < size) && (strchr (c, data[p]) == NULL))
+    {
+      if (data[p] == '\0')
+        return 0;
+      p++;
+    }
+  *pos = p;
+  return p < size;
+}
+
+static void
+findEntry (const char *key,
+           const char *start,
+           const char *end, const char **mstart, const char **mend)
+{
+  size_t len;
+
+  *mstart = NULL;
+  *mend = NULL;
+  len = strlen (key);
+  while (start < end - len - 1)
+    {
+      start++;
+      if (start[len] != '=')
+        continue;
+      if (0 == strncmp (start, key, len))
+        {
+          start += len + 1;
+          *mstart = start;
+          if ((*start == '\"') || (*start == '\''))
+            {
+              start++;
+              while ((start < end) && (*start != **mstart))
+                start++;
+              (*mstart)++;      /* skip quote */
+            }
+          else
+            {
+              while ((start < end) && (!isspace (*start)))
+                start++;
+            }
+          *mend = start;
+          return;
+        }
+    }
+}
+
+/**
+ * Search all tags that correspond to "tagname".  Example:
+ * If the tag is <meta name="foo" desc="bar">, and
+ * tagname == "meta", keyname="name", keyvalue="foo",
+ * and searchname="desc", then this function returns a
+ * copy (!) of "bar".  Easy enough?
+ *
+ * @return NULL if nothing is found
+ */
+static char *
+findInTags (TagInfo * t,
+            const char *tagname,
+            const char *keyname, const char *keyvalue, const char *searchname)
+{
+  const char *pstart;
+  const char *pend;
+
+  while (t != NULL)
+    {
+      if (tagMatch (tagname, t->tagStart, t->tagEnd))
+        {
+          findEntry (keyname, t->tagEnd, t->dataStart, &pstart, &pend);
+          if ((pstart != NULL) && (tagMatch (keyvalue, pstart, pend)))
+            {
+              findEntry (searchname, t->tagEnd, t->dataStart, &pstart, &pend);
+              if (pstart != NULL)
+                {
+                  char *ret = malloc (pend - pstart + 1);
+                  memcpy (ret, pstart, pend - pstart);
+                  ret[pend - pstart] = '\0';
+                  return ret;
+                }
+            }
+        }
+      t = t->next;
+    }
+  return NULL;
+}
+
+
+/* mimetype = text/html */
+int 
+EXTRACTOR_html_extract (const char *data,
+                       size_t size,
+                       EXTRACTOR_MetaDataProcessor proc,
+                       void *proc_cls,
+                       const char *options)
+{
+  size_t xsize;
+  TagInfo *tags;
+  TagInfo *t;
+  TagInfo tag;
+  size_t pos;
+  size_t tpos;
+  int i;
+  char *charset;
+  char *tmp;
+  char *xtmp;
+  int ret;
+
+  ret = 0;
+  if (size == 0)
+    return 0;
+  /* only scan first 32k */
+  if (size > 1024 * 32)
+    xsize = 1024 * 32;
+  else
+    xsize = size;
+  tags = NULL;
+  tag.next = NULL;
+  pos = 0;
+  while (pos < xsize)
+    {
+      if (!lookFor ('<', &pos, data, size))
+        break;
+      tag.tagStart = &data[++pos];
+      if (!skipLetters (&pos, data, size))
+        break;
+      tag.tagEnd = &data[pos];
+      if (!skipWhitespace (&pos, data, size))
+        break;
+    STEP3:
+      if (!lookForMultiple (">\"\'", &pos, data, size))
+        break;
+      if (data[pos] != '>')
+        {
+          /* find end-quote, ignore escaped quotes (\') */
+          do
+            {
+              tpos = pos;
+              pos++;
+              if (!lookFor (data[tpos], &pos, data, size))
+                break;
+            }
+          while (data[pos - 1] == '\\');
+          pos++;
+          goto STEP3;
+        }
+      pos++;
+      if (!skipWhitespace (&pos, data, size))
+        break;
+      tag.dataStart = &data[pos];
+      if (!lookFor ('<', &pos, data, size))
+        break;
+      tag.dataEnd = &data[pos];
+      i = 0;
+      while (relevantTags[i] != NULL)
+        {
+          if ((strlen (relevantTags[i]) == tag.tagEnd - tag.tagStart) &&
+              (0 == strncasecmp (relevantTags[i],
+                                 tag.tagStart, tag.tagEnd - tag.tagStart)))
+            {
+              t = malloc (sizeof (TagInfo));
+              *t = tag;
+              t->next = tags;
+              tags = t;
+              break;
+            }
+          i++;
+        }
+      /* abort early if we hit the body tag */
+      if (tagMatch ("body", tag.tagStart, tag.tagEnd))
+        break;
+    }
+
+  /* fast exit */
+  if (tags == NULL)
+    return 0;
+
+  charset = NULL;
+  /* first, try to determine mime type and/or character set */
+  tmp = findInTags (tags, "meta", "http-equiv", "content-type", "content");
+  if (tmp != NULL)
+    {
+      /* ideally, tmp == "test/html; charset=ISO-XXXX-Y" or something like 
that;
+         if text/html is present, we take that as the mime-type; if charset=
+         is present, we try to use that for character set conversion. */
+      if (0 == strncmp (tmp, "text/html", strlen ("text/html")))
+        ret = proc (proc_cls, 
+                   "html",
+                   EXTRACTOR_METATYPE_MIMETYPE,
+                   EXTRACTOR_METAFORMAT_UTF8,
+                   "text/plain",
+                   "text/html",
+                   strlen ("text/html")+1);
+      charset = strstr (tmp, "charset=");
+      if (charset != NULL)
+        charset = strdup (&charset[strlen ("charset=")]);
+      free (tmp);
+    }
+  i = 0;
+  while (tagmap[i].name != NULL)
+    {
+      tmp = findInTags (tags, "meta", "name", tagmap[i].name, "content");
+      if ( (tmp != NULL) &&
+          (ret == 0) )
+        {
+         if (charset == NULL)
+           ret = proc (proc_cls,
+                       "html",
+                       tagmap[i].type,
+                       EXTRACTOR_METAFORMAT_C_STRING,
+                       "text/plain",
+                       tmp,
+                       strlen (tmp) + 1);
+         else
+           {
+             xtmp = EXTRACTOR_common_convert_to_utf8 (tmp,
+                                                      strlen (tmp),
+                                                      charset);
+             ret = proc (proc_cls,
+                         "html",
+                         tagmap[i].type,
+                         EXTRACTOR_METAFORMAT_UTF8,
+                         "text/plain",
+                         xtmp,
+                         strlen (xtmp) + 1);
+             free (xtmp);
+           }
+          free (tmp);
+        }
+      i++;
+    }
+  while (tags != NULL) 
+    {
+      t = tags;
+      if ( (tagMatch ("title", t->tagStart, t->tagEnd)) &&
+          (ret == 0) )
+       {
+         if (charset == NULL)
+           {
+             xtmp = malloc (t->dataEnd - t->dataStart + 1);
+             memcpy (xtmp, t->dataStart, t->dataEnd - t->dataStart);
+             xtmp[t->dataEnd - t->dataStart] = '\0';
+             ret = proc (proc_cls,
+                         "html",
+                         EXTRACTOR_METATYPE_TITLE,
+                         EXTRACTOR_METAFORMAT_C_STRING,
+                         "text/plain",
+                         xtmp,
+                         strlen (xtmp) + 1);
+             free (xtmp);
+           }
+         else
+           {
+             xtmp = EXTRACTOR_common_convert_to_utf8 (tmp,
+                                                      strlen (tmp),
+                                                      charset);
+             ret = proc (proc_cls,
+                         "html",
+                         EXTRACTOR_METATYPE_TITLE,
+                         EXTRACTOR_METAFORMAT_UTF8,
+                         "text/plain",
+                         xtmp,
+                         strlen (xtmp) + 1);
+             free (xtmp);
+           }
+       }
+      tags = t->next;
+      free (t);
+    }
+  free (charset);
+  return ret;
+}

Deleted: Extractor/src/plugins/htmlextractor.c
===================================================================
--- Extractor/src/plugins/htmlextractor.c       2009-12-12 20:09:31 UTC (rev 
9745)
+++ Extractor/src/plugins/htmlextractor.c       2009-12-13 23:02:19 UTC (rev 
9746)
@@ -1,446 +0,0 @@
-/*
-     This file is part of libextractor.
-     (C) 2002, 2003, 2004, 2005 Vidyut Samanta and Christian Grothoff
-
-     libextractor is free software; you can redistribute it and/or modify
-     it under the terms of the GNU General Public License as published
-     by the Free Software Foundation; either version 2, or (at your
-     option) any later version.
-
-     libextractor is distributed in the hope that it will be useful, but
-     WITHOUT ANY WARRANTY; without even the implied warranty of
-     MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
-     General Public License for more details.
-
-     You should have received a copy of the GNU General Public License
-     along with libextractor; see the file COPYING.  If not, write to the
-     Free Software Foundation, Inc., 59 Temple Place - Suite 330,
-     Boston, MA 02111-1307, USA.
-
- */
-
-#include "platform.h"
-#include "extractor.h"
-#include <string.h>
-#include "convert.h"
-
-static struct
-{
-  char *name;
-  EXTRACTOR_KeywordType type;
-} tagmap[] =
-{
-  {
-  "author", EXTRACTOR_AUTHOR},
-  {
-  "title", EXTRACTOR_TITLE},
-  {
-  "description", EXTRACTOR_DESCRIPTION},
-  {
-  "language", EXTRACTOR_LANGUAGE},
-  {
-  "rights", EXTRACTOR_COPYRIGHT},
-  {
-  "publisher", EXTRACTOR_PUBLISHER},
-  {
-  "formatter", EXTRACTOR_SOFTWARE},
-  {
-  "copyright", EXTRACTOR_COPYRIGHT},
-  {
-  "abstract", EXTRACTOR_SUMMARY},
-  {
-  "subject", EXTRACTOR_SUBJECT},
-  {
-  "abstract", EXTRACTOR_SUMMARY},
-  {
-  "date", EXTRACTOR_DATE},
-  {
-  "keywords", EXTRACTOR_KEYWORDS},
-  {
-  "dc.author", EXTRACTOR_AUTHOR},
-  {
-  "dc.title", EXTRACTOR_TITLE},
-  {
-  "dc.description", EXTRACTOR_DESCRIPTION},
-  {
-  "dc.subject", EXTRACTOR_SUBJECT},
-  {
-  "dc.creator", EXTRACTOR_CREATOR},
-  {
-  "dc.publisher", EXTRACTOR_PUBLISHER},
-  {
-  "dc.date", EXTRACTOR_DATE},
-  {
-  "dc.format", EXTRACTOR_FORMAT},
-  {
-  "dc.identifier", EXTRACTOR_RESOURCE_IDENTIFIER},
-  {
-  "dc.rights", EXTRACTOR_COPYRIGHT},
-  {
-NULL, EXTRACTOR_UNKNOWN},};
-
-static char *relevantTags[] = {
-  "title",
-  "meta",
-  NULL,
-};
-
-/* which mime-types should not be subjected to
-   the HTML extractor (no use trying & parsing
-   is expensive!) */
-static char *blacklist[] = {
-  "image/jpeg",
-  "image/gif",
-  "image/png",
-  "image/x-png",
-  "image/xcf",
-  "image/tiff",
-  "application/java",
-  "application/pdf",
-  "application/postscript",
-  "application/elf",
-  "application/gnunet-directory",
-  "application/x-gzip",
-  "application/bz2",
-  "application/x-rpm",
-  "application/x-rar",
-  "application/x-zip",
-  "application/x-arj",
-  "application/x-compress",
-  "application/x-tar",
-  "application/x-lha",
-  "application/x-gtar",
-  "application/x-dpkg",
-  "application/ogg",
-  "audio/real",
-  "audio/x-wav",
-  "audio/avi",
-  "audio/midi",
-  "audio/mpeg",
-  "video/real",
-  "video/asf",
-  "video/quicktime",
-  NULL,
-};
-
-typedef struct TI
-{
-  struct TI *next;
-  const char *tagStart;
-  const char *tagEnd;
-  const char *dataStart;
-  const char *dataEnd;
-} TagInfo;
-
-/**
- * Add a keyword.
- */
-static struct EXTRACTOR_Keywords *
-addKeyword (EXTRACTOR_KeywordType type,
-            char *keyword, struct EXTRACTOR_Keywords *next)
-{
-  EXTRACTOR_KeywordList *result;
-
-  result = malloc (sizeof (EXTRACTOR_KeywordList));
-  result->next = next;
-  result->keyword = keyword;
-  result->keywordType = type;
-  return result;
-}
-
-/* ******************** parser helper functions ************** */
-
-static int
-tagMatch (const char *tag, const char *s, const char *e)
-{
-  return (((e - s) == strlen (tag)) && (0 == strncasecmp (tag, s, e - s)));
-}
-
-static int
-lookFor (char c, size_t * pos, const char *data, size_t size)
-{
-  size_t p = *pos;
-
-  while ((p < size) && (data[p] != c))
-    {
-      if (data[p] == '\0')
-        return 0;
-      p++;
-    }
-  *pos = p;
-  return p < size;
-}
-
-static int
-skipWhitespace (size_t * pos, const char *data, size_t size)
-{
-  size_t p = *pos;
-
-  while ((p < size) && (isspace (data[p])))
-    {
-      if (data[p] == '\0')
-        return 0;
-      p++;
-    }
-  *pos = p;
-  return p < size;
-}
-
-static int
-skipLetters (size_t * pos, const char *data, size_t size)
-{
-  size_t p = *pos;
-
-  while ((p < size) && (isalpha (data[p])))
-    {
-      if (data[p] == '\0')
-        return 0;
-      p++;
-    }
-  *pos = p;
-  return p < size;
-}
-
-static int
-lookForMultiple (const char *c, size_t * pos, const char *data, size_t size)
-{
-  size_t p = *pos;
-
-  while ((p < size) && (strchr (c, data[p]) == NULL))
-    {
-      if (data[p] == '\0')
-        return 0;
-      p++;
-    }
-  *pos = p;
-  return p < size;
-}
-
-static void
-findEntry (const char *key,
-           const char *start,
-           const char *end, const char **mstart, const char **mend)
-{
-  size_t len;
-
-  *mstart = NULL;
-  *mend = NULL;
-  len = strlen (key);
-  while (start < end - len - 1)
-    {
-      start++;
-      if (start[len] != '=')
-        continue;
-      if (0 == strncmp (start, key, len))
-        {
-          start += len + 1;
-          *mstart = start;
-          if ((*start == '\"') || (*start == '\''))
-            {
-              start++;
-              while ((start < end) && (*start != **mstart))
-                start++;
-              (*mstart)++;      /* skip quote */
-            }
-          else
-            {
-              while ((start < end) && (!isspace (*start)))
-                start++;
-            }
-          *mend = start;
-          return;
-        }
-    }
-}
-
-/**
- * Search all tags that correspond to "tagname".  Example:
- * If the tag is <meta name="foo" desc="bar">, and
- * tagname == "meta", keyname="name", keyvalue="foo",
- * and searchname="desc", then this function returns a
- * copy (!) of "bar".  Easy enough?
- *
- * @return NULL if nothing is found
- */
-static char *
-findInTags (TagInfo * t,
-            const char *tagname,
-            const char *keyname, const char *keyvalue, const char *searchname)
-{
-  const char *pstart;
-  const char *pend;
-
-  while (t != NULL)
-    {
-      if (tagMatch (tagname, t->tagStart, t->tagEnd))
-        {
-          findEntry (keyname, t->tagEnd, t->dataStart, &pstart, &pend);
-          if ((pstart != NULL) && (tagMatch (keyvalue, pstart, pend)))
-            {
-              findEntry (searchname, t->tagEnd, t->dataStart, &pstart, &pend);
-              if (pstart != NULL)
-                {
-                  char *ret = malloc (pend - pstart + 1);
-                  memcpy (ret, pstart, pend - pstart);
-                  ret[pend - pstart] = '\0';
-                  return ret;
-                }
-            }
-        }
-      t = t->next;
-    }
-  return NULL;
-}
-
-
-/* mimetype = text/html */
-struct EXTRACTOR_Keywords *
-libextractor_html_extract (const char *filename,
-                           const char *data,
-                           const size_t size, struct EXTRACTOR_Keywords *prev)
-{
-  size_t xsize;
-  const char *mime;
-  TagInfo *tags;
-  TagInfo *t;
-  TagInfo tag;
-  size_t pos;
-  size_t tpos;
-  int i;
-  char *charset;
-  char *tmp;
-
-  if (size == 0)
-    return prev;
-
-  mime = EXTRACTOR_extractLast (EXTRACTOR_MIMETYPE, prev);
-  if (mime != NULL)
-    {
-      int j;
-      j = 0;
-      while (blacklist[j] != NULL)
-        {
-          if (0 == strcmp (blacklist[j], mime))
-            return prev;
-          j++;
-        }
-    }
-
-  /* only scan first 32k */
-  if (size > 1024 * 32)
-    xsize = 1024 * 32;
-  else
-    xsize = size;
-  tags = NULL;
-  tag.next = NULL;
-  pos = 0;
-  while (pos < xsize)
-    {
-      if (!lookFor ('<', &pos, data, size))
-        break;
-      tag.tagStart = &data[++pos];
-      if (!skipLetters (&pos, data, size))
-        break;
-      tag.tagEnd = &data[pos];
-      if (!skipWhitespace (&pos, data, size))
-        break;
-    STEP3:
-      if (!lookForMultiple (">\"\'", &pos, data, size))
-        break;
-      if (data[pos] != '>')
-        {
-          /* find end-quote, ignore escaped quotes (\') */
-          do
-            {
-              tpos = pos;
-              pos++;
-              if (!lookFor (data[tpos], &pos, data, size))
-                break;
-            }
-          while (data[pos - 1] == '\\');
-          pos++;
-          goto STEP3;
-        }
-      pos++;
-      if (!skipWhitespace (&pos, data, size))
-        break;
-      tag.dataStart = &data[pos];
-      if (!lookFor ('<', &pos, data, size))
-        break;
-      tag.dataEnd = &data[pos];
-      i = 0;
-      while (relevantTags[i] != NULL)
-        {
-          if ((strlen (relevantTags[i]) == tag.tagEnd - tag.tagStart) &&
-              (0 == strncasecmp (relevantTags[i],
-                                 tag.tagStart, tag.tagEnd - tag.tagStart)))
-            {
-              t = malloc (sizeof (TagInfo));
-              *t = tag;
-              t->next = tags;
-              tags = t;
-              break;
-            }
-          i++;
-        }
-      /* abort early if we hit the body tag */
-      if (tagMatch ("body", tag.tagStart, tag.tagEnd))
-        break;
-    }
-
-  /* fast exit */
-  if (tags == NULL)
-    return prev;
-
-  charset = NULL;
-
-  /* first, try to determine mime type and/or character set */
-  tmp = findInTags (tags, "meta", "http-equiv", "content-type", "content");
-  if (tmp != NULL)
-    {
-      /* ideally, tmp == "test/html; charset=ISO-XXXX-Y" or something like 
that;
-         if text/html is present, we take that as the mime-type; if charset=
-         is present, we try to use that for character set conversion. */
-      if (0 == strncmp (tmp, "text/html", strlen ("text/html")))
-        prev = addKeyword (EXTRACTOR_MIMETYPE, strdup ("text/html"), prev);
-
-      charset = strstr (tmp, "charset=");
-
-      if (charset != NULL)
-        charset = strdup (&charset[strlen ("charset=")]);
-      free (tmp);
-    }
-  if (charset == NULL)
-    charset = strdup ("ISO-8859-1");    /* try a sensible default */
-
-
-  i = 0;
-  while (tagmap[i].name != NULL)
-    {
-      tmp = findInTags (tags, "meta", "name", tagmap[i].name, "content");
-      if (tmp != NULL)
-        {
-          prev = addKeyword (tagmap[i].type,
-              EXTRACTOR_common_convert_to_utf8 (tmp,
-                                            strlen (tmp), charset), prev);
-          free (tmp);
-        }
-      i++;
-    }
-
-
-  while (tags != NULL)
-    {
-      t = tags;
-      if (tagMatch ("title", t->tagStart, t->tagEnd))
-        prev = addKeyword (EXTRACTOR_TITLE,
-            EXTRACTOR_common_convert_to_utf8 (t->dataStart,
-                                          t->dataEnd - t->dataStart,
-                                          charset), prev);
-      tags = t->next;
-      free (t);
-    }
-  free (charset);
-
-  return prev;
-}

Copied: Extractor/src/plugins/it_extractor.c (from rev 9738, 
Extractor/src/plugins/itextractor.c)
===================================================================
--- Extractor/src/plugins/it_extractor.c                                (rev 0)
+++ Extractor/src/plugins/it_extractor.c        2009-12-13 23:02:19 UTC (rev 
9746)
@@ -0,0 +1,102 @@
+/*
+ * This file is part of libextractor.
+ * (C) 2008 Toni Ruottu
+ *
+ * libextractor is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published
+ * by the Free Software Foundation; either version 2, or (at your
+ * option) any later version.
+ *
+ * libextractor is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with libextractor; see the file COPYING.  If not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 02111-1307, USA.
+ *
+ */
+
+#include "platform.h"
+#include "extractor.h"
+
+#define HEADER_SIZE  0xD0
+
+struct header
+{
+  char magicid[4];
+  char title[26];
+  char hilight[2];
+  char orders[2];
+  char instruments[2];
+  char samples[2];
+  char patterns[2];
+  char version[2];
+  char compatible[2];
+  char flags[2];
+  char special[2];
+};
+
+/* "extract" keyword from an Impulse Tracker module
+ *
+ * ITTECH.TXT as taken from IT 2.14p5 was used,
+ * while this piece of software was originally
+ * written.
+ *
+ */
+int 
+EXTRACTOR_mime_extract (const char *data,
+                       size_t size,
+                       EXTRACTOR_MetaDataProcessor proc,
+                       void *proc_cls,
+                       const char *options)
+{
+  char title[27];
+  char itversion[8];
+  struct header *head;
+
+  /* Check header size */
+  if (size < HEADER_SIZE)    
+    return 0;
+  head = (struct header *) data;
+  /* Check "magic" id bytes */
+  if (memcmp (head->magicid, "IMPM", 4))
+    return 0;
+  /* Mime-type */
+  if (0 != proc (proc_cls,
+                "it",
+                EXTRACTOR_METATYPE_MIMETYPE,
+                EXTRACTOR_METAFORMAT_UTF8,
+                "text/plain",
+                "audio/x-it",
+                strlen("audio/x-it")+1))
+    return 1;
+
+  /* Version of Tracker */
+  sprintf (itversion, 
+          "%d.%d", 
+          (head->version[0]& 0x01),head->version[1]);
+  if (0 != proc (proc_cls,
+                "it",
+                EXTRACTOR_METATYPE_FORMAT_VERSION,
+                EXTRACTOR_METAFORMAT_C_STRING,
+                "text/plain",
+                itversion,
+                strlen(itversion)+1))
+    return 1;
+
+  /* Song title */
+  memcpy (&title, head->title, 26);
+  title[26] = '\0';
+  if (0 != proc (proc_cls,
+                "it",
+                EXTRACTOR_METATYPE_TITLE,
+                EXTRACTOR_METAFORMAT_C_STRING,
+                "text/plain",
+                title,
+                strlen(title)+1))
+    return 1;
+  return 0;
+}

Deleted: Extractor/src/plugins/itextractor.c
===================================================================
--- Extractor/src/plugins/itextractor.c 2009-12-12 20:09:31 UTC (rev 9745)
+++ Extractor/src/plugins/itextractor.c 2009-12-13 23:02:19 UTC (rev 9746)
@@ -1,107 +0,0 @@
-/*
- * This file is part of libextractor.
- * (C) 2008 Toni Ruottu
- *
- * libextractor is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published
- * by the Free Software Foundation; either version 2, or (at your
- * option) any later version.
- *
- * libextractor is distributed in the hope that it will be useful, but
- * WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
- * General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with libextractor; see the file COPYING.  If not, write to the
- * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
- * Boston, MA 02111-1307, USA.
- *
- */
-
-#include "platform.h"
-#include "extractor.h"
-#include "convert.h"
-
-#define HEADER_SIZE  0xD0
-
-struct header
-{
-  char magicid[4];
-  char title[26];
-  char hilight[2];
-  char orders[2];
-  char instruments[2];
-  char samples[2];
-  char patterns[2];
-  char version[2];
-  char compatible[2];
-  char flags[2];
-  char special[2];
-};
-
-
-static struct EXTRACTOR_Keywords *addkword
-  (EXTRACTOR_KeywordList * oldhead,
-   const char *phrase, EXTRACTOR_KeywordType type)
-{
-  EXTRACTOR_KeywordList *keyword;
-
-  keyword = malloc (sizeof (EXTRACTOR_KeywordList));
-  keyword->next = oldhead;
-  keyword->keyword = strdup (phrase);
-  keyword->keywordType = type;
-  return (keyword);
-}
-
-
-/* "extract" keyword from an Impulse Tracker module
- *
- * ITTECH.TXT as taken from IT 2.14p5 was used,
- * while this piece of software was originally
- * written.
- *
- */
-struct EXTRACTOR_Keywords *libextractor_it_extract
-  (const char *filename,
-   char *data, size_t size, struct EXTRACTOR_Keywords *prev)
-{
-  char title[27];
-  char itversion[8];
-  struct header *head;
-
-  /* Check header size */
-
-  if (size < HEADER_SIZE)
-    {
-      return (prev);
-    }
-
-  head = (struct header *) data;
-
-  /* Check "magic" id bytes */
-
-  if (memcmp (head->magicid, "IMPM", 4))
-    {
-      return (prev);
-    }
-
-  /* Mime-type */
-
-  prev = addkword (prev, "audio/x-it", EXTRACTOR_MIMETYPE);
-
-
-  /* Version of Tracker */
-
-  sprintf (itversion, "%d.%d", (head->version[0]& 0x01),head->version[1]);
-  prev = addkword (prev, itversion, EXTRACTOR_FORMAT_VERSION);
-
-  /* Song title */
-
-  memcpy (&title, head->title, 26);
-  title[26] = '\0';
-  prev = addkword (prev, title, EXTRACTOR_TITLE);
-
-  return (prev);
-
-}

Deleted: Extractor/src/plugins/lowerextractor.c
===================================================================
--- Extractor/src/plugins/lowerextractor.c      2009-12-12 20:09:31 UTC (rev 
9745)
+++ Extractor/src/plugins/lowerextractor.c      2009-12-13 23:02:19 UTC (rev 
9746)
@@ -1,80 +0,0 @@
-/*
-     This file is part of libextractor.
-     (C) 2002, 2003 Vidyut Samanta and Christian Grothoff
-
-     libextractor is free software; you can redistribute it and/or modify
-     it under the terms of the GNU General Public License as published
-     by the Free Software Foundation; either version 2, or (at your
-     option) any later version.
-
-     libextractor is distributed in the hope that it will be useful, but
-     WITHOUT ANY WARRANTY; without even the implied warranty of
-     MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
-     General Public License for more details.
-
-     You should have received a copy of the GNU General Public License
-     along with libextractor; see the file COPYING.  If not, write to the
-     Free Software Foundation, Inc., 59 Temple Place - Suite 330,
-     Boston, MA 02111-1307, USA.
- */
-
-#include "platform.h"
-#include "extractor.h"
-
-static void
-addKeyword (struct EXTRACTOR_Keywords **list,
-            const char *keyword, EXTRACTOR_KeywordType type)
-{
-  EXTRACTOR_KeywordList *next;
-  next = malloc (sizeof (EXTRACTOR_KeywordList));
-  next->next = *list;
-  next->keyword = strdup (keyword);
-  next->keywordType = type;
-  *list = next;
-}
-
-/* convert other keywords to lower case */
-struct EXTRACTOR_Keywords *
-libextractor_lower_extract (char *filename,
-                            char *data,
-                            size_t size, struct EXTRACTOR_Keywords *prev)
-{
-  struct EXTRACTOR_Keywords *pos;
-  char *lower;
-  unsigned int mem, needed, i;
-
-  pos = prev;
-  lower = NULL;
-  mem = 0;
-
-  while (pos != NULL)
-    {
-      if (pos->keywordType == EXTRACTOR_FILE_SIZE)
-        {
-          pos = pos->next;
-          continue;
-        }
-
-      needed = strlen (pos->keyword) + 1;
-      if (needed > mem)
-        {
-          lower = (lower == NULL) ? realloc (lower, needed) : malloc (needed);
-          mem = needed;
-        }
-
-      for (i = 0; i < needed; i++)
-        {
-          lower[i] = tolower (pos->keyword[i]);
-        }
-
-      if (strcmp (pos->keyword, lower))
-        {
-          addKeyword (&prev, lower, EXTRACTOR_LOWERCASE);
-        }
-      pos = pos->next;
-    }
-  if (lower != NULL)
-    free (lower);
-
-  return prev;
-}

Copied: Extractor/src/plugins/mime_extractor.c (from rev 9738, 
Extractor/src/plugins/mimeextractor.c)
===================================================================
--- Extractor/src/plugins/mime_extractor.c                              (rev 0)
+++ Extractor/src/plugins/mime_extractor.c      2009-12-13 23:02:19 UTC (rev 
9746)
@@ -0,0 +1,320 @@
+/*
+     This file is part of libextractor.
+     (C) 2002, 2003, 2006 Vidyut Samanta and Christian Grothoff
+
+     libextractor is free software; you can redistribute it and/or modify
+     it under the terms of the GNU General Public License as published
+     by the Free Software Foundation; either version 2, or (at your
+     option) any later version.
+
+     libextractor is distributed in the hope that it will be useful, but
+     WITHOUT ANY WARRANTY; without even the implied warranty of
+     MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+     General Public License for more details.
+
+     You should have received a copy of the GNU General Public License
+     along with libextractor; see the file COPYING.  If not, write to the
+     Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+     Boston, MA 02111-1307, USA.
+ */
+
+#include "platform.h"
+#include "extractor.h"
+
+
+/**
+ * Detect a file-type.
+ * @param data the contents of the file
+ * @param len the length of the file
+ * @param arg closure...
+ * @return 0 if the file does not match, 1 if it does
+ **/
+typedef int (*Detector) (const char *data, size_t len, void *arg);
+
+/**
+ * Detect a file-type.
+ * @param data the contents of the file
+ * @param len the length of the file
+ * @return always 1
+ **/
+static int
+defaultDetector (const char *data, size_t len, void *arg)
+{
+  return 1;
+}
+
+/**
+ * Detect a file-type.
+ * @param data the contents of the file
+ * @param len the length of the file
+ * @return always 0
+ **/
+static int
+disableDetector (const char *data, size_t len, void *arg)
+{
+  return 0;
+}
+
+typedef struct ExtraPattern
+{
+  int pos;
+  int len;
+  char *pattern;
+} ExtraPattern;
+
+/**
+ * Define special matching rules for complicated formats...
+ **/
+static ExtraPattern xpatterns[] = {
+#define AVI_XPATTERN 0
+  {8, 4, "AVI "},
+  {0, 0, NULL},
+#define WAVE_XPATTERN 2
+  {8, 4, "WAVE"},
+  {0, 0, NULL},
+#define ACE_XPATTERN 4
+  {4, 10, "\x00\x00\x90**ACE**"},
+  {0, 0, NULL},
+#define TAR_XPATTERN 6
+  {257, 6, "ustar\x00"},
+  {0, 0, NULL},
+#define GTAR_XPATTERN 8
+  {257, 8, "ustar\040\040\0"},
+  {0, 0, NULL},
+#define RMID_XPATTERN 10
+  {8, 4, "RMID"},
+  {0, 0, NULL},
+#define ACON_XPATTERN 12
+  {8, 4, "ACON"},
+  {0, 0, NULL},
+#define CR2_PATTERN 14
+  {8, 3, "CR\x02"},
+  {0, 0, NULL},
+};
+
+/**
+ * Detect AVI. A pattern matches if all XPatterns until the next {0,
+ * 0, NULL} slot match. OR-ing patterns can be achieved using multiple
+ * entries in the main table, so this "AND" (all match) semantics are
+ * the only reasonable answer.
+ **/
+static int
+xPatternMatcher (const char *data, size_t len, void *cls)
+{
+  ExtraPattern *arg = cls;
+
+  while (arg->pattern != NULL)
+    {
+      if (arg->pos + arg->len > len)
+        return 0;
+      if (0 != memcmp (&data[arg->pos], arg->pattern, arg->len))
+        return 0;
+      arg++;
+    }
+  return 1;
+}
+
+/**
+ * Detect SVG
+ */
+static int
+svgMatcher (const char *data, size_t len, void *cls)
+{
+  enum
+  { XMLSTART, XMLCLOSE, SVGSTART } state;
+  size_t i;
+
+  i = 0;
+  state = XMLSTART;
+
+  while (i < len)
+    {
+      if (!isprint (data[i]))
+        return 0;
+      switch (state)
+        {
+        case XMLSTART:
+          if (i + 6 >= len)
+            return 0;
+          else if (memcmp (data + i, "<?xml", 5) == 0
+                   && isspace (*(data + i + 5)))
+            state = XMLCLOSE;
+          break;
+        case XMLCLOSE:
+          if (i + 2 >= len)
+            return 0;
+          else if (memcmp (data + i, "?>", 2) == 0)
+            state = SVGSTART;
+          break;
+        case SVGSTART:
+          if (i + 5 >= len)
+            return 0;
+          else if (memcmp (data + i, "<svg", 4) == 0
+                   && isspace (*(data + i + 4)))
+            return 1;
+          break;
+        default:
+          /* do nothing */
+          break;
+        }
+      i++;
+    }
+  return 0;
+}
+
+/**
+ * Use this detector, if the simple header-prefix matching is
+ * sufficient.
+ **/
+#define DEFAULT &defaultDetector, NULL
+
+/**
+ * Use this detector, to disable the mime-type (effectively comment it
+ * out).
+ **/
+#define DISABLED &disableDetector, NULL
+
+/**
+ * Select an entry in xpatterns for matching
+ **/
+#define XPATTERN(a) &xPatternMatcher, &xpatterns[(a)]
+
+typedef struct Pattern
+{
+  char *pattern;
+  int size;
+  char *mimetype;
+  Detector detector;
+  void *arg;
+} Pattern;
+
+static Pattern patterns[] = {
+  {"\xFF\xD8", 2, "image/jpeg", DEFAULT},
+  {"\211PNG\r\n\032\n", 8, "image/png", DEFAULT},
+  {"/* XPM */", 9, "image/x-xpm", DEFAULT},
+  {"GIF8", 4, "image/gif", DEFAULT},
+  {"P1", 2, "image/x-portable-bitmap", DEFAULT},
+  {"P2", 2, "image/x-portable-graymap", DEFAULT},
+  {"P3", 2, "image/x-portable-pixmap", DEFAULT},
+  {"P4", 2, "image/x-portable-bitmap", DEFAULT},
+  {"P5", 2, "image/x-portable-graymap", DEFAULT},
+  {"P6", 2, "image/x-portable-pixmap", DEFAULT},
+  {"P7", 2, "image/x-portable-anymap", DEFAULT},
+  {"BM", 2, "image/x-bmp", DEFAULT},
+  {"fLaC", 4, "audio/flac", DEFAULT},
+  {"\x89PNG", 4, "image/x-png", DEFAULT},
+  {"id=ImageMagick", 14, "application/x-imagemagick-image", DEFAULT},
+  {"hsi1", 4, "image/x-jpeg-proprietary", DEFAULT},
+  {"FLV", 3, "video/x-flv", DEFAULT},
+  {"FWS", 3, "application/x-shockwave-flash", DEFAULT},
+  {"CWS", 3, "application/x-shockwave-flash", DEFAULT},
+  {"\x2E\x52\x4d\x46", 4, "video/real", DEFAULT},
+  {"\x2e\x72\x61\xfd", 4, "audio/real", DEFAULT},
+  {"\x00\x05\x16\x00", 4, "application/applefile", DEFAULT},
+  {"\x00\x05\x16\x07", 4, "application/applefile", DEFAULT},
+  {"\177ELF", 4, "application/x-executable", DEFAULT},
+  /* FIXME: correct MIME-type for an ELF!? */
+  {"\xca\xfe\xba\xbe", 4, "application/java", DEFAULT},
+  /* FIXME: correct MIME for a class-file? */
+  {"gimp xcf", 8, "image/xcf", DEFAULT},
+  {"II\x2a\x00\x10", 5, "image/x-canon-cr2", XPATTERN (CR2_PATTERN)},
+  {"IIN1", 4, "image/tiff", DEFAULT},
+  {"MM\x00\x2a", 4, "image/tiff", DEFAULT},     /* big-endian */
+  {"II\x2a\x00", 4, "image/tiff", DEFAULT},     /* little-endian */
+  {"%PDF", 4, "application/pdf", DEFAULT},
+  {"%!PS-Adobe-", 11, "application/postscript", DEFAULT},
+  {"\004%!PS-Adobe-", 12, "application/postscript", DEFAULT},
+  {"RIFF", 4, "video/x-msvideo", XPATTERN (AVI_XPATTERN)},
+  {"RIFF", 4, "audio/x-wav", XPATTERN (WAVE_XPATTERN)},
+  {"RIFX", 4, "video/x-msvideo", XPATTERN (AVI_XPATTERN)},
+  {"RIFX", 4, "audio/x-wav", XPATTERN (WAVE_XPATTERN)},
+  {"RIFF", 4, "audio/midi", XPATTERN (RMID_XPATTERN)},
+  {"RIFX", 4, "audio/midi", XPATTERN (RMID_XPATTERN)},
+  {"RIFF", 4, "image/x-animated-cursor", XPATTERN (ACON_XPATTERN)},
+  {"RIFX", 4, "image/x-animated-cursor", XPATTERN (ACON_XPATTERN)},
+  {"\211GND\r\n\032\n", 8, "application/gnunet-directory", DEFAULT},
+  {"{\\rtf", 5, "application/rtf", DEFAULT},
+  {"\xf7\x02", 2, "application/x-dvi", DEFAULT},
+  {"\x1F\x8B\x08\x00", 4, "application/x-gzip", DEFAULT},
+  {"BZh91AY&SY", 10, "application/bz2", DEFAULT},
+  {"\xED\xAB\xEE\xDB", 4, "application/x-rpm", DEFAULT},        /* binary */
+  {"!<arch>\ndebian", 14, "application/x-dpkg", DEFAULT},       /* .deb */
+  {"PK\x03\x04", 4, "application/x-zip", DEFAULT},
+  {"\xea\x60", 2, "application/x-arj", DEFAULT},
+  {"\037\235", 2, "application/x-compress", DEFAULT},
+  {"Rar!", 4, "application/x-rar", DEFAULT},
+  {"", 0, "application/x-ace", XPATTERN (ACE_XPATTERN)},
+  {"", 0, "application/x-tar", XPATTERN (TAR_XPATTERN)},
+  {"", 0, "application/x-gtar", XPATTERN (GTAR_XPATTERN)},
+  {"-lh0-", 5, "application/x-lha", DEFAULT},
+  {"-lh1-", 5, "application/x-lha", DEFAULT},
+  {"-lh2-", 5, "application/x-lha", DEFAULT},
+  {"-lh3-", 5, "application/x-lha", DEFAULT},
+  {"-lh4-", 5, "application/x-lha", DEFAULT},
+  {"-lh5-", 5, "application/x-lha", DEFAULT},
+  {"-lh6-", 5, "application/x-lha", DEFAULT},
+  {"-lh7-", 5, "application/x-lha", DEFAULT},
+  {"-lhd-", 5, "application/x-lha", DEFAULT},
+  {"-lh\40-", 5, "application/x-lha", DEFAULT},
+  {"-lz4-", 5, "application/x-lha", DEFAULT},
+  {"-lz5-", 5, "application/x-lha", DEFAULT},
+  {"-lzs-", 5, "application/x-lha", DEFAULT},
+  {"\xFD\x76", 2, "application/x-lzh", DEFAULT},
+  {"\x00\x00\x01\xb3", 4, "video/mpeg", DEFAULT},
+  {"\x00\x00\x01\xba", 4, "video/mpeg", DEFAULT},
+  {"moov", 4, "video/quicktime", DEFAULT},
+  {"mdat", 4, "video/quicktime", DEFAULT},
+  {"\x8aMNG", 4, "video/x-mng", DEFAULT},
+  {"\x30\x26\xb2\x75\x8e\x66", 6, "video/asf", DEFAULT},        /* same as 
.wmv ? */
+  {"FWS", 3, "application/x-shockwave-flash", DEFAULT},
+  {"MThd", 4, "audio/midi", DEFAULT},
+  {"ID3", 3, "audio/mpeg", DEFAULT},
+  {"\xFF\xFA", 2, "audio/mpeg", DEFAULT},
+  {"\xFF\xFB", 2, "audio/mpeg", DEFAULT},
+  {"\xFF\xFC", 2, "audio/mpeg", DEFAULT},
+  {"\xFF\xFD", 2, "audio/mpeg", DEFAULT},
+  {"\xFF\xFE", 2, "audio/mpeg", DEFAULT},
+  {"\xFF\xFF", 2, "audio/mpeg", DEFAULT},
+  {"OggS", 4, "application/ogg", DEFAULT},
+  {"#!/bin/sh", 9, "application/x-shellscript", DEFAULT},
+  {"#!/bin/bash", 11, "application/x-shellscript", DEFAULT},
+  {"#!/bin/csh", 10, "application/x-shellscript", DEFAULT},
+  {"#!/bin/tcsh", 11, "application/x-shellscript", DEFAULT},
+  {"#!/bin/perl", 11, "application/x-perl", DEFAULT},
+  {"<?xml", 5, "image/svg+xml", svgMatcher, NULL},
+  {NULL, 0, NULL, DISABLED},
+};
+
+
+int 
+EXTRACTOR_mime_extract (const char *data,
+                       size_t size,
+                       EXTRACTOR_MetaDataProcessor proc,
+                       void *proc_cls,
+                       const char *options)
+{
+  int i;
+
+  i = 0;
+  while (patterns[i].pattern != NULL)
+    {
+      if (size < patterns[i].size)
+        {
+          i++;
+          continue;
+        }
+      if (0 == memcmp (patterns[i].pattern, data, patterns[i].size))
+        {
+          if (patterns[i].detector (data, size, patterns[i].arg))
+            return proc (proc_cls,
+                        "mime",
+                        EXTRACTOR_METATYPE_MIMETYPE,
+                        EXTRACTOR_METAFORMAT_UTF8,
+                        "text/plain",
+                        patterns[i].mimetype,
+                        strlen(patterns[i].mimetype)+1);
+        }
+      i++;
+    }
+  return 0;
+}

Deleted: Extractor/src/plugins/mimeextractor.c
===================================================================
--- Extractor/src/plugins/mimeextractor.c       2009-12-12 20:09:31 UTC (rev 
9745)
+++ Extractor/src/plugins/mimeextractor.c       2009-12-13 23:02:19 UTC (rev 
9746)
@@ -1,333 +0,0 @@
-/*
-     This file is part of libextractor.
-     (C) 2002, 2003, 2006 Vidyut Samanta and Christian Grothoff
-
-     libextractor is free software; you can redistribute it and/or modify
-     it under the terms of the GNU General Public License as published
-     by the Free Software Foundation; either version 2, or (at your
-     option) any later version.
-
-     libextractor is distributed in the hope that it will be useful, but
-     WITHOUT ANY WARRANTY; without even the implied warranty of
-     MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
-     General Public License for more details.
-
-     You should have received a copy of the GNU General Public License
-     along with libextractor; see the file COPYING.  If not, write to the
-     Free Software Foundation, Inc., 59 Temple Place - Suite 330,
-     Boston, MA 02111-1307, USA.
- */
-
-#include "platform.h"
-#include "extractor.h"
-
-
-static EXTRACTOR_KeywordList *
-addKeyword (EXTRACTOR_KeywordType type,
-            char *keyword, EXTRACTOR_KeywordList * next)
-{
-  EXTRACTOR_KeywordList *result;
-
-  if (keyword == NULL)
-    return next;
-  result = malloc (sizeof (EXTRACTOR_KeywordList));
-  result->next = next;
-  result->keyword = keyword;
-  result->keywordType = type;
-  return result;
-}
-
-/**
- * Detect a file-type.
- * @param data the contents of the file
- * @param len the length of the file
- * @param arg closure...
- * @return 0 if the file does not match, 1 if it does
- **/
-typedef int (*Detector) (const char *data, size_t len, void *arg);
-
-/**
- * Detect a file-type.
- * @param data the contents of the file
- * @param len the length of the file
- * @return always 1
- **/
-static int
-defaultDetector (const char *data, size_t len, void *arg)
-{
-  return 1;
-}
-
-/**
- * Detect a file-type.
- * @param data the contents of the file
- * @param len the length of the file
- * @return always 0
- **/
-static int
-disableDetector (const char *data, size_t len, void *arg)
-{
-  return 0;
-}
-
-typedef struct ExtraPattern
-{
-  int pos;
-  int len;
-  char *pattern;
-} ExtraPattern;
-
-/**
- * Define special matching rules for complicated formats...
- **/
-static ExtraPattern xpatterns[] = {
-#define AVI_XPATTERN 0
-  {8, 4, "AVI "},
-  {0, 0, NULL},
-#define WAVE_XPATTERN 2
-  {8, 4, "WAVE"},
-  {0, 0, NULL},
-#define ACE_XPATTERN 4
-  {4, 10, "\x00\x00\x90**ACE**"},
-  {0, 0, NULL},
-#define TAR_XPATTERN 6
-  {257, 6, "ustar\x00"},
-  {0, 0, NULL},
-#define GTAR_XPATTERN 8
-  {257, 8, "ustar\040\040\0"},
-  {0, 0, NULL},
-#define RMID_XPATTERN 10
-  {8, 4, "RMID"},
-  {0, 0, NULL},
-#define ACON_XPATTERN 12
-  {8, 4, "ACON"},
-  {0, 0, NULL},
-#define CR2_PATTERN 14
-  {8, 3, "CR\x02"},
-  {0, 0, NULL},
-};
-
-/**
- * Detect AVI. A pattern matches if all XPatterns until the next {0,
- * 0, NULL} slot match. OR-ing patterns can be achieved using multiple
- * entries in the main table, so this "AND" (all match) semantics are
- * the only reasonable answer.
- **/
-static int
-xPatternMatcher (const char *data, size_t len, void *cls)
-{
-  ExtraPattern *arg = cls;
-
-  while (arg->pattern != NULL)
-    {
-      if (arg->pos + arg->len > len)
-        return 0;
-      if (0 != memcmp (&data[arg->pos], arg->pattern, arg->len))
-        return 0;
-      arg++;
-    }
-  return 1;
-}
-
-/**
- * Detect SVG
- */
-static int
-svgMatcher (const char *data, size_t len, void *cls)
-{
-  enum
-  { XMLSTART, XMLCLOSE, SVGSTART } state;
-  size_t i;
-
-  i = 0;
-  state = XMLSTART;
-
-  while (i < len)
-    {
-      if (!isprint (data[i]))
-        return 0;
-      switch (state)
-        {
-        case XMLSTART:
-          if (i + 6 >= len)
-            return 0;
-          else if (memcmp (data + i, "<?xml", 5) == 0
-                   && isspace (*(data + i + 5)))
-            state = XMLCLOSE;
-          break;
-        case XMLCLOSE:
-          if (i + 2 >= len)
-            return 0;
-          else if (memcmp (data + i, "?>", 2) == 0)
-            state = SVGSTART;
-          break;
-        case SVGSTART:
-          if (i + 5 >= len)
-            return 0;
-          else if (memcmp (data + i, "<svg", 4) == 0
-                   && isspace (*(data + i + 4)))
-            return 1;
-          break;
-        default:
-          /* do nothing */
-          break;
-        }
-      i++;
-    }
-  return 0;
-}
-
-/**
- * Use this detector, if the simple header-prefix matching is
- * sufficient.
- **/
-#define DEFAULT &defaultDetector, NULL
-
-/**
- * Use this detector, to disable the mime-type (effectively comment it
- * out).
- **/
-#define DISABLED &disableDetector, NULL
-
-/**
- * Select an entry in xpatterns for matching
- **/
-#define XPATTERN(a) &xPatternMatcher, &xpatterns[(a)]
-
-typedef struct Pattern
-{
-  char *pattern;
-  int size;
-  char *mimetype;
-  Detector detector;
-  void *arg;
-} Pattern;
-
-static Pattern patterns[] = {
-  {"\xFF\xD8", 2, "image/jpeg", DEFAULT},
-  {"\211PNG\r\n\032\n", 8, "image/png", DEFAULT},
-  {"/* XPM */", 9, "image/x-xpm", DEFAULT},
-  {"GIF8", 4, "image/gif", DEFAULT},
-  {"P1", 2, "image/x-portable-bitmap", DEFAULT},
-  {"P2", 2, "image/x-portable-graymap", DEFAULT},
-  {"P3", 2, "image/x-portable-pixmap", DEFAULT},
-  {"P4", 2, "image/x-portable-bitmap", DEFAULT},
-  {"P5", 2, "image/x-portable-graymap", DEFAULT},
-  {"P6", 2, "image/x-portable-pixmap", DEFAULT},
-  {"P7", 2, "image/x-portable-anymap", DEFAULT},
-  {"BM", 2, "image/x-bmp", DEFAULT},
-  {"fLaC", 4, "audio/flac", DEFAULT},
-  {"\x89PNG", 4, "image/x-png", DEFAULT},
-  {"id=ImageMagick", 14, "application/x-imagemagick-image", DEFAULT},
-  {"hsi1", 4, "image/x-jpeg-proprietary", DEFAULT},
-  {"FLV", 3, "video/x-flv", DEFAULT},
-  {"FWS", 3, "application/x-shockwave-flash", DEFAULT},
-  {"CWS", 3, "application/x-shockwave-flash", DEFAULT},
-  {"\x2E\x52\x4d\x46", 4, "video/real", DEFAULT},
-  {"\x2e\x72\x61\xfd", 4, "audio/real", DEFAULT},
-  {"\x00\x05\x16\x00", 4, "application/applefile", DEFAULT},
-  {"\x00\x05\x16\x07", 4, "application/applefile", DEFAULT},
-  {"\177ELF", 4, "application/x-executable", DEFAULT},
-  /* FIXME: correct MIME-type for an ELF!? */
-  {"\xca\xfe\xba\xbe", 4, "application/java", DEFAULT},
-  /* FIXME: correct MIME for a class-file? */
-  {"gimp xcf", 8, "image/xcf", DEFAULT},
-  {"II\x2a\x00\x10", 5, "image/x-canon-cr2", XPATTERN (CR2_PATTERN)},
-  {"IIN1", 4, "image/tiff", DEFAULT},
-  {"MM\x00\x2a", 4, "image/tiff", DEFAULT},     /* big-endian */
-  {"II\x2a\x00", 4, "image/tiff", DEFAULT},     /* little-endian */
-  {"%PDF", 4, "application/pdf", DEFAULT},
-  {"%!PS-Adobe-", 11, "application/postscript", DEFAULT},
-  {"\004%!PS-Adobe-", 12, "application/postscript", DEFAULT},
-  {"RIFF", 4, "video/x-msvideo", XPATTERN (AVI_XPATTERN)},
-  {"RIFF", 4, "audio/x-wav", XPATTERN (WAVE_XPATTERN)},
-  {"RIFX", 4, "video/x-msvideo", XPATTERN (AVI_XPATTERN)},
-  {"RIFX", 4, "audio/x-wav", XPATTERN (WAVE_XPATTERN)},
-  {"RIFF", 4, "audio/midi", XPATTERN (RMID_XPATTERN)},
-  {"RIFX", 4, "audio/midi", XPATTERN (RMID_XPATTERN)},
-  {"RIFF", 4, "image/x-animated-cursor", XPATTERN (ACON_XPATTERN)},
-  {"RIFX", 4, "image/x-animated-cursor", XPATTERN (ACON_XPATTERN)},
-  {"\211GND\r\n\032\n", 8, "application/gnunet-directory", DEFAULT},
-  {"{\\rtf", 5, "application/rtf", DEFAULT},
-  {"\xf7\x02", 2, "application/x-dvi", DEFAULT},
-  {"\x1F\x8B\x08\x00", 4, "application/x-gzip", DEFAULT},
-  {"BZh91AY&SY", 10, "application/bz2", DEFAULT},
-  {"\xED\xAB\xEE\xDB", 4, "application/x-rpm", DEFAULT},        /* binary */
-  {"!<arch>\ndebian", 14, "application/x-dpkg", DEFAULT},       /* .deb */
-  {"PK\x03\x04", 4, "application/x-zip", DEFAULT},
-  {"\xea\x60", 2, "application/x-arj", DEFAULT},
-  {"\037\235", 2, "application/x-compress", DEFAULT},
-  {"Rar!", 4, "application/x-rar", DEFAULT},
-  {"", 0, "application/x-ace", XPATTERN (ACE_XPATTERN)},
-  {"", 0, "application/x-tar", XPATTERN (TAR_XPATTERN)},
-  {"", 0, "application/x-gtar", XPATTERN (GTAR_XPATTERN)},
-  {"-lh0-", 5, "application/x-lha", DEFAULT},
-  {"-lh1-", 5, "application/x-lha", DEFAULT},
-  {"-lh2-", 5, "application/x-lha", DEFAULT},
-  {"-lh3-", 5, "application/x-lha", DEFAULT},
-  {"-lh4-", 5, "application/x-lha", DEFAULT},
-  {"-lh5-", 5, "application/x-lha", DEFAULT},
-  {"-lh6-", 5, "application/x-lha", DEFAULT},
-  {"-lh7-", 5, "application/x-lha", DEFAULT},
-  {"-lhd-", 5, "application/x-lha", DEFAULT},
-  {"-lh\40-", 5, "application/x-lha", DEFAULT},
-  {"-lz4-", 5, "application/x-lha", DEFAULT},
-  {"-lz5-", 5, "application/x-lha", DEFAULT},
-  {"-lzs-", 5, "application/x-lha", DEFAULT},
-  {"\xFD\x76", 2, "application/x-lzh", DEFAULT},
-  {"\x00\x00\x01\xb3", 4, "video/mpeg", DEFAULT},
-  {"\x00\x00\x01\xba", 4, "video/mpeg", DEFAULT},
-  {"moov", 4, "video/quicktime", DEFAULT},
-  {"mdat", 4, "video/quicktime", DEFAULT},
-  {"\x8aMNG", 4, "video/x-mng", DEFAULT},
-  {"\x30\x26\xb2\x75\x8e\x66", 6, "video/asf", DEFAULT},        /* same as 
.wmv ? */
-  {"FWS", 3, "application/x-shockwave-flash", DEFAULT},
-  {"MThd", 4, "audio/midi", DEFAULT},
-  {"ID3", 3, "audio/mpeg", DEFAULT},
-  {"\xFF\xFA", 2, "audio/mpeg", DEFAULT},
-  {"\xFF\xFB", 2, "audio/mpeg", DEFAULT},
-  {"\xFF\xFC", 2, "audio/mpeg", DEFAULT},
-  {"\xFF\xFD", 2, "audio/mpeg", DEFAULT},
-  {"\xFF\xFE", 2, "audio/mpeg", DEFAULT},
-  {"\xFF\xFF", 2, "audio/mpeg", DEFAULT},
-  {"OggS", 4, "application/ogg", DEFAULT},
-  {"#!/bin/sh", 9, "application/x-shellscript", DEFAULT},
-  {"#!/bin/bash", 11, "application/x-shellscript", DEFAULT},
-  {"#!/bin/csh", 10, "application/x-shellscript", DEFAULT},
-  {"#!/bin/tcsh", 11, "application/x-shellscript", DEFAULT},
-  {"#!/bin/perl", 11, "application/x-perl", DEFAULT},
-  {"<?xml", 5, "image/svg+xml", svgMatcher, NULL},
-  {NULL, 0, NULL, DISABLED},
-};
-
-struct EXTRACTOR_Keywords *
-libextractor_mime_extract (const char *filename,
-                           const char *data,
-                           size_t size, struct EXTRACTOR_Keywords *prev)
-{
-  int i;
-  const char *mime;
-
-  mime = EXTRACTOR_extractLast (EXTRACTOR_MIMETYPE, prev);
-  if (mime != NULL)
-    return prev;                /* if the mime-type has already
-                                   been determined, there is no need
-                                   to probe again (and potentially be 
wrong...) */
-  i = 0;
-  while (patterns[i].pattern != NULL)
-    {
-      if (size < patterns[i].size)
-        {
-          i++;
-          continue;
-        }
-      if (0 == memcmp (patterns[i].pattern, data, patterns[i].size))
-        {
-          if (patterns[i].detector (data, size, patterns[i].arg))
-            return addKeyword (EXTRACTOR_MIMETYPE,
-                               strdup (patterns[i].mimetype), prev);
-        }
-      i++;
-    }
-  return prev;
-}

Deleted: Extractor/src/plugins/splitextractor.c
===================================================================
--- Extractor/src/plugins/splitextractor.c      2009-12-12 20:09:31 UTC (rev 
9745)
+++ Extractor/src/plugins/splitextractor.c      2009-12-13 23:02:19 UTC (rev 
9746)
@@ -1,157 +0,0 @@
-/*
-     This file is part of libextractor.
-     (C) 2002, 2003, 2005, 2006 Vidyut Samanta and Christian Grothoff
-
-     libextractor is free software; you can redistribute it and/or modify
-     it under the terms of the GNU General Public License as published
-     by the Free Software Foundation; either version 2, or (at your
-     option) any later version.
-
-     libextractor is distributed in the hope that it will be useful, but
-     WITHOUT ANY WARRANTY; without even the implied warranty of
-     MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
-     General Public License for more details.
-
-     You should have received a copy of the GNU General Public License
-     along with libextractor; see the file COPYING.  If not, write to the
-     Free Software Foundation, Inc., 59 Temple Place - Suite 330,
-     Boston, MA 02111-1307, USA.
- */
-
-#include "platform.h"
-#include "extractor.h"
-
-/**
- * Default split characters.
- */
-static const char *TOKENIZERS = "._ ,address@hidden(){}";
-
-/**
- * Do not use keywords shorter than this minimum
- * length.
- */
-static int MINIMUM_KEYWORD_LENGTH = 3;
-
-static void
-addKeyword (struct EXTRACTOR_Keywords **list, const char *keyword)
-{
-  EXTRACTOR_KeywordList *next;
-  next = malloc (sizeof (EXTRACTOR_KeywordList));
-  next->next = *list;
-  next->keyword = strdup (keyword);
-  next->keywordType = EXTRACTOR_SPLIT;
-  *list = next;
-}
-
-static int
-token (char letter, const char *options)
-{
-  size_t i;
-
-  i = 0;
-  while (options[i] != '\0')
-    {
-      if (letter == options[i])
-        return 1;
-      i++;
-    }
-  return 0;
-}
-
-static void
-splitKeywords (const char *keyword,
-               struct EXTRACTOR_Keywords **list, const char *options)
-{
-  char *dp;
-  size_t pos;
-  size_t last;
-  size_t len;
-
-  dp = strdup (keyword);
-  len = strlen (dp);
-  pos = 0;
-  last = 0;
-  while (pos < len)
-    {
-      while ((0 == token (dp[pos], options)) && (pos < len))
-        pos++;
-      dp[pos++] = '\0';
-      if ((pos - last > MINIMUM_KEYWORD_LENGTH) &&
-          (0 != strcmp (keyword, &dp[last])))
-        addKeyword (list, &dp[last]);
-      while ((pos < len) && (1 == token (dp[pos], options)))
-        pos++;
-      last = pos;
-    }
-  free (dp);
-}
-
-/* split other keywords into multiple keywords */
-struct EXTRACTOR_Keywords *
-libextractor_split_extract (const char *filename,
-                            const char *data,
-                            size_t size,
-                            struct EXTRACTOR_Keywords *prev,
-                            const char *options)
-{
-  struct EXTRACTOR_Keywords *kpos;
-  char *opt;
-  char *pos;
-
-  if (options == NULL)
-    {
-      opt = strdup (TOKENIZERS);
-    }
-  else
-    {
-      opt = strdup (options);
-      pos = opt;
-      while (pos[0] != '\0')
-        {
-          if (pos[0] == '\\')
-            {
-              switch (pos[1])
-                {
-                case 'n':
-                  pos[0] = '\n';
-                  memmove (&pos[1], &pos[2], strlen (&pos[2]));
-                  continue;
-                case 'r':
-                  pos[0] = '\r';
-                  memmove (&pos[1], &pos[2], strlen (&pos[2]));
-                  continue;
-                case 'b':
-                  pos[0] = '\b';
-                  memmove (&pos[1], &pos[2], strlen (&pos[2]));
-                  continue;
-                case 't':
-                  pos[0] = '\t';
-                  memmove (&pos[1], &pos[2], strlen (&pos[2]));
-                  continue;
-                case '\\':
-                  memmove (&pos[1], &pos[2], strlen (&pos[2]));
-                  continue;
-                case '\0':     /* invalid escape, ignore */
-                  pos[0] = '\0';
-                  break;
-                default:       /* invalid escape, skip */
-                  memmove (&pos[0], &pos[2], strlen (&pos[2]));
-                  continue;
-                }
-            }
-          pos++;
-        }
-    }
-  kpos = prev;
-  while (kpos != NULL)
-    {
-      if (kpos->keywordType != EXTRACTOR_FILE_SIZE)
-        splitKeywords (kpos->keyword, &prev, opt);
-
-      kpos = kpos->next;
-    }
-  free (opt);
-  return prev;
-}
-
-/* end of splitextractor.c */
[Prev in Thread]
Current Thread
[Next in Thread]
[GNUnet-SVN] r9746 - in Extractor: . doc src/include src/main src/plugins, gnunet <=
Prev by Date: [GNUnet-SVN] r9745 - in GNUnet/src/applications: dv/module dv_dht/module
Next by Date: [GNUnet-SVN] r9750 - in Extractor/src: include main plugins
Previous by thread: [GNUnet-SVN] r9745 - in GNUnet/src/applications: dv/module dv_dht/module
Next by thread: [GNUnet-SVN] r9750 - in Extractor/src: include main plugins
Index(es):
- Date
- Thread