gnunet-svn
[Top][All Lists]
Advanced

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

[GNUnet-SVN] r9769 - in Extractor: . src/include src/main src/plugins


From: gnunet
Subject: [GNUnet-SVN] r9769 - in Extractor: . src/include src/main src/plugins
Date: Tue, 15 Dec 2009 17:40:48 +0100

Author: grothoff
Date: 2009-12-15 17:40:48 +0100 (Tue, 15 Dec 2009)
New Revision: 9769

Added:
   Extractor/src/plugins/pdf_extractor.cc
Removed:
   Extractor/src/plugins/pdf/
Modified:
   Extractor/configure.ac
   Extractor/src/include/extractor.h
   Extractor/src/main/extractor_metatypes.c
   Extractor/src/plugins/Makefile.am
   Extractor/src/plugins/dvi_extractor.c
Log:
pdf

Modified: Extractor/configure.ac
===================================================================
--- Extractor/configure.ac      2009-12-15 16:35:34 UTC (rev 9768)
+++ Extractor/configure.ac      2009-12-15 16:40:48 UTC (rev 9769)
@@ -217,7 +217,14 @@
          AM_CONDITIONAL(HAVE_MPEG2, false))],
          AM_CONDITIONAL(HAVE_MPEG2, false))
 
+AC_CHECK_LIB(poppler, _ZTI9MemStream,
+        [AC_CHECK_HEADERS([poppler/goo/gmem.h],
+           AM_CONDITIONAL(HAVE_POPPLER, true)
+           AC_DEFINE(HAVE_POPPLER,1,[Have libpoppler]),
+         AM_CONDITIONAL(HAVE_POPPLER, false))],
+         AM_CONDITIONAL(HAVE_POPPLER, false))
 
+
 # restore LIBS
 LIBS=$LIBSOLD
 
@@ -309,7 +316,6 @@
 fi
 
 # check for all C++ dependencies...
-xpdf=0
 exiv2=0
 qt=0
 qt4=0
@@ -398,21 +404,6 @@
               EXT_LIB_PATH="-L$with_qt/lib $EXT_LIB_PATH"
               qt_svg=1)))])
 
-AC_MSG_CHECKING([whether to enable xpdf-based extractor])
-AC_ARG_ENABLE(xpdf,
- [AC_HELP_STRING([--enable-xpdf],[Enable xpdf-based extractor])
-  AC_HELP_STRING([--disable-xpdf],[Disable xpdf-based extractor])],
- [case "$enableval" in
-  no)  AC_MSG_RESULT(no)
-        xpdf=0
-       ;;
-  *)    AC_MSG_RESULT(yes)
-        xpdf=1
-       ;;
-  esac],
-  [     AC_MSG_RESULT(no)
-        xpdf=0])
-
 exiv2=1
 AC_MSG_CHECKING([whether to enable exiv2 extractor])
 AC_ARG_ENABLE(exiv2,
@@ -455,8 +446,6 @@
 AM_CONDITIONAL(HAVE_QT_SVG, test x$qt_svg != x0)
 AM_CONDITIONAL(HAVE_QT_SVG4, test x$qt_svg4 != x0)
 
-AM_CONDITIONAL(HAVE_XPDF, test x$xpdf != x0)
-
 AM_CONDITIONAL(HAVE_EXIV2, test x$exiv2 != x0)
 AC_DEFINE_UNQUOTED([HAVE_EXIV2], $exiv2, [We use EXIV2])
 
@@ -569,7 +558,6 @@
 src/plugins/Makefile
 src/plugins/ole2/Makefile
 src/plugins/oo/Makefile
-src/plugins/pdf/Makefile
 src/plugins/printable/Makefile
 src/plugins/hash/Makefile
 src/plugins/thumbnail/Makefile
@@ -627,13 +615,6 @@
 
 AC_OUTPUT
 
-if test "x$xpdf" = "x1"
-then
- AC_MSG_NOTICE([NOTICE: xpdf enabled (xpdf has a bad security record)])
-else 
- AC_MSG_NOTICE([NOTICE: xpdf disabled (result: limited PDF support)])
-fi
-
 if test "x$exiv2" = "x0"
 then
  AC_MSG_NOTICE([NOTICE: exiv2 disabled])
@@ -679,6 +660,11 @@
  AC_MSG_NOTICE([NOTICE: libmpeg2 not found (will not compile mpeg2 plugin)])
 fi
 
+if test "x$HAVE_POPPLER_TRUE" = "x#"
+then
+ AC_MSG_NOTICE([NOTICE: libpoppler not found (will not compile pdf plugin)])
+fi
+
 if test "x$HAVE_CXX" != "xyes"
 then
  AC_MSG_NOTICE([NOTICE: no C++ compiler found (not compiling plugins that 
require C++)])

Modified: Extractor/src/include/extractor.h
===================================================================
--- Extractor/src/include/extractor.h   2009-12-15 16:35:34 UTC (rev 9768)
+++ Extractor/src/include/extractor.h   2009-12-15 16:40:48 UTC (rev 9769)
@@ -237,6 +237,10 @@
 
     /* image specifics */
     EXTRACTOR_METATYPE_IMAGE_DIMENSIONS = 112, 
+
+
+    EXTRACTOR_METATYPE_PRODUCED_BY_SOFTWARE = 113, 
+
     
     /* fixme: used up to here! */
     EXTRACTOR_METATYPE_THUMBNAIL_DATA = 70,

Modified: Extractor/src/main/extractor_metatypes.c
===================================================================
--- Extractor/src/main/extractor_metatypes.c    2009-12-15 16:35:34 UTC (rev 
9768)
+++ Extractor/src/main/extractor_metatypes.c    2009-12-15 16:40:48 UTC (rev 
9769)
@@ -282,10 +282,13 @@
     gettext_noop ("") }, 
   { gettext_noop ("image dimensions"),
     gettext_noop ("") }, 
+  { gettext_noop ("produced by software"),
+    gettext_noop ("") }, /* what is the exact difference between the software
+                           creator and the software producer? PDF and DVI
+                           both have this distinction (i.e., Writer vs.
+                           OpenOffice) */
   { gettext_noop (""),
     gettext_noop ("") }, 
-  { gettext_noop (""),
-    gettext_noop ("") }, 
 #if 0
   
   gettext_noop("author"),

Modified: Extractor/src/plugins/Makefile.am
===================================================================
--- Extractor/src/plugins/Makefile.am   2009-12-15 16:35:34 UTC (rev 9768)
+++ Extractor/src/plugins/Makefile.am   2009-12-15 16:40:48 UTC (rev 9769)
@@ -46,22 +46,19 @@
 if HAVE_EXIV2
  exiv2=libextractor_exiv2.la
 endif
+if HAVE_POPPLER
+ pdf=libextractor_pdf.la
 endif
+endif
 
 
-if HAVE_XPDF
- xpdfdir=pdf
-else
- pdfplugin=libextractor_pdf.la 
-endif
-
 if HAVE_MPEG2
  extrampeg = libextractor_mpeg.la
 endif
 
 # toggle for development
 SUBDIRS = . 
-# SUBDIRS = . $(thumbgtk) $(thumbffmpeg) $(oodir) $(printdir) hash $(oledir) 
$(xpdfdir)
+# SUBDIRS = . $(thumbgtk) $(thumbffmpeg) $(oodir) $(printdir) hash $(oledir)
 
 
 if HAVE_VORBISFILE
@@ -95,6 +92,7 @@
   libextractor_html.la \
   libextractor_it.la \
   libextractor_mime.la \
+  $(pdf) \
   $(rpm)
 
 libextractor_applefile_la_SOURCES = \
@@ -148,6 +146,19 @@
 libextractor_it_la_LDFLAGS = \
   $(PLUGINFLAGS)
 
+libextractor_mime_la_SOURCES = \
+  mime_extractor.c 
+libextractor_mime_la_LDFLAGS = \
+  $(PLUGINFLAGS)
+
+libextractor_pdf_la_SOURCES = \
+  pdf_extractor.cc 
+libextractor_pdf_la_LDFLAGS = \
+  $(XTRA_CPPLIBS) $(PLUGINFLAGS) 
+libextractor_pdf_la_LIBADD = \
+  $(top_builddir)/src/common/libextractor_common.la \
+  -lpoppler
+
 libextractor_rpm_la_SOURCES = \
   rpm_extractor.c 
 libextractor_rpm_la_LDFLAGS = \
@@ -155,10 +166,6 @@
 libextractor_rpm_la_LIBADD = \
   -lrpm
 
-libextractor_mime_la_SOURCES = \
-  mime_extractor.c 
-libextractor_mime_la_LDFLAGS = \
-  $(PLUGINFLAGS)
 
 
 
@@ -236,13 +243,6 @@
 libextractor_wav_la_LIBADD = \
   $(LE_LIBINTL)
 
-libextractor_pdf_la_SOURCES = \
-  pdfextractor.c
-libextractor_pdf_la_LDFLAGS = \
-  $(PLUGINFLAGS)
-libextractor_pdf_la_LIBADD = \
-  $(top_builddir)/src/common/libextractor_common.la
-
 libextractor_mp3_la_SOURCES = \
   mp3extractor.c 
 libextractor_mp3_la_LDFLAGS = \

Modified: Extractor/src/plugins/dvi_extractor.c
===================================================================
--- Extractor/src/plugins/dvi_extractor.c       2009-12-15 16:35:34 UTC (rev 
9768)
+++ Extractor/src/plugins/dvi_extractor.c       2009-12-15 16:40:48 UTC (rev 
9769)
@@ -33,7 +33,7 @@
   {"/Author (",   EXTRACTOR_METATYPE_AUTHOR_NAME},
   {"/Keywords (", EXTRACTOR_METATYPE_KEYWORDS},
   {"/Creator (",  EXTRACTOR_METATYPE_CREATED_BY_SOFTWARE },
-  {"/Producer (", EXTRACTOR_METATYPE_CREATED_BY_SOFTWARE},
+  {"/Producer (", EXTRACTOR_METATYPE_PRODUCED_BY_SOFTWARE},
   {NULL, 0},
 };
 

Added: Extractor/src/plugins/pdf_extractor.cc
===================================================================
--- Extractor/src/plugins/pdf_extractor.cc                              (rev 0)
+++ Extractor/src/plugins/pdf_extractor.cc      2009-12-15 16:40:48 UTC (rev 
9769)
@@ -0,0 +1,216 @@
+/*
+     This file is part of libextractor.
+     (C) 2002, 2003 Vidyut Samanta and Christian Grothoff
+
+     libextractor is free software; you can redistribute it and/or modify
+     it under the terms of the GNU General Public License as published
+     by the Free Software Foundation; either version 2, or (at your
+     option) any later version.
+
+     libextractor is distributed in the hope that it will be useful, but
+     WITHOUT ANY WARRANTY; without even the implied warranty of
+     MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+     General Public License for more details.
+
+     You should have received a copy of the GNU General Public License
+     along with libextractor; see the file COPYING.  If not, write to the
+     Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+     Boston, MA 02111-1307, USA.
+
+     This code was inspired by pdfinfo and depends heavily
+     on the xpdf code that pdfinfo is a part of. See also
+     the INFO file in this directory.
+ */
+
+#include "platform.h"
+#include "extractor.h"
+#include "convert.h"
+#include <math.h>
+
+#include <poppler/goo/gmem.h>
+#include <poppler/Object.h>
+#include <poppler/Stream.h>
+#include <poppler/Array.h>
+#include <poppler/Dict.h>
+#include <poppler/XRef.h>
+#include <poppler/Catalog.h>
+#include <poppler/Page.h>
+#include <poppler/PDFDoc.h>
+#include <poppler/Error.h>
+#include <poppler/goo/GooString.h>
+
+#define ADD(s, type) do { if (0!=proc(proc_cls, "pdf", type, 
EXTRACTOR_METAFORMAT_UTF8, "text/plain", s, strlen(s)+1)) { err = 1; goto EXIT; 
}} while (0)
+
+static int 
+printInfoString(Dict *infoDict,
+               const char *key,
+               enum EXTRACTOR_MetaType type,
+               EXTRACTOR_MetaDataProcessor proc,
+               void *proc_cls)
+{
+  Object obj;
+  GooString *s1;
+  const char * s;
+  char *ckey = strdup (key);
+  int err = 0;
+  char * result;
+      
+  result = NULL;
+  if (infoDict->lookup(ckey, &obj)->isString()) {
+    s1 = obj.getString();
+    s = s1->getCString();
+    if ((((unsigned char)s[0]) & 0xff) == 0xfe &&
+       (((unsigned char)s[1]) & 0xff) == 0xff) {
+      result = EXTRACTOR_common_convert_to_utf8(&s[2], s1->getLength() - 2, 
"UTF-16BE");
+      ADD (result, type);
+    } else {
+      size_t len = strlen(s);
+      
+      while(0 < len) {
+        /*
+         * Avoid outputting trailing spaces.
+         *
+         * The following expression might be rewritten as
+         * (! isspace(s[len - 1]) && 0xA0 != s[len - 1]).
+         * There seem to exist isspace() implementations
+         * which do return non-zero from NBSP (maybe locale-dependent).
+         * Remove ISO-8859 non-breaking space (NBSP, hex value 0xA0) from
+         * the expression if it looks suspicious (locale issues for instance).
+         *
+         * Squeezing out all non-printable characters might also be useful.
+         */
+          if ( (' '  != s[len - 1]) && ((char)0xA0 != s[len - 1]) &&
+               ('\r' != s[len - 1]) && ('\n' != s[len - 1]) &&
+               ('\t' != s[len - 1]) && ('\v' != s[len - 1]) &&
+               ('\f' != s[len - 1]) )
+             break;
+
+          else
+            len --;
+        }
+
+        /* there should be a check to truncate preposterously long values. */
+      
+      if (0 < len) {
+       result = EXTRACTOR_common_convert_to_utf8(s, len,
+                                                 "ISO-8859-1");
+       ADD (result, type);
+      }
+    }
+  }
+ EXIT:
+  obj.free();
+  free (result);
+  free (ckey);
+  return err;
+}
+
+static int 
+printInfoDate(Dict *infoDict,
+             const char *key,
+             enum EXTRACTOR_MetaType type,
+             EXTRACTOR_MetaDataProcessor proc,
+             void *proc_cls)
+{
+  Object obj;
+  const char *s;
+  GooString *s1;  
+  char *gkey;
+  char * result;
+  int err;
+  
+  err = 0;
+  result = NULL;
+  gkey = strdup (key);
+  if (infoDict->lookup(gkey, &obj)->isString()) {
+    s1 = obj.getString();
+    s = s1->getCString();
+    
+    if ((s1->getChar(0) & 0xff) == 0xfe &&
+       (s1->getChar(1) & 0xff) == 0xff) {
+      /* isUnicode */
+      
+      result = EXTRACTOR_common_convert_to_utf8((const char*)&s[2], 
s1->getLength() - 2, "UTF-16BE");
+      ADD (result, type);
+    } else {
+      if (s[0] == 'D' && s[1] == ':') 
+       s += 2;
+      
+      ADD (s, type);
+    }
+    /* printf(fmt, s);*/
+  }
+ EXIT:
+  obj.free();
+  free (result);
+  free (gkey);
+  return err;
+}
+
+#define PIS(s,t) do { if (0 != (err = printInfoString (info.getDict(), s, t, 
proc, proc_cls))) goto EXIT; } while (0)
+
+#define PID(s,t) do { if (0 != (err = printInfoDate (info.getDict(), s, t, 
proc, proc_cls))) goto EXIT; } while (0)
+
+extern "C" {
+ 
+
+  int 
+  EXTRACTOR_pdf_extract (const char *data,
+                        size_t size,
+                        EXTRACTOR_MetaDataProcessor proc,
+                        void *proc_cls,
+                        const char *options)
+  {
+    PDFDoc * doc;
+    Object info;
+    Object obj;
+    BaseStream * stream;
+    int err;
+
+    /* errorInit();   -- keep commented out, otherwise errors are printed to 
stderr for non-pdf files! */
+    obj.initNull();
+    err = 0;
+    stream = new MemStream( (char*) data, 0, size, &obj);
+    doc = new PDFDoc(stream, NULL, NULL);
+    if (! doc->isOk()) {
+      delete doc;
+      return 0;
+    }
+
+    ADD ("application/pdf",
+        EXTRACTOR_METATYPE_MIMETYPE);
+    if ( (NULL != doc->getDocInfo(&info)) &&
+        (info.isDict()) ) {
+      PIS ("Title", EXTRACTOR_METATYPE_TITLE);
+      PIS ("Subject", EXTRACTOR_METATYPE_SUBJECT);
+      PIS ("Keywords", EXTRACTOR_METATYPE_KEYWORDS);
+      PIS ("Author", EXTRACTOR_METATYPE_AUTHOR_NAME);
+      /*
+       * we now believe that Adobe's Creator is not a person nor an
+       * organisation, but just a piece of software.
+       */
+      PIS ("Creator", EXTRACTOR_METATYPE_CREATED_BY_SOFTWARE);
+      PIS ("Producer", EXTRACTOR_METATYPE_PRODUCED_BY_SOFTWARE);
+      {
+       char pcnt[20];
+       sprintf(pcnt, "%d", doc->getNumPages());
+       ADD (pcnt, EXTRACTOR_METATYPE_PAGE_COUNT);
+      }
+      {
+       char pcnt[64];
+       sprintf(pcnt, "PDF %d.%d", 
+               doc->getPDFMajorVersion(),
+               doc->getPDFMinorVersion());
+       ADD (pcnt, EXTRACTOR_METATYPE_FORMAT);
+      }
+      PID ("CreationDate", EXTRACTOR_METATYPE_CREATION_DATE);
+      PID ("ModDate", EXTRACTOR_METATYPE_MODIFICATION_DATE);
+    }
+  EXIT:
+    info.free();
+    delete doc;
+
+    return err;
+  }
+}
+





reply via email to

[Prev in Thread] Current Thread [Next in Thread]