gnunet-svn
[Top][All Lists]
Advanced

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

[GNUnet-SVN] r36956 - in Extractor: . src/include src/plugins


From: gnunet
Subject: [GNUnet-SVN] r36956 - in Extractor: . src/include src/plugins
Date: Sat, 26 Mar 2016 16:26:31 +0100

Author: grothoff
Date: 2016-03-26 16:26:31 +0100 (Sat, 26 Mar 2016)
New Revision: 36956

Added:
   Extractor/src/plugins/pdf_extractor.c
Modified:
   Extractor/ChangeLog
   Extractor/src/include/extractor.h
   Extractor/src/plugins/Makefile.am
Log:
simple hack for PDF support

Modified: Extractor/ChangeLog
===================================================================
--- Extractor/ChangeLog 2016-03-26 00:38:26 UTC (rev 36955)
+++ Extractor/ChangeLog 2016-03-26 15:26:31 UTC (rev 36956)
@@ -1,3 +1,7 @@
+Sat Mar 26 16:23:56 CET 2016
+       Adding PDF support using pdfinfo.
+       Likely conflicts with Apparmor. -CG
+
 Mon Aug 31 19:19:17 CEST 2015
        Adding apparmor support. -jmorvan/CG
 

Modified: Extractor/src/include/extractor.h
===================================================================
--- Extractor/src/include/extractor.h   2016-03-26 00:38:26 UTC (rev 36955)
+++ Extractor/src/include/extractor.h   2016-03-26 15:26:31 UTC (rev 36956)
@@ -35,7 +35,7 @@
  * 0.2.6-1 => 0x00020601
  * 4.5.2-0 => 0x04050200
  */
-#define EXTRACTOR_VERSION 0x01030001
+#define EXTRACTOR_VERSION 0x01030002
 
 #include <stdio.h>
 
@@ -383,7 +383,7 @@
     EXTRACTOR_METATYPE_AUDIO_DURATION = 226,
     EXTRACTOR_METATYPE_SUBTITLE_DURATION = 227,
 
-       EXTRACTOR_METATYPE_AUDIO_PREVIEW = 228,
+    EXTRACTOR_METATYPE_AUDIO_PREVIEW = 228,
 
     EXTRACTOR_METATYPE_LAST = 229
   };
@@ -443,13 +443,14 @@
  * @param data_len number of bytes in @a data
  * @return 0 to continue extracting, 1 to abort
  */
-typedef int (*EXTRACTOR_MetaDataProcessor) (void *cls,
-                                           const char *plugin_name,
-                                           enum EXTRACTOR_MetaType type,
-                                           enum EXTRACTOR_MetaFormat format,
-                                           const char *data_mime_type,
-                                           const char *data,
-                                           size_t data_len);
+typedef int
+(*EXTRACTOR_MetaDataProcessor) (void *cls,
+                                const char *plugin_name,
+                                enum EXTRACTOR_MetaType type,
+                                enum EXTRACTOR_MetaFormat format,
+                                const char *data_mime_type,
+                                const char *data,
+                                size_t data_len);
 
 
 /**
@@ -519,7 +520,8 @@
  *
  * @param ec extraction context provided to the plugin
  */
-typedef void (*EXTRACTOR_extract_method) (struct EXTRACTOR_ExtractContext *ec);
+typedef void
+(*EXTRACTOR_extract_method) (struct EXTRACTOR_ExtractContext *ec);
 
 
 /**

Modified: Extractor/src/plugins/Makefile.am
===================================================================
--- Extractor/src/plugins/Makefile.am   2016-03-26 00:38:26 UTC (rev 36955)
+++ Extractor/src/plugins/Makefile.am   2016-03-26 15:26:31 UTC (rev 36956)
@@ -160,6 +160,9 @@
 TEST_OGG=test_ogg
 endif
 
+if ! WINDOWS
+PLUGIN_PDF=libextractor_pdf.la
+endif
 
 if HAVE_ZLIB
 PLUGIN_ZLIB= \
@@ -198,6 +201,7 @@
   $(PLUGIN_MP4) \
   $(PLUGIN_MPEG) \
   $(PLUGIN_OGG) \
+  $(PLUGIN_PDF) \
   $(PLUGIN_PREVIEWOPUS) \
   $(PLUGIN_RPM) \
   $(PLUGIN_TIFF) \
@@ -524,6 +528,14 @@
   $(top_builddir)/src/plugins/libtest.la
 
 
+libextractor_pdf_la_SOURCES = \
+  pdf_extractor.c
+libextractor_pdf_la_LDFLAGS = \
+  $(PLUGINFLAGS)
+libextractor_pdf_la_LIBADD = \
+  $(top_builddir)/src/common/libextractor_common.la $(XLIB)  $(SOCKET_LIBS)
+
+
 libextractor_png_la_SOURCES = \
   png_extractor.c
 libextractor_png_la_LDFLAGS = \

Added: Extractor/src/plugins/pdf_extractor.c
===================================================================
--- Extractor/src/plugins/pdf_extractor.c                               (rev 0)
+++ Extractor/src/plugins/pdf_extractor.c       2016-03-26 15:26:31 UTC (rev 
36956)
@@ -0,0 +1,229 @@
+/*
+     This file is part of libextractor.
+     Copyright (C) 2016 Christian Grothoff
+
+     libextractor is free software; you can redistribute it and/or modify
+     it under the terms of the GNU General Public License as published
+     by the Free Software Foundation; either version 3, or (at your
+     option) any later version.
+
+     libextractor is distributed in the hope that it will be useful, but
+     WITHOUT ANY WARRANTY; without even the implied warranty of
+     MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+     General Public License for more details.
+
+     You should have received a copy of the GNU General Public License
+     along with libextractor; see the file COPYING.  If not, write to the
+     Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor,
+     Boston, MA 02110-1301, USA.
+ */
+/**
+ * @file plugins/pdf_extractor.c
+ * @brief plugin to support PDF files
+ * @author Christian Grothoff
+ *
+ * PDF libraries today are a nightmare (TM).  So instead of doing the
+ * fast thing and calling some library functions to parse the PDF,
+ * we execute 'pdfinfo' and parse the output. Because that's 21st
+ * century plumbing: nobody writes reasonable code anymore.
+ */
+#include "platform.h"
+#include <extractor.h>
+#include <sys/types.h>
+#include <sys/wait.h>
+#include <signal.h>
+#include <unistd.h>
+
+/**
+ * Entry in the mapping from control data to LE types.
+ */
+struct Matches
+{
+  /**
+   * Key in the Pdfian control file.
+   */
+  const char *text;
+
+  /**
+   * Corresponding type in LE.
+   */
+  enum EXTRACTOR_MetaType type;
+};
+
+
+/**
+ * Map from pdf-control entries to LE types.
+ *
+ * See output of 'pdfinfo'.
+ */
+static struct Matches tmap[] = {
+  {"Title",        EXTRACTOR_METATYPE_TITLE},
+  {"Subject",      EXTRACTOR_METATYPE_SUBJECT},
+  {"Keywords",     EXTRACTOR_METATYPE_KEYWORDS},
+  {"Author",       EXTRACTOR_METATYPE_AUTHOR_NAME},
+  {"Creator",      EXTRACTOR_METATYPE_CREATOR},
+  {"Producer",     EXTRACTOR_METATYPE_PRODUCED_BY_SOFTWARE},
+  {"CreationDate", EXTRACTOR_METATYPE_CREATION_DATE},
+  {"ModDate",      EXTRACTOR_METATYPE_MODIFICATION_DATE},
+  {"PDF version",  EXTRACTOR_METATYPE_ENCODER_VERSION},
+  {"Pages",        EXTRACTOR_METATYPE_PAGE_COUNT},
+  {NULL, 0}
+};
+
+
+/**
+ * Process the "stdout" file from pdfinfo.
+ *
+ * @param fout stdout of pdfinfo
+ * @param proc function to call with meta data
+ * @param proc_cls closure for @e proc
+ */
+static void
+process_stdout (FILE *fout,
+               EXTRACTOR_MetaDataProcessor proc,
+               void *proc_cls)
+{
+  unsigned int i;
+  char line[1025];
+  const char *psuffix;
+  const char *colon;
+
+  while (! feof (fout))
+    {
+      if (NULL == fgets (line, sizeof (line) - 1, fout))
+        break;
+      if (0 == strlen (line))
+        continue;
+      if ('\n' == line[strlen(line)-1])
+        line[strlen(line)-1] = '\0';
+      colon = strchr (line, (int) ':');
+      if (NULL == colon)
+        break;
+      psuffix = colon + 1;
+      while (isblank ((int) psuffix[0]))
+        psuffix++;
+      if (0 == strlen (psuffix))
+        continue;
+      for (i = 0; NULL != tmap[i].text; i++)
+        {
+          if (0 != strncasecmp (line,
+                                tmap[i].text,
+                                colon - line))
+           continue;
+         if (0 != proc (proc_cls,
+                        "pdf",
+                        tmap[i].type,
+                        EXTRACTOR_METAFORMAT_UTF8,
+                        "text/plain",
+                        psuffix,
+                        strlen(psuffix) + 1))
+            return;
+         break;
+       }
+    }
+}
+
+
+/**
+ * Main entry method for the PDF extraction plugin.
+ *
+ * @param ec extraction context provided to the plugin
+ */
+void
+EXTRACTOR_pdf_extract_method (struct EXTRACTOR_ExtractContext *ec)
+{
+  uint64_t fsize;
+  void *data;
+  pid_t pid;
+  int in[2];
+  int out[2];
+  FILE *fout;
+  uint64_t pos;
+
+  fsize = ec->get_size (ec->cls);
+  if (fsize < 128)
+    return;
+  if (4 !=
+      ec->read (ec->cls, &data, 4))
+    return;
+  if (0 != strncmp ("%PDF", data, 4))
+    return;
+  if (0 !=
+      ec->seek (ec->cls, 0, SEEK_SET))
+    return;
+  if (0 != pipe (in))
+    return;
+  if (0 != pipe (out))
+    {
+      close (in[0]);
+      close (in[1]);
+      return;
+    }
+  pid = fork ();
+  if (-1 == pid)
+    {
+      close (in[0]);
+      close (in[1]);
+      close (out[0]);
+      close (out[1]);
+      return;
+    }
+  if (0 == pid)
+    {
+      char *const args[] = {
+        "pdfinfo",
+        "-",
+        NULL
+      };
+      /* am child, exec 'pdfinfo' */
+      close (0);
+      close (1);
+      dup2 (in[0], 0);
+      dup2 (out[1], 1);
+      close (in[0]);
+      close (in[1]);
+      close (out[0]);
+      close (out[1]);
+      execvp ("pdfinfo", args);
+      exit (1);
+    }
+  /* am parent, send file */
+  close (in[0]);
+  close (out[1]);
+  fout = fdopen (out[0], "r");
+
+  pos = 0;
+  while (pos < fsize)
+    {
+      ssize_t got;
+      size_t wpos;
+
+      data = NULL;
+      got = ec->read (ec->cls,
+                      &data,
+                      fsize - pos);
+      if ( (-1 == got) ||
+           (NULL == data) )
+        break;
+      wpos = 0;
+      while (wpos < got)
+        {
+          ssize_t out;
+
+          out = write (in[1], data + wpos, got - wpos);
+          if (out <= 0)
+            break;
+          wpos += out;
+        }
+      if (wpos < got)
+        break;
+      pos += got;
+    }
+  close (in[1]);
+  process_stdout (fout, ec->proc, ec->cls);
+  fclose (fout);
+  kill (pid, SIGKILL);
+  waitpid (pid, NULL, 0);
+}
+
+/* end of pdf_extractor.c */




reply via email to

[Prev in Thread] Current Thread [Next in Thread]