pspp-dev
[Top][All Lists]
Advanced

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

[encodings 2/4] line-reader: New library for reading a file line-by-line


From: Ben Pfaff
Subject: [encodings 2/4] line-reader: New library for reading a file line-by-line.
Date: Tue, 19 Jun 2012 23:11:35 -0700

This library reads a file line-by-line in an arbitrary 8-bit or wider
encoding, without requiring the file to be recoded.  This will be used
in an upcoming commit.
---
 src/libpspp/automake.mk          |    2 +
 src/libpspp/line-reader.c        |  350 ++++++++++++++++++++++++++++++++++++++
 src/libpspp/line-reader.h        |   54 ++++++
 tests/automake.mk                |    5 +
 tests/libpspp/line-reader-test.c |  130 ++++++++++++++
 tests/libpspp/line-reader.at     |   74 ++++++++
 6 files changed, 615 insertions(+), 0 deletions(-)
 create mode 100644 src/libpspp/line-reader.c
 create mode 100644 src/libpspp/line-reader.h
 create mode 100644 tests/libpspp/line-reader-test.c
 create mode 100644 tests/libpspp/line-reader.at

diff --git a/src/libpspp/automake.mk b/src/libpspp/automake.mk
index 244f1d1..2f81243 100644
--- a/src/libpspp/automake.mk
+++ b/src/libpspp/automake.mk
@@ -44,6 +44,8 @@ src_libpspp_liblibpspp_la_SOURCES = \
        src/libpspp/integer-format.h \
        src/libpspp/intern.c \
        src/libpspp/intern.h \
+       src/libpspp/line-reader.c \
+       src/libpspp/line-reader.h \
        src/libpspp/ll.c \
        src/libpspp/ll.h \
        src/libpspp/llx.c \
diff --git a/src/libpspp/line-reader.c b/src/libpspp/line-reader.c
new file mode 100644
index 0000000..6f90b50
--- /dev/null
+++ b/src/libpspp/line-reader.c
@@ -0,0 +1,350 @@
+/* PSPP - a program for statistical analysis.
+   Copyright (C) 2010, 2011 Free Software Foundation, Inc.
+
+   This program is free software: you can redistribute it and/or modify
+   it under the terms of the GNU General Public License as published by
+   the Free Software Foundation, either version 3 of the License, or
+   (at your option) any later version.
+
+   This program is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+   GNU General Public License for more details.
+
+   You should have received a copy of the GNU General Public License
+   along with this program.  If not, see <http://www.gnu.org/licenses/>. */
+
+#include <config.h>
+
+#include "line-reader.h"
+
+#include <assert.h>
+#include <errno.h>
+#include <fcntl.h>
+#include <stdlib.h>
+#include <string.h>
+#include <unistd.h>
+
+#include "libpspp/assertion.h"
+#include "libpspp/encoding-guesser.h"
+#include "libpspp/i18n.h"
+#include "libpspp/str.h"
+
+#include "gl/minmax.h"
+#include "gl/xalloc.h"
+
+enum line_reader_state
+  {
+    S_UNIBYTE,                  /* Known stream encoding, 1-byte unit. */
+    S_MULTIBYTE,                /* Known stream encoding, multibyte unit. */
+    S_AUTO                      /* Encoding autodetection in progress. */
+  };
+
+struct line_reader
+  {
+    int fd;
+    enum line_reader_state state;
+    struct encoding_info encoding_info;
+
+    char *encoding;             /* Current encoding. */
+    char *auto_encoding;        /* In S_AUTO mode, user-specified encoding. */
+
+    char *buffer;
+    char *head;
+    size_t length;
+
+    int error;
+    bool eof;
+  };
+
+static ssize_t fill_buffer (struct line_reader *);
+
+/* Opens FILENAME, which is encoded in ENCODING, for reading line by line,
+   passing FLAGS to the open() function.  Returns a new line_reader if
+   successful, otherwise returns NULL and sets errno to an appropriate value.
+
+   The accepted forms for ENCODING are listed at the top of
+   encoding-guesser.h. */
+struct line_reader *
+line_reader_for_file (const char *encoding, const char *filename, int flags)
+{
+  struct line_reader *r;
+  int fd;
+
+  assert (!(flags & O_CREAT));
+
+  fd = open (filename, flags);
+  if (fd < 0)
+    return NULL;
+
+  r = line_reader_for_fd (encoding, fd);
+  if (r == NULL)
+    {
+      int save_errno = errno;
+      close (fd);
+      errno = save_errno;
+    }
+
+  return r;
+}
+
+/* Creates and returns a new line_reader that reads its input from FD.  Returns
+   a new line_reader if successful, otherwise returns NULL and sets errno to an
+   appropriate value.
+
+   The accepted forms for ENCODING are listed at the top of
+   encoding-guesser.h. */
+struct line_reader *
+line_reader_for_fd (const char *encoding, int fd)
+{
+  struct line_reader *r;
+
+  r = calloc (1, sizeof *r);
+  if (r == NULL)
+    return NULL;
+
+  r->fd = fd;
+  r->buffer = malloc (LINE_READER_BUFFER_SIZE);
+  if (r->buffer == NULL)
+    goto error;
+  r->head = r->buffer;
+  r->length = 0;
+
+  if (fill_buffer (r) < 0)
+    goto error;
+
+  r->encoding = xstrdup (encoding_guess_head_encoding (
+                           encoding, r->buffer, r->length));
+  if (!get_encoding_info (&r->encoding_info, r->encoding))
+    {
+      errno = EINVAL;
+      goto error;
+    }
+
+  if (encoding_guess_encoding_is_auto (encoding)
+      && !strcmp (r->encoding, "ASCII"))
+    {
+      r->state = S_AUTO;
+      r->auto_encoding = xstrdup (encoding);
+    }
+  else
+    r->state = r->encoding_info.unit == 1 ? S_UNIBYTE : S_MULTIBYTE;
+
+  return r;
+
+error:
+  line_reader_free (r);
+  return NULL;
+}
+
+/* Closes R and its underlying file descriptor and frees all associated
+   resources.  Returns the return value from close(). */
+int
+line_reader_close (struct line_reader *r)
+{
+  if (r != NULL)
+    {
+      int fd = r->fd;
+      line_reader_free (r);
+      return close (fd);
+    }
+  return 0;
+}
+
+/* Frees R and associated resources, but does not close the underlying file
+   descriptor.  (Thus, the client must close the file descriptor when it is no
+   longer needed.) */
+void
+line_reader_free (struct line_reader *r)
+{
+  if (r != NULL)
+    {
+      free (r->buffer);
+      free (r->encoding);
+      free (r->auto_encoding);
+      free (r);
+    }
+}
+
+static ssize_t
+fill_buffer (struct line_reader *r)
+{
+  ssize_t n;
+
+  /* Move any unused bytes to the beginning of the input buffer. */
+  if (r->length > 0 && r->buffer != r->head)
+    memmove (r->buffer, r->head, r->length);
+  r->head = r->buffer;
+
+  /* Read more input. */
+  do
+    {
+      n = read (r->fd, r->buffer + r->length,
+                LINE_READER_BUFFER_SIZE - r->length);
+    }
+  while (n < 0 && errno == EINTR);
+  if (n > 0)
+    r->length += n;
+  else if (n < 0)
+    r->error = errno;
+  else
+    r->eof = true;
+  return n;
+}
+
+static void
+output_bytes (struct line_reader *r, struct string *s, size_t n)
+{
+  ds_put_substring (s, ss_buffer (r->head, n));
+  r->head += n;
+  r->length -= n;
+}
+
+static void
+output_line (struct line_reader *r, struct string *s, size_t n)
+{
+  int unit = r->encoding_info.unit;
+
+  output_bytes (r, s, n);
+
+  r->head += unit;
+  r->length -= unit;
+
+  ds_chomp (s, ss_buffer (r->encoding_info.cr, unit));
+}
+
+/* Reads a line of text, but no more than MAX_LENGTH bytes, from R and appends
+   it to S, omitting the final new-line and the carriage return that
+   immediately precedes it, if one is present.  The line is left in its
+   original encoding.
+
+   Returns true if anything was successfully read from the file.  (If an empty
+   line was read, then nothing is appended to S.)  Returns false if end of file
+   was reached or a read error occurred before any text could be read. */
+bool
+line_reader_read (struct line_reader *r, struct string *s, size_t max_length)
+{
+  size_t original_length = ds_length (s);
+  int unit = r->encoding_info.unit;
+
+  do
+    {
+      size_t max_out = max_length - (ds_length (s) - original_length);
+      size_t max_in = r->length;
+      size_t max = MIN (max_in, max_out);
+      size_t n;
+      char *p;
+
+      if (max_out < unit)
+        break;
+
+      switch (r->state)
+        {
+        case S_UNIBYTE:
+          p = memchr (r->head, r->encoding_info.lf[0], max);
+          if (p != NULL)
+            {
+              output_line (r, s, p - r->head);
+              return true;
+            }
+          n = max;
+          break;
+
+        case S_MULTIBYTE:
+          for (n = 0; n + unit <= max; n += unit)
+            if (!memcmp (r->head + n, r->encoding_info.lf, unit))
+              {
+                output_line (r, s, n);
+                return true;
+              }
+          break;
+
+        case S_AUTO:
+          for (n = 0; n < max; n++)
+            if (!encoding_guess_is_ascii_text (r->head[n]))
+              {
+                char *encoding;
+
+                output_bytes (r, s, n);
+                fill_buffer (r);
+                r->state = S_UNIBYTE;
+
+                encoding = xstrdup (encoding_guess_tail_encoding (
+                                      r->auto_encoding, r->head, r->length));
+                free (r->encoding);
+                r->encoding = encoding;
+
+                free (r->auto_encoding);
+                r->auto_encoding = NULL;
+
+                n = 0;
+                break;
+              }
+            else if (r->head[n] == '\n')
+              {
+                output_line (r, s, n);
+                return true;
+              }
+          break;
+
+        default:
+          NOT_REACHED ();
+        }
+
+      output_bytes (r, s, n);
+    }
+  while (r->length >= unit || fill_buffer (r) > 0);
+
+  return ds_length (s) > original_length;
+}
+
+/* Returns the file descriptor underlying R. */
+int
+line_reader_fileno (const struct line_reader *r)
+{
+  return r->fd;
+}
+
+/* Returns the offset in the file of the next byte to be read from R, or -1 on
+   error (e.g. if the file is not seekable). */
+off_t
+line_reader_tell (const struct line_reader *r)
+{
+  off_t pos = lseek (r->fd, 0, SEEK_CUR);
+  if (pos >= 0)
+    pos = MAX (0, pos - r->length);
+  return pos;
+}
+
+/* Returns true if end of file has been encountered reading R. */
+bool
+line_reader_eof (const struct line_reader *r)
+{
+  return r->eof && !r->length;
+}
+
+/* Returns an nonzero errno value if an error has been encountered reading
+   R, zero otherwise. */
+int
+line_reader_error (const struct line_reader *r)
+{
+  return !r->length ? r->error : 0;
+}
+
+/* Returns the encoding of R.  If line_reader_is_auto(R) returns true, the
+   encoding might change as more lines are read. */
+const char *
+line_reader_get_encoding (const struct line_reader *r)
+{
+  return r->encoding;
+}
+
+/* Returns true if the encoding of the file being read by R is not yet
+   completely known.  If this function returns true, then the encoding returned
+   by line_reader_get_encoding() might change as more lines are read (and after
+   the change, this function will return false). */
+bool
+line_reader_is_auto (const struct line_reader *r)
+{
+  return r->state == S_AUTO;
+}
diff --git a/src/libpspp/line-reader.h b/src/libpspp/line-reader.h
new file mode 100644
index 0000000..e9b2f50
--- /dev/null
+++ b/src/libpspp/line-reader.h
@@ -0,0 +1,54 @@
+/* PSPP - a program for statistical analysis.
+   Copyright (C) 2010, 2011 Free Software Foundation, Inc.
+
+   This program is free software: you can redistribute it and/or modify
+   it under the terms of the GNU General Public License as published by
+   the Free Software Foundation, either version 3 of the License, or
+   (at your option) any later version.
+
+   This program is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+   GNU General Public License for more details.
+
+   You should have received a copy of the GNU General Public License
+   along with this program.  If not, see <http://www.gnu.org/licenses/>. */
+
+#ifndef LIBPSPP_LINE_READER_H
+#define LIBPSPP_LINE_READER_H 1
+
+#include <stdbool.h>
+#include <stddef.h>
+#include <sys/types.h>
+
+/* line_reader.
+
+   Reads a text file in an arbitrary encoding one line at a time, with
+   optional automatic encoding detection.
+*/
+
+#define LINE_READER_BUFFER_SIZE 4096
+
+struct string;
+
+struct line_reader *line_reader_for_fd (const char *encoding, int fd);
+struct line_reader *line_reader_for_file (const char *encoding,
+                                          const char *filename, int flags);
+
+int line_reader_close (struct line_reader *);
+void line_reader_free (struct line_reader *);
+
+bool line_reader_read (struct line_reader *, struct string *,
+                       size_t max_length);
+
+int line_reader_fileno (const struct line_reader *);
+off_t line_reader_tell (const struct line_reader *);
+
+bool line_reader_eof (const struct line_reader *);
+int line_reader_error (const struct line_reader *);
+
+const char *line_reader_get_encoding (const struct line_reader *);
+
+bool line_reader_is_auto (const struct line_reader *);
+
+#endif /* libpspp/line-reader.h */
diff --git a/tests/automake.mk b/tests/automake.mk
index 0af3d1e..b8e4c2d 100644
--- a/tests/automake.mk
+++ b/tests/automake.mk
@@ -14,6 +14,7 @@ check_PROGRAMS += \
        tests/libpspp/hmap-test \
        tests/libpspp/hmapx-test \
        tests/libpspp/i18n-test \
+       tests/libpspp/line-reader-test \
        tests/libpspp/ll-test \
        tests/libpspp/llx-test \
        tests/libpspp/range-map-test \
@@ -43,6 +44,9 @@ tests_data_sack_SOURCES = \
 tests_data_sack_LDADD = src/libpspp-core.la 
 tests_data_sack_CFLAGS = $(AM_CFLAGS)
 
+tests_libpspp_line_reader_test_SOURCES = tests/libpspp/line-reader-test.c
+tests_libpspp_line_reader_test_LDADD = src/libpspp/liblibpspp.la gl/libgl.la
+
 tests_libpspp_ll_test_SOURCES = \
        src/libpspp/ll.c \
        tests/libpspp/ll-test.c
@@ -320,6 +324,7 @@ TESTSUITE_AT = \
        tests/libpspp/hmap.at \
        tests/libpspp/hmapx.at \
        tests/libpspp/i18n.at \
+       tests/libpspp/line-reader.at \
        tests/libpspp/ll.at \
        tests/libpspp/llx.at \
        tests/libpspp/range-map.at \
diff --git a/tests/libpspp/line-reader-test.c b/tests/libpspp/line-reader-test.c
new file mode 100644
index 0000000..fef9eb6
--- /dev/null
+++ b/tests/libpspp/line-reader-test.c
@@ -0,0 +1,130 @@
+/* PSPP - a program for statistical analysis.
+   Copyright (C) 2010, 2011 Free Software Foundation, Inc.
+
+   This program is free software: you can redistribute it and/or modify
+   it under the terms of the GNU General Public License as published by
+   the Free Software Foundation, either version 3 of the License, or
+   (at your option) any later version.
+
+   This program is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+   GNU General Public License for more details.
+
+   You should have received a copy of the GNU General Public License
+   along with this program.  If not, see <http://www.gnu.org/licenses/>. */
+
+#include <config.h>
+
+#include "libpspp/line-reader.h"
+
+#include <errno.h>
+#include <fcntl.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <unistd.h>
+
+#include "libpspp/i18n.h"
+#include "libpspp/str.h"
+
+#include "gl/error.h"
+#include "gl/progname.h"
+#include "gl/xalloc.h"
+
+static void
+usage (void)
+{
+  printf ("usage: %s COMMAND [ARG]...\n"
+          "The available commands are:\n"
+          "  help\n"
+          "    print this usage message\n"
+          "  buffer-size\n"
+          "    print the buffer size, in bytes, on stdout\n"
+          "  read FILE ENCODING\n"
+          "    read FILE encoded in ENCODING and print it in UTF-8\n",
+          program_name);
+  exit (0);
+}
+
+static void
+cmd_read (int argc, char *argv[])
+{
+  struct line_reader *r;
+  const char *filename;
+  struct string line;
+  char *encoding;
+
+  if (argc != 4)
+    error (1, 0, "bad syntax for `%s' command; use `%s help' for help",
+           argv[1], program_name);
+
+  filename = argv[2];
+
+  r = (!strcmp(filename, "-")
+       ? line_reader_for_fd (argv[3], STDIN_FILENO)
+       : line_reader_for_file (argv[3], filename, O_RDONLY));
+  if (r == NULL)
+    error (1, errno, "line_reader_open failed");
+
+  encoding = xstrdup (line_reader_get_encoding (r));
+  printf ("encoded in %s", encoding);
+  if (line_reader_is_auto (r))
+    printf (" (auto)");
+  printf ("\n");
+
+  ds_init_empty (&line);
+  while (line_reader_read (r, &line, SIZE_MAX))
+    {
+      const char *new_encoding;
+      char *utf8_line;
+
+      new_encoding = line_reader_get_encoding (r);
+      if (strcmp (encoding, new_encoding))
+        {
+          free (encoding);
+          encoding = xstrdup (new_encoding);
+
+          printf ("encoded in %s", encoding);
+          if (line_reader_is_auto (r))
+            printf (" (auto)");
+          printf ("\n");
+        }
+
+      utf8_line = recode_string ("UTF-8", encoding,
+                                 ds_data (&line), ds_length (&line));
+      printf ("\"%s\"\n", utf8_line);
+      free (utf8_line);
+
+      ds_clear (&line);
+    }
+
+  if (!strcmp(filename, "-"))
+    line_reader_free (r);
+  else
+    {
+      if (line_reader_close (r) != 0)
+        error (1, errno, "line_reader_close failed");
+    }
+}
+
+int
+main (int argc, char *argv[])
+{
+  set_program_name (argv[0]);
+  i18n_init ();
+
+  if (argc < 2)
+    error (1, 0, "missing command name; use `%s help' for help", program_name);
+  else if (!strcmp(argv[1], "help") || !strcmp(argv[1], "--help"))
+    usage ();
+  else if (!strcmp(argv[1], "buffer-size"))
+    printf ("%d\n", LINE_READER_BUFFER_SIZE);
+  else if (!strcmp(argv[1], "read"))
+    cmd_read (argc, argv);
+  else
+    error (1, 0, "unknown command `%s'; use `%s help' for help",
+           argv[1], program_name);
+
+  return 0;
+}
diff --git a/tests/libpspp/line-reader.at b/tests/libpspp/line-reader.at
new file mode 100644
index 0000000..29cff4d
--- /dev/null
+++ b/tests/libpspp/line-reader.at
@@ -0,0 +1,74 @@
+AT_BANNER([line_reader])
+
+AT_SETUP([read ASCII])
+AT_KEYWORDS([line_reader])
+AT_CHECK([i18n-test supports_encodings ASCII])
+AT_CHECK([echo string | line-reader-test read - ASCII], [0], [dnl
+encoded in ASCII
+"string"
+])
+AT_CLEANUP
+
+AT_SETUP([read UTF-8])
+AT_KEYWORDS([line_reader])
+AT_CHECK([printf '\346\227\245\346\234\254\350\252\236\n' | line-reader-test 
read - UTF-8], [0], [dnl
+encoded in UTF-8
+"日本語"
+])
+AT_CLEANUP
+
+AT_SETUP([read EUC-JP])
+AT_KEYWORDS([line_reader])
+AT_CHECK([i18n-test supports_encodings EUC-JP])
+AT_CHECK([printf '\244\241 \244\242 \244\243 \244\244 \244\245 \244\246 
\244\247 \244\250 \244\251 \244\252\n' | line-reader-test read - EUC-JP], [0], 
[dnl
+encoded in EUC-JP
+"ぁ あ ぃ い ぅ う ぇ え ぉ お"
+])
+AT_CLEANUP
+
+AT_SETUP([read ASCII as Auto])
+AT_KEYWORDS([line_reader])
+AT_CHECK([echo string | line-reader-test read - Auto], [0], [dnl
+encoded in ASCII (auto)
+"string"
+])
+AT_CLEANUP
+
+AT_SETUP([read UTF-8 as Auto])
+AT_KEYWORDS([line_reader])
+AT_CHECK([printf 'entr\303\251e\n' | line-reader-test read - Auto], [0], [dnl
+encoded in ASCII (auto)
+encoded in UTF-8
+"entrée"
+])
+AT_CLEANUP
+
+AT_SETUP([read ISO-8859-1 as Auto,ISO-8859-1])
+AT_KEYWORDS([line_reader])
+AT_CHECK([i18n-test supports_encodings ISO-8859-1])
+buffer_size=`line-reader-test buffer-size`
+($PERL -e "print 'x' x ($buffer_size - 2)"
+ printf '\none line\ntwo lines\nentr\351e\nfour lines\n') > input
+(printf 'encoded in ASCII (auto)\n\"'
+ $PERL -e "print 'x' x ($buffer_size - 2)"
+ printf '\"\n"one line"\n"two lines"\nencoded in 
ISO-8859-1\n"entr\303\251e"\n"four lines"\n') > expout
+AT_CHECK([line-reader-test read input Auto,ISO-8859-1], [0], [expout])
+AT_CLEANUP
+
+AT_SETUP([read UTF-16BE as Auto,UTF-16BE])
+AT_KEYWORDS([line_reader])
+AT_CHECK([i18n-test supports_encodings UTF-16BE])
+AT_CHECK([printf '\0e\0n\0t\0r\0\351\0e\0\n' | line-reader-test read - 
Auto,UTF-16BE],
+  [0], [encoded in UTF-16BE
+"entrée"
+])
+AT_CLEANUP
+
+AT_SETUP([read EUC-JP as Auto,EUC-JP])
+AT_KEYWORDS([line_reader])
+AT_CHECK([i18n-test supports_encodings EUC-JP])
+AT_CHECK([printf 'entr\217\253\261e\n' | line-reader-test read - Auto,EUC-JP],
+  [0], [encoded in EUC-JP
+"entrée"
+])
+AT_CLEANUP
-- 
1.7.2.5




reply via email to

[Prev in Thread] Current Thread [Next in Thread]