bug-coreutils
[Top][All Lists]
Advanced

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

Re: join suggestion: auto-output-format


From: Assaf Gordon
Subject: Re: join suggestion: auto-output-format
Date: Mon, 09 Nov 2009 13:05:34 -0500
User-agent: Mozilla-Thunderbird 2.0.0.22 (X11/20090707)

Hello,

Here's an improved version of the 'auto-output-format' feature for join.
Includes code,tests, NEWS and documentation.

The patch is attached below, and is also available here:
http://cancan.cshl.edu/labmembers/gordon/coreutils8/join_auto_format.patch

See email below for a use-case example.

Thanks,  
  -gordon

Assaf Gordon wrote, On 11/04/2009 08:36 PM:
> 
> This feature allows join to automatically guess the output format
> without specifying '-o', allowing easier use (IMHO) of "-e". This is
> mostly a convenience, DWIM kind of feature.
> Here a simple use case:
> 
> $ cat 1.txt
> 1 alice
> 2 bob
> 4 dave
> 
> $ cat 2.txt
> 1 red
> 2 green
> 3 blue
> 
> Joining with "-a 1 -a 2" will display the third and fourth items without
> proper field 'fillers':
> 
> $ join -j1 -a1 -a2   1.txt 2.txt
> 1 alice red
> 2 bob   green
> 3 blue
> 4 dave
> 
> This behavior is of course by design.
> If one needs the empty columns to be filled, it requires both "-e" and
> "-o", and to use "-o" properly, one needs to know beforehand the columns
> in the input files:
> 
> $ join -j1 -a1 -a2 -e FOO -o 0,1.2,2.2   1.txt   2.txt
> 1 alice red
> 2 bob   green
> 3 FOO   blue
> 4 dave  FOO
> 
> If there are many columns in the input fields, writing the proper "-o"
> format string is cumbersome.
> 
> I suggest a simple feature:
> When adding "--auto-format" argument, join will automatically generate
> an output format (simulating "-o"), by putting the joined field first,
> followed by all the fields from file1, followed by all fields from file2.
> (This feature assumes the number of columns in the first lines
> represents the number of columns in all lines).
> This allows using "-e" without specifying "-o", as so:
> 
> $ join -j1 -a1 -a2 -e FOO --auto-format   1.txt   2.txt
> 1 alice red
> 2 bob   green
> 3 FOO   blue
> 4 dave  FOO
> 

 NEWS               |    3 +++
 doc/coreutils.texi |   10 ++++++++++
 src/join.c         |   36 +++++++++++++++++++++++++++++++++++-
 tests/misc/join    |   21 +++++++++++++++++++++
 4 files changed, 69 insertions(+), 1 deletions(-)

diff --git a/NEWS b/NEWS
index 5b75dbb..8655faa 100644
--- a/NEWS
+++ b/NEWS
@@ -77,6 +77,9 @@ GNU coreutils NEWS                                    -*- 
outline -*-
   touch now accepts the option --no-dereference (-h), as a means to
   change symlink timestamps on platforms with enough support.
 
+  join now accepts the option --auto-format (-F), to automatically
+  detect the output format without requireing explicit -o.
+
 
 * Noteworthy changes in release 8.0 (2009-10-06) [beta]
 
diff --git a/doc/coreutils.texi b/doc/coreutils.texi
index 227014c..f692f47 100644
--- a/doc/coreutils.texi
+++ b/doc/coreutils.texi
@@ -5512,6 +5512,16 @@ Do not check that both input files are in sorted order.  
This is the default.
 Replace those output fields that are missing in the input with
 @var{string}.
 
address@hidden -F
address@hidden --auto-format
address@hidden -i
address@hidden --auto-format
+Automatically detects output format based on the number of fields in the
+first line of each input file (as if the user explicitly specified @samp{-o}).
+Allows using @samp{-e} without a-priori knowledge of the fields in the input 
files.
+The join field is printed first, followed by the remaining fields from the 
first
+file and the second file.
+
 @item -i
 @itemx --ignore-case
 @opindex -i
diff --git a/src/join.c b/src/join.c
index d734a91..07112eb 100644
--- a/src/join.c
+++ b/src/join.c
@@ -146,6 +146,7 @@ static struct option const longopts[] =
   {"ignore-case", no_argument, NULL, 'i'},
   {"check-order", no_argument, NULL, CHECK_ORDER_OPTION},
   {"nocheck-order", no_argument, NULL, NOCHECK_ORDER_OPTION},
+  {"auto-format", no_argument, NULL, 'F'},
   {GETOPT_HELP_OPTION_DECL},
   {GETOPT_VERSION_OPTION_DECL},
   {NULL, 0, NULL, 0}
@@ -157,6 +158,12 @@ static struct line uni_blank;
 /* If nonzero, ignore case when comparing join fields.  */
 static bool ignore_case;
 
+/* if nonzero, automatically build a specific output field list,
+   based on the first line of each input file */
+static bool auto_output_format;
+
+static void build_output_format(struct line const *line1, struct line const* 
line2);
+
 void
 usage (int status)
 {
@@ -191,6 +198,8 @@ by whitespace.  When FILE1 or FILE2 (not both) is -, read 
standard input.\n\
   --check-order     check that the input is correctly sorted, even\n\
                       if all input lines are pairable\n\
   --nocheck-order   do not check that the input is correctly sorted\n\
+  -F, --auto-format  Automatically build output format, based on the first\n\
+                    line of each input file. Allows '-e' without using '-o'.\n\
 "), stdout);
       fputs (HELP_OPTION_DESCRIPTION, stdout);
       fputs (VERSION_OPTION_DESCRIPTION, stdout);
@@ -616,6 +625,9 @@ join (FILE *fp1, FILE *fp2)
   initseq (&seq2);
   getseq (fp2, &seq2, 2);
 
+  if (auto_output_format && seq1.count && seq2.count)
+    build_output_format(seq1.lines[0],seq2.lines[0]);
+
   while (seq1.count && seq2.count)
     {
       size_t i;
@@ -926,6 +938,24 @@ add_file_name (char *name, char *names[2],
     *optc_status = MIGHT_BE_O_ARG;
 }
 
+static void
+build_output_format(struct line const *line1, struct line const* line2)
+{
+  int i ;
+  if (outlist_head.next)
+    return;
+
+  add_field(0,0);
+  for (i = 0; i < join_field_1 && i < line1->nfields; ++i)
+    add_field(1,i);
+  for (i = join_field_1 + 1; i < line1->nfields; ++i)
+    add_field(1,i);
+  for (i = 0; i < join_field_2 && i < line2->nfields; ++i)
+    add_field(2,i);
+  for (i = join_field_2 + 1; i < line2->nfields; ++i)
+    add_field(2,i);
+}
+
 int
 main (int argc, char **argv)
 {
@@ -954,7 +984,7 @@ main (int argc, char **argv)
   issued_disorder_warning[0] = issued_disorder_warning[1] = false;
   check_input_order = CHECK_ORDER_DEFAULT;
 
-  while ((optc = getopt_long (argc, argv, "-a:e:i1:2:j:o:t:v:",
+  while ((optc = getopt_long (argc, argv, "-a:e:i1:2:j:o:t:v:F",
                               longopts, NULL))
          != -1)
     {
@@ -1052,6 +1082,10 @@ main (int argc, char **argv)
                          &nfiles, &prev_optc_status, &optc_status);
           break;
 
+        case 'F':
+          auto_output_format = true;
+          break;
+
         case_GETOPT_HELP_CHAR;
 
         case_GETOPT_VERSION_CHAR (PROGRAM_NAME, AUTHORS);
diff --git a/tests/misc/join b/tests/misc/join
index d1f1677..3f1e590 100755
--- a/tests/misc/join
+++ b/tests/misc/join
@@ -185,6 +185,27 @@ my @tv = (
 # Before 6.10.143, this would mistakenly fail with the diagnostic:
 # join: File 1 is not in sorted order
 ['chkodr-7', '-12', ["2 a\n1 b\n", ""], "", 0],
+
+# Auto-format
+['autoformat-1', '-j1 -a1 -a2 -F -e FOO',
+ ["1 a\n2 b\n", "1 X\n3 Y\n"], "1 a X\n2 b FOO\n3 FOO Y\n", 0],
+
+# Auto-format, with empty filler (no '-e' specified)-
+# should print a column delimiters (space characters), but no filler.
+# This should be equivalent to specifing "-o 0,1.2,2.2" without "-e".
+['autoformat-2', '-j1 -a1 -a2 -F',
+ ["1 a\n2 b\n", "1 X\n3 Y\n"], "1 a X\n2 b \n3  Y\n", 0],
+
+# auto-format sanity check: specify explicit output format without -e,
+# make sure it matches the above test.
+['autoformat-3', '-j1 -a1 -a2 -o 0,1.2,2.2',
+ ["1 a\n2 b\n", "1 X\n3 Y\n"], "1 a X\n2 b \n3  Y\n", 0],
+
+# both auto-format and explicit output format (different format than 'auto'),
+# auto-format should be silently ignored.
+['autoformat-4', '-j1 -a1 -a2 -e FOO -F -o 0,2.2,1.2',
+ ["1 a\n2 b\n", "1 X\n3 Y\n"], "1 X a\n2 FOO b\n3 Y FOO\n", 0],
+
 );
 
 # Convert the above old-style test vectors to the newer




reply via email to

[Prev in Thread] Current Thread [Next in Thread]