Re: Support bytesize comparison in sort

bug-coreutils

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

Re: Support bytesize comparison in sort

From:	Mart Somermaa
Subject:	Re: Support bytesize comparison in sort
Date:	Sat, 08 Apr 2006 14:43:22 +0300
User-agent:	Mail/News 1.5 (X11/20060309)

Andreas Schwab wrote:
> Mart Somermaa <address@hidden> writes:
>
>   
>> @@ -295,6 +299,7 @@ Ordering options:\n\
>>  "), stdout);
>>        fputs (_("\
>>    -b, --ignore-leading-blanks  ignore leading blanks\n\
>> +  -B, --size-in-bytes         compare bytesizes (numbers suffixed with K, 
>> M, G)\n\
>>     
>
> You forgot to update the help string.
>
> Andreas.

Fixed. Added documentation to coreutils.texi.

diff -burp coreutils-5.94.orig/doc/coreutils.texi 
coreutils-5.94/doc/coreutils.texi
--- coreutils-5.94.orig/doc/coreutils.texi      2006-02-07 10:31:28.000000000 
+0200
+++ coreutils-5.94/doc/coreutils.texi   2006-04-08 13:22:40.000000000 +0300
@@ -3257,6 +3257,23 @@ Use this option only if there is no alte
 @option{--numeric-sort} (@option{-n}) and it can lose information when
 converting to floating point.
 
address@hidden -h
address@hidden --human-readable-bytesize
address@hidden -h
address@hidden --human-readable-bytesize
address@hidden sort human readable bytesizes
+Behaves otherwise like @option{--general-numeric-sort}, but also takes
+into account human readable bytesize suffixes (K for kilo-, M for
+mega-, G for giga-, T for tera-, P for peta-, E for exa-, Z for zetta-
+and Y for yottabytes).
+
+Note that input numbers are assumed to be properly scaled (i.e. 1M
+should always be used instead of 1024K or 1000K) -- numbers in K will
+always compare less than numbers in M, similarly with M and G, G and T
+etc. This approach makes the routine immune to the 1KB = 1000B versus
+1KB = 1024B problem, sorting correctly the output of other GNU utilities
+that accept either @option{--human-readable} or @option{--si} options.
+
 @item -i
 @itemx --ignore-nonprinting
 @opindex -i
diff -burp coreutils-5.94.orig/src/sort.c coreutils-5.94/src/sort.c
--- coreutils-5.94.orig/src/sort.c      2005-10-07 21:48:28.000000000 +0300
+++ coreutils-5.94/src/sort.c   2006-04-08 13:00:10.000000000 +0300
@@ -26,6 +26,7 @@
 #include <getopt.h>
 #include <sys/types.h>
 #include <signal.h>
+#include <ctype.h>
 #include "system.h"
 #include "error.h"
 #include "hard-locale.h"
@@ -149,6 +150,9 @@ struct keyfield
                                   point, but no exponential notation. */
   bool general_numeric;                /* Flag for general, numeric comparison.
                                   Handle numbers in exponential notation. */
+  bool size_in_bytes;          /* Flag for human-readable bytesize comparison.
+                                  Handle numbers suffixed with K for kilo-,
+                                  M for mega- and G for gigabytes. */
   bool month;                  /* Flag for comparison by month name. */
   bool reverse;                        /* Reverse the sense of comparison. */
   struct keyfield *next;       /* Next keyfield to try. */
@@ -300,6 +304,7 @@ Ordering options:\n\
 "), stdout);
       fputs (_("\
   -g, --general-numeric-sort  compare according to general numerical value\n\
+  -h, --human-readable-bytesize compare bytesizes (suffixed with K, M, G 
etc)\n\
   -i, --ignore-nonprinting    consider only printable characters\n\
   -M, --month-sort            compare (unknown) < `JAN' < ... < `DEC'\n\
   -n, --numeric-sort          compare according to string numerical value\n\
@@ -353,7 +358,7 @@ native byte values.\n\
   exit (status);
 }
 
-static char const short_options[] = "-bcdfgik:mMno:rsS:t:T:uy:z";
+static char const short_options[] = "-bcdfghik:mMno:rsS:t:T:uy:z";
 
 static struct option const long_options[] =
 {
@@ -362,6 +367,7 @@ static struct option const long_options[
   {"dictionary-order", no_argument, NULL, 'd'},
   {"ignore-case", no_argument, NULL, 'f'},
   {"general-numeric-sort", no_argument, NULL, 'g'},
+  {"human-readable-bytesize", no_argument, NULL, 'h'},
   {"ignore-nonprinting", no_argument, NULL, 'i'},
   {"key", required_argument, NULL, 'k'},
   {"merge", no_argument, NULL, 'm'},
@@ -1077,8 +1083,14 @@ numcompare (const char *a, const char *b
   return strnumcmp (a, b, decimal_point, thousands_sep);
 }
 
-static int
-general_numcompare (const char *sa, const char *sb)
+/* If size_in_bytes is true, compare strings A and B as human-readable
+ * positive byte counts (as returned e.g. by df -h) suffixed with
+ * either 'K', 'M', 'G', 'T', 'P', 'E', 'Z', 'Y' for kilobytes,
+ * megabytes, gigabytes, terabytes, petabytes, exabytes, zettabytes,
+ * yottabytes.
+ */
+static int general_numcompare (const char *sa, const char *sb,
+               bool size_in_bytes)
 {
   /* FIXME: add option to warn about failed conversions.  */
   /* FIXME: maybe add option to try expensive FP conversion
@@ -1095,6 +1107,49 @@ general_numcompare (const char *sa, cons
   if (sb == eb)
     return 1;
 
+  if (size_in_bytes && ea && eb)
+    {
+      char ca, cb;
+
+      while (isblank(*ea))
+             ea++;
+      while (isblank(*eb))
+             eb++;
+
+      ca = (char) tolower(*ea);
+      cb = (char) tolower(*eb);
+
+      /* 1) We don't require both operands to have a known suffix. 
+       * 2) If both suffixes are unknown or equal, compare as usual */
+      if (! ( (ca == 'k' || ca == 'm' || ca == 'g' || ca == 't' 
+                     || ca == 'p' || ca == 'e' || ca == 'z' || ca == 'y')
+             ||
+             (cb == 'k' || cb == 'm' || cb == 'g' || cb == 't' 
+                     || cb == 'p' || cb == 'e' || cb == 'z' || cb == 'y') )
+         || ca == cb)
+       goto compare_as_usual;
+
+      /* As ca != cb, if ca in YB => ca bigger, cb in YB => cb bigger,
+       * if neither of these, if ca in ZB => ca bigger etc */
+      return (ca == 'y' ? 1
+             : cb == 'y' ? -1
+             : ca == 'z' ? 1
+             : cb == 'z' ? -1
+             : ca == 'e' ? 1
+             : cb == 'e' ? -1
+             : ca == 'p' ? 1
+             : cb == 'p' ? -1
+             : ca == 't' ? 1
+             : cb == 't' ? -1
+             : ca == 'g' ? 1
+             : cb == 'g' ? -1
+             : ca == 'm' ? 1
+             : cb == 'm' ? -1
+             : ca == 'k' ? 1 /* ca in KB and cb without a known suffix */
+             : -1); /* cb in KB and ca without a known suffix */
+    }
+
+compare_as_usual:
   /* Sort numbers in the usual way, where -0 == +0.  Put NaNs after
      conversion errors but before numbers; sort them by internal
      bit-pattern, for lack of a more portable alternative.  */
@@ -1179,13 +1234,14 @@ keycompare (const struct line *a, const 
       size_t lenb = limb <= textb ? 0 : limb - textb;
 
       /* Actually compare the fields. */
-      if (key->numeric | key->general_numeric)
+      if (key->numeric | key->general_numeric | key->size_in_bytes)
        {
          char savea = *lima, saveb = *limb;
 
          *lima = *limb = '\0';
-         diff = ((key->numeric ? numcompare : general_numcompare)
-                 (texta, textb));
+         diff = (key->numeric ?
+                   numcompare(texta, textb) :
+                   general_numcompare(texta, textb, key->size_in_bytes));
          *lima = savea, *limb = saveb;
        }
       else if (key->month)
@@ -2069,6 +2125,9 @@ set_ordering (const char *s, struct keyf
        case 'g':
          key->general_numeric = true;
          break;
+       case 'h':
+         key->size_in_bytes = true;
+         break;
        case 'i':
          /* Option order should not matter, so don't let -i override
             -d.  -d implies -i, but -i does not imply -d.  */
@@ -2187,7 +2246,8 @@ main (int argc, char **argv)
   gkey.sword = gkey.eword = SIZE_MAX;
   gkey.ignore = NULL;
   gkey.translate = NULL;
-  gkey.numeric = gkey.general_numeric = gkey.month = gkey.reverse = false;
+  gkey.numeric = gkey.general_numeric = gkey.size_in_bytes = false;
+  gkey.month = gkey.reverse = false;
   gkey.skipsblanks = gkey.skipeblanks = false;
 
   files = xnmalloc (argc, sizeof *files);
@@ -2259,6 +2319,7 @@ main (int argc, char **argv)
        case 'd':
        case 'f':
        case 'g':
+       case 'h':
        case 'i':
        case 'M':
        case 'n':
@@ -2418,7 +2479,7 @@ main (int argc, char **argv)
     if (! (key->ignore || key->translate
           || (key->skipsblanks | key->reverse
               | key->skipeblanks | key->month | key->numeric
-              | key->general_numeric)))
+              | key->general_numeric | key->size_in_bytes)))
       {
        key->ignore = gkey.ignore;
        key->translate = gkey.translate;
@@ -2427,12 +2488,14 @@ main (int argc, char **argv)
        key->month = gkey.month;
        key->numeric = gkey.numeric;
        key->general_numeric = gkey.general_numeric;
+       key->size_in_bytes = gkey.size_in_bytes;
        key->reverse = gkey.reverse;
       }
 
   if (!keylist && (gkey.ignore || gkey.translate
                   || (gkey.skipsblanks | gkey.skipeblanks | gkey.month
-                      | gkey.numeric | gkey.general_numeric)))
+                      | gkey.numeric | gkey.general_numeric
+                      | gkey.size_in_bytes )))
     insertkey (&gkey);
   reverse = gkey.reverse;

[Prev in Thread]

Current Thread

[Next in Thread]

Support bytesize comparison in sort, Mart Somermaa, 2006/04/06
- Re: Support bytesize comparison in sort, James Youngman, 2006/04/06
  - Re: Support bytesize comparison in sort, Andrew D Jewell, 2006/04/06
    - Re: Support bytesize comparison in sort, Mart Somermaa, 2006/04/06
    - Re: Support bytesize comparison in sort, Andreas Schwab, 2006/04/06
    - Re: Support bytesize comparison in sort, Mart Somermaa <=

Prev by Date: Re: Enhancement request for od, dd
Next by Date: Small documentation fix
Previous by thread: Re: Support bytesize comparison in sort
Next by thread: Enhancement request for od, dd
Index(es):
- Date
- Thread