bug-coreutils
[Top][All Lists]
Advanced

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

Re: Human readable sort


From: Pádraig Brady
Subject: Re: Human readable sort
Date: Sat, 25 Apr 2009 02:05:10 +0100
User-agent: Thunderbird 2.0.0.6 (X11/20071008)

Michael Speer wrote:
> I wrote the following patch to the 7.2 branch of coreutils to allow
> `sort` to sort by human readable byte sizes.  I looked around a bit to
> see what the status of previous attempts to integrate this
> functionality were, but didn't see any very recent activity.  This is
> my first interaction with coreutils, so if I missed something obvious,
> please point me towards it.
> 
> Is the last potential patch (
> http://www.mail-archive.com/address@hidden/msg14080.html )
> moving through?  If not, if I cleaned this up ( tabs, documentation,
> and test cases ) and applied it to the current HEAD on savannah is
> there a chance of getting this functionality into sort?

Thanks for reviving this again.
There was a more recent attempt that petered out unfortunately:
http://www.mail-archive.com/address@hidden/msg14080.html

> 
> Patch assumptions :
>   * that numbers will use the best representation ( never uses 1024b
> instead of 1k, etc )
>   * that the sizes will be specified via suffixes of b, K, M, G, T, P,
> E, Z, Y or their alternately cased variants
> 
> The first assumption results in checking only the suffix when they differ.
> This enables it to match the output of `du -h / du --si`, but possibly
> not other tools that do not conform to these assumptions.

The consensus was that these assumptions are appropriate and useful.

We assume C99 support now for coreutils so I tweaked your patch,
the main change being to greatly shrink the lookup table initialisation.
Note I commented out the lower case letters (except 'k') as I don't
think any coreutils generate those and they could preclude supporting
other suffixes in future. I'm not sure about doing that but I think it's
better to err on the side of too few suffixes than too many?

Something else to consider is to flag when
a mixture of SI and IEC units are used, as
this not being supported might not be obvious
to users and could cause difficult to debug issues for users.
I.E. flag an error if the following input is presented.
  999MB
  998MiB
I added a very quick hack for that to the patch for illustration.

I also noticed that you didn't terminate the fields before
processing as was done for the other numeric sorts?
So I changed that also in the attached patch but didn't
analyze it TBH.

cheers,
Pádraig.

p.s. obviously docs and help and tests need to be written,
but we can do that after we get the implementation done.
diff --git a/src/sort.c b/src/sort.c
index f48d727..a2ed015 100644
--- a/src/sort.c
+++ b/src/sort.c
@@ -176,6 +176,7 @@ struct keyfield
   bool random;                 /* Sort by random hash of key.  */
   bool general_numeric;                /* Flag for general, numeric comparison.
                                   Handle numbers in exponential notation. */
+  bool human_numeric;           /* Flag for sorting by common suffixes. */
   bool month;                  /* Flag for comparison by month name. */
   bool reverse;                        /* Reverse the sense of comparison. */
   bool version;                        /* sort by version number */
@@ -426,7 +427,7 @@ enum
   SORT_OPTION
 };
 
-static char const short_options[] = "-bcCdfgik:mMno:rRsS:t:T:uVy:z";
+static char const short_options[] = "-bcCdfghik:mMno:rRsS:t:T:uVy:z";
 
 static struct option const long_options[] =
 {
@@ -442,6 +443,7 @@ static struct option const long_options[] =
   {"merge", no_argument, NULL, 'm'},
   {"month-sort", no_argument, NULL, 'M'},
   {"numeric-sort", no_argument, NULL, 'n'},
+  {"human-sort", no_argument, NULL, 'h'},
   {"version-sort", no_argument, NULL, 'V'},
   {"random-sort", no_argument, NULL, 'R'},
   {"random-source", required_argument, NULL, RANDOM_SOURCE_OPTION},
@@ -1673,6 +1675,54 @@ numcompare (const char *a, const char *b)
   return strnumcmp (a, b, decimal_point, thousands_sep);
 }
 
+/* error if a mixture of SI and IEC units used.  */
+static void
+check_mixed_SI_IEC (char suffix)
+{
+  static int seen_si = -1;
+  bool si_present = suffix == 'i';
+  if (seen_si != -1 && seen_si != si_present)
+    error (SORT_FAILURE, 0, _("Both SI and IEC suffixes present"));
+  seen_si = si_present;
+}
+
+/* Compare numeric entities ending in human readable size specifiers
+          b < K < M < G < T < P < E < Z < Y
+   We assume that numbers are properly abbreviated.
+   For example, you will never see 500,000,000b, instead of 5M.  */
+
+static int
+human_compare(const char *a, const char *b)
+{
+  static const char weights [] = {
+    ['K']=1, ['M']=2, ['G']=3, ['T']=4, ['P']=5, ['E']=6, ['Z']=7, ['Y']=8,
+    ['k']=1, /*['m']=2, ['g']=3, ['t']=4, ['p']=5, ['e']=6, ['z']=7, ['y']=8,*/
+  };
+
+  while (blanks[to_uchar (*a)])
+    a++;
+  while (blanks[to_uchar (*b)])
+    b++;
+
+  const char *ar = a;
+  const char *br = b;
+
+  while( ISDIGIT (*ar) || (*ar) == decimal_point || (*ar) == thousands_sep )
+    ar++;
+  while( ISDIGIT (*br) || (*br) == decimal_point || (*br) == thousands_sep )
+    br++;
+
+  check_mixed_SI_IEC (*(ar+1));
+  check_mixed_SI_IEC (*(br+1));
+
+  int aw = weights[to_uchar (*ar)];
+  int bw = weights[to_uchar (*br)];
+
+  return (aw > bw ? 1
+          : aw < bw ? -1
+          : strnumcmp( a , b , decimal_point , thousands_sep));
+}
+
 static int
 general_numcompare (const char *sa, const char *sb)
 {
@@ -1917,13 +1967,14 @@ keycompare (const struct line *a, const struct line *b)
 
       if (key->random)
        diff = compare_random (texta, lena, textb, lenb);
-      else if (key->numeric | key->general_numeric)
+      else if (key->numeric | key->general_numeric | key->human_numeric)
        {
          char savea = *lima, saveb = *limb;
 
          *lima = *limb = '\0';
-         diff = ((key->numeric ? numcompare : general_numcompare)
-                 (texta, textb));
+         diff = ((key->numeric ? numcompare
+                  : key->general_numeric ? general_numcompare
+                  : human_compare) (texta, textb));
          *lima = savea, *limb = saveb;
        }
       else if (key->version)
@@ -2889,7 +2940,7 @@ check_ordering_compatibility (void)
 
   for (key = keylist; key; key = key->next)
     if ((1 < (key->random + key->numeric + key->general_numeric + key->month
-             + key->version + !!key->ignore))
+             + key->version + (!!key->ignore) + key->human_numeric))
        || (key->random && key->translate))
       {
        /* The following is too big, but guaranteed to be "big enough". */
@@ -2901,6 +2952,8 @@ check_ordering_compatibility (void)
          *p++ = 'f';
        if (key->general_numeric)
          *p++ = 'g';
+        if (key->human_numeric)
+          *p++ = 'h';
        if (key->ignore == nonprinting)
          *p++ = 'i';
        if (key->month)
@@ -2992,6 +3045,9 @@ set_ordering (const char *s, struct keyfield *key, enum 
blanktype blanktype)
        case 'g':
          key->general_numeric = true;
          break;
+        case 'h':
+          key->human_numeric = true;
+          break;
        case 'i':
          /* Option order should not matter, so don't let -i override
             -d.  -d implies -i, but -i does not imply -d.  */
@@ -3140,7 +3196,8 @@ main (int argc, char **argv)
   gkey.sword = gkey.eword = SIZE_MAX;
   gkey.ignore = NULL;
   gkey.translate = NULL;
-  gkey.numeric = gkey.general_numeric = gkey.random = gkey.version = false;
+  gkey.numeric = gkey.general_numeric = gkey.human_numeric = false;
+  gkey.random = gkey.version = false;
   gkey.month = gkey.reverse = false;
   gkey.skipsblanks = gkey.skipeblanks = false;
 
@@ -3219,6 +3276,7 @@ main (int argc, char **argv)
        case 'd':
        case 'f':
        case 'g':
+        case 'h':
        case 'i':
        case 'M':
        case 'n':
@@ -3471,6 +3529,7 @@ main (int argc, char **argv)
                 | key->numeric
                 | key->version
                 | key->general_numeric
+                 | key->human_numeric
                 | key->random)))
         {
           key->ignore = gkey.ignore;
@@ -3480,6 +3539,7 @@ main (int argc, char **argv)
           key->month = gkey.month;
           key->numeric = gkey.numeric;
           key->general_numeric = gkey.general_numeric;
+          key->human_numeric = gkey.human_numeric;
           key->random = gkey.random;
           key->reverse = gkey.reverse;
           key->version = gkey.version;
@@ -3495,6 +3555,7 @@ main (int argc, char **argv)
                       | gkey.month
                       | gkey.numeric
                       | gkey.general_numeric
+                       | gkey.human_numeric
                       | gkey.random
                       | gkey.version)))
     {

reply via email to

[Prev in Thread] Current Thread [Next in Thread]