bug-coreutils
[Top][All Lists]
Advanced

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

Re: Human readable sort


From: Pádraig Brady
Subject: Re: Human readable sort
Date: Mon, 27 Apr 2009 10:01:33 +0100
User-agent: Thunderbird 2.0.0.6 (X11/20071008)

Michael Speer wrote:
> 2009/4/25 Pádraig Brady <address@hidden>:
>> I've further modified your latest in the attached.
>> I refactored the suffix finding a bit and also added
>> support for --sort=human-numeric.
> 
> I refactored it again to handle some potential problems with how
> separators and decimals points were handled.  It will still let you
> write something silly like "1,3,4.5.6", but I've stopped scanning on
> "4..4" or "3,,2" or even "5.M".  I'm not sure if that last one is used
> meaningfully anywhere.

This needs another cycle I think.
BTW earlier in this thread I pasted the wrong link to the
previous attempt to include this feature. This is the right one:
http://lists.gnu.org/archive/html/bug-coreutils/2009-01/threads.html#00006

Anyway attached an updated version which supports negative
numbers as it was pretty trivial to add. I also removed the
explicit check for thousands_sep==-1 as I changed to using
unsigned char.

Some performance measurements of this version are
(where "ret 0" is just returning 0 at the top of
find_unit_order() to show the function call overheads.)

seconds to sort 1 million ints:
-----------------------------------
sort option    time         difference
-----------------------------------
-n             2.75
-h (ret 0)     3.10         +13%
-h             3.96         +44%

seconds to sort 1 million sizes (max len = 4):
-----------------------------------
sort option    time         difference
-----------------------------------
-n             2.54
-h (ret 0)     2.70         +6%
-h             3.50         +38%

I haven't really looked at optimizing it yet.

cheers,
Pádraig.
diff --git a/src/sort.c b/src/sort.c
index f48d727..640cf1c 100644
--- a/src/sort.c
+++ b/src/sort.c
@@ -176,6 +176,8 @@ struct keyfield
   bool random;                 /* Sort by random hash of key.  */
   bool general_numeric;                /* Flag for general, numeric comparison.
                                   Handle numbers in exponential notation. */
+  bool human_numeric;           /* Flag for sorting by human readable
+                                   units with either SI xor IEC prefixes. */
   bool month;                  /* Flag for comparison by month name. */
   bool reverse;                        /* Reverse the sense of comparison. */
   bool version;                        /* sort by version number */
@@ -336,6 +338,9 @@ Ordering options:\n\
   -i, --ignore-nonprinting    consider only printable characters\n\
   -M, --month-sort            compare (unknown) < `JAN' < ... < `DEC'\n\
 "), stdout);
+      fputs(_("\
+  -h, --human-numeric-sort    compare human readable numbers (e.g., 2K 1G)\n\
+"), stdout);
       fputs (_("\
   -n, --numeric-sort          compare according to string numerical value\n\
   -R, --random-sort           sort by random hash of keys\n\
@@ -344,8 +349,8 @@ Ordering options:\n\
 "), stdout);
       fputs (_("\
       --sort=WORD             sort according to WORD:\n\
-                                general-numeric -g, month -M, numeric -n,\n\
-                                random -R, version -V\n\
+                                general-numeric -g, human-numeric -h, month 
-M,\n\
+                                numeric -n, random -R, version -V\n\
   -V, --version-sort          natural sort of (version) numbers within text\n\
 \n\
 "), stdout);
@@ -426,7 +431,7 @@ enum
   SORT_OPTION
 };
 
-static char const short_options[] = "-bcCdfgik:mMno:rRsS:t:T:uVy:z";
+static char const short_options[] = "-bcCdfghik:mMno:rRsS:t:T:uVy:z";
 
 static struct option const long_options[] =
 {
@@ -442,6 +447,7 @@ static struct option const long_options[] =
   {"merge", no_argument, NULL, 'm'},
   {"month-sort", no_argument, NULL, 'M'},
   {"numeric-sort", no_argument, NULL, 'n'},
+  {"human-numeric-sort", no_argument, NULL, 'h'},
   {"version-sort", no_argument, NULL, 'V'},
   {"random-sort", no_argument, NULL, 'R'},
   {"random-source", required_argument, NULL, RANDOM_SOURCE_OPTION},
@@ -480,6 +486,7 @@ static char const check_types[] =
 
 #define SORT_TABLE \
   _st_("general-numeric", 'g') \
+  _st_("human-numeric",   'h') \
   _st_("month",           'M') \
   _st_("numeric",         'n') \
   _st_("random",          'R') \
@@ -1673,6 +1680,87 @@ numcompare (const char *a, const char *b)
   return strnumcmp (a, b, decimal_point, thousands_sep);
 }
 
+/* Exit with an error if a mixture of SI and IEC units detected.  */
+
+static void
+check_mixed_SI_IEC (char prefix)
+{
+  static int seen_si = -1;
+  bool si_present = prefix == 'i';
+  if (seen_si != -1 && seen_si != si_present)
+    error (SORT_FAILURE, 0, _("both SI and IEC prefixes present on units"));
+  seen_si = si_present;
+}
+
+/* Return an integer which represents the order of magnitude of
+   the unit following the number.  NUMBER can contain thousands separators
+   or a decimal point, but not have preceeding blanks.
+   Negative numbers return a negative unit order.  */
+
+static int
+find_unit_order (const char* number)
+{
+  static const char orders [UCHAR_LIM] = {
+    ['K']=1, ['M']=2, ['G']=3, ['T']=4, ['P']=5, ['E']=6, ['Z']=7, ['Y']=8,
+    ['k']=1,
+  };
+
+  const unsigned char *p = number;
+
+  int sign = 1;
+
+  if (*p == '-')
+    {
+      sign = -1;
+      p++;
+    }
+
+  /* Scan to end of number.
+     Decimals or separators not followed by digits stop the scan.
+     Numbers ending in decimals or separators are thus considered
+     to be lacking in units.
+     FIXME: add support for multibyte thousands_sep and decimal_point.  */
+
+  while (ISDIGIT (*p))
+    {
+      p++;
+
+      if (*p == decimal_point && ISDIGIT (*(p+1)))
+       p += 2;
+      else if (*p == thousands_sep && ISDIGIT (*(p+1)))
+       p += 2;
+    }
+
+  int order = orders[to_uchar (*p)];
+
+  /* For valid units check for MiB vs MB etc.  */
+  if (order)
+    check_mixed_SI_IEC (*(p+1));
+
+  return sign * order;
+}
+
+/* Compare numbers ending in units with SI xor IEC prefixes
+          <none/unknown> < K < M < G < T < P < E < Z < Y
+   Assume that numbers are properly abbreviated.
+   i.e. input will never have 5000K instead of 5M.  */
+
+static int
+human_numcompare (const char *a, const char *b)
+{
+  while (blanks[to_uchar (*a)])
+    a++;
+  while (blanks[to_uchar (*b)])
+    b++;
+
+  int aw = find_unit_order (a);
+  int bw = find_unit_order (b);
+
+  return (aw > bw ? 1
+          : aw < bw ? -1
+          : strnumcmp (a , b , decimal_point , thousands_sep));
+}
+
 static int
 general_numcompare (const char *sa, const char *sb)
 {
@@ -1917,13 +2005,14 @@ keycompare (const struct line *a, const struct line *b)
 
       if (key->random)
        diff = compare_random (texta, lena, textb, lenb);
-      else if (key->numeric | key->general_numeric)
+      else if (key->numeric | key->general_numeric | key->human_numeric)
        {
          char savea = *lima, saveb = *limb;
 
          *lima = *limb = '\0';
-         diff = ((key->numeric ? numcompare : general_numcompare)
-                 (texta, textb));
+         diff = ((key->numeric ? numcompare
+                  : key->general_numeric ? general_numcompare
+                  : human_numcompare) (texta, textb));
          *lima = savea, *limb = saveb;
        }
       else if (key->version)
@@ -2889,7 +2978,7 @@ check_ordering_compatibility (void)
 
   for (key = keylist; key; key = key->next)
     if ((1 < (key->random + key->numeric + key->general_numeric + key->month
-             + key->version + !!key->ignore))
+             + key->version + (!!key->ignore) + key->human_numeric))
        || (key->random && key->translate))
       {
        /* The following is too big, but guaranteed to be "big enough". */
@@ -2901,6 +2990,8 @@ check_ordering_compatibility (void)
          *p++ = 'f';
        if (key->general_numeric)
          *p++ = 'g';
+        if (key->human_numeric)
+          *p++ = 'h';
        if (key->ignore == nonprinting)
          *p++ = 'i';
        if (key->month)
@@ -2992,6 +3083,9 @@ set_ordering (const char *s, struct keyfield *key, enum 
blanktype blanktype)
        case 'g':
          key->general_numeric = true;
          break;
+        case 'h':
+          key->human_numeric = true;
+          break;
        case 'i':
          /* Option order should not matter, so don't let -i override
             -d.  -d implies -i, but -i does not imply -d.  */
@@ -3140,7 +3234,8 @@ main (int argc, char **argv)
   gkey.sword = gkey.eword = SIZE_MAX;
   gkey.ignore = NULL;
   gkey.translate = NULL;
-  gkey.numeric = gkey.general_numeric = gkey.random = gkey.version = false;
+  gkey.numeric = gkey.general_numeric = gkey.human_numeric = false;
+  gkey.random = gkey.version = false;
   gkey.month = gkey.reverse = false;
   gkey.skipsblanks = gkey.skipeblanks = false;
 
@@ -3219,6 +3314,7 @@ main (int argc, char **argv)
        case 'd':
        case 'f':
        case 'g':
+        case 'h':
        case 'i':
        case 'M':
        case 'n':
@@ -3471,6 +3567,7 @@ main (int argc, char **argv)
                 | key->numeric
                 | key->version
                 | key->general_numeric
+                 | key->human_numeric
                 | key->random)))
         {
           key->ignore = gkey.ignore;
@@ -3480,6 +3577,7 @@ main (int argc, char **argv)
           key->month = gkey.month;
           key->numeric = gkey.numeric;
           key->general_numeric = gkey.general_numeric;
+          key->human_numeric = gkey.human_numeric;
           key->random = gkey.random;
           key->reverse = gkey.reverse;
           key->version = gkey.version;
@@ -3495,6 +3593,7 @@ main (int argc, char **argv)
                       | gkey.month
                       | gkey.numeric
                       | gkey.general_numeric
+                       | gkey.human_numeric
                       | gkey.random
                       | gkey.version)))
     {

reply via email to

[Prev in Thread] Current Thread [Next in Thread]