[Top][All Lists]
[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]
Re: Human readable sort
From: |
Pádraig Brady |
Subject: |
Re: Human readable sort |
Date: |
Mon, 27 Apr 2009 17:18:19 +0100 |
User-agent: |
Thunderbird 2.0.0.6 (X11/20071008) |
address@hidden wrote:
> On Apr 27, 2009 11:41am, Ondřej Vašík <address@hidden> wrote:
>> Pádraig Brady wrote:
>>
>> > Pádraig Brady wrote:
>>
>> > Attached is the full patch, which hopefully we can push soon.
>>
>>
>>
>> I'm not objecting anything relevant in that patch, just the tab/spaces
>>
>> mixing looks inconsistent with the rest of the code in added lines in
>>
>> sort.c . Only cosmetic thing ... but is this intentional?
>>
>>
>>
>> Greetings,
>>
>> Ondřej Vašík
>>
>
> That is my fault. My home and work emacs are configured with tabs
> disabled for interaction with my office and home repos. I'll do cleanup
> against the patch this afternoon when I can look at it.
Thanks Ondřej.
This is fixed up in attached I think.
cheers,
Pádraig.
p.s. Micheal, which email address do you want in THANKS?
>From 2e1ee46e97858717535210729f5e0181c08bf6d1 Mon Sep 17 00:00:00 2001
From: Michael Speer <address@hidden>
Date: Mon, 27 Apr 2009 14:51:29 +0100
Subject: [PATCH] sort: new --human-numeric-sort option to sort KiB MB etc.
* NEWS: document the new option
* doc/coreutils.texi: ditto
* src/sort.c (main): handle the new -human-numeric-sort option (-h).
(human_numcompare): A new function to compare SI and IEC suffixes
before falling back to the standard --numeric comparision.
(find_unit_order): A new helper function to find the order
of magnitude of a number string as determined by its suffix.
(check_mixed_SI_IEC): A new helper function to exit with error
if both SI and IEC suffixes are presented.
* tests/misc/sort: Add 6 tests to test the new functionality.
* THANKS: Update
---
NEWS | 5 ++
THANKS | 1 +
doc/coreutils.texi | 14 ++++++
src/sort.c | 115 ++++++++++++++++++++++++++++++++++++++++++++++++----
tests/misc/sort | 13 ++++++
5 files changed, 140 insertions(+), 8 deletions(-)
diff --git a/NEWS b/NEWS
index 8cb17cc..bfebe1d 100644
--- a/NEWS
+++ b/NEWS
@@ -2,6 +2,11 @@ GNU coreutils NEWS -*-
outline -*-
* Noteworthy changes in release 7.3 (????-??-??) [?]
+** New features
+
+ sort accepts a new option, --human-numeric-sort (-h): sort numbers
+ while honouring human readable suffixes like KiB and MB etc.
+
** Bug fixes
sort -m no longer segfaults when its output file is also an input file.
diff --git a/THANKS b/THANKS
index 876a6b6..da48d6d 100644
--- a/THANKS
+++ b/THANKS
@@ -395,6 +395,7 @@ Michael J. Croghan address@hidden
Michael McFarland address@hidden
Michael McLagan address@hidden
Michael Piefel address@hidden
+Michael Speer address@hidden
Michael Steffens address@hidden
Michael Stone address@hidden
Michael Stutz address@hidden
diff --git a/doc/coreutils.texi b/doc/coreutils.texi
index 918f44e..8f73419 100644
--- a/doc/coreutils.texi
+++ b/doc/coreutils.texi
@@ -3785,6 +3785,20 @@ Use this option only if there is no alternative; it is
much slower than
@option{--numeric-sort} (@option{-n}) and it can lose information when
converting to floating point.
address@hidden -h
address@hidden --human-numeric-sort
address@hidden --sort=human-numeric
address@hidden -h
address@hidden --human-numeric-sort
address@hidden --sort
address@hidden human numeric sort
address@hidden LC_NUMERIC
+Sort numerically, as per the @option{--numeric-sort} option,
+and in addition handle IEC or SI suffixes like MiB, MB etc.
+Note a mixture of these suffixes is not supported and will
+be flagged as an error. Also the numbers must be abbreviated unambiguously.
+I.E. 5000K and 6M will be sorted incorrectly for example.
+
@item -i
@itemx --ignore-nonprinting
@opindex -i
diff --git a/src/sort.c b/src/sort.c
index f48d727..8a85d54 100644
--- a/src/sort.c
+++ b/src/sort.c
@@ -176,6 +176,8 @@ struct keyfield
bool random; /* Sort by random hash of key. */
bool general_numeric; /* Flag for general, numeric comparison.
Handle numbers in exponential notation. */
+ bool human_numeric; /* Flag for sorting by human readable
+ units with either SI xor IEC prefixes. */
bool month; /* Flag for comparison by month name. */
bool reverse; /* Reverse the sense of comparison. */
bool version; /* sort by version number */
@@ -337,6 +339,9 @@ Ordering options:\n\
-M, --month-sort compare (unknown) < `JAN' < ... < `DEC'\n\
"), stdout);
fputs (_("\
+ -h, --human-numeric-sort compare human readable numbers (e.g., 2K 1G)\n\
+"), stdout);
+ fputs (_("\
-n, --numeric-sort compare according to string numerical value\n\
-R, --random-sort sort by random hash of keys\n\
--random-source=FILE get random bytes from FILE\n\
@@ -344,8 +349,8 @@ Ordering options:\n\
"), stdout);
fputs (_("\
--sort=WORD sort according to WORD:\n\
- general-numeric -g, month -M, numeric -n,\n\
- random -R, version -V\n\
+ general-numeric -g, human-numeric -h, month
-M,\n\
+ numeric -n, random -R, version -V\n\
-V, --version-sort natural sort of (version) numbers within text\n\
\n\
"), stdout);
@@ -426,7 +431,7 @@ enum
SORT_OPTION
};
-static char const short_options[] = "-bcCdfgik:mMno:rRsS:t:T:uVy:z";
+static char const short_options[] = "-bcCdfghik:mMno:rRsS:t:T:uVy:z";
static struct option const long_options[] =
{
@@ -442,6 +447,7 @@ static struct option const long_options[] =
{"merge", no_argument, NULL, 'm'},
{"month-sort", no_argument, NULL, 'M'},
{"numeric-sort", no_argument, NULL, 'n'},
+ {"human-numeric-sort", no_argument, NULL, 'h'},
{"version-sort", no_argument, NULL, 'V'},
{"random-sort", no_argument, NULL, 'R'},
{"random-source", required_argument, NULL, RANDOM_SOURCE_OPTION},
@@ -480,6 +486,7 @@ static char const check_types[] =
#define SORT_TABLE \
_st_("general-numeric", 'g') \
+ _st_("human-numeric", 'h') \
_st_("month", 'M') \
_st_("numeric", 'n') \
_st_("random", 'R') \
@@ -1673,6 +1680,87 @@ numcompare (const char *a, const char *b)
return strnumcmp (a, b, decimal_point, thousands_sep);
}
+/* Exit with an error if a mixture of SI and IEC units detected. */
+
+static void
+check_mixed_SI_IEC (char prefix)
+{
+ static int seen_si = -1;
+ bool si_present = prefix == 'i';
+ if (seen_si != -1 && seen_si != si_present)
+ error (SORT_FAILURE, 0, _("both SI and IEC prefixes present on units"));
+ seen_si = si_present;
+}
+
+/* Return an integer which represents the order of magnitude of
+ the unit following the number. NUMBER can contain thousands separators
+ or a decimal point, but not have preceeding blanks.
+ Negative numbers return a negative unit order. */
+
+static int
+find_unit_order (const char *number)
+{
+ static const char orders [UCHAR_LIM] = {
+ ['K']=1, ['M']=2, ['G']=3, ['T']=4, ['P']=5, ['E']=6, ['Z']=7, ['Y']=8,
+ ['k']=1,
+ };
+
+ const unsigned char *p = number;
+
+ int sign = 1;
+
+ if (*p == '-')
+ {
+ sign = -1;
+ p++;
+ }
+
+ /* Scan to end of number.
+ Decimals or separators not followed by digits stop the scan.
+ Numbers ending in decimals or separators are thus considered
+ to be lacking in units.
+ FIXME: add support for multibyte thousands_sep and decimal_point. */
+
+ while (ISDIGIT (*p))
+ {
+ p++;
+
+ if (*p == decimal_point && ISDIGIT (*(p + 1)))
+ p += 2;
+ else if (*p == thousands_sep && ISDIGIT (*(p + 1)))
+ p += 2;
+ }
+
+ int order = orders[*p];
+
+ /* For valid units check for MiB vs MB etc. */
+ if (order)
+ check_mixed_SI_IEC (*(p + 1));
+
+ return sign * order;
+}
+
+/* Compare numbers ending in units with SI xor IEC prefixes
+ <none/unknown> < K/k < M < G < T < P < E < Z < Y
+ Assume that numbers are properly abbreviated.
+ i.e. input will never have both 5000K and 6M. */
+
+static int
+human_numcompare (const char *a, const char *b)
+{
+ while (blanks[to_uchar (*a)])
+ a++;
+ while (blanks[to_uchar (*b)])
+ b++;
+
+ int order_a = find_unit_order (a);
+ int order_b = find_unit_order (b);
+
+ return (order_a > order_b ? 1
+ : order_a < order_b ? -1
+ : strnumcmp (a, b, decimal_point , thousands_sep));
+}
+
static int
general_numcompare (const char *sa, const char *sb)
{
@@ -1917,13 +2005,14 @@ keycompare (const struct line *a, const struct line *b)
if (key->random)
diff = compare_random (texta, lena, textb, lenb);
- else if (key->numeric | key->general_numeric)
+ else if (key->numeric | key->general_numeric | key->human_numeric)
{
char savea = *lima, saveb = *limb;
*lima = *limb = '\0';
- diff = ((key->numeric ? numcompare : general_numcompare)
- (texta, textb));
+ diff = ((key->numeric ? numcompare
+ : key->general_numeric ? general_numcompare
+ : human_numcompare) (texta, textb));
*lima = savea, *limb = saveb;
}
else if (key->version)
@@ -2889,7 +2978,7 @@ check_ordering_compatibility (void)
for (key = keylist; key; key = key->next)
if ((1 < (key->random + key->numeric + key->general_numeric + key->month
- + key->version + !!key->ignore))
+ + key->version + !!key->ignore + key->human_numeric))
|| (key->random && key->translate))
{
/* The following is too big, but guaranteed to be "big enough". */
@@ -2901,6 +2990,8 @@ check_ordering_compatibility (void)
*p++ = 'f';
if (key->general_numeric)
*p++ = 'g';
+ if (key->human_numeric)
+ *p++ = 'h';
if (key->ignore == nonprinting)
*p++ = 'i';
if (key->month)
@@ -2992,6 +3083,9 @@ set_ordering (const char *s, struct keyfield *key, enum
blanktype blanktype)
case 'g':
key->general_numeric = true;
break;
+ case 'h':
+ key->human_numeric = true;
+ break;
case 'i':
/* Option order should not matter, so don't let -i override
-d. -d implies -i, but -i does not imply -d. */
@@ -3140,7 +3234,8 @@ main (int argc, char **argv)
gkey.sword = gkey.eword = SIZE_MAX;
gkey.ignore = NULL;
gkey.translate = NULL;
- gkey.numeric = gkey.general_numeric = gkey.random = gkey.version = false;
+ gkey.numeric = gkey.general_numeric = gkey.human_numeric = false;
+ gkey.random = gkey.version = false;
gkey.month = gkey.reverse = false;
gkey.skipsblanks = gkey.skipeblanks = false;
@@ -3219,6 +3314,7 @@ main (int argc, char **argv)
case 'd':
case 'f':
case 'g':
+ case 'h':
case 'i':
case 'M':
case 'n':
@@ -3471,6 +3567,7 @@ main (int argc, char **argv)
| key->numeric
| key->version
| key->general_numeric
+ | key->human_numeric
| key->random)))
{
key->ignore = gkey.ignore;
@@ -3480,6 +3577,7 @@ main (int argc, char **argv)
key->month = gkey.month;
key->numeric = gkey.numeric;
key->general_numeric = gkey.general_numeric;
+ key->human_numeric = gkey.human_numeric;
key->random = gkey.random;
key->reverse = gkey.reverse;
key->version = gkey.version;
@@ -3495,6 +3593,7 @@ main (int argc, char **argv)
| gkey.month
| gkey.numeric
| gkey.general_numeric
+ | gkey.human_numeric
| gkey.random
| gkey.version)))
{
diff --git a/tests/misc/sort b/tests/misc/sort
index c5a2d74..76b271d 100755
--- a/tests/misc/sort
+++ b/tests/misc/sort
@@ -54,6 +54,19 @@ my @Tests =
["n11a", '-s -n -k1,1', {IN=>".01a\n.010\n"}, {OUT=>".01a\n.010\n"}],
["n11b", '-s -n -k1,1', {IN=>".010\n.01a\n"}, {OUT=>".010\n.01a\n"}],
+#human readable suffixes
+["h1", '-h', {IN=>"Y\nZ\nE\nP\nT\nG\nM\nK\n"},
+ {OUT=>"K\nM\nG\nT\nP\nE\nZ\nY\n"}],
+["h2", '-h', {IN=>"1M\n-2G\n-3K"}, {OUT=>"-2G\n-3K\n1M\n"}],
+["h3", '-h', {IN=>"1Mi\n1M\n"}, {OUT=>""}, {EXIT=>2},
+ {ERR=>"$prog: both SI and IEC prefixes present on units\n"}],
+# decimal at end => ignore suffix
+["h4", '-h', {IN=>"1.E\n2.M\n"}, {OUT=>"1.E\n2.M\n"}],
+# double decimal => ignore suffix
+["h5", '-h', {IN=>"1..2E\n2..2M\n"}, {OUT=>"1..2E\n2..2M\n"}],
+# illustrate misordering of ambiguous abbreviations
+["h6", '-h', {IN=>"1GiB\n1030MiB\n"}, {OUT=>"1030MiB\n1GiB\n"}],
+
["01a", '', {IN=>"A\nB\nC\n"}, {OUT=>"A\nB\nC\n"}],
#
["02a", '-c', {IN=>"A\nB\nC\n"}, {OUT=>''}],
--
1.5.3.6
- Human readable sort, Michael Speer, 2009/04/24
- Re: Human readable sort, Pádraig Brady, 2009/04/24
- Re: Human readable sort, Michael Speer, 2009/04/25
- Re: Human readable sort, Pádraig Brady, 2009/04/25
- Re: Human readable sort, Michael Speer, 2009/04/26
- Re: Human readable sort, Pádraig Brady, 2009/04/26
- Re: Human readable sort, Pádraig Brady, 2009/04/27
- Re: Human readable sort, Pádraig Brady, 2009/04/27
- Re: Human readable sort, Ondřej Vašík, 2009/04/27
- Re: Re: Human readable sort, knome . net, 2009/04/27
- Re: Human readable sort,
Pádraig Brady <=