From 70b1e98b5bbf119e9b14b63780c4d02769cdb8ea Mon Sep 17 00:00:00 2001 From: Bo Borgerson Date: Mon, 31 Mar 2008 16:58:21 -0400 Subject: [PATCH] sort: added --merge-batch-size=NMERGE option. * src/sort.c: Replace constant NMERGE with static int nmerge. Validate and apply nmerge command-line settings. Replace some (now variable-length) arrays with pointers to xnmalloc'd storage. * tests/misc/sort-merge: Test new option * doc/coreutils.texi: Describe new option * NEWS: Advertise new option Signed-off-by: Bo Borgerson --- NEWS | 4 ++ doc/coreutils.texi | 16 +++++++++ src/sort.c | 85 +++++++++++++++++++++++++++++++++++++++---------- tests/misc/sort-merge | 28 +++++++++++++++- 4 files changed, 114 insertions(+), 19 deletions(-) diff --git a/NEWS b/NEWS index c05e0ad..92bdea1 100644 --- a/NEWS +++ b/NEWS @@ -50,6 +50,10 @@ GNU coreutils NEWS -*- outline -*- options --general-numeric-sort/-g, --month-sort/-M, --numeric-sort/-n and --random-sort/-R, resp. + sort accepts a new option --merge-batch-size=NMERGE, where NMERGE + represents the maximum number of inputs that will be merged at once. + When more than NMERGE inputs are present temporary files are used. + ** Improvements id and groups work around an AFS-related bug whereby those programs diff --git a/doc/coreutils.texi b/doc/coreutils.texi index f161c4d..7ccbb8a 100644 --- a/doc/coreutils.texi +++ b/doc/coreutils.texi @@ -3689,6 +3689,22 @@ multiple fields. Example: To sort on the second field, use @option{--key=2,2} (@option{-k 2,2}). See below for more examples. address@hidden address@hidden address@hidden --merge-batch-size address@hidden number of inputs to merge, nmerge +Merge at most @var{nmerge} inputs at once. + +If more than @var{nmerge} inputs are to be merged, then temporary files +will be used. + +A large value of @var{nmerge} may improve merge performance and decrease +temporary storage utilization at the expense of increased memory usage +an I/0. Conversely a small value of @var{nmerge} may reduce memory +requirements and I/0 at the expense of temporary storage consumption and +merge performance. + +The value of @var{nmerge} must be at least 2. + @item -o @var{output-file} @itemx address@hidden @opindex -o diff --git a/src/sort.c b/src/sort.c index 8b2eec5..3d4f5a6 100644 --- a/src/sort.c +++ b/src/sort.c @@ -223,13 +223,13 @@ static struct month monthtab[] = }; /* During the merge phase, the number of files to merge at once. */ -#define NMERGE 16 +#define NMERGE_DEFAULT 16 /* Minimum size for a merge or check buffer. */ #define MIN_MERGE_BUFFER_SIZE (2 + sizeof (struct line)) /* Minimum sort size; the code might not work with smaller sizes. */ -#define MIN_SORT_SIZE (NMERGE * MIN_MERGE_BUFFER_SIZE) +#define MIN_SORT_SIZE (nmerge * MIN_MERGE_BUFFER_SIZE) /* The number of bytes needed for a merge or check buffer, which can function relatively efficiently even if it holds only one line. If @@ -281,6 +281,10 @@ static struct keyfield *keylist; /* Program used to (de)compress temp files. Must accept -d. */ static char const *compress_program; +/* Maximum number of files to merge in one go. If more than this + number are present, temp files will be used. */ +static int nmerge = NMERGE_DEFAULT; + static void sortlines_temp (struct line *, size_t, struct line *); /* Report MESSAGE for FILE, then clean up and exit. @@ -344,6 +348,8 @@ Other options:\n\ decompress them with PROG -d\n\ -k, --key=POS1[,POS2] start a key at POS1, end it at POS2 (origin 1)\n\ -m, --merge merge already sorted files; do not sort\n\ + --merge-batch-size=NMERGE merge at most NMERGE inputs at once;\n\ + for more use temp files\n\ "), stdout); fputs (_("\ -o, --output=FILE write result to FILE instead of standard output\n\ @@ -395,7 +401,8 @@ enum CHECK_OPTION = CHAR_MAX + 1, COMPRESS_PROGRAM_OPTION, RANDOM_SOURCE_OPTION, - SORT_OPTION + SORT_OPTION, + NMERGE_OPTION }; static char const short_options[] = "-bcCdfgik:mMno:rRsS:t:T:uy:z"; @@ -419,6 +426,7 @@ static struct option const long_options[] = {"output", required_argument, NULL, 'o'}, {"reverse", no_argument, NULL, 'r'}, {"stable", no_argument, NULL, 's'}, + {"merge-batch-size", required_argument, NULL, NMERGE_OPTION}, {"buffer-size", required_argument, NULL, 'S'}, {"field-separator", required_argument, NULL, 't'}, {"temporary-directory", required_argument, NULL, 'T'}, @@ -1045,6 +1053,35 @@ inittables (void) #endif } +/* Specify how many inputs may be merged at once. */ +static void +specify_nmerge (int oi, char c, char const *s) +{ + uintmax_t n; + enum strtol_error e = xstrtoumax (s, NULL, 10, &n, NULL); + + if (e == LONGINT_OK) + { + nmerge = n; + if (nmerge == n) + { + if (nmerge >= 2) + { + /* Need to re-check that we meet the minimum + requirement for memory usage with the new, + potentially larger, nmerge */ + sort_size = MAX (sort_size, MIN_SORT_SIZE); + return; + } + e = LONGINT_INVALID; + } + else + e = LONGINT_OVERFLOW; + } + + xstrtol_fatal (e, oi, c, long_options, s); +} + /* Specify the amount of main memory to use when sorting. */ static void specify_sort_size (int oi, char c, char const *s) @@ -2029,15 +2066,20 @@ static void mergefps (struct sortfile *files, size_t ntemps, size_t nfiles, FILE *ofp, char const *output_file) { - FILE *fps[NMERGE]; /* Input streams for each file. */ - struct buffer buffer[NMERGE]; /* Input buffers for each file. */ + FILE **fps = xnmalloc(nmerge, sizeof *fps); + /* Input streams for each file. */ + struct buffer *buffer = xnmalloc(nmerge, sizeof *buffer); + /* Input buffers for each file. */ struct line saved; /* Saved line storage for unique check. */ struct line const *savedline = NULL; /* &saved if there is a saved line. */ size_t savealloc = 0; /* Size allocated for the saved line. */ - struct line const *cur[NMERGE]; /* Current line in each line table. */ - struct line const *base[NMERGE]; /* Base of each line table. */ - size_t ord[NMERGE]; /* Table representing a permutation of fps, + struct line const **cur = xnmalloc(nmerge, sizeof *cur); + /* Current line in each line table. */ + struct line const **base = xnmalloc(nmerge, sizeof *base); + /* Base of each line table. */ + size_t *ord = xnmalloc(nmerge, sizeof ord); + /* Table representing a permutation of fps, such that cur[ord[0]] is the smallest line and will be next output. */ size_t i; @@ -2210,6 +2252,11 @@ mergefps (struct sortfile *files, size_t ntemps, size_t nfiles, } xfclose (ofp, output_file); + free(fps); + free(buffer); + free(ord); + free(base); + free(cur); } /* Merge into T the two sorted arrays of lines LO (with NLO members) @@ -2397,7 +2444,7 @@ static void merge (struct sortfile *files, size_t ntemps, size_t nfiles, char const *output_file) { - while (NMERGE < nfiles) + while (nmerge < nfiles) { /* Number of input files processed so far. */ size_t in; @@ -2405,33 +2452,33 @@ merge (struct sortfile *files, size_t ntemps, size_t nfiles, /* Number of output files generated so far. */ size_t out; - /* nfiles % NMERGE; this counts input files that are left over + /* nfiles % nmerge; this counts input files that are left over after all full-sized merges have been done. */ size_t remainder; /* Number of easily-available slots at the next loop iteration. */ size_t cheap_slots; - /* Do as many NMERGE-size merges as possible. */ - for (out = in = 0; out < nfiles / NMERGE; out++, in += NMERGE) + /* Do as many nmerge-size merges as possible. */ + for (out = in = 0; out < nfiles / nmerge; out++, in += nmerge) { FILE *tfp; pid_t pid; char *temp = create_temp (&tfp, &pid); - size_t nt = MIN (ntemps, NMERGE); + size_t nt = MIN (ntemps, nmerge); ntemps -= nt; - mergefps (&files[in], nt, NMERGE, tfp, temp); + mergefps (&files[in], nt, nmerge, tfp, temp); files[out].name = temp; files[out].pid = pid; } remainder = nfiles - in; - cheap_slots = NMERGE - out % NMERGE; + cheap_slots = nmerge - out % nmerge; if (cheap_slots < remainder) { /* So many files remain that they can't all be put into the last - NMERGE-sized output window. Do one more merge. Merge as few + nmerge-sized output window. Do one more merge. Merge as few files as possible, to avoid needless I/O. */ size_t nshortmerge = remainder - cheap_slots + 1; FILE *tfp; @@ -2445,7 +2492,7 @@ merge (struct sortfile *files, size_t ntemps, size_t nfiles, in += nshortmerge; } - /* Put the remaining input files into the last NMERGE-sized output + /* Put the remaining input files into the last nmerge-sized output window, so they will be merged in the next pass. */ memmove(&files[out], &files[in], (nfiles - in) * sizeof *files); ntemps += out; @@ -3013,6 +3060,10 @@ main (int argc, char **argv) mergeonly = true; break; + case NMERGE_OPTION: + specify_nmerge (oi, c, optarg); + break; + case 'o': if (outfile && !STREQ (outfile, optarg)) error (SORT_FAILURE, 0, _("multiple output files specified")); diff --git a/tests/misc/sort-merge b/tests/misc/sort-merge index 7ac9f84..544d367 100755 --- a/tests/misc/sort-merge +++ b/tests/misc/sort-merge @@ -25,19 +25,43 @@ require 5.003; use strict; (my $program_name = $0) =~ s|.*/||; +my $prog = 'sort'; + # Turn off localisation of executable's ouput. @ENV{qw(LANGUAGE LANG LC_ALL)} = ('C') x 3; +my $mbs = '--merge-batch-size'; + +# three empty files and one that says 'foo' +my @inputs = (+(map{{IN=> {"empty$_"=> ''}}}1..3), {IN=> {foo=> "foo\n"}}); + +my $badtmp = 'does/not/exist'; +-d $badtmp and die "$badtmp exists"; + my @Tests = ( - ['m1', '-m', {IN=> {empty=> ''}}, {IN=> {f=> "foo\n"}}, {OUT=>"foo\n"}], + ['m1', '-m', @inputs, {OUT=>"foo\n"}], + + # check bounds on --merge-batch-size + ['m2', "-m $mbs=1", @inputs, + {ERR=>"$prog: invalid $mbs argument `1'\n"}, {EXIT=>2}], + ['m3', "-m $mbs=2147483649", @inputs, + {ERR=>"$prog: $mbs argument `2147483649' too large\n"}, {EXIT=>2}], + + # This should work since nmerge >= the number of input files + ['m4', "-m $mbs=4 -T$badtmp", @inputs, {OUT=>"foo\n"}], + + # this should fail since nmerge < # of input files, so + # temp files are needed + ['m5', "-m $mbs=2 -T$badtmp", @inputs, + {ERR_SUBST=>"s|: $badtmp/sort.+||"}, + {ERR=>"$prog: cannot create temporary file\n"}, {EXIT=>2}], ); my $save_temps = $ENV{DEBUG}; my $verbose = $ENV{VERBOSE}; -my $prog = 'sort'; my $fail = run_tests ($program_name, $prog, address@hidden, $save_temps, $verbose); exit $fail; EOF -- 1.5.2.5