bug-coreutils
[Top][All Lists]
Advanced

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

rm -rf: avoid ext3/4 O(N^2) performance hit, further librarify remove.c


From: Jim Meyering
Subject: rm -rf: avoid ext3/4 O(N^2) performance hit, further librarify remove.c
Date: Thu, 25 Sep 2008 23:41:56 +0200

Here are three patches:
The first relaxes "make distcheck" so it'll pass even after
the 2nd goes in (it introduces several c99 stmt-after-decl).

The 2nd patch does the same thing for rm that I did for fts:

  http://thread.gmane.org/gmane.comp.lib.gnulib.bugs/14808

i.e., this change makes rm process dir entry names in sorted
inode order, when that's sensible.

Adding the test to exercise/exhibit the fix was interesting.
I classified it as "expensive", so it doesn't run by default:
it creates and removes 400,000 files.  The removal takes about
10s on a reasonably modern system.  If it takes more than 1 minute,
that's deemed a failure.  Without the patch, the removal takes almost
6 *minutes* with a fast system.  If you want to run just
that one test, do this:

  make -C tests TESTS=rm/ext3-perf RUN_EXPENSIVE_TESTS=yes VERBOSE=yes

The 3rd patch fixes remove.c to be closer to library-ready,
by removing uses of xmalloc and by adjusting the existing obstack-
using code so that obstack allocation doesn't provoke an exit.

Comments welcome.

>From 5b90ccb2972b799a4b902b756f9c78b4c1cc520b Mon Sep 17 00:00:00 2001
From: Jim Meyering <address@hidden>
Date: Wed, 24 Sep 2008 15:02:19 +0200
Subject: [PATCH] maint: skip the -Wdeclaration-after-statement check

* cfg.mk (local-checks-to-skip): Add patch-check.
With the recent changes to remove.c, I no longer wish
to maintain the c99-to-c89 patch set.
---
 cfg.mk |    2 ++
 1 files changed, 2 insertions(+), 0 deletions(-)

diff --git a/cfg.mk b/cfg.mk
index df172a6..6924e0e 100644
--- a/cfg.mk
+++ b/cfg.mk
@@ -33,6 +33,8 @@ gpg_key_ID = B9AB9A16
 # at the top of the file for each `make distcheck' run.
 local-checks-to-skip = changelog-check strftime-check

+local-checks-to-skip += patch-check
+
 # The local directory containing the checked-out copy of gnulib used in this
 # release.  Used solely to get gnulib's SHA1 for the "announcement" target.
 gnulib_dir = /gnulib
--
1.6.0.2.307.gc427


>From 0cec96e137a53f59b5e88fabd12b9130f313b570 Mon Sep 17 00:00:00 2001
From: Jim Meyering <address@hidden>
Date: Mon, 22 Sep 2008 22:42:12 +0200
Subject: [PATCH] rm -r: avoid O(n^2) performance for a directory with very many 
entries

This enhancement works around a problem that is specific to at least
ext3 and ext4 file systems.  With them, it would take hours to remove
a two-million-entry directory.  RAM-backed file systems (tmpfs) are
not affected, since there is no seek penalty.
* remove.c (rm_malloc, rm_free, compare_ino): New functions.
(dirent_count, preprocess_dir): New function.
[struct readdir_data]: New struct.
(remove_cwd_entries): Call preprocess_dir.
* tests/rm/ext3-perf: New file.  Test for the performance fix.
* NEWS: mention the new feature
---
 NEWS               |    7 ++
 src/remove.c       |  162 ++++++++++++++++++++++++++++++++++++++++++++++++++++
 tests/Makefile.am  |    1 +
 tests/rm/ext3-perf |   76 ++++++++++++++++++++++++
 4 files changed, 246 insertions(+), 0 deletions(-)
 create mode 100755 tests/rm/ext3-perf

diff --git a/NEWS b/NEWS
index 2549d41..b5acbba 100644
--- a/NEWS
+++ b/NEWS
@@ -9,6 +9,13 @@ GNU coreutils NEWS                                    -*- 
outline -*-

 ** New features

+  chgrp, chmod, chown, chcon, du, rm: now all display linear performance,
+  even when operating on million-entry directories on ext3 and ext4 file
+  systems.  Before, they would exhibit O(N^2) performance, due to linear
+  per-entry seek time cost when operating on entries in readdir order.
+  Rm was improved directly, while the others inherit the improvement
+  from the newer version of fts in gnulib.
+
   comm now verifies that the inputs are in sorted order.  This check can
   be turned off with the --nocheck-order option.

diff --git a/src/remove.c b/src/remove.c
index 7c63dfe..d1fb11f 100644
--- a/src/remove.c
+++ b/src/remove.c
@@ -63,6 +63,14 @@ enum
     CONSECUTIVE_READDIR_UNLINK_THRESHOLD = 10
   };

+/* If the heuristics in preprocess_dir suggest that there
+   are fewer than this many entries in a directory, then it
+   skips the preprocessing altogether.  */
+enum
+{
+  INODE_SORT_DIR_ENTRIES_THRESHOLD = 10000
+};
+
 /* FIXME: in 2009, or whenever Darwin 7.9.0 (aka MacOS X 10.3.9) is no
    longer relevant, remove this work-around code.  Then, there will be
    no need to perform the extra rewinddir call, ever.  */
@@ -220,6 +228,24 @@ hash_compare_strings (void const *x, void const *y)
   return STREQ (x, y) ? true : false;
 }

+/* Obstack allocator: longjump on failure.  */
+static void *
+rm_malloc (void *v_jumpbuf, long size)
+{
+  jmp_buf *jumpbuf = v_jumpbuf;
+  void *p = malloc (size);
+  if (p)
+    return p;
+  longjmp (*jumpbuf, 1);
+}
+
+/* With the 2-arg allocator, we must also provide a two-argument freer.  */
+static void
+rm_free (void *v_jumpbuf ATTRIBUTE_UNUSED, void *ptr)
+{
+  free (ptr);
+}
+
 static inline void
 push_dir (Dirstack_state *ds, const char *dir_name)
 {
@@ -1209,6 +1235,138 @@ fd_to_subdirp (int fd_cwd, char const *f,
   return NULL;
 }

+struct readdir_data
+{
+  ino_t ino;
+  char name[FLEXIBLE_ARRAY_MEMBER];
+};
+
+/* A comparison function to sort on increasing inode number.  */
+static int
+compare_ino (void const *av, void const *bv)
+{
+  struct readdir_data const *const *a = av;
+  struct readdir_data const *const *b = bv;
+  return a[0]->ino - b[0]->ino;
+}
+
+/* Return an approximation of the maximum number of dirent entries
+   in a directory with stat data *ST.  */
+static size_t
+dirent_count (struct stat const *st)
+{
+  return st->st_size / 16;
+}
+
+/* When a directory contains very many entries, operating on N entries in
+   readdir order can be very seek-intensive (be it to unlink or even to
+   merely stat each entry), to the point that it results in O(N^2) work.
+   This is file-system-specific: ext3 and ext4 (as of 2008) are susceptible,
+   but tmpfs is not.  The general solution is to process entries in inode
+   order.  That means reading all entries, and sorting them before operating
+   on any.  As such, it is useful only on systems with useful dirent.d_ino.
+   Since 'rm -r's removal process must traverse into directories and since
+   this preprocessing phase can allocate O(N) storage, here we store and
+   sort only non-directory entries, and then remove all of those, so that we
+   can free all allocated storage before traversing into any subdirectory.
+   Perform this optimization only when not interactive and not in verbose
+   mode, to keep the implementation simple and to minimize duplication.
+   Upon failure, simply free any resources and return.  */
+static void
+preprocess_dir (DIR **dirp, struct rm_options const *x)
+{
+#if HAVE_STRUCT_DIRENT_D_TYPE
+
+  struct stat st;
+  /* There are many reasons to return right away, skipping this
+     optimization.  The penalty for being wrong is that we will
+     perform a small amount of extra work.
+
+     Skip this optimization if... */
+
+  if (/* - there is a chance of interactivity */
+      x->interactive != RMI_NEVER
+
+      /* - we're in verbose mode */
+      || x->verbose
+
+      /* - privileged users can unlink nonempty directories.
+        Otherwise, there'd be a race condition between the readdir
+        call (in which we learn dirent.d_type) and the unlink, by
+        which time the non-directory may be replaced with a directory. */
+      || ! cannot_unlink_dir ()
+
+      /* - we can't fstat the file descriptor */
+      || fstat (dirfd (*dirp), &st) != 0
+
+      /* - the directory is smaller than some threshold.
+        Estimate the number of inodes with a heuristic.
+         There's no significant benefit to sorting if there are
+        too few inodes.  This  */
+      || dirent_count (&st) < INODE_SORT_DIR_ENTRIES_THRESHOLD)
+    return;
+
+  /* FIXME: maybe test file system type, too; skip if it's tmpfs: see fts.c */
+
+  struct obstack o_readdir_data;  /* readdir data: inode,name pairs  */
+  struct obstack o_p;  /* an array of pointers to each inode,name pair */
+
+  /* Arrange to longjmp upon obstack allocation failure.  */
+  jmp_buf readdir_jumpbuf;
+  if (setjmp (readdir_jumpbuf))
+    goto cleanup;
+
+  obstack_specify_allocation_with_arg (&o_readdir_data, 0, 0,
+                                      rm_malloc, rm_free, &readdir_jumpbuf);
+  obstack_specify_allocation_with_arg (&o_p, 0, 0,
+                                      rm_malloc, rm_free, &readdir_jumpbuf);
+
+  /* Read all entries, storing <d_ino, d_name> for each non-dir one.
+     Maintain a parallel list of pointers into the primary buffer.  */
+  while (1)
+    {
+      struct dirent const *dp;
+      dp = readdir_ignoring_dot_and_dotdot (*dirp);
+      /* no need to distinguish EOF from failure */
+      if (dp == NULL)
+       break;
+
+      /* Skip known-directory and type-unknown entries.  */
+      if (D_TYPE (dp) == DT_UNKNOWN || D_TYPE (dp) == DT_DIR)
+       break;
+
+      size_t name_len = strlen (dp->d_name);
+      size_t ent_len = offsetof (struct readdir_data, name) + name_len + 1;
+      struct readdir_data *v = obstack_alloc (&o_readdir_data, ent_len);
+      v->ino = D_INO (dp);
+      memcpy (v->name, dp->d_name, name_len + 1);
+
+      /* Append V to the list of pointers.  */
+      obstack_ptr_grow (&o_p, v);
+    }
+
+  /* Compute size and finalize VV.  */
+  size_t n = obstack_object_size (&o_p) / sizeof (void *);
+  struct readdir_data **vv = obstack_finish (&o_p);
+
+  /* Sort on inode number.  */
+  qsort(vv, n, sizeof *vv, compare_ino);
+
+  /* Iterate through those pointers, unlinking each name.  */
+  size_t i;
+  for (i = 0; i < n; i++)
+    {
+      /* ignore failure */
+      (void) unlinkat (dirfd (*dirp), vv[i]->name, 0);
+    }
+
+ cleanup:
+  obstack_free (&o_readdir_data, NULL);
+  obstack_free (&o_p, NULL);
+  rewinddir (*dirp);
+#endif
+}
+
 /* Remove entries in the directory open on DIRP
    Upon finding a directory that is both non-empty and that can be chdir'd
    into, return RM_OK and set *SUBDIR and fill in SUBDIR_SB, where
@@ -1231,6 +1389,10 @@ remove_cwd_entries (DIR **dirp,
   assert (VALID_STATUS (status));
   *subdir = NULL;

+  /* This is just an optimization.
+     It's not a fatal problem if it fails.  */
+  preprocess_dir (dirp, x);
+
   while (1)
     {
       struct dirent const *dp;
diff --git a/tests/Makefile.am b/tests/Makefile.am
index bc656c4..e0377cc 100644
--- a/tests/Makefile.am
+++ b/tests/Makefile.am
@@ -72,6 +72,7 @@ EXTRA_DIST += $(TESTS)
 TESTS =                                                \
   misc/help-version                            \
   misc/invalid-opt                             \
+  rm/ext3-perf                                 \
   rm/cycle                                     \
   chmod/no-x                                   \
   chgrp/basic                                  \
diff --git a/tests/rm/ext3-perf b/tests/rm/ext3-perf
new file mode 100755
index 0000000..e0439df
--- /dev/null
+++ b/tests/rm/ext3-perf
@@ -0,0 +1,76 @@
+#!/bin/sh
+# ensure that "rm -rf DIR-with-many-entries" is not O(N^2)
+
+# Copyright (C) 2008 Free Software Foundation, Inc.
+
+# This program is free software: you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation, either version 3 of the License, or
+# (at your option) any later version.
+
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU General Public License for more details.
+
+# You should have received a copy of the GNU General Public License
+# along with this program.  If not, see <http://www.gnu.org/licenses/>.
+
+if test "$VERBOSE" = yes; then
+  set -x
+  rm --version
+fi
+
+. $srcdir/test-lib.sh
+
+expensive_
+
+# Using rm -rf to remove a 400k-entry directory takes:
+# - 9 seconds with the patch, on a 2-yr-old system
+# - 350 seconds without the patch, on a high-end system (disk 20-30% faster)
+threshold_seconds=60
+
+# The number of entries in our test directory.
+n=400000
+
+# Choose a value that is large enough to ensure an accidentally
+# regressed rm would require much longer than $threshold_seconds to remove
+# the directory.  With n=400k, pre-patch GNU rm would require about 350
+# seconds even on a fast disk.  On a relatively modern system, the
+# patched version of rm requires about 10 seconds, so even if you
+# choose to enable very expensive tests with a disk that is much slower,
+# the test should still succeed.
+
+# Skip unless "." is on an ext[34] file system.
+# FIXME-maybe: try to find a suitable file system or allow
+# the user to specify it via an envvar.
+df -t ext3 -t ext4dev -t ext4 . \
+  || skip_test_ 'this test runs only on an ext3 or ext4 file system'
+
+# Skip if there are too few inodes free.  Require some slack.
+free_inodes=$(stat -f --format=%d .) || framework_failure
+min_free_inodes=$(expr 12 \* $n / 10)
+test $min_free_inodes -lt $free_inodes \
+  || skip_test_ "too few free inodes on '.': $free_inodes;" \
+      "this test requires at least $min_free_inodes"
+
+ok=0
+mkdir d &&
+  cd d &&
+    seq $n | xargs touch &&
+    test -f 1 &&
+    test -f $n &&
+  cd .. &&
+  ok=1
+test $ok = 1 || framework_failure
+
+fail=0
+start=$(date +%s)
+rm -rf d || fail=1
+duration=$(expr $(date +%s) - $start)
+test $duration -lt $threshold_seconds ||
+  { fail=1; echo rm took longer than $threshold_seconds seconds; }
+
+echo removing a $n-entry directory took $duration seconds
+
+Exit $fail
--
1.6.0.2.307.gc427


>From 5aada0b15c1d5aec51b6f78772fa7f3fcdcbcde7 Mon Sep 17 00:00:00 2001
From: Jim Meyering <address@hidden>
Date: Wed, 24 Sep 2008 10:27:35 +0200
Subject: [PATCH] remove.c: don't use xmalloc; don't let obstack call exit on 
failure

(obstack_chunk_alloc, obstack_chunk_free): Don't define.
(top_dir): Param is no longer "const".
Use malloc, not xmalloc, and call longjmp upon failed malloc.
(ds_init): Don't use xmalloc.  Instead, use caller-supplied buffer.
Use obstack_specify_allocation_with_arg, not obstack_init, so
that we control what happens upon allocation failure.
Arrange for ds_free not to free uninitialized if/when
any obstack_specify_allocation_with_arg allocation fails.
(ds_free): Don't free DS, now that it's no longer malloc'd.
(rm): Allocate DS on the stack.
Arrange to handle ds_init allocation failure.
---
 src/remove.c |   50 ++++++++++++++++++++++++++++++++++----------------
 1 files changed, 34 insertions(+), 16 deletions(-)

diff --git a/src/remove.c b/src/remove.c
index d1fb11f..a6f0b4e 100644
--- a/src/remove.c
+++ b/src/remove.c
@@ -45,9 +45,6 @@
 #define dir_name rm_dir_name
 #define dir_len rm_dir_len

-#define obstack_chunk_alloc malloc
-#define obstack_chunk_free free
-
 /* This is the maximum number of consecutive readdir/unlink calls that
    can be made (with no intervening rewinddir or closedir/opendir) before
    triggering a bug that makes readdir return NULL even though some
@@ -271,13 +268,15 @@ push_dir (Dirstack_state *ds, const char *dir_name)
 /* Return the entry name of the directory on the top of the stack
    in malloc'd storage.  */
 static inline char *
-top_dir (Dirstack_state const *ds)
+top_dir (Dirstack_state *ds)
 {
   size_t n_lengths = obstack_object_size (&ds->len_stack) / sizeof (size_t);
   size_t *length = obstack_base (&ds->len_stack);
   size_t top_len = length[n_lengths - 1];
   char const *p = obstack_next_free (&ds->dir_stack) - top_len;
-  char *q = xmalloc (top_len);
+  char *q = malloc (top_len);
+  if (q == NULL)
+    longjmp (ds->current_arg_jumpbuf, 1);
   memcpy (q, p, top_len - 1);
   q[top_len - 1] = 0;
   return q;
@@ -466,14 +465,24 @@ AD_stack_clear (Dirstack_state *ds)
     }
 }

-static Dirstack_state *
-ds_init (void)
+static void
+ds_init (Dirstack_state *ds)
 {
-  Dirstack_state *ds = xmalloc (sizeof *ds);
-  obstack_init (&ds->dir_stack);
-  obstack_init (&ds->len_stack);
-  obstack_init (&ds->Active_dir);
-  return ds;
+  struct obstack *o[3];
+  o[0] = &ds->dir_stack;
+  o[1] = &ds->len_stack;
+  o[2] = &ds->Active_dir;
+  unsigned int i;
+
+  /* Ensure each of these is NULL, in case init/allocation
+     fails and we end up calling ds_free on all three while only
+     one or two has been initialized.  */
+  for (i = 0; i < 3; i++)
+    o[i]->chunk = NULL;
+
+  for (i = 0; i < 3; i++)
+    obstack_specify_allocation_with_arg
+      (o[i], 0, 0, rm_malloc, rm_free, &ds->current_arg_jumpbuf);
 }

 static void
@@ -492,7 +501,6 @@ ds_free (Dirstack_state *ds)
   obstack_free (&ds->dir_stack, NULL);
   obstack_free (&ds->len_stack, NULL);
   obstack_free (&ds->Active_dir, NULL);
-  free (ds);
 }

 /* Pop the active directory (AD) stack and prepare to move `up' one level,
@@ -1756,10 +1764,19 @@ extern enum RM_status
 rm (size_t n_files, char const *const *file, struct rm_options const *x)
 {
   enum RM_status status = RM_OK;
-  Dirstack_state *ds = ds_init ();
+  Dirstack_state ds;
   int cwd_errno = 0;
   size_t i;

+  /* Arrange for obstack allocation failure to longjmp.  */
+  if (setjmp (ds.current_arg_jumpbuf))
+    {
+      status = RM_ERROR;
+      goto cleanup;
+    }
+
+  ds_init (&ds);
+
   for (i = 0; i < n_files; i++)
     {
       if (cwd_errno && IS_RELATIVE_FILE_NAME (file[i]))
@@ -1769,7 +1786,7 @@ rm (size_t n_files, char const *const *file, struct 
rm_options const *x)
        }
       else
        {
-         enum RM_status s = rm_1 (ds, file[i], x, &cwd_errno);
+         enum RM_status s = rm_1 (&ds, file[i], x, &cwd_errno);
          assert (VALID_STATUS (s));
          UPDATE_STATUS (status, s);
        }
@@ -1782,7 +1799,8 @@ rm (size_t n_files, char const *const *file, struct 
rm_options const *x)
       status = RM_ERROR;
     }

-  ds_free (ds);
+ cleanup:;
+  ds_free (&ds);

   return status;
 }
--
1.6.0.2.307.gc427




reply via email to

[Prev in Thread] Current Thread [Next in Thread]