guix-commits
[Top][All Lists]
Advanced

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

09/17: deduplication: new module.


From: Caleb Ristvedt
Subject: 09/17: deduplication: new module.
Date: Tue, 29 Aug 2017 02:07:48 -0400 (EDT)

reepca pushed a commit to branch guile-daemon
in repository guix.

commit 9162ed54dc9d277d77a5ee7499ae2776bfac1fd9
Author: Caleb Ristvedt <address@hidden>
Date:   Sun Jun 11 17:43:34 2017 -0500

    deduplication: new module.
    
    * guix/store/deduplication.scm: new file
      (counting-wrapper-port, nar-sha256, tempname-in, get-temp-link,
      replace-with-link, deduplicate, reset-timestamps): new procedures.
      (deduplicate): Handle most common errors.
      (ignore-system-errors): new macro.
    * guix/store.scm (counting-wrapper-port, nar-sha256, get-temp-link,
      replace-with-link, deduplicate): removed.
      (register-path): new optimize parameter.
    
    * gnu/build/install.scm: Use reset-timestamps from (guix store 
deduplication).
      (reset-timestamps): removed.
    * gnu/build/vm.scm: Same.
    
    * guix/store/database.scm: added missing copyright header line.
---
 gnu/build/install.scm        |  16 +---
 gnu/build/vm.scm             |   1 +
 guix/store.scm               |  88 +++------------------
 guix/store/database.scm      |   1 +
 guix/store/deduplication.scm | 179 +++++++++++++++++++++++++++++++++++++++++++
 5 files changed, 192 insertions(+), 93 deletions(-)

diff --git a/gnu/build/install.scm b/gnu/build/install.scm
index 9e30c0d..77123fa 100644
--- a/gnu/build/install.scm
+++ b/gnu/build/install.scm
@@ -20,12 +20,12 @@
 (define-module (gnu build install)
   #:use-module (guix build utils)
   #:use-module (guix build store-copy)
+  #:use-module (guix store deduplication)
   #:use-module (srfi srfi-26)
   #:use-module (ice-9 match)
   #:export (install-boot-config
             evaluate-populate-directive
             populate-root-file-system
-            reset-timestamps
             register-closure
             populate-single-profile-directory))
 
@@ -144,20 +144,6 @@ includes /etc, /var, /run, /bin/sh, etc., and all the 
symlinks to SYSTEM."
                 (try))
               (apply throw args)))))))
 
-(define (reset-timestamps directory)
-  "Reset the timestamps of all the files under DIRECTORY, so that they appear
-as created and modified at the Epoch."
-  (display "clearing file timestamps...\n")
-  (for-each (lambda (file)
-              (let ((s (lstat file)))
-                ;; XXX: Guile uses libc's 'utime' function (not 'futime'), so
-                ;; the timestamp of symlinks cannot be changed, and there are
-                ;; symlinks here pointing to /gnu/store, which is the host,
-                ;; read-only store.
-                (unless (eq? (stat:type s) 'symlink)
-                  (utime file 0 0 0 0))))
-            (find-files directory #:directories? #t)))
-
 (define* (register-closure store closure
                            #:key (deduplicate? #t))
   "Register CLOSURE in STORE, where STORE is the directory name of the target
diff --git a/gnu/build/vm.scm b/gnu/build/vm.scm
index 727494a..9aa34d3 100644
--- a/gnu/build/vm.scm
+++ b/gnu/build/vm.scm
@@ -35,6 +35,7 @@
   #:use-module (srfi srfi-1)
   #:use-module (srfi srfi-9)
   #:use-module (srfi srfi-26)
+  #:use-module (guix store deduplication)
   #:export (qemu-command
             load-in-linux-vm
             format-partition
diff --git a/guix/store.scm b/guix/store.scm
index 62b503b..fcdf192 100644
--- a/guix/store.scm
+++ b/guix/store.scm
@@ -44,7 +44,7 @@
   #:use-module (ice-9 popen)
   #:use-module (web uri)
   #:use-module (guix store database)
-  #:use-module (gnu build install)
+  #:use-module (guix store deduplication)
   #:export (%daemon-socket-uri
             %gc-roots-directory
             %default-substitute-urls
@@ -1305,92 +1305,23 @@ This makes sense only when the daemon was started with 
'--cache-failures'."
   boolean)
 
 
-;; Would it be better to just make WRITE-FILE give size as well? I question
-;; the general utility of this approach.
-(define (counting-wrapper-port output-port)
-  "Some custom ports don't implement GET-POSITION at all. But if we want to
-figure out how many bytes are being written, we will want to use that. So this
-makes a wrapper around a port which implements GET-POSITION."
-  (let ((byte-count 0))
-    (make-custom-binary-output-port "counting-wrapper"
-                                    (lambda (bytes offset count)
-                                      (set! byte-count
-                                        (+ byte-count count))
-                                      (put-bytevector output-port bytes
-                                                      offset count)
-                                      count)
-                                    (lambda ()
-                                      byte-count)
-                                    #f
-                                    (lambda ()
-                                      (close-port output-port)))))
-
-
-(define (nar-sha256 file)
-  "Gives the sha256 hash of a file and the size of the file in nar form."
-  (let-values (((port get-hash) (open-sha256-port)))
-    (let ((wrapper (counting-wrapper-port port)))
-      (write-file file wrapper)
-      (force-output wrapper)
-      (force-output port)
-      (let ((hash (get-hash))
-            (size (port-position wrapper)))
-        (close-port wrapper)
-        (values hash
-                size)))))
-
-(define (get-temp-link target)
-  "Like mkstemp!, but instead of creating a new file and giving you the name,
-it creates a new hardlink to TARGET and gives you the name."
-  (let try-again ((tempname (tmpnam)))
-    (catch
-      #t
-      (lambda ()
-        (link target tempname)
-        tempname)
-      (lambda ()
-        (try-again (tmpnam))))))
-
-(define (replace-with-link target to-replace)
-  "Replaces the file TO-REPLACE with a hardlink to TARGET"
-  ;; According to the C++ code, this is how you replace it with a link
-  ;; "atomically".
-  (let ((temp-link (get-temp-link target)))
-    (delete-file to-replace)
-    (rename-file temp-link to-replace)))
-
-;; TODO: handling in case the .links directory doesn't exist? For now I'll
-;; just assume it's the responsibility of whoever makes the store to create
-;; it.
-(define (deduplicate path store hash)
-  "Checks if a store item with hash HASH already exists. If so, replaces PATH
-with a hardlink to the already-existing one. If not, it registers PATH so that
-future duplicates can hardlink to it."
-  (let ((links-path (string-append store
-                                   "/.links/"
-                                   (bytevector->base16-string hash))))
-    (if (file-exists? links-path)
-        (replace-with-link links-path path)
-        (link path links-path))))
+
+
 
 ;; TODO: Handle databases not existing yet (what should the default behavior
 ;; be? The C++ version checks for a number in the file "schema" in the
 ;; database directory and compares it to a constant, and uses that to decide
 ;; whether to "upgrade" or initialize the database).
 
-(define* (register-path path
-                        #:key (references '()) deriver prefix state-directory)
-  ;; Priority for options: first what is given, then environment variables,
-  ;; then defaults. %state-directory, %store-directory, and
-  ;; %store-database-directory already handle the "environment variables /
-  ;; defaults" question, so we only need to choose between what is given and
-  ;; those.
+(define* (register-path path #:key (references '()) deriver
+                        prefix state-directory (optimize #t))
   "Register PATH as a valid store file, with REFERENCES as its list of
 references, and DERIVER as its deriver (.drv that led to it.)  If PREFIX is
 given, it must be the name of the directory containing the new store to
 initialize; if STATE-DIRECTORY is given, it must be a string containing the
-absolute file name to the state directory of the store being initialized.
-Return #t on success.
+absolute file name to the state directory of the store being
+initialized. Deduplication (saves space, sometimes complicates things) can be
+controlled with OPTIMIZE.  Return #t on success.
 
 Use with care as it directly modifies the store!  This is primarily meant to
 be used internally by the daemon's build hook."
@@ -1437,7 +1368,8 @@ be used internally by the daemon's build hook."
            (%make-void-port "w")
          (lambda ()
            (reset-timestamps real-path)))
-       (deduplicate real-path store-dir hash)
+       (when optimize
+         (deduplicate real-path hash store-dir))
        ;; If we've made it this far without an exception, I guess we've
        ;; probably succeeded?
        #t))))
diff --git a/guix/store/database.scm b/guix/store/database.scm
index ecf7ba4..8e04d5b 100644
--- a/guix/store/database.scm
+++ b/guix/store/database.scm
@@ -1,3 +1,4 @@
+;;; GNU Guix --- Functional package management for GNU
 ;;; Copyright © 2017 Caleb Ristvedt <address@hidden>
 ;;;
 ;;; This file is part of GNU Guix.
diff --git a/guix/store/deduplication.scm b/guix/store/deduplication.scm
new file mode 100644
index 0000000..c4a38cc
--- /dev/null
+++ b/guix/store/deduplication.scm
@@ -0,0 +1,179 @@
+;;; GNU Guix --- Functional package management for GNU
+;;; Copyright © 2017 Caleb Ristvedt <address@hidden>
+;;;
+;;; This file is part of GNU Guix.
+;;;
+;;; GNU Guix is free software; you can redistribute it and/or modify it
+;;; under the terms of the GNU General Public License as published by
+;;; the Free Software Foundation; either version 3 of the License, or (at
+;;; your option) any later version.
+;;;
+;;; GNU Guix is distributed in the hope that it will be useful, but
+;;; WITHOUT ANY WARRANTY; without even the implied warranty of
+;;; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+;;; GNU General Public License for more details.
+;;;
+;;; You should have received a copy of the GNU General Public License
+;;; along with GNU Guix.  If not, see <http://www.gnu.org/licenses/>.
+
+;;; This houses stuff we do to files when they arrive at the store - resetting
+;;; timestamps, deduplicating, etc.
+
+(define-module (guix store deduplication)
+  #:use-module (guix hash)
+  #:use-module (guix build utils)
+  #:use-module (guix base16)
+  #:use-module (srfi srfi-11)
+  #:use-module (rnrs io ports)
+  #:use-module (ice-9 ftw)
+  #:use-module (guix serialization)
+  #:export (nar-sha256
+            deduplicate
+            reset-timestamps))
+
+;; Would it be better to just make WRITE-FILE give size as well? I question
+;; the general utility of this approach.
+(define (counting-wrapper-port output-port)
+  "Some custom ports don't implement GET-POSITION at all. But if we want to
+figure out how many bytes are being written, we will want to use that. So this
+makes a wrapper around a port which implements GET-POSITION."
+  (let ((byte-count 0))
+    (make-custom-binary-output-port "counting-wrapper"
+                                    (lambda (bytes offset count)
+                                      (set! byte-count
+                                        (+ byte-count count))
+                                      (put-bytevector output-port bytes
+                                                      offset count)
+                                      count)
+                                    (lambda ()
+                                      byte-count)
+                                    #f
+                                    (lambda ()
+                                      (close-port output-port)))))
+
+
+(define (nar-sha256 file)
+  "Gives the sha256 hash of a file and the size of the file in nar form."
+  (let-values (((port get-hash) (open-sha256-port)))
+    (let ((wrapper (counting-wrapper-port port)))
+      (write-file file wrapper)
+      (force-output wrapper)
+      (force-output port)
+      (let ((hash (get-hash))
+            (size (port-position wrapper)))
+        (close-port wrapper)
+        (values hash
+                size)))))
+
+
+; Taken from stdlib.h - this should be the same range as the C++ deduplication
+; stuff this way.
+(define rand-max 2147483647)
+
+(define (tempname-in directory)
+  "Gives an unused temporary name under DIRECTORY. Not guaranteed to still be
+unused by the time you create anything with that name, but a good shot."
+  (let ((const-part (string-append directory
+                                   "/.tmp-link-"
+                                   (number->string (getpid)))))
+    (let try-again ((guess-part (number->string (random rand-max))))
+      (if (file-exists? (string-append const-part "-" guess-part))
+          (try-again (number->string (random rand-max)))
+          (string-append const-part "-" guess-part)))))
+
+(define* (get-temp-link target #:optional (link-prefix (dirname target)))
+  "Like mkstemp!, but instead of creating a new file and giving you the name,
+it creates a new hardlink to TARGET and gives you the name. Since
+cross-filesystem hardlinks don't work, the temp link must be created on the
+same filesystem - where in that filesystem it is can be controlled by
+LINK-PREFIX."
+  (let try-again ((tempname (tempname-in link-prefix)))
+    (catch
+      'system-error
+      (lambda ()
+        (link target tempname)
+        tempname)
+      (lambda (args)
+        (if (= (system-error-errno args) EEXIST)
+            (try-again (tempname-in link-prefix))
+            (throw 'system-error args))))))
+
+;; There are 3 main kinds of errors we can get from hardlinking: "Too many
+;; things link to this" (EMLINK), "this link already exists" (EEXIST), and
+;; "can't fit more stuff in this directory" (ENOSPC). 
+
+(define (replace-with-link target to-replace)
+  "Replace the file TO-REPLACE with a link to TARGET. Note: assumes that
+TARGET and TO-REPLACE are on the same filesystem. If they aren't, bad things
+will happen!"
+  (let ((temp-link (get-temp-link target (dirname to-replace))))
+    (rename-file temp-link to-replace)))
+
+(define-syntax ignore-system-errors
+  (syntax-rules ()
+    "Given a list of system error codes to ignore, evaluates EXPS and returns
+#f if any of the system error codes in the given list are thrown."
+    ((ignore-system-errors (errors ...) exps ...)
+     (catch 'system-error
+       (lambda ()
+         exps ...)
+       (lambda (args)
+         (case (system-error-errno args)
+           ((errors ...) #f)
+           (else (throw 'system-error args))))))))
+
+;; Under what conditions would PATH be on a separate filesystem from the
+;; .links directory? Any instance of that as far as I can tell would be a
+;; misuse of DEDUPLICATE, such as specifying a PATH that isn't in STORE. 
+(define* (deduplicate path hash #:optional (store %store-directory))
+  "Checks if a store item with hash HASH already exists. If so, replaces PATH
+with a hardlink to the already-existing one. If not, it registers PATH so that
+future duplicates can hardlink to it. If PATH isn't under the default
+%store-directory, the directory it is under must be given as STORE. "
+  (let* ((links-directory (string-append store
+                                         "/.links"))
+         (link-file (string-append links-directory "/"
+                                   (bytevector->base16-string hash))))
+    (mkdir-p links-directory)
+    (if (file-is-directory? path)
+        ;; Can't hardlink directories, gotta hardlink their atoms.
+        (for-each (lambda (file)
+                    (deduplicate file (nar-sha256 file) store))
+                  (scandir path))
+        (if (file-exists? link-file)
+            (ignore-system-errors (EMLINK)
+                                  (replace-with-link path link-file))
+            (catch 'system-error
+              (lambda ()
+                (link path link-file))
+              (lambda (args)
+                (case (system-error-errno args)
+                  ((EEXIST)
+                   ;; Someone else put an entry for PATH in links-directory
+                   ;; before we could! Let's use it!
+                   (ignore-system-errors (EMLINK)
+                                         (replace-with-link path link-file)))
+                  ;; It's fine if there's not enough room in the directory
+                  ;; index or whatever for more entries in .links, we just
+                  ;; need to stop in that case.
+                  ((ENOSPC) #f)
+                  ;; EMLINK is an error - we got here because initially there
+                  ;; wasn't an entry in .links, and suddenly there are so many
+                  ;; things linked to the original file that we can't make 
another
+                  ;; one? Sounds like an error! Anything we haven't 
anticipated,
+                  ;; too.
+                  (else (throw 'system-error args)))))))))
+
+(define (reset-timestamps directory)
+  "Reset the timestamps of all the files under DIRECTORY, so that they appear
+as created and modified at the Epoch."
+  (display "clearing file timestamps...\n")
+  (for-each (lambda (file)
+              (let ((s (lstat file)))
+                ;; XXX: Guile uses libc's 'utime' function (not 'futime'), so
+                ;; the timestamp of symlinks cannot be changed, and there are
+                ;; symlinks here pointing to /gnu/store, which is the host,
+                ;; read-only store.
+                (unless (eq? (stat:type s) 'symlink)
+                  (utime file 0 0 0 0))))
+            (find-files directory #:directories? #t)))



reply via email to

[Prev in Thread] Current Thread [Next in Thread]