02/03: UNTESTED: faster graft code cleanup.

guix-commits
[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]
02/03: UNTESTED: faster graft code cleanup.

From:	Mark H. Weaver
Subject:	02/03: UNTESTED: faster graft code cleanup.
Date:	Mon, 28 Mar 2016 03:43:55 +0000
mhw pushed a commit to branch wip-graft-improvements
in repository guix.

commit e02dd7ef96a818bae63feb8d292103965f407a1b
Author: Mark H Weaver <address@hidden>
Date:   Mon Mar 14 16:18:06 2016 -0400

    UNTESTED: faster graft code cleanup.
---
 guix/build/graft.scm |  132 +++++++++++++++++++++++++++++++++++++-------------
 1 files changed, 98 insertions(+), 34 deletions(-)

diff --git a/guix/build/graft.scm b/guix/build/graft.scm
index ec6f838..a5e88d6 100644
--- a/guix/build/graft.scm
+++ b/guix/build/graft.scm
@@ -21,6 +21,7 @@
   #:use-module (guix build utils)
   #:use-module (rnrs bytevectors)
   #:use-module (rnrs io ports)
+  #:use-module (ice-9 vlist)
   #:use-module (ice-9 match)
   #:use-module (ice-9 threads)
   #:use-module (ice-9 binary-ports)
@@ -42,7 +43,10 @@
 ;;;
 ;;; Code:
 
-(define hash-length 32)
+(define-syntax-rule (define-inline name val)
+  (define-syntax name (identifier-syntax val)))
+
+(define-inline hash-length 32)
 
 (define nix-base32-char?
   (cute char-set-contains?
@@ -50,12 +54,19 @@
         (string->char-set "0123456789abcdfghijklmnpqrsvwxyz")
         <>))
 
-(define* (replace-store-references input output lookup-replacement
+(define* (replace-store-references input output replacement-table
                                    #:optional (store (%store-directory)))
   "Read data from INPUT, replacing store references according to
-LOOKUP-REPLACEMENT, and writing the result to OUTPUT."
+REPLACEMENT-TABLE, and writing the result to OUTPUT.  REPLACEMENT-TABLE is a
+vhash that maps strings (original hashes) to bytevectors (replacement hashes).
+Note: We use string keys to work around the fact that guile-2.0 hashes all
+bytevectors to the same value."
 
-  (define request-size (expt 2 20))  ; 1 MB
+  (define (lookup-replacement s)
+    (match (vhash-assoc s replacement-table)
+      ((origin . replacement)
+       replacement)
+      (#f #f)))
 
   (define (optimize-u8-predicate pred)
     (cute vector-ref
@@ -69,17 +80,44 @@ LOOKUP-REPLACEMENT, and writing the result to OUTPUT."
 
   (define (dash? byte) (= byte 45))
 
+  (define request-size (expt 2 20))  ; 1 MiB
+
+  ;; We scan the file for the following 33-byte pattern: 32 bytes of
+  ;; nix-base32 characters followed by a dash.  To accommodate large files,
+  ;; we do not read the entire file, but instead work on buffers of up to
+  ;; 'request-size' bytes.  To ensure that every 33-byte sequence appears
+  ;; entirely within exactly one buffer, adjacent buffers must overlap,
+  ;; i.e. they must share 32 byte positions.  We accomplish this by
+  ;; "ungetting" the last 32 bytes of each buffer before reading the next
+  ;; buffer, unless we know that we've reached the end-of-file.
   (let ((buffer (make-bytevector request-size)))
     (let loop ()
-      ;; Note: work around <http://bugs.gnu.org/17466>.
+      ;; Note: We avoid 'get-bytevector-n' to work around
+      ;; <http://bugs.gnu.org/17466>.
       (match (get-bytevector-n! input buffer 0 request-size)
         ((? eof-object?) 'done)
         (end
-         ;; Scan the buffer for dashes preceded by a valid nix hash.
+         ;; We scan the buffer for dashes that might be preceded by a
+         ;; nix-base32 hash.  The key optimization here is that whenever we
+         ;; find a NON-nix-base32 character at position 'i', we know that it
+         ;; cannot be part of a hash, so the earliest position where the next
+         ;; hash could start is i+1 with the following dash at position i+33.
+         ;;
+         ;; Since nix-base32 characters comprise only 1/8 of the 256 possible
+         ;; byte values, and exclude some of the most common letters in
+         ;; English text (e t o u), in practice we can advance by 33 positions
+         ;; most of the time.
          (let scan-from ((i hash-length) (written 0))
+           ;; 'i' is the first position where we look for a dash.  'written'
+           ;; is the number of bytes in the buffer that have already been
+           ;; written.
            (if (< i end)
                (let ((byte (bytevector-u8-ref buffer i)))
                  (cond ((and (dash? byte)
+                             ;; We've found a dash.  Note that we do not know
+                             ;; whether the preceeding 32 bytes are nix-base32
+                             ;; characters, but we do not need to know.  If
+                             ;; they are not, the following lookup will fail.
                              (lookup-replacement
                               (string-tabulate (lambda (j)
                                                  (integer->char
@@ -87,14 +125,43 @@ LOOKUP-REPLACEMENT, and writing the result to OUTPUT."
                                                    (+ j (- i hash-length)))))
                                                hash-length)))
                         => (lambda (replacement)
+                             ;; We've found a hash that needs to be replaced.
+                             ;; First, write out all bytes preceding the hash
+                             ;; that have not yet been written.
                              (put-bytevector output buffer written
                                              (- i hash-length written))
+                             ;; Now write the replacement hash.
                              (put-bytevector output replacement)
+                             ;; Since the byte at position 'i' is a dash,
+                             ;; which is not a nix-base32 char, the earliest
+                             ;; position where the next hash might start is
+                             ;; i+1, and the earliest position where the
+                             ;; following dash might start is (+ i 1
+                             ;; hash-length).  Also, we have now written up to
+                             ;; position 'i' in the buffer.
                              (scan-from (+ i 1 hash-length) i)))
+                       ;; If the byte at position 'i' is a nix-base32 char,
+                       ;; then the dash we're looking for might be as early as
+                       ;; the following byte, so we can only advance by 1.
                        ((nix-base32-byte? byte)
                         (scan-from (+ i 1) written))
+                       ;; If the byte at position 'i' is NOT a nix-base32
+                       ;; char, then the earliest position where the next hash
+                       ;; might start is i+1, with the following dash at
+                       ;; position (+ i 1 hash-length).
                        (else
                         (scan-from (+ i 1 hash-length) written))))
+
+               ;; We have finished scanning the buffer.  Now we determine how
+               ;; many bytes have not yet been written, and how many bytes to
+               ;; "unget".  If 'end' is less than 'request-size' then we read
+               ;; less than we asked for, which indicates that we are at EOF,
+               ;; so we needn't unget anything.  Otherwise, we unget up to
+               ;; 'hash-length' bytes (32 bytes).  However, we must be careful
+               ;; not to unget bytes that have already been written, because
+               ;; that would cause them to be written again from the next
+               ;; buffer.  In practice, this case occurs when a replacement is
+               ;; made near the end of the buffer.
                (let* ((unwritten   (- end written))
                       (unget-size  (if (= end request-size)
                                        (min hash-length unwritten)
@@ -110,36 +177,33 @@ LOOKUP-REPLACEMENT, and writing the result to OUTPUT."
   "Copy DIRECTORY to OUTPUT, replacing strings according to MAPPING, a list of
 file name pairs."
 
-  (define lookup-replacement
+  (define hash-mapping
     (let* ((prefix (string-append store "/"))
            (start  (string-length prefix))
-           (end    (+ start hash-length))
-           (table  (make-hash-table)))
-      (define (valid-prefix? p) (string=? p prefix))
-      (define (valid-suffix? s) (string-prefix? "-" s))
+           (end    (+ start hash-length)))
       (define (valid-hash? h)
-        (and (= hash-length (string-length h))
-             (every nix-base32-char?
-                    (string->list h))))
-      (define (components s)
+        (every nix-base32-char? (string->list h)))
+      (define (valid-suffix? s)
+        (string-prefix? "-" s))
+      (define (hash+suffix s)
         (and (< end (string-length s))
-             (list (substring s 0 start)
-                   (substring s start end)
-                   (substring s end))))
-      (for-each (match-lambda
-                  (((= components ((? valid-prefix?)
-                                   (? valid-hash? origin-hash)
-                                   (? valid-suffix? suffix)))
-                    .
-                    (= components ((? valid-prefix?)
-                                   (? valid-hash? replacement-hash)
-                                   (? valid-suffix? suffix))))
-                   (hash-set! table origin-hash
-                              (string->utf8 replacement-hash)))
-                  ((origin . replacement)
-                   (error "invalid replacement" origin replacement)))
-                mapping)
-      (cut hash-ref table <>)))
+             (let ((hash   (substring s start end))
+                   (suffix (substring s end)))
+               (and (string-prefix? prefix s)
+                    (valid-hash?    hash)
+                    (valid-suffix?  suffix)
+                    (list hash suffix)))))
+      (map (match-lambda
+             (((= hash+suffix (origin-hash      suffix))
+               .
+               (= hash+suffix (replacement-hash suffix)))
+              (cons origin-hash (string->utf8 replacement-hash)))
+             ((origin . replacement)
+              (error "invalid replacement" origin replacement)))
+           mapping)))
+
+  (define replacement-table
+    (alist->vhash hash-mapping))
 
   (define prefix-len
     (string-length directory))
@@ -157,7 +221,7 @@ file name pairs."
            (symlink (call-with-output-string
                       (lambda (output)
                         (replace-store-references (open-input-string target)
-                                                  output lookup-replacement
+                                                  output replacement-table
                                                   store)))
                     dest)))
         ((regular)
@@ -165,7 +229,7 @@ file name pairs."
            (lambda (input)
              (call-with-output-file dest
                (lambda (output)
-                 (replace-store-references input output lookup-replacement
+                 (replace-store-references input output replacement-table
                                            store)
                  (chmod output (stat:perms stat)))))))
         (else
[Prev in Thread]
Current Thread
[Next in Thread]
branch wip-graft-improvements created (now bb6a001), Mark H. Weaver, 2016/03/27
- 03/03: PRELIMINARY: Attempt to eliminate duplicate grafts., Mark H. Weaver, 2016/03/27
- 01/03: PRELIMINARY: Faster grafting., Mark H. Weaver, 2016/03/27
- 02/03: UNTESTED: faster graft code cleanup., Mark H. Weaver <=
Prev by Date: 01/03: PRELIMINARY: Faster grafting.
Next by Date: branch master updated (36c2c8c -> e0b90db)
Previous by thread: 01/03: PRELIMINARY: Faster grafting.
Next by thread: branch master updated (36c2c8c -> e0b90db)
Index(es):
- Date
- Thread