[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]
02/03: UNTESTED: faster graft code cleanup.
From: |
Mark H. Weaver |
Subject: |
02/03: UNTESTED: faster graft code cleanup. |
Date: |
Mon, 28 Mar 2016 03:43:55 +0000 |
mhw pushed a commit to branch wip-graft-improvements
in repository guix.
commit e02dd7ef96a818bae63feb8d292103965f407a1b
Author: Mark H Weaver <address@hidden>
Date: Mon Mar 14 16:18:06 2016 -0400
UNTESTED: faster graft code cleanup.
---
guix/build/graft.scm | 132 +++++++++++++++++++++++++++++++++++++-------------
1 files changed, 98 insertions(+), 34 deletions(-)
diff --git a/guix/build/graft.scm b/guix/build/graft.scm
index ec6f838..a5e88d6 100644
--- a/guix/build/graft.scm
+++ b/guix/build/graft.scm
@@ -21,6 +21,7 @@
#:use-module (guix build utils)
#:use-module (rnrs bytevectors)
#:use-module (rnrs io ports)
+ #:use-module (ice-9 vlist)
#:use-module (ice-9 match)
#:use-module (ice-9 threads)
#:use-module (ice-9 binary-ports)
@@ -42,7 +43,10 @@
;;;
;;; Code:
-(define hash-length 32)
+(define-syntax-rule (define-inline name val)
+ (define-syntax name (identifier-syntax val)))
+
+(define-inline hash-length 32)
(define nix-base32-char?
(cute char-set-contains?
@@ -50,12 +54,19 @@
(string->char-set "0123456789abcdfghijklmnpqrsvwxyz")
<>))
-(define* (replace-store-references input output lookup-replacement
+(define* (replace-store-references input output replacement-table
#:optional (store (%store-directory)))
"Read data from INPUT, replacing store references according to
-LOOKUP-REPLACEMENT, and writing the result to OUTPUT."
+REPLACEMENT-TABLE, and writing the result to OUTPUT. REPLACEMENT-TABLE is a
+vhash that maps strings (original hashes) to bytevectors (replacement hashes).
+Note: We use string keys to work around the fact that guile-2.0 hashes all
+bytevectors to the same value."
- (define request-size (expt 2 20)) ; 1 MB
+ (define (lookup-replacement s)
+ (match (vhash-assoc s replacement-table)
+ ((origin . replacement)
+ replacement)
+ (#f #f)))
(define (optimize-u8-predicate pred)
(cute vector-ref
@@ -69,17 +80,44 @@ LOOKUP-REPLACEMENT, and writing the result to OUTPUT."
(define (dash? byte) (= byte 45))
+ (define request-size (expt 2 20)) ; 1 MiB
+
+ ;; We scan the file for the following 33-byte pattern: 32 bytes of
+ ;; nix-base32 characters followed by a dash. To accommodate large files,
+ ;; we do not read the entire file, but instead work on buffers of up to
+ ;; 'request-size' bytes. To ensure that every 33-byte sequence appears
+ ;; entirely within exactly one buffer, adjacent buffers must overlap,
+ ;; i.e. they must share 32 byte positions. We accomplish this by
+ ;; "ungetting" the last 32 bytes of each buffer before reading the next
+ ;; buffer, unless we know that we've reached the end-of-file.
(let ((buffer (make-bytevector request-size)))
(let loop ()
- ;; Note: work around <http://bugs.gnu.org/17466>.
+ ;; Note: We avoid 'get-bytevector-n' to work around
+ ;; <http://bugs.gnu.org/17466>.
(match (get-bytevector-n! input buffer 0 request-size)
((? eof-object?) 'done)
(end
- ;; Scan the buffer for dashes preceded by a valid nix hash.
+ ;; We scan the buffer for dashes that might be preceded by a
+ ;; nix-base32 hash. The key optimization here is that whenever we
+ ;; find a NON-nix-base32 character at position 'i', we know that it
+ ;; cannot be part of a hash, so the earliest position where the next
+ ;; hash could start is i+1 with the following dash at position i+33.
+ ;;
+ ;; Since nix-base32 characters comprise only 1/8 of the 256 possible
+ ;; byte values, and exclude some of the most common letters in
+ ;; English text (e t o u), in practice we can advance by 33 positions
+ ;; most of the time.
(let scan-from ((i hash-length) (written 0))
+ ;; 'i' is the first position where we look for a dash. 'written'
+ ;; is the number of bytes in the buffer that have already been
+ ;; written.
(if (< i end)
(let ((byte (bytevector-u8-ref buffer i)))
(cond ((and (dash? byte)
+ ;; We've found a dash. Note that we do not know
+ ;; whether the preceeding 32 bytes are nix-base32
+ ;; characters, but we do not need to know. If
+ ;; they are not, the following lookup will fail.
(lookup-replacement
(string-tabulate (lambda (j)
(integer->char
@@ -87,14 +125,43 @@ LOOKUP-REPLACEMENT, and writing the result to OUTPUT."
(+ j (- i hash-length)))))
hash-length)))
=> (lambda (replacement)
+ ;; We've found a hash that needs to be replaced.
+ ;; First, write out all bytes preceding the hash
+ ;; that have not yet been written.
(put-bytevector output buffer written
(- i hash-length written))
+ ;; Now write the replacement hash.
(put-bytevector output replacement)
+ ;; Since the byte at position 'i' is a dash,
+ ;; which is not a nix-base32 char, the earliest
+ ;; position where the next hash might start is
+ ;; i+1, and the earliest position where the
+ ;; following dash might start is (+ i 1
+ ;; hash-length). Also, we have now written up to
+ ;; position 'i' in the buffer.
(scan-from (+ i 1 hash-length) i)))
+ ;; If the byte at position 'i' is a nix-base32 char,
+ ;; then the dash we're looking for might be as early as
+ ;; the following byte, so we can only advance by 1.
((nix-base32-byte? byte)
(scan-from (+ i 1) written))
+ ;; If the byte at position 'i' is NOT a nix-base32
+ ;; char, then the earliest position where the next hash
+ ;; might start is i+1, with the following dash at
+ ;; position (+ i 1 hash-length).
(else
(scan-from (+ i 1 hash-length) written))))
+
+ ;; We have finished scanning the buffer. Now we determine how
+ ;; many bytes have not yet been written, and how many bytes to
+ ;; "unget". If 'end' is less than 'request-size' then we read
+ ;; less than we asked for, which indicates that we are at EOF,
+ ;; so we needn't unget anything. Otherwise, we unget up to
+ ;; 'hash-length' bytes (32 bytes). However, we must be careful
+ ;; not to unget bytes that have already been written, because
+ ;; that would cause them to be written again from the next
+ ;; buffer. In practice, this case occurs when a replacement is
+ ;; made near the end of the buffer.
(let* ((unwritten (- end written))
(unget-size (if (= end request-size)
(min hash-length unwritten)
@@ -110,36 +177,33 @@ LOOKUP-REPLACEMENT, and writing the result to OUTPUT."
"Copy DIRECTORY to OUTPUT, replacing strings according to MAPPING, a list of
file name pairs."
- (define lookup-replacement
+ (define hash-mapping
(let* ((prefix (string-append store "/"))
(start (string-length prefix))
- (end (+ start hash-length))
- (table (make-hash-table)))
- (define (valid-prefix? p) (string=? p prefix))
- (define (valid-suffix? s) (string-prefix? "-" s))
+ (end (+ start hash-length)))
(define (valid-hash? h)
- (and (= hash-length (string-length h))
- (every nix-base32-char?
- (string->list h))))
- (define (components s)
+ (every nix-base32-char? (string->list h)))
+ (define (valid-suffix? s)
+ (string-prefix? "-" s))
+ (define (hash+suffix s)
(and (< end (string-length s))
- (list (substring s 0 start)
- (substring s start end)
- (substring s end))))
- (for-each (match-lambda
- (((= components ((? valid-prefix?)
- (? valid-hash? origin-hash)
- (? valid-suffix? suffix)))
- .
- (= components ((? valid-prefix?)
- (? valid-hash? replacement-hash)
- (? valid-suffix? suffix))))
- (hash-set! table origin-hash
- (string->utf8 replacement-hash)))
- ((origin . replacement)
- (error "invalid replacement" origin replacement)))
- mapping)
- (cut hash-ref table <>)))
+ (let ((hash (substring s start end))
+ (suffix (substring s end)))
+ (and (string-prefix? prefix s)
+ (valid-hash? hash)
+ (valid-suffix? suffix)
+ (list hash suffix)))))
+ (map (match-lambda
+ (((= hash+suffix (origin-hash suffix))
+ .
+ (= hash+suffix (replacement-hash suffix)))
+ (cons origin-hash (string->utf8 replacement-hash)))
+ ((origin . replacement)
+ (error "invalid replacement" origin replacement)))
+ mapping)))
+
+ (define replacement-table
+ (alist->vhash hash-mapping))
(define prefix-len
(string-length directory))
@@ -157,7 +221,7 @@ file name pairs."
(symlink (call-with-output-string
(lambda (output)
(replace-store-references (open-input-string target)
- output lookup-replacement
+ output replacement-table
store)))
dest)))
((regular)
@@ -165,7 +229,7 @@ file name pairs."
(lambda (input)
(call-with-output-file dest
(lambda (output)
- (replace-store-references input output lookup-replacement
+ (replace-store-references input output replacement-table
store)
(chmod output (stat:perms stat)))))))
(else