qemu-devel
[Top][All Lists]
Advanced

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

[Qemu-devel] [PATCH] block: Add -drive detect_zero=on|off option to dete


From: Richard W.M. Jones
Subject: [Qemu-devel] [PATCH] block: Add -drive detect_zero=on|off option to detect all zero writes.
Date: Thu, 26 Jul 2012 16:59:49 +0100

From: "Richard W.M. Jones" <address@hidden>

This change adds a new block device option, "detect_zero=on|off".

If "detect_zero=on" then when a guest writes sectors that contain all
zero bytes, we call the internal "bdrv_co_write_zeroes" function
instead of the standard "bdrv_co_writev".  Some drivers, notably qed
and qcow2 version 3, will handle this more space-efficiently by just
recording a zero flag instead of allocating new clusters.

The flip side is that we have to scan each block being written by the
guest to see if it contains all zero bytes.  Therefore this option
defaults to "off".

In the special case where we expect the guest to be mostly writing
zeroes (virt-sparsify) this makes a huge difference in the size of the
intermediate qcow2 file which is created.

Note that if you want to test this feature, you must use a qcow2
version 3 file.  To create that, do:

  qemu-img create -f qcow2 -o compat=1.1 <name> <size>

Ordinary qcow2 (v2) and raw do NOT know how to treat zero blocks
specially, so you won't notice any difference.

Signed-off-by: Richard W.M. Jones <address@hidden>
---
 block.c         |   33 ++++++++++++++++++++++++++++++++-
 block.h         |    1 +
 block_int.h     |    1 +
 blockdev.c      |   10 ++++++++++
 hmp-commands.hx |    3 ++-
 qemu-config.c   |    4 ++++
 qemu-options.hx |    7 ++++++-
 7 files changed, 56 insertions(+), 3 deletions(-)

diff --git a/block.c b/block.c
index 7547051..49e8186 100644
--- a/block.c
+++ b/block.c
@@ -639,6 +639,8 @@ static int bdrv_open_common(BlockDriverState *bs, const 
char *filename,
         bdrv_enable_copy_on_read(bs);
     }
 
+    bs->detect_zero = !!(flags & BDRV_O_DETECT_ZERO);
+
     pstrcpy(bs->filename, sizeof(bs->filename), filename);
 
     if (use_bdrv_whitelist && !bdrv_is_whitelisted(drv)) {
@@ -654,7 +656,7 @@ static int bdrv_open_common(BlockDriverState *bs, const 
char *filename,
      * Clear flags that are internal to the block layer before opening the
      * image.
      */
-    open_flags = flags & ~(BDRV_O_SNAPSHOT | BDRV_O_NO_BACKING);
+    open_flags = flags & ~(BDRV_O_SNAPSHOT | BDRV_O_NO_BACKING | 
BDRV_O_DETECT_ZERO);
 
     /*
      * Snapshots should be writable.
@@ -1940,6 +1942,33 @@ static int coroutine_fn 
bdrv_co_do_write_zeroes(BlockDriverState *bs,
 }
 
 /*
+ * Detect zeroes: This is very conservative and could be a lot more
+ * clever/complex.
+ *
+ * All it does now is detect if the whole iovec contains nothing but
+ * zero bytes, and if so call bdrv_co_do_write_zeroes, otherwise call
+ * drv->bdrv_co_writev.
+ *
+ * It would be possible to split requests up and do this
+ * iovec-element-by-element or even sector-by-sector.
+ */
+static int detect_zeroes(BlockDriverState *bs,
+    int64_t sector_num, int nb_sectors, QEMUIOVector *qiov)
+{
+    BlockDriver *drv = bs->drv;
+    int i;
+
+    for (i = 0; i < qiov->niov; i++) {
+        if (!buffer_is_zero (qiov->iov[i].iov_base, qiov->iov[i].iov_len))
+            goto not_zero;
+    }
+    return bdrv_co_do_write_zeroes(bs, sector_num, nb_sectors);
+
+ not_zero:
+    return drv->bdrv_co_writev(bs, sector_num, nb_sectors, qiov);
+}
+
+/*
  * Handle a write request in coroutine context
  */
 static int coroutine_fn bdrv_co_do_writev(BlockDriverState *bs,
@@ -1973,6 +2002,8 @@ static int coroutine_fn 
bdrv_co_do_writev(BlockDriverState *bs,
 
     if (flags & BDRV_REQ_ZERO_WRITE) {
         ret = bdrv_co_do_write_zeroes(bs, sector_num, nb_sectors);
+    } else if (bs->detect_zero) {
+        ret = detect_zeroes (bs, sector_num, nb_sectors, qiov);
     } else {
         ret = drv->bdrv_co_writev(bs, sector_num, nb_sectors, qiov);
     }
diff --git a/block.h b/block.h
index 7408acc..fcc6de6 100644
--- a/block.h
+++ b/block.h
@@ -79,6 +79,7 @@ typedef struct BlockDevOps {
 #define BDRV_O_NO_FLUSH    0x0200 /* disable flushing on this disk */
 #define BDRV_O_COPY_ON_READ 0x0400 /* copy read backing sectors into image */
 #define BDRV_O_INCOMING    0x0800  /* consistency hint for incoming migration 
*/
+#define BDRV_O_DETECT_ZERO 0x1000 /* detect zero writes */
 
 #define BDRV_O_CACHE_MASK  (BDRV_O_NOCACHE | BDRV_O_CACHE_WB | BDRV_O_NO_FLUSH)
 
diff --git a/block_int.h b/block_int.h
index 3d4abc6..1008103 100644
--- a/block_int.h
+++ b/block_int.h
@@ -271,6 +271,7 @@ struct BlockDriverState {
     int sg;        /* if true, the device is a /dev/sg* */
     int copy_on_read; /* if true, copy read backing sectors into image
                          note this is a reference count */
+    int detect_zero; /* if true, detect all zero byte writes */
 
     BlockDriver *drv; /* NULL means no media */
     void *opaque;
diff --git a/blockdev.c b/blockdev.c
index 67895b2..580c078 100644
--- a/blockdev.c
+++ b/blockdev.c
@@ -296,6 +296,7 @@ DriveInfo *drive_init(QemuOpts *opts, int default_to_scsi)
     BlockIOLimit io_limits;
     int snapshot = 0;
     bool copy_on_read;
+    bool detect_zero;
     int ret;
 
     translation = BIOS_ATA_TRANSLATION_AUTO;
@@ -313,6 +314,7 @@ DriveInfo *drive_init(QemuOpts *opts, int default_to_scsi)
     snapshot = qemu_opt_get_bool(opts, "snapshot", 0);
     ro = qemu_opt_get_bool(opts, "readonly", 0);
     copy_on_read = qemu_opt_get_bool(opts, "copy-on-read", false);
+    detect_zero = qemu_opt_get_bool(opts, "detect-zero", false);
 
     file = qemu_opt_get(opts, "file");
     serial = qemu_opt_get(opts, "serial");
@@ -595,6 +597,10 @@ DriveInfo *drive_init(QemuOpts *opts, int default_to_scsi)
         bdrv_flags |= BDRV_O_COPY_ON_READ;
     }
 
+    if (detect_zero) {
+        bdrv_flags |= BDRV_O_DETECT_ZERO;
+    }
+
     if (runstate_check(RUN_STATE_INMIGRATE)) {
         bdrv_flags |= BDRV_O_INCOMING;
     }
@@ -612,6 +618,10 @@ DriveInfo *drive_init(QemuOpts *opts, int default_to_scsi)
 
     bdrv_flags |= ro ? 0 : BDRV_O_RDWR;
 
+    if (ro && detect_zero) {
+        error_report("warning: ignoring detect-zero on readonly drive");
+    }
+
     ret = bdrv_open(dinfo->bdrv, file, bdrv_flags, drv);
     if (ret < 0) {
         error_report("could not open disk image %s: %s",
diff --git a/hmp-commands.hx b/hmp-commands.hx
index 18cb415..7dcdcfe 100644
--- a/hmp-commands.hx
+++ b/hmp-commands.hx
@@ -908,7 +908,8 @@ ETEXI
                       "[,unit=m][,media=d][,index=i]\n"
                       "[,cyls=c,heads=h,secs=s[,trans=t]]\n"
                       "[,snapshot=on|off][,cache=on|off]\n"
-                      "[,readonly=on|off][,copy-on-read=on|off]",
+                      "[,readonly=on|off][,copy-on-read=on|off]\n"
+                      "[,detect-zero=on|off]\n",
         .help       = "add drive to PCI storage controller",
         .mhandler.cmd = drive_hot_add,
     },
diff --git a/qemu-config.c b/qemu-config.c
index be84a03..fb19d15 100644
--- a/qemu-config.c
+++ b/qemu-config.c
@@ -113,6 +113,10 @@ static QemuOptsList qemu_drive_opts = {
             .name = "copy-on-read",
             .type = QEMU_OPT_BOOL,
             .help = "copy read data from backing file into image file",
+        },{
+            .name = "detect-zero",
+            .type = QEMU_OPT_BOOL,
+            .help = "detect zero writes and handle space-efficiently",
         },
         { /* end of list */ }
     },
diff --git a/qemu-options.hx b/qemu-options.hx
index 8b66264..5d5da6e 100644
--- a/qemu-options.hx
+++ b/qemu-options.hx
@@ -141,7 +141,7 @@ DEF("drive", HAS_ARG, QEMU_OPTION_drive,
     "       [,cyls=c,heads=h,secs=s[,trans=t]][,snapshot=on|off]\n"
     "       
[,cache=writethrough|writeback|none|directsync|unsafe][,format=f]\n"
     "       [,serial=s][,addr=A][,id=name][,aio=threads|native]\n"
-    "       [,readonly=on|off][,copy-on-read=on|off]\n"
+    "       [,readonly=on|off][,copy-on-read=on|off][,detect-zero=on|off]\n"
     "       
[[,bps=b]|[[,bps_rd=r][,bps_wr=w]]][[,iops=i]|[[,iops_rd=r][,iops_wr=w]]\n"
     "                use 'file' as a drive image\n", QEMU_ARCH_ALL)
 STEXI
@@ -196,6 +196,11 @@ Open drive @option{file} as read-only. Guest write 
attempts will fail.
 @item address@hidden
 @var{copy-on-read} is "on" or "off" and enables whether to copy read backing
 file sectors into the image file.
address@hidden address@hidden
address@hidden is "on" or "off". If "on", guest writes containing
+all zero bytes are detected and may be handled more space-efficiently by
+the underlying block device. Under some workloads this may result in
+worse performance, so it defaults to "off".
 @end table
 
 By default, writethrough caching is used for all block device.  This means that
-- 
1.7.10.4




reply via email to

[Prev in Thread] Current Thread [Next in Thread]