qemu-devel
[Top][All Lists]
Advanced

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

Re: [Qemu-devel] [PATCH] block: Add -drive detect_zero=on|off option to


From: Paolo Bonzini
Subject: Re: [Qemu-devel] [PATCH] block: Add -drive detect_zero=on|off option to detect all zero writes.
Date: Fri, 27 Jul 2012 21:01:38 +0200
User-agent: Mozilla/5.0 (X11; Linux x86_64; rv:13.0) Gecko/20120615 Thunderbird/13.0.1

Il 27/07/2012 15:58, Anthony Liguori ha scritto:
>> Note that if you want to test this feature, you must use a qcow2
>> version 3 file.  To create that, do:
>>
>>   qemu-img create -f qcow2 -o compat=1.1 <name> <size>
>>
>> Ordinary qcow2 (v2) and raw do NOT know how to treat zero blocks
>> specially, so you won't notice any difference.
> 
> It would be a lot more useful if this was implemented as a dynamic
> option.  Offline sparsification is one use-case, but online
> sparsification is another.
> 
> This can be orchestrated through the qemu-ga binary.  First, you enable
> zero detection, then you ask qemu-ga to create a file filled with zeros
> of 90% of the available free space unlinking the file as soon as you
> open it.  You then close the fd after you've written out the zeros.
> Then you disable zero detection and keep going.

Or just finish up discard support and use the existing fstrim command of
qemu-ga.  :)

Paolo

> 
> This would let virt-manager or oVirt implement a "compact disk" option
> against running VMs.
> 
> I don't think it's much of a change to your existing patch either.
> 
> Regards,
> 
> Anthony Liguori
> 
>>
>> Signed-off-by: Richard W.M. Jones <address@hidden>
>> ---
>>  block.c         |   33 ++++++++++++++++++++++++++++++++-
>>  block.h         |    1 +
>>  block_int.h     |    1 +
>>  blockdev.c      |   10 ++++++++++
>>  hmp-commands.hx |    3 ++-
>>  qemu-config.c   |    4 ++++
>>  qemu-options.hx |    7 ++++++-
>>  7 files changed, 56 insertions(+), 3 deletions(-)
>>
>> diff --git a/block.c b/block.c
>> index 7547051..49e8186 100644
>> --- a/block.c
>> +++ b/block.c
>> @@ -639,6 +639,8 @@ static int bdrv_open_common(BlockDriverState *bs, const 
>> char *filename,
>>          bdrv_enable_copy_on_read(bs);
>>      }
>>  
>> +    bs->detect_zero = !!(flags & BDRV_O_DETECT_ZERO);
>> +
>>      pstrcpy(bs->filename, sizeof(bs->filename), filename);
>>  
>>      if (use_bdrv_whitelist && !bdrv_is_whitelisted(drv)) {
>> @@ -654,7 +656,7 @@ static int bdrv_open_common(BlockDriverState *bs, const 
>> char *filename,
>>       * Clear flags that are internal to the block layer before opening the
>>       * image.
>>       */
>> -    open_flags = flags & ~(BDRV_O_SNAPSHOT | BDRV_O_NO_BACKING);
>> +    open_flags = flags & ~(BDRV_O_SNAPSHOT | BDRV_O_NO_BACKING | 
>> BDRV_O_DETECT_ZERO);
>>  
>>      /*
>>       * Snapshots should be writable.
>> @@ -1940,6 +1942,33 @@ static int coroutine_fn 
>> bdrv_co_do_write_zeroes(BlockDriverState *bs,
>>  }
>>  
>>  /*
>> + * Detect zeroes: This is very conservative and could be a lot more
>> + * clever/complex.
>> + *
>> + * All it does now is detect if the whole iovec contains nothing but
>> + * zero bytes, and if so call bdrv_co_do_write_zeroes, otherwise call
>> + * drv->bdrv_co_writev.
>> + *
>> + * It would be possible to split requests up and do this
>> + * iovec-element-by-element or even sector-by-sector.
>> + */
>> +static int detect_zeroes(BlockDriverState *bs,
>> +    int64_t sector_num, int nb_sectors, QEMUIOVector *qiov)
>> +{
>> +    BlockDriver *drv = bs->drv;
>> +    int i;
>> +
>> +    for (i = 0; i < qiov->niov; i++) {
>> +        if (!buffer_is_zero (qiov->iov[i].iov_base, qiov->iov[i].iov_len))
>> +            goto not_zero;
>> +    }
>> +    return bdrv_co_do_write_zeroes(bs, sector_num, nb_sectors);
>> +
>> + not_zero:
>> +    return drv->bdrv_co_writev(bs, sector_num, nb_sectors, qiov);
>> +}
>> +
>> +/*
>>   * Handle a write request in coroutine context
>>   */
>>  static int coroutine_fn bdrv_co_do_writev(BlockDriverState *bs,
>> @@ -1973,6 +2002,8 @@ static int coroutine_fn 
>> bdrv_co_do_writev(BlockDriverState *bs,
>>  
>>      if (flags & BDRV_REQ_ZERO_WRITE) {
>>          ret = bdrv_co_do_write_zeroes(bs, sector_num, nb_sectors);
>> +    } else if (bs->detect_zero) {
>> +        ret = detect_zeroes (bs, sector_num, nb_sectors, qiov);
>>      } else {
>>          ret = drv->bdrv_co_writev(bs, sector_num, nb_sectors, qiov);
>>      }
>> diff --git a/block.h b/block.h
>> index 7408acc..fcc6de6 100644
>> --- a/block.h
>> +++ b/block.h
>> @@ -79,6 +79,7 @@ typedef struct BlockDevOps {
>>  #define BDRV_O_NO_FLUSH    0x0200 /* disable flushing on this disk */
>>  #define BDRV_O_COPY_ON_READ 0x0400 /* copy read backing sectors into image 
>> */
>>  #define BDRV_O_INCOMING    0x0800  /* consistency hint for incoming 
>> migration */
>> +#define BDRV_O_DETECT_ZERO 0x1000 /* detect zero writes */
>>  
>>  #define BDRV_O_CACHE_MASK  (BDRV_O_NOCACHE | BDRV_O_CACHE_WB | 
>> BDRV_O_NO_FLUSH)
>>  
>> diff --git a/block_int.h b/block_int.h
>> index 3d4abc6..1008103 100644
>> --- a/block_int.h
>> +++ b/block_int.h
>> @@ -271,6 +271,7 @@ struct BlockDriverState {
>>      int sg;        /* if true, the device is a /dev/sg* */
>>      int copy_on_read; /* if true, copy read backing sectors into image
>>                           note this is a reference count */
>> +    int detect_zero; /* if true, detect all zero byte writes */
>>  
>>      BlockDriver *drv; /* NULL means no media */
>>      void *opaque;
>> diff --git a/blockdev.c b/blockdev.c
>> index 67895b2..580c078 100644
>> --- a/blockdev.c
>> +++ b/blockdev.c
>> @@ -296,6 +296,7 @@ DriveInfo *drive_init(QemuOpts *opts, int 
>> default_to_scsi)
>>      BlockIOLimit io_limits;
>>      int snapshot = 0;
>>      bool copy_on_read;
>> +    bool detect_zero;
>>      int ret;
>>  
>>      translation = BIOS_ATA_TRANSLATION_AUTO;
>> @@ -313,6 +314,7 @@ DriveInfo *drive_init(QemuOpts *opts, int 
>> default_to_scsi)
>>      snapshot = qemu_opt_get_bool(opts, "snapshot", 0);
>>      ro = qemu_opt_get_bool(opts, "readonly", 0);
>>      copy_on_read = qemu_opt_get_bool(opts, "copy-on-read", false);
>> +    detect_zero = qemu_opt_get_bool(opts, "detect-zero", false);
>>  
>>      file = qemu_opt_get(opts, "file");
>>      serial = qemu_opt_get(opts, "serial");
>> @@ -595,6 +597,10 @@ DriveInfo *drive_init(QemuOpts *opts, int 
>> default_to_scsi)
>>          bdrv_flags |= BDRV_O_COPY_ON_READ;
>>      }
>>  
>> +    if (detect_zero) {
>> +        bdrv_flags |= BDRV_O_DETECT_ZERO;
>> +    }
>> +
>>      if (runstate_check(RUN_STATE_INMIGRATE)) {
>>          bdrv_flags |= BDRV_O_INCOMING;
>>      }
>> @@ -612,6 +618,10 @@ DriveInfo *drive_init(QemuOpts *opts, int 
>> default_to_scsi)
>>  
>>      bdrv_flags |= ro ? 0 : BDRV_O_RDWR;
>>  
>> +    if (ro && detect_zero) {
>> +        error_report("warning: ignoring detect-zero on readonly drive");
>> +    }
>> +
>>      ret = bdrv_open(dinfo->bdrv, file, bdrv_flags, drv);
>>      if (ret < 0) {
>>          error_report("could not open disk image %s: %s",
>> diff --git a/hmp-commands.hx b/hmp-commands.hx
>> index 18cb415..7dcdcfe 100644
>> --- a/hmp-commands.hx
>> +++ b/hmp-commands.hx
>> @@ -908,7 +908,8 @@ ETEXI
>>                        "[,unit=m][,media=d][,index=i]\n"
>>                        "[,cyls=c,heads=h,secs=s[,trans=t]]\n"
>>                        "[,snapshot=on|off][,cache=on|off]\n"
>> -                      "[,readonly=on|off][,copy-on-read=on|off]",
>> +                      "[,readonly=on|off][,copy-on-read=on|off]\n"
>> +                      "[,detect-zero=on|off]\n",
>>          .help       = "add drive to PCI storage controller",
>>          .mhandler.cmd = drive_hot_add,
>>      },
>> diff --git a/qemu-config.c b/qemu-config.c
>> index be84a03..fb19d15 100644
>> --- a/qemu-config.c
>> +++ b/qemu-config.c
>> @@ -113,6 +113,10 @@ static QemuOptsList qemu_drive_opts = {
>>              .name = "copy-on-read",
>>              .type = QEMU_OPT_BOOL,
>>              .help = "copy read data from backing file into image file",
>> +        },{
>> +            .name = "detect-zero",
>> +            .type = QEMU_OPT_BOOL,
>> +            .help = "detect zero writes and handle space-efficiently",
>>          },
>>          { /* end of list */ }
>>      },
>> diff --git a/qemu-options.hx b/qemu-options.hx
>> index 8b66264..5d5da6e 100644
>> --- a/qemu-options.hx
>> +++ b/qemu-options.hx
>> @@ -141,7 +141,7 @@ DEF("drive", HAS_ARG, QEMU_OPTION_drive,
>>      "       [,cyls=c,heads=h,secs=s[,trans=t]][,snapshot=on|off]\n"
>>      "       
>> [,cache=writethrough|writeback|none|directsync|unsafe][,format=f]\n"
>>      "       [,serial=s][,addr=A][,id=name][,aio=threads|native]\n"
>> -    "       [,readonly=on|off][,copy-on-read=on|off]\n"
>> +    "       [,readonly=on|off][,copy-on-read=on|off][,detect-zero=on|off]\n"
>>      "       
>> [[,bps=b]|[[,bps_rd=r][,bps_wr=w]]][[,iops=i]|[[,iops_rd=r][,iops_wr=w]]\n"
>>      "                use 'file' as a drive image\n", QEMU_ARCH_ALL)
>>  STEXI
>> @@ -196,6 +196,11 @@ Open drive @option{file} as read-only. Guest write 
>> attempts will fail.
>>  @item address@hidden
>>  @var{copy-on-read} is "on" or "off" and enables whether to copy read backing
>>  file sectors into the image file.
>> address@hidden address@hidden
>> address@hidden is "on" or "off". If "on", guest writes containing
>> +all zero bytes are detected and may be handled more space-efficiently by
>> +the underlying block device. Under some workloads this may result in
>> +worse performance, so it defaults to "off".
>>  @end table
>>  
>>  By default, writethrough caching is used for all block device.  This means 
>> that
>> -- 
>> 1.7.10.4
> 
> 





reply via email to

[Prev in Thread] Current Thread [Next in Thread]