[Top][All Lists]
[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]
Re: [Qemu-devel] [PATCH 4/8] block: add mirror job
From: |
Jeff Cody |
Subject: |
Re: [Qemu-devel] [PATCH 4/8] block: add mirror job |
Date: |
Wed, 30 May 2012 16:09:59 -0400 |
User-agent: |
Mozilla/5.0 (X11; Linux x86_64; rv:12.0) Gecko/20120430 Thunderbird/12.0.1 |
On 04/13/2012 12:23 PM, Paolo Bonzini wrote:
> This patch adds the implementation of a new job that mirrors a disk to
> a new image while letting the guest continue using the old image.
> The target is treated as a "black box" and data is copied from the
> source to the target in the background.
>
> The mirror job is never-ending, but it is logically structured into
> two phases: 1) copy all data as fast as possible until the target
> first gets in sync with the source; 2) keep target in sync and
> ensure that reopening to the target gets a correct (full) copy
> of the source data.
>
> The second phase is indicated by the progress in "info block-jobs"
> reporting the current offset to be equal to the length of the file.
> When the job is cancelled in the second phase, QEMU will run the
> job until the source is clean and quiescent, then it will report
> successful completion of the job. (Note that it could already happen
> that management lost the race against QEMU and got a completion
> event instead of cancellation).
>
> Signed-off-by: Paolo Bonzini <address@hidden>
> ---
> Makefile.objs | 2 +-
> block/mirror.c | 236
> ++++++++++++++++++++++++++++++++++++++++++++++++++++++++
> block_int.h | 5 ++
> trace-events | 4 +
> 4 files changed, 246 insertions(+), 1 deletion(-)
> create mode 100644 block/mirror.c
>
> diff --git a/Makefile.objs b/Makefile.objs
> index 5c3bcda..1679461 100644
> --- a/Makefile.objs
> +++ b/Makefile.objs
> @@ -53,7 +53,7 @@ block-nested-y += qcow2.o qcow2-refcount.o qcow2-cluster.o
> qcow2-snapshot.o qcow
> block-nested-y += qed.o qed-gencb.o qed-l2-cache.o qed-table.o qed-cluster.o
> block-nested-y += qed-check.o
> block-nested-y += parallels.o nbd.o blkdebug.o sheepdog.o blkverify.o
> -block-nested-y += stream.o
> +block-nested-y += stream.o mirror.o
> block-nested-$(CONFIG_WIN32) += raw-win32.o
> block-nested-$(CONFIG_POSIX) += raw-posix.o
> block-nested-$(CONFIG_LIBISCSI) += iscsi.o
> diff --git a/block/mirror.c b/block/mirror.c
> new file mode 100644
> index 0000000..5a3395a
> --- /dev/null
> +++ b/block/mirror.c
> @@ -0,0 +1,236 @@
> +/*
> + * Image mirroring
> + *
> + * Copyright Red Hat, Inc. 2012
> + *
> + * Authors:
> + * Paolo Bonzini <address@hidden>
> + *
> + * This work is licensed under the terms of the GNU LGPL, version 2 or later.
> + * See the COPYING.LIB file in the top-level directory.
> + *
> + */
> +
> +#include "trace.h"
> +#include "block_int.h"
> +#include "qemu/ratelimit.h"
> +
> +enum {
> + /*
> + * Size of data buffer for populating the image file. This should be
> large
> + * enough to process multiple clusters in a single call, so that
> populating
> + * contiguous regions of the image is efficient.
> + */
> + BLOCK_SIZE = 512 * BDRV_SECTORS_PER_DIRTY_CHUNK, /* in bytes */
> +};
> +
> +#define SLICE_TIME 100000000ULL /* ns */
> +
> +typedef struct MirrorBlockJob {
> + BlockJob common;
> + RateLimit limit;
> + BlockDriverState *target;
> + bool full;
> +} MirrorBlockJob;
> +
> +static int coroutine_fn mirror_populate(BlockDriverState *source,
> + BlockDriverState *target,
> + int64_t sector_num, int nb_sectors,
> + void *buf)
> +{
> + struct iovec iov = {
> + .iov_base = buf,
> + .iov_len = nb_sectors * 512,
> + };
> + QEMUIOVector qiov;
> + int ret;
> +
> + qemu_iovec_init_external(&qiov, &iov, 1);
> +
> + /* Copy-on-read the unallocated clusters */
> + ret = bdrv_co_readv(source, sector_num, nb_sectors, &qiov);
> + if (ret < 0) {
> + return ret;
> + }
> + return bdrv_co_writev(target, sector_num, nb_sectors, &qiov);
> +}
> +
> +static void coroutine_fn mirror_run(void *opaque)
> +{
> + MirrorBlockJob *s = opaque;
> + BlockDriverState *bs = s->common.bs;
> + BlockDriverState *base;
> + int64_t sector_num, end;
> + int ret = 0;
> + int n;
> + bool synced = false;
> + void *buf;
> +
> + if (block_job_is_cancelled(&s->common)) {
> + goto immediate_exit;
> + }
> +
> + s->common.len = bdrv_getlength(bs);
> + if (s->common.len < 0) {
> + block_job_complete(&s->common, s->common.len);
> + return;
> + }
> +
> + base = s->full ? NULL : bs->backing_hd;
> + end = s->common.len >> BDRV_SECTOR_BITS;
> + buf = qemu_blockalign(bs, BLOCK_SIZE);
> +
> + /* First part, loop on the sectors and initialize the dirty bitmap. */
> + for (sector_num = 0; sector_num < end; ) {
> + int64_t next = (sector_num | (BDRV_SECTORS_PER_DIRTY_CHUNK - 1)) + 1;
> + ret = bdrv_co_is_allocated_above(bs, base,
> + sector_num, next - sector_num, &n);
> +
> + if (ret < 0) {
> + break;
> + } else if (ret == 1) {
> + bdrv_set_dirty(bs, sector_num, n);
> + sector_num = next;
> + } else {
> + sector_num += n;
> + }
> + }
> +
> + if (ret < 0) {
> + block_job_complete(&s->common, ret);
> + }
> +
> + sector_num = -1;
> + for (;;) {
> + int64_t cnt;
> + s->common.busy = true;
> + if (bdrv_get_dirty_count(bs) == 0) {
> + /* Switch out of the streaming phase. From now on, if the
> + * job is cancelled we will actually complete all pending
> + * I/O and report completion, so that drive-reopen can be
> + * used to pivot to the mirroring target.
> + */
> + synced = true;
> + sector_num = -1;
> + s->common.offset = end * BDRV_SECTOR_SIZE;
> + }
> +
> + if (synced && block_job_is_cancelled(&s->common)) {
> + /* The dirty bitmap is not updated while operations are pending.
> + * If we're about to exit, wait for pending operations or we may
> + * exit while the source has dirty data to copy!
> + */
> + while (bdrv_get_dirty_count(bs) == 0 &&
> + !QLIST_EMPTY(&bs->tracked_requests)) {
> + qemu_aio_wait();
> + }
> + }
> +
> + if (bdrv_get_dirty_count(bs) != 0) {
> + int nb_sectors;
> + sector_num = bdrv_get_next_dirty(bs, sector_num);
> + nb_sectors = MIN(BDRV_SECTORS_PER_DIRTY_CHUNK, end - sector_num);
> + trace_mirror_one_iteration(s, sector_num);
> + bdrv_reset_dirty(bs, sector_num, BDRV_SECTORS_PER_DIRTY_CHUNK);
> + ret = mirror_populate(bs, s->target, sector_num, nb_sectors,
> buf);
> + if (ret < 0) {
> + break;
> + }
> + }
> +
> + ret = 0;
> + cnt = bdrv_get_dirty_count(bs);
> + if (synced) {
> + if (!block_job_is_cancelled(&s->common)) {
> + s->common.busy = false;
> + co_sleep_ns(rt_clock, cnt == 0 ? SLICE_TIME : 0);
> + } else if (cnt == 0 && QLIST_EMPTY(&bs->tracked_requests)) {
> + /* The two disks are in sync. Exit and report successful
> + * successful completion.
> + */
> + s->common.cancelled = false;
> + break;
> + }
> +
> + /* We get here either to poll the target, or because the job
> + * was cancelled. In the latter case, we still have an
> + * opportunity to do I/O (without going to sleep) before
> + * exiting.
> + */
> + } else {
> + uint64_t delay_ns;
> +
> + /* Publish progress */
> + s->common.offset = end * BDRV_SECTOR_SIZE - cnt * BLOCK_SIZE;
> +
> + if (s->common.speed) {
> + delay_ns = ratelimit_calculate_delay(&s->limit,
> BDRV_SECTORS_PER_DIRTY_CHUNK);
> + } else {
> + delay_ns = 0;
> + }
> +
> + /* Note that even when no rate limit is applied we need to yield
> + * with no pending I/O here so that qemu_aio_flush() returns.
> + */
> + s->common.busy = false;
> + co_sleep_ns(rt_clock, delay_ns);
> + if (block_job_is_cancelled(&s->common)) {
> + break;
> + }
> + }
> + }
> +
> +immediate_exit:
> + bdrv_set_dirty_tracking(bs, false);
> + bdrv_close(s->target);
> + bdrv_delete(s->target);
> + block_job_complete(&s->common, ret);
> +}
> +
> +static int mirror_set_speed(BlockJob *job, int64_t value)
> +{
> + MirrorBlockJob *s = container_of(job, MirrorBlockJob, common);
> +
> + if (value < 0) {
> + return -EINVAL;
> + }
> + ratelimit_set_speed(&s->limit, value / BDRV_SECTOR_SIZE, SLICE_TIME);
> + return 0;
> +}
> +
> +static BlockJobType mirror_job_type = {
> + .instance_size = sizeof(MirrorBlockJob),
> + .job_type = "mirror",
> + .set_speed = mirror_set_speed,
> +};
> +
> +int mirror_start(BlockDriverState *bs,
> + const char *target, BlockDriver *drv, int flags,
> + BlockDriverCompletionFunc *cb,
> + void *opaque, bool full)
> +{
> + MirrorBlockJob *s;
> + int ret;
> +
> + s = block_job_create(&mirror_job_type, bs, cb, opaque);
> + if (!s) {
> + return -EBUSY; /* bs must already be in use */
> + }
> +
> + s->target = bdrv_new("");
> + ret = bdrv_open(s->target, target,
> + flags | BDRV_O_NO_BACKING | BDRV_O_NO_FLUSH |
> BDRV_O_CACHE_WB,
> + drv);
> +
> + if (ret < 0) {
> + bdrv_delete(s->target);
> + return ret;
> + }
> +
> + s->full = full;
> + bdrv_set_dirty_tracking(bs, true);
> + s->common.co = qemu_coroutine_create(mirror_run);
> + trace_mirror_start(bs, s, s->common.co, opaque);
> + qemu_coroutine_enter(s->common.co, s);
> + return 0;
> +}
Something to note: mirror_start() will leave the BlockDriverState busy,
and the block job dangling, if the bdrv_open() fails (for instance,
unable to open an existing image:
https://bugzilla.redhat.com/show_bug.cgi?id=814102).
Re-arranging mirror_start() to not create the block job if opening the
BDS fails should fix this, like so:
int mirror_start(BlockDriverState *bs,
const char *target, BlockDriver *drv, int flags,
int64_t speed, BlockDriverCompletionFunc *cb,
void *opaque, bool full)
{
MirrorBlockJob *s;
BlockDriverState *target_bs;
int ret = 0;
target_bs = bdrv_new("");
ret = bdrv_open(target_bs, target,
flags | BDRV_O_NO_BACKING | BDRV_O_NO_FLUSH |
BDRV_O_CACHE_WB,
drv);
if (ret < 0) {
bdrv_delete(target_bs);
goto exit;
}
s = block_job_create(&mirror_job_type, bs, speed, cb, opaque);
if (!s) {
bdrv_delete(target_bs);
ret = -EBUSY; /* bs must already be in use */
goto exit;
}
s->target = target_bs;
s->full = full;
bdrv_set_dirty_tracking(bs, true);
s->common.co = qemu_coroutine_create(mirror_run);
trace_mirror_start(bs, s, s->common.co, opaque);
exit:
return ret;
}
> diff --git a/block_int.h b/block_int.h
> index eae24d2..683d59d 100644
> --- a/block_int.h
> +++ b/block_int.h
> @@ -432,4 +432,9 @@ int stream_start(BlockDriverState *bs, BlockDriverState
> *base,
> const char *base_id, BlockDriverCompletionFunc *cb,
> void *opaque);
>
> +int mirror_start(BlockDriverState *bs,
> + const char *target, BlockDriver *drv, int flags,
> + BlockDriverCompletionFunc *cb,
> + void *opaque, bool full);
> +
> #endif /* BLOCK_INT_H */
> diff --git a/trace-events b/trace-events
> index a5f276d..23aad83 100644
> --- a/trace-events
> +++ b/trace-events
> @@ -71,6 +71,10 @@ bdrv_co_write_zeroes(void *bs, int64_t sector_num, int
> nb_sector) "bs %p sector_
> bdrv_co_io_em(void *bs, int64_t sector_num, int nb_sectors, int is_write,
> void *acb) "bs %p sector_num %"PRId64" nb_sectors %d is_write %d acb %p"
> bdrv_co_do_copy_on_readv(void *bs, int64_t sector_num, int nb_sectors,
> int64_t cluster_sector_num, int cluster_nb_sectors) "bs %p sector_num
> %"PRId64" nb_sectors %d cluster_sector_num %"PRId64" cluster_nb_sectors %d"
>
> +# block/mirror.c
> +mirror_one_iteration(void *s, int64_t sector_num) "s %p sector_num %"PRId64""
> +mirror_start(void *bs, void *s, void *co, void *opaque) "bs %p s %p co %p
> opaque %p"
> +
> # block/stream.c
> stream_one_iteration(void *s, int64_t sector_num, int nb_sectors, int
> is_allocated) "s %p sector_num %"PRId64" nb_sectors %d is_allocated %d"
> stream_start(void *bs, void *base, void *s, void *co, void *opaque) "bs %p
> base %p s %p co %p opaque %p"
[Prev in Thread] |
Current Thread |
[Next in Thread] |
- Re: [Qemu-devel] [PATCH 4/8] block: add mirror job,
Jeff Cody <=