[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]
Re: [Qemu-devel] [PATCH v2 1/2] block: allow live commit of active image
From: |
Stefan Hajnoczi |
Subject: |
Re: [Qemu-devel] [PATCH v2 1/2] block: allow live commit of active image |
Date: |
Tue, 30 Jul 2013 14:53:00 +0200 |
User-agent: |
Mutt/1.5.21 (2010-09-15) |
On Tue, Jul 30, 2013 at 03:17:47PM +0800, Fam Zheng wrote:
> for (sector_num = 0; sector_num < end; sector_num += n) {
> - uint64_t delay_ns = 0;
> - bool copy;
>
> -wait:
> - /* Note that even when no rate limit is applied we need to yield
> - * with no pending I/O here so that bdrv_drain_all() returns.
> - */
> - block_job_sleep_ns(&s->common, rt_clock, delay_ns);
> - if (block_job_is_cancelled(&s->common)) {
> - break;
> - }
> /* Copy if allocated above the base */
> ret = bdrv_co_is_allocated_above(top, base, sector_num,
> - COMMIT_BUFFER_SIZE /
> BDRV_SECTOR_SIZE,
> + COMMIT_BUFFER_SECTORS,
> &n);
> - copy = (ret == 1);
> - trace_commit_one_iteration(s, sector_num, n, ret);
> - if (copy) {
> - if (s->common.speed) {
> - delay_ns = ratelimit_calculate_delay(&s->limit, n);
> - if (delay_ns > 0) {
> - goto wait;
> - }
> + if (ret) {
> + bdrv_set_dirty(top, sector_num, n);
> + }
This could take a while on a big image. You need sleep/cancel here like
the other blockjob loops have.
I think error handling isn't sufficient here. If
bdrv_co_is_allocated_above() fails you need to exit (if n becomes 0 on
error, then this is an infinite loop!).
> + bdrv_dirty_iter_init(s->top, &hbi);
> + for (next_dirty = hbitmap_iter_next(&hbi);
> + next_dirty >= 0;
> + next_dirty = hbitmap_iter_next(&hbi)) {
> + sector_num = next_dirty;
> + if (block_job_is_cancelled(&s->common)) {
> + goto exit;
> + }
> + delay_ns = ratelimit_calculate_delay(&s->limit,
> + COMMIT_BUFFER_SECTORS);
> + /* Note that even when no rate limit is applied we need to yield
> + * with no pending I/O here so that bdrv_drain_all() returns.
> + */
> + block_job_sleep_ns(&s->common, rt_clock, delay_ns);
> + trace_commit_one_iteration(s, sector_num,
> + COMMIT_BUFFER_SECTORS, ret);
> + ret = commit_populate(top, base, sector_num,
> + COMMIT_BUFFER_SECTORS, buf);
Can we be sure that a guest write during commit_populate()...
> + if (ret < 0) {
> + if (s->on_error == BLOCKDEV_ON_ERROR_STOP ||
> + s->on_error == BLOCKDEV_ON_ERROR_REPORT ||
> + (s->on_error == BLOCKDEV_ON_ERROR_ENOSPC &&
> + ret == -ENOSPC)) {
> + goto exit;
> + } else {
> + continue;
> + }
> }
> + /* Publish progress */
> + s->common.offset += COMMIT_BUFFER_BYTES;
> + bdrv_reset_dirty(top, sector_num, COMMIT_BUFFER_SECTORS);
...sets the dirty but *after* us? Otherwise there is a race condition
where guest writes fail to be copied into the base image.
I think the answer is "no" since commit_populate() performs two separate
blocking operations: bdrv_read(top) followed by bdrv_write(base). Now
imagine the guest does bdrv_aio_writev(top) after we complete
bdrv_read(top) but before we do bdrv_reset_dirty().
> }
> - /* Publish progress */
> - s->common.offset += n * BDRV_SECTOR_SIZE;
> }
> + s->common.offset = end;
>
> - ret = 0;
> -
> - if (!block_job_is_cancelled(&s->common) && sector_num == end) {
> - /* success */
> - ret = bdrv_drop_intermediate(active, top, base);
> + bdrv_flush(base);
bdrv_co_flush() is clearer since we're in a coroutine.
> + if (!block_job_is_cancelled(&s->common)) {
> + /* Drop intermediate: [top, base) */
> + ret = bdrv_drop_intermediate(s->overlay, &top, &base);
> + s->common.offset = s->common.len;
> }
>
> -exit_free_buf:
> - qemu_vfree(buf);
> + ret = 0;
> +
> +exit:
> + bdrv_set_dirty_tracking(active, 0);
>
> -exit_restore_reopen:
> /* restore base open flags here if appropriate (e.g., change the base
> back
> * to r/o). These reopens do not need to be atomic, since we won't abort
> * even on failure here */
> - if (s->base_flags != bdrv_get_flags(base)) {
> + if (s->overlay && s->base_flags != bdrv_get_flags(base)) {
Why check s->overlay, this only concerns base?
> @@ -212,23 +246,20 @@ void commit_start(BlockDriverState *bs,
> BlockDriverState *base,
>
> overlay_bs = bdrv_find_overlay(bs, top);
>
> - if (overlay_bs == NULL) {
> - error_setg(errp, "Could not find overlay image for %s:",
> top->filename);
> - return;
> - }
> -
> orig_base_flags = bdrv_get_flags(base);
> - orig_overlay_flags = bdrv_get_flags(overlay_bs);
> + if (overlay_bs) {
> + orig_overlay_flags = bdrv_get_flags(overlay_bs);
> + if (!(orig_overlay_flags & BDRV_O_RDWR)) {
> + reopen_queue = bdrv_reopen_queue(reopen_queue, overlay_bs,
> + orig_overlay_flags | BDRV_O_RDWR);
> + }
> + }
>
> /* convert base & overlay_bs to r/w, if necessary */
> if (!(orig_base_flags & BDRV_O_RDWR)) {
> reopen_queue = bdrv_reopen_queue(reopen_queue, base,
> orig_base_flags | BDRV_O_RDWR);
> }
> - if (!(orig_overlay_flags & BDRV_O_RDWR)) {
> - reopen_queue = bdrv_reopen_queue(reopen_queue, overlay_bs,
> - orig_overlay_flags | BDRV_O_RDWR);
> - }
IMO it is clearer to put the two orig_base_flags stanzas together rather
than interleaving them:
/* convert base & overlay_bs to r/w, if necessary */
orig_base_flags = bdrv_get_flags(base);
if (!(orig_base_flags & BDRV_O_RDWR)) {
reopen_queue = bdrv_reopen_queue(reopen_queue, base,
orig_base_flags |
BDRV_O_RDWR);
}
overlay_bs = bdrv_find_overlay(bs, top);
if (overlay_bs) {
orig_overlay_flags = bdrv_get_flags(overlay_bs);
if (!(orig_overlay_flags & BDRV_O_RDWR)) {
reopen_queue = bdrv_reopen_queue(reopen_queue, overlay_bs,
orig_overlay_flags | BDRV_O_RDWR);
}
}