qemu-devel
[Top][All Lists]
Advanced

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

Re: [Qemu-devel] [PATCH v2 1/2] block: allow live commit of active image


From: Stefan Hajnoczi
Subject: Re: [Qemu-devel] [PATCH v2 1/2] block: allow live commit of active image
Date: Tue, 30 Jul 2013 14:53:00 +0200
User-agent: Mutt/1.5.21 (2010-09-15)

On Tue, Jul 30, 2013 at 03:17:47PM +0800, Fam Zheng wrote:
>      for (sector_num = 0; sector_num < end; sector_num += n) {
> -        uint64_t delay_ns = 0;
> -        bool copy;
>  
> -wait:
> -        /* Note that even when no rate limit is applied we need to yield
> -         * with no pending I/O here so that bdrv_drain_all() returns.
> -         */
> -        block_job_sleep_ns(&s->common, rt_clock, delay_ns);
> -        if (block_job_is_cancelled(&s->common)) {
> -            break;
> -        }
>          /* Copy if allocated above the base */
>          ret = bdrv_co_is_allocated_above(top, base, sector_num,
> -                                         COMMIT_BUFFER_SIZE / 
> BDRV_SECTOR_SIZE,
> +                                         COMMIT_BUFFER_SECTORS,
>                                           &n);
> -        copy = (ret == 1);
> -        trace_commit_one_iteration(s, sector_num, n, ret);
> -        if (copy) {
> -            if (s->common.speed) {
> -                delay_ns = ratelimit_calculate_delay(&s->limit, n);
> -                if (delay_ns > 0) {
> -                    goto wait;
> -                }
> +        if (ret) {
> +            bdrv_set_dirty(top, sector_num, n);
> +        }

This could take a while on a big image.  You need sleep/cancel here like
the other blockjob loops have.

I think error handling isn't sufficient here.  If
bdrv_co_is_allocated_above() fails you need to exit (if n becomes 0 on
error, then this is an infinite loop!).

> +        bdrv_dirty_iter_init(s->top, &hbi);
> +        for (next_dirty = hbitmap_iter_next(&hbi);
> +                next_dirty >= 0;
> +                next_dirty = hbitmap_iter_next(&hbi)) {
> +            sector_num = next_dirty;
> +            if (block_job_is_cancelled(&s->common)) {
> +                goto exit;
> +            }
> +            delay_ns = ratelimit_calculate_delay(&s->limit,
> +                                                 COMMIT_BUFFER_SECTORS);
> +            /* Note that even when no rate limit is applied we need to yield
> +             * with no pending I/O here so that bdrv_drain_all() returns.
> +             */
> +            block_job_sleep_ns(&s->common, rt_clock, delay_ns);
> +            trace_commit_one_iteration(s, sector_num,
> +                                       COMMIT_BUFFER_SECTORS, ret);
> +            ret = commit_populate(top, base, sector_num,
> +                                  COMMIT_BUFFER_SECTORS, buf);

Can we be sure that a guest write during commit_populate()...

> +            if (ret < 0) {
> +                if (s->on_error == BLOCKDEV_ON_ERROR_STOP ||
> +                    s->on_error == BLOCKDEV_ON_ERROR_REPORT ||
> +                    (s->on_error == BLOCKDEV_ON_ERROR_ENOSPC &&
> +                         ret == -ENOSPC)) {
> +                    goto exit;
> +                } else {
> +                    continue;
> +                }
>              }
> +            /* Publish progress */
> +            s->common.offset += COMMIT_BUFFER_BYTES;
> +            bdrv_reset_dirty(top, sector_num, COMMIT_BUFFER_SECTORS);

...sets the dirty but *after* us?  Otherwise there is a race condition
where guest writes fail to be copied into the base image.

I think the answer is "no" since commit_populate() performs two separate
blocking operations: bdrv_read(top) followed by bdrv_write(base).  Now
imagine the guest does bdrv_aio_writev(top) after we complete
bdrv_read(top) but before we do bdrv_reset_dirty().

>          }
> -        /* Publish progress */
> -        s->common.offset += n * BDRV_SECTOR_SIZE;
>      }
> +    s->common.offset = end;
>  
> -    ret = 0;
> -
> -    if (!block_job_is_cancelled(&s->common) && sector_num == end) {
> -        /* success */
> -        ret = bdrv_drop_intermediate(active, top, base);
> +    bdrv_flush(base);

bdrv_co_flush() is clearer since we're in a coroutine.

> +    if (!block_job_is_cancelled(&s->common)) {
> +        /* Drop intermediate: [top, base) */
> +        ret = bdrv_drop_intermediate(s->overlay, &top, &base);
> +        s->common.offset = s->common.len;
>      }
>  
> -exit_free_buf:
> -    qemu_vfree(buf);
> +    ret = 0;
> +
> +exit:
> +    bdrv_set_dirty_tracking(active, 0);
>  
> -exit_restore_reopen:
>      /* restore base open flags here if appropriate (e.g., change the base 
> back
>       * to r/o). These reopens do not need to be atomic, since we won't abort
>       * even on failure here */
> -    if (s->base_flags != bdrv_get_flags(base)) {
> +    if (s->overlay && s->base_flags != bdrv_get_flags(base)) {

Why check s->overlay, this only concerns base?

> @@ -212,23 +246,20 @@ void commit_start(BlockDriverState *bs, 
> BlockDriverState *base,
>  
>      overlay_bs = bdrv_find_overlay(bs, top);
>  
> -    if (overlay_bs == NULL) {
> -        error_setg(errp, "Could not find overlay image for %s:", 
> top->filename);
> -        return;
> -    }
> -
>      orig_base_flags    = bdrv_get_flags(base);
> -    orig_overlay_flags = bdrv_get_flags(overlay_bs);
> +    if (overlay_bs) {
> +        orig_overlay_flags = bdrv_get_flags(overlay_bs);
> +        if (!(orig_overlay_flags & BDRV_O_RDWR)) {
> +            reopen_queue = bdrv_reopen_queue(reopen_queue, overlay_bs,
> +                    orig_overlay_flags | BDRV_O_RDWR);
> +        }
> +    }
>  
>      /* convert base & overlay_bs to r/w, if necessary */
>      if (!(orig_base_flags & BDRV_O_RDWR)) {
>          reopen_queue = bdrv_reopen_queue(reopen_queue, base,
>                                           orig_base_flags | BDRV_O_RDWR);
>      }
> -    if (!(orig_overlay_flags & BDRV_O_RDWR)) {
> -        reopen_queue = bdrv_reopen_queue(reopen_queue, overlay_bs,
> -                                         orig_overlay_flags | BDRV_O_RDWR);
> -    }

IMO it is clearer to put the two orig_base_flags stanzas together rather
than interleaving them:

/* convert base & overlay_bs to r/w, if necessary */
orig_base_flags = bdrv_get_flags(base);
if (!(orig_base_flags & BDRV_O_RDWR)) {
    reopen_queue = bdrv_reopen_queue(reopen_queue, base,
                                     orig_base_flags |
                                     BDRV_O_RDWR);
}

overlay_bs = bdrv_find_overlay(bs, top);
if (overlay_bs) {
    orig_overlay_flags = bdrv_get_flags(overlay_bs);
    if (!(orig_overlay_flags & BDRV_O_RDWR)) {
        reopen_queue = bdrv_reopen_queue(reopen_queue, overlay_bs,
                orig_overlay_flags | BDRV_O_RDWR); 
    }
}



reply via email to

[Prev in Thread] Current Thread [Next in Thread]