qemu-devel
[Top][All Lists]
Advanced

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

Re: [Qemu-devel] [PATCH 1/2] block: allow live commit of active image


From: Fam Zheng
Subject: Re: [Qemu-devel] [PATCH 1/2] block: allow live commit of active image
Date: Mon, 22 Jul 2013 14:48:34 +0800
User-agent: Mutt/1.5.21 (2010-09-15)

On Mon, 07/22 08:34, Paolo Bonzini wrote:
> Il 22/07/2013 05:46, Fam Zheng ha scritto:
> > This patch eliminates limitation of committing the active device.
> > 
> > bdrv_drop_intermediate is reimplemented to take pointers to
> > (BlockDriverState *), so it can modify the caller's local pointers to
> > preserve their semantics, while updating active BDS in-place by
> > bdrv_swap active and base: we need data in 'base' as it's the only
> > remaining after commit, but we can't delete 'active' as it's referenced
> > everywhere in the program.
> > 
> > Guest writes to active device during the commit are tracked by dirty map
> > and committed like block-mirror.
> 
> I have only skimmed the patch, but I think this is incomplete.
> Management needs to know the moment when 'active' is not valid anymore,
> thus this job needs to be completed manually with "block-job-complete".

Does management need access to 'active' image outside of QEMU process?
Although original 'active' it is "dropped" by bdrv_drop_intermediate,
the pointers to original 'active' is still valid because 'base' is moved
to this address (with bdrv_swap). I don't know, what is the problem here
for management?

> 
> In fact, I wonder if block/commit.c could reuse most of the code from
> block/mirror.c (basically everything except that bdrv_swap should be
> replaced by bdrv_drop_intermediate).
> 

Hmm, yes, in this case, it is quite similar to mirroring 'active' to
'base' with sync mode top.

> > Signed-off-by: Fam Zheng <address@hidden>
> > ---
> >  block.c               | 102 ++++++++++----------------------
> >  block/commit.c        | 160 
> > ++++++++++++++++++++++++++------------------------
> >  include/block/block.h |   5 +-
> >  3 files changed, 115 insertions(+), 152 deletions(-)
> > 
> > diff --git a/block.c b/block.c
> > index b560241..367e064 100644
> > --- a/block.c
> > +++ b/block.c
> > @@ -2018,18 +2018,11 @@ BlockDriverState 
> > *bdrv_find_overlay(BlockDriverState *active,
> >      return overlay;
> >  }
> >  
> > -typedef struct BlkIntermediateStates {
> > -    BlockDriverState *bs;
> > -    QSIMPLEQ_ENTRY(BlkIntermediateStates) entry;
> > -} BlkIntermediateStates;
> > -
> > -
> >  /*
> > - * Drops images above 'base' up to and including 'top', and sets the image
> > - * above 'top' to have base as its backing file.
> > - *
> > - * Requires that the overlay to 'top' is opened r/w, so that the backing 
> > file
> > - * information in 'bs' can be properly updated.
> > + * Drops images above '*base' up to and including '*top', and sets new 
> > '*base'
> > + * as backing_hd of top_overlay (the image orignally has 'top' as backing
> > + * file). top_overlay may be NULL if '*top' is active, no such update 
> > needed.
> > + * Requires that the top_overlay to 'top' is opened r/w.
> >   *
> >   * E.g., this will convert the following chain:
> >   * bottom <- base <- intermediate <- top <- active
> > @@ -2046,82 +2039,47 @@ typedef struct BlkIntermediateStates {
> >   *
> >   * base <- active
> >   *
> > - * Error conditions:
> > - *  if active == top, that is considered an error
> > + * It also allows active==top, in which case it converts:
> > + *
> > + * base <- intermediate <- active (also top)
> > + *
> > + * to
> > + *
> > + * base == active == top, i.e. only base remains: *top == *base when 
> > return.
> >   *
> >   */
> > -int bdrv_drop_intermediate(BlockDriverState *active, BlockDriverState *top,
> > -                           BlockDriverState *base)
> > +int bdrv_drop_intermediate(BlockDriverState *top_overlay,
> > +                           BlockDriverState **top,
> > +                           BlockDriverState **base)
> >  {
> > -    BlockDriverState *intermediate;
> > +    BlockDriverState *pbs;
> >      BlockDriverState *base_bs = NULL;
> > -    BlockDriverState *new_top_bs = NULL;
> > -    BlkIntermediateStates *intermediate_state, *next;
> >      int ret = -EIO;
> >  
> > -    QSIMPLEQ_HEAD(states_to_delete, BlkIntermediateStates) 
> > states_to_delete;
> > -    QSIMPLEQ_INIT(&states_to_delete);
> > -
> > -    if (!top->drv || !base->drv) {
> > +    if (!(*top)->drv || !(*base)->drv) {
> >          goto exit;
> >      }
> >  
> > -    new_top_bs = bdrv_find_overlay(active, top);
> > -
> > -    if (new_top_bs == NULL) {
> > -        /* we could not find the image above 'top', this is an error */
> > -        goto exit;
> > +    for (pbs = (*top)->backing_hd; pbs != *base; pbs = base_bs) {
> > +        assert(pbs);
> > +        base_bs = pbs->backing_hd;
> > +        pbs->backing_hd = NULL;
> > +        bdrv_delete(pbs);
> >      }
> >  
> > -    /* special case of new_top_bs->backing_hd already pointing to base - 
> > nothing
> > -     * to do, no intermediate images */
> > -    if (new_top_bs->backing_hd == base) {
> > -        ret = 0;
> > -        goto exit;
> > -    }
> > +    bdrv_swap(*base, *top);
> >  
> > -    intermediate = top;
> > +    (*base)->backing_hd = NULL;
> > +    bdrv_delete(*base);
> > +    *base = *top;
> >  
> > -    /* now we will go down through the list, and add each BDS we find
> > -     * into our deletion queue, until we hit the 'base'
> > -     */
> > -    while (intermediate) {
> > -        intermediate_state = g_malloc0(sizeof(BlkIntermediateStates));
> > -        intermediate_state->bs = intermediate;
> > -        QSIMPLEQ_INSERT_TAIL(&states_to_delete, intermediate_state, entry);
> > -
> > -        if (intermediate->backing_hd == base) {
> > -            base_bs = intermediate->backing_hd;
> > -            break;
> > -        }
> > -        intermediate = intermediate->backing_hd;
> > -    }
> > -    if (base_bs == NULL) {
> > -        /* something went wrong, we did not end at the base. safely
> > -         * unravel everything, and exit with error */
> > -        goto exit;
> > -    }
> > -
> > -    /* success - we can delete the intermediate states, and link top->base 
> > */
> > -    ret = bdrv_change_backing_file(new_top_bs, base_bs->filename,
> > -                                   base_bs->drv ? 
> > base_bs->drv->format_name : "");
> > -    if (ret) {
> > -        goto exit;
> > -    }
> > -    new_top_bs->backing_hd = base_bs;
> > -
> > -
> > -    QSIMPLEQ_FOREACH_SAFE(intermediate_state, &states_to_delete, entry, 
> > next) {
> > -        /* so that bdrv_close() does not recursively close the chain */
> > -        intermediate_state->bs->backing_hd = NULL;
> > -        bdrv_delete(intermediate_state->bs);
> > +    /* overlay exists when active != top, need to change backing file for 
> > it */
> > +    if (top_overlay) {
> > +        ret = bdrv_change_backing_file(top_overlay, (*base)->filename,
> > +                                       (*base)->drv ?
> > +                                            (*base)->drv->format_name : 
> > "");
> >      }
> > -    ret = 0;
> > -
> >  exit:
> > -    QSIMPLEQ_FOREACH_SAFE(intermediate_state, &states_to_delete, entry, 
> > next) {
> > -        g_free(intermediate_state);
> > -    }
> >      return ret;
> >  }
> >  
> > diff --git a/block/commit.c b/block/commit.c
> > index 2227fc2..c85b188 100644
> > --- a/block/commit.c
> > +++ b/block/commit.c
> > @@ -17,14 +17,13 @@
> >  #include "block/blockjob.h"
> >  #include "qemu/ratelimit.h"
> >  
> > -enum {
> > -    /*
> > -     * Size of data buffer for populating the image file.  This should be 
> > large
> > -     * enough to process multiple clusters in a single call, so that 
> > populating
> > -     * contiguous regions of the image is efficient.
> > -     */
> > -    COMMIT_BUFFER_SIZE = 512 * 1024, /* in bytes */
> > -};
> > +/*
> > + * Size of data buffer for populating the image file.  This should be large
> > + * enough to process multiple clusters in a single call, so that populating
> > + * contiguous regions of the image is efficient.
> > + */
> > +#define COMMIT_BUFFER_SECTORS 128
> > +#define COMMIT_BUFFER_BYTES (COMMIT_BUFFER_SECTORS * BDRV_SECTOR_SIZE)
> >  
> >  #define SLICE_TIME 100000000ULL /* ns */
> >  
> > @@ -34,6 +33,7 @@ typedef struct CommitBlockJob {
> >      BlockDriverState *active;
> >      BlockDriverState *top;
> >      BlockDriverState *base;
> > +    BlockDriverState *overlay;
> >      BlockdevOnError on_error;
> >      int base_flags;
> >      int orig_overlay_flags;
> > @@ -65,100 +65,109 @@ static void coroutine_fn commit_run(void *opaque)
> >      BlockDriverState *active = s->active;
> >      BlockDriverState *top = s->top;
> >      BlockDriverState *base = s->base;
> > -    BlockDriverState *overlay_bs;
> >      int64_t sector_num, end;
> >      int ret = 0;
> >      int n = 0;
> >      void *buf;
> > -    int bytes_written = 0;
> >      int64_t base_len;
> > +    int64_t next_dirty;
> > +    HBitmapIter hbi;
> >  
> > +    buf = qemu_blockalign(top, COMMIT_BUFFER_BYTES);
> >      ret = s->common.len = bdrv_getlength(top);
> >  
> > -
> >      if (s->common.len < 0) {
> > -        goto exit_restore_reopen;
> > +        goto exit;
> >      }
> >  
> >      ret = base_len = bdrv_getlength(base);
> >      if (base_len < 0) {
> > -        goto exit_restore_reopen;
> > +        goto exit;
> >      }
> >  
> >      if (base_len < s->common.len) {
> >          ret = bdrv_truncate(base, s->common.len);
> >          if (ret) {
> > -            goto exit_restore_reopen;
> > +            goto exit;
> >          }
> >      }
> >  
> >      end = s->common.len >> BDRV_SECTOR_BITS;
> > -    buf = qemu_blockalign(top, COMMIT_BUFFER_SIZE);
> >  
> >      for (sector_num = 0; sector_num < end; sector_num += n) {
> > -        uint64_t delay_ns = 0;
> > -        bool copy;
> >  
> > -wait:
> > -        /* Note that even when no rate limit is applied we need to yield
> > -         * with no pending I/O here so that bdrv_drain_all() returns.
> > -         */
> > -        block_job_sleep_ns(&s->common, rt_clock, delay_ns);
> > -        if (block_job_is_cancelled(&s->common)) {
> > -            break;
> > -        }
> >          /* Copy if allocated above the base */
> >          ret = bdrv_co_is_allocated_above(top, base, sector_num,
> > -                                         COMMIT_BUFFER_SIZE / 
> > BDRV_SECTOR_SIZE,
> > +                                         COMMIT_BUFFER_SECTORS,
> >                                           &n);
> > -        copy = (ret == 1);
> > -        trace_commit_one_iteration(s, sector_num, n, ret);
> > -        if (copy) {
> > -            if (s->common.speed) {
> > -                delay_ns = ratelimit_calculate_delay(&s->limit, n);
> > -                if (delay_ns > 0) {
> > -                    goto wait;
> > -                }
> > -            }
> > -            ret = commit_populate(top, base, sector_num, n, buf);
> > -            bytes_written += n * BDRV_SECTOR_SIZE;
> > +        if (ret) {
> > +            bdrv_set_dirty(top, sector_num, n);
> > +        }
> > +    }
> > +
> > +    while (bdrv_get_dirty_count(s->top)) {
> > +        uint64_t delay_ns = 0;
> > +        if (block_job_is_cancelled(&s->common)) {
> > +            goto exit;
> >          }
> > -        if (ret < 0) {
> > -            if (s->on_error == BLOCKDEV_ON_ERROR_STOP ||
> > -                s->on_error == BLOCKDEV_ON_ERROR_REPORT||
> > -                (s->on_error == BLOCKDEV_ON_ERROR_ENOSPC && ret == 
> > -ENOSPC)) {
> > -                goto exit_free_buf;
> > -            } else {
> > -                n = 0;
> > -                continue;
> > +
> > +        bdrv_dirty_iter_init(s->top, &hbi);
> > +        for (next_dirty = hbitmap_iter_next(&hbi);
> > +                next_dirty >= 0;
> > +                next_dirty = hbitmap_iter_next(&hbi)) {
> > +            sector_num = next_dirty;
> > +            if (block_job_is_cancelled(&s->common)) {
> > +                goto exit;
> >              }
> > +            delay_ns = ratelimit_calculate_delay(&s->limit,
> > +                                                 COMMIT_BUFFER_SECTORS);
> > +            /* Note that even when no rate limit is applied we need to 
> > yield
> > +             * with no pending I/O here so that bdrv_drain_all() returns.
> > +             */
> > +            block_job_sleep_ns(&s->common, rt_clock, delay_ns);
> > +            trace_commit_one_iteration(s, sector_num,
> > +                                       COMMIT_BUFFER_SECTORS, ret);
> > +            ret = commit_populate(top, base, sector_num,
> > +                                  COMMIT_BUFFER_SECTORS, buf);
> > +            if (ret < 0) {
> > +                if (s->on_error == BLOCKDEV_ON_ERROR_STOP ||
> > +                    s->on_error == BLOCKDEV_ON_ERROR_REPORT ||
> > +                    (s->on_error == BLOCKDEV_ON_ERROR_ENOSPC &&
> > +                         ret == -ENOSPC)) {
> > +                    goto exit;
> > +                } else {
> > +                    continue;
> > +                }
> > +            }
> > +            /* Publish progress */
> > +            s->common.offset += COMMIT_BUFFER_BYTES;
> > +            bdrv_reset_dirty(top, sector_num, COMMIT_BUFFER_SECTORS);
> >          }
> > -        /* Publish progress */
> > -        s->common.offset += n * BDRV_SECTOR_SIZE;
> >      }
> >  
> > -    ret = 0;
> > -
> > -    if (!block_job_is_cancelled(&s->common) && sector_num == end) {
> > -        /* success */
> > -        ret = bdrv_drop_intermediate(active, top, base);
> > +    if (!block_job_is_cancelled(&s->common)) {
> > +        /* Drop intermediate: [top, base) */
> > +        ret = bdrv_drop_intermediate(s->overlay, &top, &base);
> > +        s->common.offset = s->common.len;
> >      }
> >  
> > -exit_free_buf:
> > -    qemu_vfree(buf);
> > +    ret = 0;
> > +
> > +exit:
> > +    bdrv_set_dirty_tracking(active, 0);
> >  
> > -exit_restore_reopen:
> >      /* restore base open flags here if appropriate (e.g., change the base 
> > back
> >       * to r/o). These reopens do not need to be atomic, since we won't 
> > abort
> >       * even on failure here */
> > -    if (s->base_flags != bdrv_get_flags(base)) {
> > +    if (s->overlay && s->base_flags != bdrv_get_flags(base)) {
> >          bdrv_reopen(base, s->base_flags, NULL);
> >      }
> > -    overlay_bs = bdrv_find_overlay(active, top);
> > -    if (overlay_bs && s->orig_overlay_flags != bdrv_get_flags(overlay_bs)) 
> > {
> > -        bdrv_reopen(overlay_bs, s->orig_overlay_flags, NULL);
> > +
> > +    if (s->overlay && s->orig_overlay_flags != bdrv_get_flags(s->overlay)) 
> > {
> > +        bdrv_reopen(s->overlay, s->orig_overlay_flags, NULL);
> >      }
> >  
> > +    qemu_vfree(buf);
> >      block_job_completed(&s->common, ret);
> >  }
> >  
> > @@ -198,13 +207,6 @@ void commit_start(BlockDriverState *bs, 
> > BlockDriverState *base,
> >          return;
> >      }
> >  
> > -    /* Once we support top == active layer, remove this check */
> > -    if (top == bs) {
> > -        error_setg(errp,
> > -                   "Top image as the active layer is currently 
> > unsupported");
> > -        return;
> > -    }
> > -
> >      if (top == base) {
> >          error_setg(errp, "Invalid files for merge: top and base are the 
> > same");
> >          return;
> > @@ -212,23 +214,20 @@ void commit_start(BlockDriverState *bs, 
> > BlockDriverState *base,
> >  
> >      overlay_bs = bdrv_find_overlay(bs, top);
> >  
> > -    if (overlay_bs == NULL) {
> > -        error_setg(errp, "Could not find overlay image for %s:", 
> > top->filename);
> > -        return;
> > -    }
> > -
> >      orig_base_flags    = bdrv_get_flags(base);
> > -    orig_overlay_flags = bdrv_get_flags(overlay_bs);
> > +    if (overlay_bs) {
> > +        orig_overlay_flags = bdrv_get_flags(overlay_bs);
> > +        if (!(orig_overlay_flags & BDRV_O_RDWR)) {
> > +            reopen_queue = bdrv_reopen_queue(reopen_queue, overlay_bs,
> > +                    orig_overlay_flags | BDRV_O_RDWR);
> > +        }
> > +    }
> >  
> >      /* convert base & overlay_bs to r/w, if necessary */
> >      if (!(orig_base_flags & BDRV_O_RDWR)) {
> >          reopen_queue = bdrv_reopen_queue(reopen_queue, base,
> >                                           orig_base_flags | BDRV_O_RDWR);
> >      }
> > -    if (!(orig_overlay_flags & BDRV_O_RDWR)) {
> > -        reopen_queue = bdrv_reopen_queue(reopen_queue, overlay_bs,
> > -                                         orig_overlay_flags | BDRV_O_RDWR);
> > -    }
> >      if (reopen_queue) {
> >          bdrv_reopen_multiple(reopen_queue, &local_err);
> >          if (local_err != NULL) {
> > @@ -237,7 +236,6 @@ void commit_start(BlockDriverState *bs, 
> > BlockDriverState *base,
> >          }
> >      }
> >  
> > -
> >      s = block_job_create(&commit_job_type, bs, speed, cb, opaque, errp);
> >      if (!s) {
> >          return;
> > @@ -246,13 +244,19 @@ void commit_start(BlockDriverState *bs, 
> > BlockDriverState *base,
> >      s->base   = base;
> >      s->top    = top;
> >      s->active = bs;
> > +    s->overlay = overlay_bs;
> >  
> >      s->base_flags          = orig_base_flags;
> > -    s->orig_overlay_flags  = orig_overlay_flags;
> > +    if (overlay_bs) {
> > +        s->orig_overlay_flags  = orig_overlay_flags;
> > +    }
> >  
> >      s->on_error = on_error;
> >      s->common.co = qemu_coroutine_create(commit_run);
> >  
> >      trace_commit_start(bs, base, top, s, s->common.co, opaque);
> > +
> > +    bdrv_set_dirty_tracking(top, COMMIT_BUFFER_BYTES);
> > +
> >      qemu_coroutine_enter(s->common.co, s);
> >  }
> > diff --git a/include/block/block.h b/include/block/block.h
> > index b6b9014..caf2c22 100644
> > --- a/include/block/block.h
> > +++ b/include/block/block.h
> > @@ -197,8 +197,9 @@ int bdrv_commit_all(void);
> >  int bdrv_change_backing_file(BlockDriverState *bs,
> >      const char *backing_file, const char *backing_fmt);
> >  void bdrv_register(BlockDriver *bdrv);
> > -int bdrv_drop_intermediate(BlockDriverState *active, BlockDriverState *top,
> > -                           BlockDriverState *base);
> > +int bdrv_drop_intermediate(BlockDriverState *top_overlay,
> > +                           BlockDriverState **top,
> > +                           BlockDriverState **base);
> >  BlockDriverState *bdrv_find_overlay(BlockDriverState *active,
> >                                      BlockDriverState *bs);
> >  BlockDriverState *bdrv_find_base(BlockDriverState *bs);
> > 
> 

-- 
Fam



reply via email to

[Prev in Thread] Current Thread [Next in Thread]