qemu-devel
[Top][All Lists]
Advanced

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

Re: [Qemu-devel] [PATCH 8/8] quorum: add basic device recovery logic


From: Benoît Canet
Subject: Re: [Qemu-devel] [PATCH 8/8] quorum: add basic device recovery logic
Date: Mon, 1 Sep 2014 11:37:20 +0200
User-agent: Mutt/1.5.23 (2014-03-12)

The Monday 01 Sep 2014 à 15:43:14 (+0800), Liu Yuan wrote :
> For some configuration, quorum allow VMs to continue while some child devices
> are broken and when the child devices are repaired and return back, we need to
> sync dirty bits during downtime to keep data consistency.
> 
> The recovery logic is based on the driver state bitmap and will sync the dirty
> bits with a timeslice window in a coroutine in this prtimive implementation.
> 
> Simple graph about 2 children with threshold=1 and read-pattern=fifo:
> 
> + denote device sync iteration
> - IO on a single device
> = IO on two devices
> 
>                                       sync complete, release dirty bitmap
>                                          ^
>                                          |
>   ====-----------------++++----++++----++==========
>      |                 |
>      |                 v
>      |               device repaired and begin to sync
>      v
>    device broken, create a dirty bitmap
> 
>   This sync logic can take care of nested broken problem, that devices are
>   broken while in sync. We just start a sync process after the devices are
>   repaired again and switch the devices from broken to sound only when the 
> sync
>   completes.
> 
> For read-pattern=quorum mode, it enjoys the recovery logic without any 
> problem.
> 
> Cc: Eric Blake <address@hidden>
> Cc: Benoit Canet <address@hidden>
> Cc: Kevin Wolf <address@hidden>
> Cc: Stefan Hajnoczi <address@hidden>
> Signed-off-by: Liu Yuan <address@hidden>
> ---
>  block/quorum.c | 189 
> ++++++++++++++++++++++++++++++++++++++++++++++++++++++++-
>  trace-events   |   5 ++
>  2 files changed, 191 insertions(+), 3 deletions(-)
> 
> diff --git a/block/quorum.c b/block/quorum.c
> index 7b07e35..ffd7c2d 100644
> --- a/block/quorum.c
> +++ b/block/quorum.c
> @@ -23,6 +23,7 @@
>  #include "qapi/qmp/qlist.h"
>  #include "qapi/qmp/qstring.h"
>  #include "qapi-event.h"
> +#include "trace.h"
>  
>  #define HASH_LENGTH 32
>  
> @@ -31,6 +32,10 @@
>  #define QUORUM_OPT_REWRITE        "rewrite-corrupted"
>  #define QUORUM_OPT_READ_PATTERN   "read-pattern"
>  
> +#define SLICE_TIME          100000000ULL /* 100 ms */
> +#define CHUNK_SIZE          (1 << 20) /* 1M */
> +#define SECTORS_PER_CHUNK   (CHUNK_SIZE >> BDRV_SECTOR_BITS)
> +
>  /* This union holds a vote hash value */
>  typedef union QuorumVoteValue {
>      char h[HASH_LENGTH];       /* SHA-256 hash */
> @@ -64,6 +69,7 @@ typedef struct QuorumVotes {
>  
>  /* the following structure holds the state of one quorum instance */
>  typedef struct BDRVQuorumState {
> +    BlockDriverState *mybs;/* Quorum block driver base state */
>      BlockDriverState **bs; /* children BlockDriverStates */
>      int num_children;      /* children count */
>      int threshold;         /* if less than threshold children reads gave the
> @@ -82,6 +88,10 @@ typedef struct BDRVQuorumState {
>                              */
>  
>      QuorumReadPattern read_pattern;
> +    BdrvDirtyBitmap *dirty_bitmap;
> +    uint8_t *sync_buf;
> +    HBitmapIter hbi;
> +    int64_t sector_num;
>  } BDRVQuorumState;
>  
>  typedef struct QuorumAIOCB QuorumAIOCB;
> @@ -290,12 +300,11 @@ static void quorum_copy_qiov(QEMUIOVector *dest, 
> QEMUIOVector *source)
>      }
>  }
>  
> -static int next_fifo_child(QuorumAIOCB *acb)
> +static int get_good_child(BDRVQuorumState *s, int iter)
>  {
> -    BDRVQuorumState *s = acb->common.bs->opaque;
>      int i;
>  
> -    for (i = acb->child_iter; i < s->num_children; i++) {
> +    for (i = iter; i < s->num_children; i++) {
>          if (!s->bs[i]->broken) {
>              break;
>          }
> @@ -306,6 +315,13 @@ static int next_fifo_child(QuorumAIOCB *acb)
>      return i;
>  }
>  
> +static int next_fifo_child(QuorumAIOCB *acb)
> +{
> +    BDRVQuorumState *s = acb->common.bs->opaque;
> +
> +    return get_good_child(s, acb->child_iter);
> +}
> +
>  static void quorum_aio_cb(void *opaque, int ret)
>  {
>      QuorumChildRequest *sacb = opaque;
> @@ -951,6 +967,171 @@ static int parse_read_pattern(const char *opt)
>      return -EINVAL;
>  }
>  
> +static void sync_prepare(BDRVQuorumState *qs, int64_t *num)
> +{
> +    int64_t nb, total = bdrv_nb_sectors(qs->mybs);
> +
> +    qs->sector_num = hbitmap_iter_next(&qs->hbi);
> +    /* Wrap around if previous bits get dirty while syncing */
> +    if (qs->sector_num < 0) {
> +        bdrv_dirty_iter_init(qs->mybs, qs->dirty_bitmap, &qs->hbi);
> +        qs->sector_num = hbitmap_iter_next(&qs->hbi);
> +        assert(qs->sector_num >= 0);
> +    }
> +
> +    for (nb = 1; nb < SECTORS_PER_CHUNK && qs->sector_num + nb < total;
> +         nb++) {
> +        if (!bdrv_get_dirty(qs->mybs, qs->dirty_bitmap, qs->sector_num + 
> nb)) {
> +            break;
> +        }
> +    }
> +    *num = nb;
> +}
> +
> +static void sync_finish(BDRVQuorumState *qs, int64_t num)
> +{
> +    int64_t i;
> +
> +    for (i = 0; i < num; i++) {
> +        /* We need to advance the iterator manually */
> +        hbitmap_iter_next(&qs->hbi);
> +    }
> +    bdrv_reset_dirty(qs->mybs, qs->sector_num, num);
> +}
> +
> +static int quorum_sync_iteration(BDRVQuorumState *qs, BlockDriverState 
> *target)
> +{
> +    BlockDriverState *source;
> +    QEMUIOVector qiov;
> +    int ret, good;
> +    int64_t nb_sectors;
> +    struct iovec iov;
> +    const char *sname, *tname = bdrv_get_filename(target);
> +
> +    good = get_good_child(qs, 0);
> +    if (good < 0) {
> +        error_report("No good device available.");
> +        return -1;
> +    }
> +    source = qs->bs[good];
> +    sname = bdrv_get_filename(source);
> +    sync_prepare(qs, &nb_sectors);
> +    iov.iov_base = qs->sync_buf;
> +    iov.iov_len  = nb_sectors * BDRV_SECTOR_SIZE;
> +    qemu_iovec_init_external(&qiov, &iov, 1);
> +
> +    trace_quorum_sync_iteration(sname, tname, qs->sector_num, nb_sectors);
> +    ret = bdrv_co_readv(source, qs->sector_num, nb_sectors, &qiov);
> +    if (ret < 0) {
> +        error_report("Read source %s failed.", sname);

I didn't read this patch throughfully but in quorum if you need to name a child 
BDS
you must use bs->node_name.

bs->node_name was introduced to be able to merge quorum and uniquely identify a 
given
node of the BDS graph.

Best regards

Benoît

> +        return ret;
> +    }
> +    ret = bdrv_co_writev(target, qs->sector_num, nb_sectors, &qiov);
> +    if (ret < 0) {
> +        error_report("Write target %s failed.", tname);
> +        return ret;
> +    }
> +    sync_finish(qs, nb_sectors);
> +
> +    return 0;
> +}
> +
> +static int quorum_sync_device(BDRVQuorumState *qs, BlockDriverState *target)
> +{
> +    uint64_t last_pause_ns;
> +
> +    bdrv_dirty_iter_init(qs->mybs, qs->dirty_bitmap, &qs->hbi);
> +    last_pause_ns = qemu_clock_get_ns(QEMU_CLOCK_REALTIME);
> +    for (;;) {
> +        int64_t cnt;
> +
> +        cnt = bdrv_get_dirty_count(qs->mybs, qs->dirty_bitmap);
> +        if (cnt == 0) {
> +            break;
> +        }
> +        error_report("count %ld", cnt);
> +        if (quorum_sync_iteration(qs, target) < 0) {
> +            return -1;
> +        }
> +        cnt = bdrv_get_dirty_count(qs->mybs, qs->dirty_bitmap);
> +        if (cnt == 0) {
> +            break;
> +        }
> +
> +        if (qemu_clock_get_ns(QEMU_CLOCK_REALTIME) - last_pause_ns >=
> +            SLICE_TIME) {
> +            co_aio_sleep_ns(bdrv_get_aio_context(target), 
> QEMU_CLOCK_REALTIME,
> +                            SLICE_TIME);
> +            last_pause_ns = qemu_clock_get_ns(QEMU_CLOCK_REALTIME);
> +        }
> +    }
> +
> +    return 0;
> +}
> +
> +static BlockDriverState *file_to_bs(BDRVQuorumState *qs, BlockDriverState 
> *file)
> +{
> +    int i;
> +
> +    for (i = 0; i < qs->num_children; i++) {
> +        BlockDriverState *f = bdrv_get_file(qs->bs[i]);
> +
> +        if (f == file) {
> +            return qs->bs[i];
> +        }
> +    }
> +
> +    error_report("Can't find driver state for %s", bdrv_get_filename(file));
> +    abort();
> +}
> +
> +static void quorum_driver_reconnect(BlockDriverState *file)
> +{
> +    BDRVQuorumState *qs = file->drv_opaque;
> +    BlockDriverState *bs = file_to_bs(qs, file);
> +    const char *name = bdrv_get_filename(bs);
> +
> +    trace_quorum_driver_reconnect(name);
> +    assert(bs->broken == true);
> +    if (quorum_sync_device(qs, bs) < 0) {
> +        error_report("Failed to sync device %s", name);
> +        return;
> +    }
> +
> +    bdrv_release_dirty_bitmap(qs->mybs, qs->dirty_bitmap);
> +    qemu_vfree(qs->sync_buf);
> +    bs->broken = false;
> +}
> +
> +static void quorum_driver_disconnect(BlockDriverState *file)
> +{
> +    BDRVQuorumState *qs = file->drv_opaque;
> +    BlockDriverState *bs = file_to_bs(qs, file);
> +    const char *name = bdrv_get_filename(bs);
> +
> +    trace_quorum_driver_disconnect(name);
> +    /*
> +     * If we are disconnected while being syncing, we expect to reconnect to 
> the
> +     * target again and resume the data sync from the last synced point.
> +     */
> +    if (bs->broken) {
> +        return;
> +    }
> +
> +    bs->broken = true;
> +    qs->dirty_bitmap = bdrv_create_dirty_bitmap(qs->mybs, BDRV_SECTOR_SIZE,
> +                                                NULL);
> +    if (!qs->dirty_bitmap) {
> +        abort();
> +    }
> +    qs->sync_buf = qemu_blockalign(bs, CHUNK_SIZE);
> +}
> +
> +static const BlockDrvOps quorum_block_drv_ops = {
> +    .driver_reconnect = quorum_driver_reconnect,
> +    .driver_disconnect = quorum_driver_disconnect,
> +};
> +
>  static int quorum_open(BlockDriverState *bs, QDict *options, int flags,
>                         Error **errp)
>  {
> @@ -975,6 +1156,7 @@ static int quorum_open(BlockDriverState *bs, QDict 
> *options, int flags,
>          goto exit;
>      }
>  
> +    s->mybs = bs;
>      /* count how many different children are present */
>      s->num_children = qlist_size(list);
>      if (s->num_children < 2) {
> @@ -1061,6 +1243,7 @@ static int quorum_open(BlockDriverState *bs, QDict 
> *options, int flags,
>              goto close_exit;
>          }
>          opened[i] = true;
> +        bdrv_set_drv_ops(bdrv_get_file(s->bs[i]), &quorum_block_drv_ops, s);
>      }
>  
>      g_free(opened);
> diff --git a/trace-events b/trace-events
> index 81bc915..8da0a13 100644
> --- a/trace-events
> +++ b/trace-events
> @@ -572,6 +572,11 @@ qed_aio_write_prefill(void *s, void *acb, uint64_t 
> start, size_t len, uint64_t o
>  qed_aio_write_postfill(void *s, void *acb, uint64_t start, size_t len, 
> uint64_t offset) "s %p acb %p start %"PRIu64" len %zu offset %"PRIu64
>  qed_aio_write_main(void *s, void *acb, int ret, uint64_t offset, size_t len) 
> "s %p acb %p ret %d offset %"PRIu64" len %zu"
>  
> +# block/quorum.c
> +quorum_sync_iteration(const char *source, const char *target, int64_t 
> sector, int num) "%s -> %s, sector %"PRId64" nb_sectors %d"
> +quorum_driver_reconnect(const char *target) "%s"
> +quorum_driver_disconnect(const char *target) "%s"
> +
>  # hw/display/g364fb.c
>  g364fb_read(uint64_t addr, uint32_t val) "read addr=0x%"PRIx64": 0x%x"
>  g364fb_write(uint64_t addr, uint32_t new) "write addr=0x%"PRIx64": 0x%x"
> -- 
> 1.9.1
> 



reply via email to

[Prev in Thread] Current Thread [Next in Thread]