[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]
Re: [Qemu-devel] [PATCH 8/8] quorum: add basic device recovery logic
From: |
Benoît Canet |
Subject: |
Re: [Qemu-devel] [PATCH 8/8] quorum: add basic device recovery logic |
Date: |
Mon, 1 Sep 2014 11:37:20 +0200 |
User-agent: |
Mutt/1.5.23 (2014-03-12) |
The Monday 01 Sep 2014 à 15:43:14 (+0800), Liu Yuan wrote :
> For some configuration, quorum allow VMs to continue while some child devices
> are broken and when the child devices are repaired and return back, we need to
> sync dirty bits during downtime to keep data consistency.
>
> The recovery logic is based on the driver state bitmap and will sync the dirty
> bits with a timeslice window in a coroutine in this prtimive implementation.
>
> Simple graph about 2 children with threshold=1 and read-pattern=fifo:
>
> + denote device sync iteration
> - IO on a single device
> = IO on two devices
>
> sync complete, release dirty bitmap
> ^
> |
> ====-----------------++++----++++----++==========
> | |
> | v
> | device repaired and begin to sync
> v
> device broken, create a dirty bitmap
>
> This sync logic can take care of nested broken problem, that devices are
> broken while in sync. We just start a sync process after the devices are
> repaired again and switch the devices from broken to sound only when the
> sync
> completes.
>
> For read-pattern=quorum mode, it enjoys the recovery logic without any
> problem.
>
> Cc: Eric Blake <address@hidden>
> Cc: Benoit Canet <address@hidden>
> Cc: Kevin Wolf <address@hidden>
> Cc: Stefan Hajnoczi <address@hidden>
> Signed-off-by: Liu Yuan <address@hidden>
> ---
> block/quorum.c | 189
> ++++++++++++++++++++++++++++++++++++++++++++++++++++++++-
> trace-events | 5 ++
> 2 files changed, 191 insertions(+), 3 deletions(-)
>
> diff --git a/block/quorum.c b/block/quorum.c
> index 7b07e35..ffd7c2d 100644
> --- a/block/quorum.c
> +++ b/block/quorum.c
> @@ -23,6 +23,7 @@
> #include "qapi/qmp/qlist.h"
> #include "qapi/qmp/qstring.h"
> #include "qapi-event.h"
> +#include "trace.h"
>
> #define HASH_LENGTH 32
>
> @@ -31,6 +32,10 @@
> #define QUORUM_OPT_REWRITE "rewrite-corrupted"
> #define QUORUM_OPT_READ_PATTERN "read-pattern"
>
> +#define SLICE_TIME 100000000ULL /* 100 ms */
> +#define CHUNK_SIZE (1 << 20) /* 1M */
> +#define SECTORS_PER_CHUNK (CHUNK_SIZE >> BDRV_SECTOR_BITS)
> +
> /* This union holds a vote hash value */
> typedef union QuorumVoteValue {
> char h[HASH_LENGTH]; /* SHA-256 hash */
> @@ -64,6 +69,7 @@ typedef struct QuorumVotes {
>
> /* the following structure holds the state of one quorum instance */
> typedef struct BDRVQuorumState {
> + BlockDriverState *mybs;/* Quorum block driver base state */
> BlockDriverState **bs; /* children BlockDriverStates */
> int num_children; /* children count */
> int threshold; /* if less than threshold children reads gave the
> @@ -82,6 +88,10 @@ typedef struct BDRVQuorumState {
> */
>
> QuorumReadPattern read_pattern;
> + BdrvDirtyBitmap *dirty_bitmap;
> + uint8_t *sync_buf;
> + HBitmapIter hbi;
> + int64_t sector_num;
> } BDRVQuorumState;
>
> typedef struct QuorumAIOCB QuorumAIOCB;
> @@ -290,12 +300,11 @@ static void quorum_copy_qiov(QEMUIOVector *dest,
> QEMUIOVector *source)
> }
> }
>
> -static int next_fifo_child(QuorumAIOCB *acb)
> +static int get_good_child(BDRVQuorumState *s, int iter)
> {
> - BDRVQuorumState *s = acb->common.bs->opaque;
> int i;
>
> - for (i = acb->child_iter; i < s->num_children; i++) {
> + for (i = iter; i < s->num_children; i++) {
> if (!s->bs[i]->broken) {
> break;
> }
> @@ -306,6 +315,13 @@ static int next_fifo_child(QuorumAIOCB *acb)
> return i;
> }
>
> +static int next_fifo_child(QuorumAIOCB *acb)
> +{
> + BDRVQuorumState *s = acb->common.bs->opaque;
> +
> + return get_good_child(s, acb->child_iter);
> +}
> +
> static void quorum_aio_cb(void *opaque, int ret)
> {
> QuorumChildRequest *sacb = opaque;
> @@ -951,6 +967,171 @@ static int parse_read_pattern(const char *opt)
> return -EINVAL;
> }
>
> +static void sync_prepare(BDRVQuorumState *qs, int64_t *num)
> +{
> + int64_t nb, total = bdrv_nb_sectors(qs->mybs);
> +
> + qs->sector_num = hbitmap_iter_next(&qs->hbi);
> + /* Wrap around if previous bits get dirty while syncing */
> + if (qs->sector_num < 0) {
> + bdrv_dirty_iter_init(qs->mybs, qs->dirty_bitmap, &qs->hbi);
> + qs->sector_num = hbitmap_iter_next(&qs->hbi);
> + assert(qs->sector_num >= 0);
> + }
> +
> + for (nb = 1; nb < SECTORS_PER_CHUNK && qs->sector_num + nb < total;
> + nb++) {
> + if (!bdrv_get_dirty(qs->mybs, qs->dirty_bitmap, qs->sector_num +
> nb)) {
> + break;
> + }
> + }
> + *num = nb;
> +}
> +
> +static void sync_finish(BDRVQuorumState *qs, int64_t num)
> +{
> + int64_t i;
> +
> + for (i = 0; i < num; i++) {
> + /* We need to advance the iterator manually */
> + hbitmap_iter_next(&qs->hbi);
> + }
> + bdrv_reset_dirty(qs->mybs, qs->sector_num, num);
> +}
> +
> +static int quorum_sync_iteration(BDRVQuorumState *qs, BlockDriverState
> *target)
> +{
> + BlockDriverState *source;
> + QEMUIOVector qiov;
> + int ret, good;
> + int64_t nb_sectors;
> + struct iovec iov;
> + const char *sname, *tname = bdrv_get_filename(target);
> +
> + good = get_good_child(qs, 0);
> + if (good < 0) {
> + error_report("No good device available.");
> + return -1;
> + }
> + source = qs->bs[good];
> + sname = bdrv_get_filename(source);
> + sync_prepare(qs, &nb_sectors);
> + iov.iov_base = qs->sync_buf;
> + iov.iov_len = nb_sectors * BDRV_SECTOR_SIZE;
> + qemu_iovec_init_external(&qiov, &iov, 1);
> +
> + trace_quorum_sync_iteration(sname, tname, qs->sector_num, nb_sectors);
> + ret = bdrv_co_readv(source, qs->sector_num, nb_sectors, &qiov);
> + if (ret < 0) {
> + error_report("Read source %s failed.", sname);
I didn't read this patch throughfully but in quorum if you need to name a child
BDS
you must use bs->node_name.
bs->node_name was introduced to be able to merge quorum and uniquely identify a
given
node of the BDS graph.
Best regards
Benoît
> + return ret;
> + }
> + ret = bdrv_co_writev(target, qs->sector_num, nb_sectors, &qiov);
> + if (ret < 0) {
> + error_report("Write target %s failed.", tname);
> + return ret;
> + }
> + sync_finish(qs, nb_sectors);
> +
> + return 0;
> +}
> +
> +static int quorum_sync_device(BDRVQuorumState *qs, BlockDriverState *target)
> +{
> + uint64_t last_pause_ns;
> +
> + bdrv_dirty_iter_init(qs->mybs, qs->dirty_bitmap, &qs->hbi);
> + last_pause_ns = qemu_clock_get_ns(QEMU_CLOCK_REALTIME);
> + for (;;) {
> + int64_t cnt;
> +
> + cnt = bdrv_get_dirty_count(qs->mybs, qs->dirty_bitmap);
> + if (cnt == 0) {
> + break;
> + }
> + error_report("count %ld", cnt);
> + if (quorum_sync_iteration(qs, target) < 0) {
> + return -1;
> + }
> + cnt = bdrv_get_dirty_count(qs->mybs, qs->dirty_bitmap);
> + if (cnt == 0) {
> + break;
> + }
> +
> + if (qemu_clock_get_ns(QEMU_CLOCK_REALTIME) - last_pause_ns >=
> + SLICE_TIME) {
> + co_aio_sleep_ns(bdrv_get_aio_context(target),
> QEMU_CLOCK_REALTIME,
> + SLICE_TIME);
> + last_pause_ns = qemu_clock_get_ns(QEMU_CLOCK_REALTIME);
> + }
> + }
> +
> + return 0;
> +}
> +
> +static BlockDriverState *file_to_bs(BDRVQuorumState *qs, BlockDriverState
> *file)
> +{
> + int i;
> +
> + for (i = 0; i < qs->num_children; i++) {
> + BlockDriverState *f = bdrv_get_file(qs->bs[i]);
> +
> + if (f == file) {
> + return qs->bs[i];
> + }
> + }
> +
> + error_report("Can't find driver state for %s", bdrv_get_filename(file));
> + abort();
> +}
> +
> +static void quorum_driver_reconnect(BlockDriverState *file)
> +{
> + BDRVQuorumState *qs = file->drv_opaque;
> + BlockDriverState *bs = file_to_bs(qs, file);
> + const char *name = bdrv_get_filename(bs);
> +
> + trace_quorum_driver_reconnect(name);
> + assert(bs->broken == true);
> + if (quorum_sync_device(qs, bs) < 0) {
> + error_report("Failed to sync device %s", name);
> + return;
> + }
> +
> + bdrv_release_dirty_bitmap(qs->mybs, qs->dirty_bitmap);
> + qemu_vfree(qs->sync_buf);
> + bs->broken = false;
> +}
> +
> +static void quorum_driver_disconnect(BlockDriverState *file)
> +{
> + BDRVQuorumState *qs = file->drv_opaque;
> + BlockDriverState *bs = file_to_bs(qs, file);
> + const char *name = bdrv_get_filename(bs);
> +
> + trace_quorum_driver_disconnect(name);
> + /*
> + * If we are disconnected while being syncing, we expect to reconnect to
> the
> + * target again and resume the data sync from the last synced point.
> + */
> + if (bs->broken) {
> + return;
> + }
> +
> + bs->broken = true;
> + qs->dirty_bitmap = bdrv_create_dirty_bitmap(qs->mybs, BDRV_SECTOR_SIZE,
> + NULL);
> + if (!qs->dirty_bitmap) {
> + abort();
> + }
> + qs->sync_buf = qemu_blockalign(bs, CHUNK_SIZE);
> +}
> +
> +static const BlockDrvOps quorum_block_drv_ops = {
> + .driver_reconnect = quorum_driver_reconnect,
> + .driver_disconnect = quorum_driver_disconnect,
> +};
> +
> static int quorum_open(BlockDriverState *bs, QDict *options, int flags,
> Error **errp)
> {
> @@ -975,6 +1156,7 @@ static int quorum_open(BlockDriverState *bs, QDict
> *options, int flags,
> goto exit;
> }
>
> + s->mybs = bs;
> /* count how many different children are present */
> s->num_children = qlist_size(list);
> if (s->num_children < 2) {
> @@ -1061,6 +1243,7 @@ static int quorum_open(BlockDriverState *bs, QDict
> *options, int flags,
> goto close_exit;
> }
> opened[i] = true;
> + bdrv_set_drv_ops(bdrv_get_file(s->bs[i]), &quorum_block_drv_ops, s);
> }
>
> g_free(opened);
> diff --git a/trace-events b/trace-events
> index 81bc915..8da0a13 100644
> --- a/trace-events
> +++ b/trace-events
> @@ -572,6 +572,11 @@ qed_aio_write_prefill(void *s, void *acb, uint64_t
> start, size_t len, uint64_t o
> qed_aio_write_postfill(void *s, void *acb, uint64_t start, size_t len,
> uint64_t offset) "s %p acb %p start %"PRIu64" len %zu offset %"PRIu64
> qed_aio_write_main(void *s, void *acb, int ret, uint64_t offset, size_t len)
> "s %p acb %p ret %d offset %"PRIu64" len %zu"
>
> +# block/quorum.c
> +quorum_sync_iteration(const char *source, const char *target, int64_t
> sector, int num) "%s -> %s, sector %"PRId64" nb_sectors %d"
> +quorum_driver_reconnect(const char *target) "%s"
> +quorum_driver_disconnect(const char *target) "%s"
> +
> # hw/display/g364fb.c
> g364fb_read(uint64_t addr, uint32_t val) "read addr=0x%"PRIx64": 0x%x"
> g364fb_write(uint64_t addr, uint32_t new) "write addr=0x%"PRIx64": 0x%x"
> --
> 1.9.1
>
- Re: [Qemu-devel] [PATCH 5/8] quorum: fix quorum_aio_cancel(), (continued)
[Qemu-devel] [PATCH 6/8] block/quorum: add broken state to BlockDriverState, Liu Yuan, 2014/09/01
[Qemu-devel] [PATCH 7/8] block: add two helpers, Liu Yuan, 2014/09/01
[Qemu-devel] [PATCH 8/8] quorum: add basic device recovery logic, Liu Yuan, 2014/09/01
- Re: [Qemu-devel] [PATCH 8/8] quorum: add basic device recovery logic,
Benoît Canet <=
Re: [Qemu-devel] [PATCH 0/8] add basic recovery logic to quorum driver, Benoît Canet, 2014/09/01
Re: [Qemu-devel] [PATCH 0/8] add basic recovery logic to quorum driver, Benoît Canet, 2014/09/02
Re: [Qemu-devel] [PATCH 0/8] add basic recovery logic to quorum driver, Benoît Canet, 2014/09/07