qemu-devel
[Top][All Lists]
Advanced

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

Re: [Qemu-devel] [PATCH 6/8] block/quorum: add broken state to BlockDriv


From: Benoît Canet
Subject: Re: [Qemu-devel] [PATCH 6/8] block/quorum: add broken state to BlockDriverState
Date: Mon, 1 Sep 2014 10:57:43 +0200
User-agent: Mutt/1.5.23 (2014-03-12)

The Monday 01 Sep 2014 à 15:43:12 (+0800), Liu Yuan wrote :
> This allow VM continues to process even if some devices are broken meanwhile
> with proper configuration.
> 
> We mark the device broken when the protocol tier notify back some broken
> state(s) of the device, such as diconnection via driver operations. We could
> also reset the device as sound when the protocol tier is repaired.
> 
> Origianlly .threshold controls how we should decide the success of read/write
> and return the failure only if the success count of read/write is less than
> .threshold specified by users. But it doesn't record the states of underlying
> states and will impact performance a bit in some cases.
> 
> For example, we have 3 children and .threshold is set 2. If one of the devices
> broken, we should still return success and continue to run VM. But for every
> IO operations, we will blindly send the requests to the broken device.
> 
> To store broken state into driver state we can save requests to borken devices
> and resend the requests to the repaired ones by setting broken as false.
> 
> This is especially useful for network based protocol such as sheepdog, which
> has a auto-reconnection mechanism and will never report EIO if the connection
> is broken but just store the requests to its local queue and wait for 
> resending.
> Without broken state, quorum request will not come back until the connection 
> is
> re-established. So we have to skip the broken deivces to allow VM to continue
> running with networked backed child (glusterfs, nfs, sheepdog, etc).
> 
> With the combination of read-pattern and threshold, we can easily mimic the 
> DRVD
> behavior with following configuration:
> 
>  read-pattern=fifo,threshold=1 will two children.
> 
> Cc: Eric Blake <address@hidden>
> Cc: Benoit Canet <address@hidden>
> Cc: Kevin Wolf <address@hidden>
> Cc: Stefan Hajnoczi <address@hidden>
> Signed-off-by: Liu Yuan <address@hidden>
> ---
>  block/quorum.c            | 102 
> ++++++++++++++++++++++++++++++++++++++--------
>  include/block/block_int.h |   3 ++
>  2 files changed, 87 insertions(+), 18 deletions(-)
> 
> diff --git a/block/quorum.c b/block/quorum.c
> index b9eeda3..7b07e35 100644
> --- a/block/quorum.c
> +++ b/block/quorum.c
> @@ -120,6 +120,7 @@ struct QuorumAIOCB {
>      int rewrite_count;          /* number of replica to rewrite: count down 
> to
>                                   * zero once writes are fired
>                                   */
> +    int issued_count;           /* actual read&write issued count */
>  
>      QuorumVotes votes;
>  
> @@ -170,8 +171,10 @@ static void quorum_aio_finalize(QuorumAIOCB *acb)
>      if (acb->is_read) {
>          /* on the quorum case acb->child_iter == s->num_children - 1 */
>          for (i = 0; i <= acb->child_iter; i++) {
> -            qemu_vfree(acb->qcrs[i].buf);
> -            qemu_iovec_destroy(&acb->qcrs[i].qiov);
> +            if (acb->qcrs[i].buf) {
> +                qemu_vfree(acb->qcrs[i].buf);
> +                qemu_iovec_destroy(&acb->qcrs[i].qiov);
> +            }
>          }
>      }
>  
> @@ -207,6 +210,7 @@ static QuorumAIOCB *quorum_aio_get(BDRVQuorumState *s,
>      acb->count = 0;
>      acb->success_count = 0;
>      acb->rewrite_count = 0;
> +    acb->issued_count = 0;
>      acb->votes.compare = quorum_sha256_compare;
>      QLIST_INIT(&acb->votes.vote_list);
>      acb->is_read = false;
> @@ -286,6 +290,22 @@ static void quorum_copy_qiov(QEMUIOVector *dest, 
> QEMUIOVector *source)
>      }
>  }
>  
> +static int next_fifo_child(QuorumAIOCB *acb)
> +{
> +    BDRVQuorumState *s = acb->common.bs->opaque;
> +    int i;
> +
> +    for (i = acb->child_iter; i < s->num_children; i++) {
> +        if (!s->bs[i]->broken) {
> +            break;
> +        }
> +    }
> +    if (i == s->num_children) {
> +        return -1;
> +    }
> +    return i;
> +}
> +
>  static void quorum_aio_cb(void *opaque, int ret)
>  {
>      QuorumChildRequest *sacb = opaque;
> @@ -293,11 +313,18 @@ static void quorum_aio_cb(void *opaque, int ret)
>      BDRVQuorumState *s = acb->common.bs->opaque;
>      bool rewrite = false;
>  
> +    if (ret < 0) {
> +        s->bs[acb->child_iter]->broken = true;
> +    }

child_iter is fifo mode stuff.
Do we need to write if (s->read_pattern == QUORUM_READ_PATTERN_FIFO && ret < 0) 
here ?


> +
>      if (acb->is_read && s->read_pattern == QUORUM_READ_PATTERN_FIFO) {
>          /* We try to read next child in FIFO order if we fail to read */
> -        if (ret < 0 && ++acb->child_iter < s->num_children) {
> -            read_fifo_child(acb);
> -            return;
> +        if (ret < 0) {
> +            acb->child_iter = next_fifo_child(acb);

You don't seem to increment child_iter anymore.

> +            if (acb->child_iter > 0) {
> +                read_fifo_child(acb);
> +                return;
> +            }
>          }
>  
>          if (ret == 0) {
> @@ -315,9 +342,9 @@ static void quorum_aio_cb(void *opaque, int ret)
>      } else {
>          quorum_report_bad(acb, sacb->aiocb->bs->node_name, ret);
>      }
> -    assert(acb->count <= s->num_children);
> -    assert(acb->success_count <= s->num_children);
> -    if (acb->count < s->num_children) {
> +    assert(acb->count <= acb->issued_count);
> +    assert(acb->success_count <= acb->issued_count);
> +    if (acb->count < acb->issued_count) {
>          return;
>      }
>  
> @@ -647,22 +674,46 @@ free_exit:
>      return rewrite;
>  }
>  
> +static bool has_enough_children(BDRVQuorumState *s)
> +{
> +    int good = 0, i;
> +
> +    for (i = 0; i < s->num_children; i++) {
> +        if (s->bs[i]->broken) {
> +            continue;
> +        }
> +        good++;
> +    }
> +
> +    if (good >= s->threshold) {
> +        return true;
> +    } else {
> +        return false;
> +    }
> +}
> +
>  static BlockDriverAIOCB *read_quorum_children(QuorumAIOCB *acb)
>  {
>      BDRVQuorumState *s = acb->common.bs->opaque;
>      int i;
>  
> +    if (!has_enough_children(s)) {
> +        quorum_aio_release(acb);
> +        return NULL;
> +    }
> +
>      for (i = 0; i < s->num_children; i++) {
> +        if (s->bs[i]->broken) {
> +            continue;
> +        }
>          acb->qcrs[i].buf = qemu_blockalign(s->bs[i], acb->qiov->size);
>          qemu_iovec_init(&acb->qcrs[i].qiov, acb->qiov->niov);
>          qemu_iovec_clone(&acb->qcrs[i].qiov, acb->qiov, acb->qcrs[i].buf);
> -    }
> -
> -    for (i = 0; i < s->num_children; i++) {
>          acb->qcrs[i].aiocb = bdrv_aio_readv(s->bs[i], acb->sector_num,
>                                              &acb->qcrs[i].qiov,
>                                              acb->nb_sectors, quorum_aio_cb,
>                                              &acb->qcrs[i]);
> +        acb->issued_count++;
>      }
>  
>      return &acb->common;
> @@ -679,7 +730,7 @@ static BlockDriverAIOCB *read_fifo_child(QuorumAIOCB *acb)
>      acb->qcrs[i].aiocb = bdrv_aio_readv(s->bs[i], acb->sector_num,
>                                          &acb->qcrs[i].qiov, acb->nb_sectors,
>                                          quorum_aio_cb, &acb->qcrs[i]);
> -
> +    acb->issued_count = 1;
>      return &acb->common;
>  }
>  
> @@ -693,6 +744,7 @@ static BlockDriverAIOCB 
> *quorum_aio_readv(BlockDriverState *bs,
>      BDRVQuorumState *s = bs->opaque;
>      QuorumAIOCB *acb = quorum_aio_get(s, bs, qiov, sector_num,
>                                        nb_sectors, cb, opaque);

> +
Spurious carriage return.

>      acb->is_read = true;
>  
>      if (s->read_pattern == QUORUM_READ_PATTERN_QUORUM) {
> @@ -701,6 +753,11 @@ static BlockDriverAIOCB 
> *quorum_aio_readv(BlockDriverState *bs,
>      }
>  
>      acb->child_iter = 0;
> +    acb->child_iter = next_fifo_child(acb);
> +    if (acb->child_iter < 0) {
> +        quorum_aio_release(acb);
> +        return NULL;
> +    }
>      return read_fifo_child(acb);
>  }
>  
> @@ -716,10 +773,19 @@ static BlockDriverAIOCB 
> *quorum_aio_writev(BlockDriverState *bs,
>                                        cb, opaque);
>      int i;
>  
> +    if (!has_enough_children(s)) {
> +        quorum_aio_release(acb);
> +        return NULL;
> +    }
> +
>      for (i = 0; i < s->num_children; i++) {
> +        if (s->bs[i]->broken) {
> +            continue;
> +        }
>          acb->qcrs[i].aiocb = bdrv_aio_writev(s->bs[i], sector_num, qiov,
>                                               nb_sectors, &quorum_aio_cb,
>                                               &acb->qcrs[i]);
> +        acb->issued_count++;
>      }
>  
>      return &acb->common;
> @@ -926,6 +992,12 @@ static int quorum_open(BlockDriverState *bs, QDict 
> *options, int flags,
>      }
>  
>      s->threshold = qemu_opt_get_number(opts, QUORUM_OPT_VOTE_THRESHOLD, 0);
> +    /* and validate it against s->num_children */
> +    ret = quorum_valid_threshold(s->threshold, s->num_children, &local_err);
> +    if (ret < 0) {
> +        goto exit;
> +    }
> +
>      ret = parse_read_pattern(qemu_opt_get(opts, QUORUM_OPT_READ_PATTERN));
>      if (ret < 0) {
>          error_setg(&local_err, "Please set read-pattern as fifo or 
> quorum\n");
> @@ -934,12 +1006,6 @@ static int quorum_open(BlockDriverState *bs, QDict 
> *options, int flags,
>      s->read_pattern = ret;
>  
>      if (s->read_pattern == QUORUM_READ_PATTERN_QUORUM) {
> -        /* and validate it against s->num_children */
> -        ret = quorum_valid_threshold(s->threshold, s->num_children, 
> &local_err);
> -        if (ret < 0) {
> -            goto exit;
> -        }
> -
>          /* is the driver in blkverify mode */
>          if (qemu_opt_get_bool(opts, QUORUM_OPT_BLKVERIFY, false) &&
>              s->num_children == 2 && s->threshold == 2) {
> diff --git a/include/block/block_int.h b/include/block/block_int.h
> index 9fdec7f..599110b 100644
> --- a/include/block/block_int.h
> +++ b/include/block/block_int.h
> @@ -402,6 +402,9 @@ struct BlockDriverState {
>  
>      /* The error object in use for blocking operations on backing_hd */
>      Error *backing_blocker;
> +
> +    /* True if the associated device is broken */
> +    bool broken;
>  };
>  
>  int get_tmp_filename(char *filename, int size);
> -- 
> 1.9.1
> 
> 



reply via email to

[Prev in Thread] Current Thread [Next in Thread]