qemu-devel
[Top][All Lists]
Advanced

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

Re: [Qemu-devel] [PATCH V15 06/13] quorum: Add quorum mechanism.


From: Kevin Wolf
Subject: Re: [Qemu-devel] [PATCH V15 06/13] quorum: Add quorum mechanism.
Date: Tue, 4 Feb 2014 16:40:12 +0100
User-agent: Mutt/1.5.21 (2010-09-15)

Am 03.02.2014 um 22:51 hat Benoît Canet geschrieben:
> From: Benoît Canet <address@hidden>
> 
> Use gnutls's SHA-256 to compare versions.
> 
> Signed-off-by: Benoit Canet <address@hidden>
> ---
>  block/Makefile.objs       |   2 +-
>  block/quorum.c            | 386 
> +++++++++++++++++++++++++++++++++++++++++++++-
>  configure                 |  36 +++++
>  docs/qmp/qmp-events.txt   |  33 ++++
>  include/monitor/monitor.h |   2 +
>  monitor.c                 |   2 +
>  6 files changed, 458 insertions(+), 3 deletions(-)
> 
> diff --git a/block/Makefile.objs b/block/Makefile.objs
> index a2650b9..4ca9d43 100644
> --- a/block/Makefile.objs
> +++ b/block/Makefile.objs
> @@ -3,7 +3,7 @@ block-obj-y += qcow2.o qcow2-refcount.o qcow2-cluster.o 
> qcow2-snapshot.o qcow2-c
>  block-obj-y += qed.o qed-gencb.o qed-l2-cache.o qed-table.o qed-cluster.o
>  block-obj-y += qed-check.o
>  block-obj-$(CONFIG_VHDX) += vhdx.o vhdx-endian.o vhdx-log.o
> -block-obj-y += quorum.o
> +block-obj-$(CONFIG_QUORUM) += quorum.o
>  block-obj-y += parallels.o blkdebug.o blkverify.o
>  block-obj-y += snapshot.o qapi.o
>  block-obj-$(CONFIG_WIN32) += raw-win32.o win32-aio.o
> diff --git a/block/quorum.c b/block/quorum.c
> index 699b512..837d261 100644
> --- a/block/quorum.c
> +++ b/block/quorum.c
> @@ -13,7 +13,43 @@
>   * See the COPYING file in the top-level directory.
>   */
>  
> +#include <gnutls/gnutls.h>
> +#include <gnutls/crypto.h>
>  #include "block/block_int.h"
> +#include "qapi/qmp/qjson.h"
> +
> +#define HASH_LENGTH 32
> +
> +/* This union holds a vote hash value */
> +typedef union QuorumVoteValue {
> +    char h[HASH_LENGTH];       /* SHA-256 hash */
> +    int64_t l;                 /* simpler 64 bits hash */
> +} QuorumVoteValue;
> +
> +/* A vote item */
> +typedef struct QuorumVoteItem {
> +    int index;
> +    QLIST_ENTRY(QuorumVoteItem) next;
> +} QuorumVoteItem;
> +
> +/* this structure is a vote version. A version is the set of votes sharing 
> the
> + * same vote value.
> + * The set of votes will be tracked with the items field and its cardinality 
> is
> + * vote_count.
> + */
> +typedef struct QuorumVoteVersion {
> +    QuorumVoteValue value;
> +    int index;
> +    int vote_count;
> +    QLIST_HEAD(, QuorumVoteItem) items;
> +    QLIST_ENTRY(QuorumVoteVersion) next;
> +} QuorumVoteVersion;
> +
> +/* this structure holds a group of vote versions together */
> +typedef struct QuorumVotes {
> +    QLIST_HEAD(, QuorumVoteVersion) vote_list;
> +    int (*compare)(QuorumVoteValue *a, QuorumVoteValue *b);
> +} QuorumVotes;
>  
>  /* the following structure holds the state of one quorum instance */
>  typedef struct {
> @@ -60,10 +96,14 @@ struct QuorumAIOCB {
>      int success_count;          /* number of successfully completed AIOCB */
>      bool *finished;             /* completion signal for cancel */
>  
> +    QuorumVotes votes;
> +
>      bool is_read;
>      int vote_ret;
>  };
>  
> +static void quorum_vote(QuorumAIOCB *acb);
> +
>  static void quorum_aio_cancel(BlockDriverAIOCB *blockacb)
>  {
>      QuorumAIOCB *acb = container_of(blockacb, QuorumAIOCB, common);
> @@ -81,10 +121,12 @@ static AIOCBInfo quorum_aiocb_info = {
>      .cancel             = quorum_aio_cancel,
>  };
>  
> +static int quorum_vote_error(QuorumAIOCB *acb);
> +

What's the reason for putting the forward declaration here? This is
neither directly before the first user nor at the top.

In fact, the next occurence of quorum_vote_error() is the implementation
of the function, so the forward declaration is completely unnecessary.

>  static void quorum_aio_finalize(QuorumAIOCB *acb)
>  {
>      BDRVQuorumState *s = acb->bqs;
> -    int ret = 0;
> +    int i, ret = 0;
>  
>      for (i = 0; i < s->total; i++) {
>          qemu_vfree(acb->aios[i].buf);
> @@ -92,6 +134,10 @@ static void quorum_aio_finalize(QuorumAIOCB *acb)
>          acb->aios[i].ret = 0;
>      }
>  
> +    if (acb->vote_ret) {
> +        ret = acb->vote_ret;
> +    }
> +
>      acb->common.cb(acb->common.opaque, ret);
>      if (acb->finished) {
>          *acb->finished = true;
> @@ -103,6 +149,27 @@ static void quorum_aio_finalize(QuorumAIOCB *acb)
>      qemu_aio_release(acb);
>  }
>  
> +static int quorum_sha256_compare(QuorumVoteValue *a, QuorumVoteValue *b)
> +{
> +    return memcmp(a->h, b->h, HASH_LENGTH);
> +}
> +
> +static int quorum_64bits_compare(QuorumVoteValue *a, QuorumVoteValue *b)
> +{
> +    int64_t i = a->l;
> +    int64_t j = b->l;
> +
> +    if (i < j) {
> +        return -1;
> +    }
> +
> +    if (i > j) {
> +        return 1;
> +    }
> +
> +    return 0;
> +}

The usual way to implement this is 'return a->l - b->l;', because if you
expect memcmp() to return a valid value for the compare function you
can't assume that it's normalised to -1/0/1 anyway.

As you only ever use the result as a bool, you could alternatively
even declare the function as such and do 'return a->l != b->l;'.

>  static QuorumAIOCB *quorum_aio_get(BDRVQuorumState *s,
>                                     BlockDriverState *bs,
>                                     QEMUIOVector *qiov,
> @@ -122,6 +189,7 @@ static QuorumAIOCB *quorum_aio_get(BDRVQuorumState *s,
>      acb->count = 0;
>      acb->success_count = 0;
>      acb->finished = NULL;
> +    acb->votes.compare = quorum_sha256_compare;
>      acb->is_read = false;
>      acb->vote_ret = 0;
>  

You need to initialise votes.vote_list as well.

> @@ -151,9 +219,323 @@ static void quorum_aio_cb(void *opaque, int ret)
>          return;
>      }
>  
> +    /* Do the vote on read */
> +    if (acb->is_read) {
> +        quorum_vote(acb);
> +    }
> +
>      quorum_aio_finalize(acb);
>  }
>  
> +static void quorum_report_bad(QuorumAIOCB *acb, char *node_name)
> +{
> +    QObject *data;
> +    data = qobject_from_jsonf("{ 'node-name': \"%s\""
> +                              ", 'sector-num': %" PRId64
> +                              ", 'sectors-count': %i }",
> +                              node_name,

Can't node_name be NULL here?

> +                              acb->sector_num,
> +                              acb->nb_sectors);
> +    monitor_protocol_event(QEVENT_QUORUM_REPORT_BAD, data);
> +    qobject_decref(data);
> +}
> +
> +static void quorum_report_failure(QuorumAIOCB *acb)
> +{
> +    QObject *data;
> +    data = qobject_from_jsonf("{ 'sector-num': %" PRId64
> +                              ", 'sectors-count': %i }",
> +                              acb->sector_num,
> +                              acb->nb_sectors);

If I have multiple quorum devices, this event doesn't tell me, which one
it is about.

> +    monitor_protocol_event(QEVENT_QUORUM_FAILURE, data);
> +    qobject_decref(data);
> +}
> +
> +static void quorum_report_bad_versions(BDRVQuorumState *s,
> +                                       QuorumAIOCB *acb,
> +                                       QuorumVoteValue *value)
> +{
> +    QuorumVoteVersion *version;
> +    QuorumVoteItem *item;
> +
> +    QLIST_FOREACH(version, &acb->votes.vote_list, next) {
> +        if (!acb->votes.compare(&version->value, value)) {
> +            continue;
> +        }
> +        QLIST_FOREACH(item, &version->items, next) {
> +            quorum_report_bad(acb, s->bs[item->index]->node_name);
> +        }
> +    }
> +}
> +
> +static void quorum_copy_qiov(QEMUIOVector *dest, QEMUIOVector *source)
> +{
> +    int i;
> +    assert(dest->niov == source->niov);
> +    assert(dest->size == source->size);
> +    for (i = 0; i < source->niov; i++) {
> +        assert(dest->iov[i].iov_len == source->iov[i].iov_len);
> +        memcpy(dest->iov[i].iov_base,
> +               source->iov[i].iov_base,
> +               source->iov[i].iov_len);
> +    }
> +}
> +
> +static void quorum_count_vote(QuorumVotes *votes,
> +                              QuorumVoteValue *value,
> +                              int index)
> +{
> +    QuorumVoteVersion *v = NULL, *version = NULL;
> +    QuorumVoteItem *item;
> +
> +    /* look if we have something with this hash */
> +    QLIST_FOREACH(v, &votes->vote_list, next) {
> +        if (!votes->compare(&v->value, value)) {
> +            version = v;
> +            break;
> +        }
> +    }
> +
> +    /* It's a version not yet in the list add it */
> +    if (!version) {
> +        version = g_new0(QuorumVoteVersion, 1);
> +        QLIST_INIT(&version->items);
> +        memcpy(&version->value, value, sizeof(version->value));
> +        version->index = index;
> +        version->vote_count = 0;
> +        QLIST_INSERT_HEAD(&votes->vote_list, version, next);
> +    }
> +
> +    version->vote_count++;
> +
> +    item = g_new0(QuorumVoteItem, 1);
> +    item->index = index;
> +    QLIST_INSERT_HEAD(&version->items, item, next);
> +}
> +
> +static void quorum_free_vote_list(QuorumVotes *votes)
> +{
> +    QuorumVoteVersion *version, *next_version;
> +    QuorumVoteItem *item, *next_item;
> +
> +    QLIST_FOREACH_SAFE(version, &votes->vote_list, next, next_version) {
> +        QLIST_REMOVE(version, next);
> +        QLIST_FOREACH_SAFE(item, &version->items, next, next_item) {
> +            QLIST_REMOVE(item, next);
> +            g_free(item);
> +        }
> +        g_free(version);
> +    }
> +}
> +
> +static int quorum_compute_hash(QuorumAIOCB *acb, int i, QuorumVoteValue 
> *hash)
> +{
> +    int j, ret;
> +    gnutls_hash_hd_t dig;
> +    QEMUIOVector *qiov = &acb->aios[i].qiov;
> +
> +    ret = gnutls_hash_init(&dig, GNUTLS_DIG_SHA256);
> +
> +    if (ret < 0) {
> +        return ret;
> +    }
> +
> +    for (j = 0; j < qiov->niov; j++) {
> +        ret = gnutls_hash(dig, qiov->iov[j].iov_base, qiov->iov[j].iov_len);
> +        if (ret < 0) {
> +            break;
> +        }
> +    }
> +
> +    gnutls_hash_deinit(dig, (void *) hash);
> +    return ret;
> +}
> +
> +static QuorumVoteVersion *quorum_get_vote_winner(QuorumVotes *votes)
> +{
> +    int i = 0;

I like obvious variable names. This must be a loop counter.

> +    QuorumVoteVersion *candidate, *winner = NULL;
> +
> +    QLIST_FOREACH(candidate, &votes->vote_list, next) {
> +        if (candidate->vote_count > i) {
> +            i = candidate->vote_count;

Wait, what? This doesn't quite look like a loop.

> +            winner = candidate;
> +        }
> +    }
> +
> +    return winner;
> +}
> +
> +static bool quorum_iovec_compare(QEMUIOVector *a, QEMUIOVector *b)
> +{
> +    int i;
> +    int result;
> +
> +    assert(a->niov == b->niov);
> +    for (i = 0; i < a->niov; i++) {
> +        assert(a->iov[i].iov_len == b->iov[i].iov_len);
> +        result = memcmp(a->iov[i].iov_base,
> +                        b->iov[i].iov_base,
> +                        a->iov[i].iov_len);
> +        if (result) {
> +            return false;
> +        }
> +    }
> +
> +    return true;
> +}

I thought we introduced qemu_iovec_compare() earlier in this series to
do exactly this, except more generically?

I see that you call one or the other depending on whether we're running
in blkverify mode, but what is the difference in the semantics? Either
both are the same and there is no reason to have both, or one of them
must have non-obvious semantics and lacks proper documentation.

> +static void GCC_FMT_ATTR(2, 3) quorum_err(QuorumAIOCB *acb,
> +                                          const char *fmt, ...)
> +{
> +    va_list ap;
> +
> +    va_start(ap, fmt);
> +    fprintf(stderr, "quorum: sector_num=%" PRId64 " nb_sectors=%d ",
> +            acb->sector_num, acb->nb_sectors);
> +    vfprintf(stderr, fmt, ap);
> +    fprintf(stderr, "\n");
> +    va_end(ap);
> +    exit(1);
> +}
> +
> +static bool quorum_compare(QuorumAIOCB *acb,
> +                           QEMUIOVector *a,
> +                           QEMUIOVector *b)
> +{
> +    BDRVQuorumState *s = acb->bqs;
> +    ssize_t offset;
> +
> +    /* This driver will replace blkverify in this particular case */
> +    if (s->is_blkverify) {
> +        offset = qemu_iovec_compare(a, b);
> +        if (offset != -1) {
> +            quorum_err(acb, "contents mismatch in sector %" PRId64,
> +                       acb->sector_num +
> +                       (uint64_t)(offset / BDRV_SECTOR_SIZE));
> +        }
> +        return true;
> +    }
> +
> +    return quorum_iovec_compare(a, b);
> +}
> +
> +/* Do a vote to get the error code */
> +static int quorum_vote_error(QuorumAIOCB *acb)
> +{
> +    BDRVQuorumState *s = acb->bqs;
> +    QuorumVoteVersion *winner = NULL;
> +    QuorumVotes error_votes;
> +    QuorumVoteValue result_value;
> +    int i, ret = 0;
> +    bool error = false;
> +
> +    QLIST_INIT(&error_votes.vote_list);
> +    error_votes.compare = quorum_64bits_compare;
> +
> +    for (i = 0; i < s->total; i++) {
> +        ret = acb->aios[i].ret;
> +        if (ret) {
> +            error = true;
> +            result_value.l = ret;
> +            quorum_count_vote(&error_votes, &result_value, i);
> +        }
> +    }
> +
> +    if (error) {
> +        winner = quorum_get_vote_winner(&error_votes);
> +        ret = winner->value.l;
> +    }
> +
> +    quorum_free_vote_list(&error_votes);
> +
> +    return ret;
> +}
> +
> +static void quorum_vote(QuorumAIOCB *acb)
> +{
> +    bool quorum = false;
> +    int i, j, ret;
> +    QuorumVoteValue hash;
> +    BDRVQuorumState *s = acb->bqs;
> +    QuorumVoteVersion *winner;
> +
> +    QLIST_INIT(&acb->votes.vote_list);
> +
> +    /* if we don't get enough successful read use the first error code */
> +    if (acb->success_count < s->threshold) {
> +        acb->vote_ret = quorum_vote_error(acb);
> +        quorum_report_failure(acb);
> +        return;
> +    }
> +
> +    /* get the index of the first successful read (we are sure to find one) 
> */
> +    for (i = 0; i < s->total; i++) {
> +        if (!acb->aios[i].ret) {
> +            break;
> +        }
> +    }

"we are sure to find one" is spelt "assert(i < s->total);"

> +
> +    /* compare this read with all other successful read looking for quorum */
> +    for (j = i + 1; j < s->total; j++) {
> +        if (acb->aios[j].ret) {
> +            continue;
> +        }
> +        quorum = quorum_compare(acb, &acb->aios[i].qiov, &acb->aios[j].qiov);
> +        if (!quorum) {
> +            break;
> +       }
> +    }
> +
> +    /* Every successful read agrees and their count is higher or equal 
> threshold
> +     * -> Quorum
> +     */
> +    if (quorum && acb->success_count >= s->threshold) {
> +        quorum_copy_qiov(acb->qiov, &acb->aios[i].qiov);
> +        return;
> +    }

For threshold == success_count == 1, the condition in the comment is
fulfilled, but the one in the code isn't.

> +
> +    /* compute hashs for each successful read, also store indexes */
> +    for (i = 0; i < s->total; i++) {
> +        if (acb->aios[i].ret) {
> +            continue;
> +        }
> +        ret = quorum_compute_hash(acb, i, &hash);
> +        /* if ever the hash computation failed */
> +        if (ret < 0) {
> +            acb->vote_ret = ret;
> +            goto free_exit;
> +        }
> +        quorum_count_vote(&acb->votes, &hash, i);
> +    }
> +
> +    /* vote to select the most represented version */
> +    winner = quorum_get_vote_winner(&acb->votes);
> +    /* every vote version are differents -> error */
> +    if (!winner) {

Can this happen? This means that there was no vote at all.

> +        quorum_report_failure(acb);
> +        acb->vote_ret = -EIO;
> +        goto free_exit;
> +    }
> +
> +    /* if the winner count is smaller than threshold the read fails */
> +    if (winner->vote_count < s->threshold) {
> +        quorum_report_failure(acb);
> +        acb->vote_ret = -EIO;
> +        goto free_exit;
> +    }
> +
> +    /* we have a winner: copy it */
> +    quorum_copy_qiov(acb->qiov, &acb->aios[winner->index].qiov);
> +
> +    /* some versions are bad print them */
> +    quorum_report_bad_versions(s, acb, &winner->value);
> +
> +free_exit:
> +    /* free lists */
> +    quorum_free_vote_list(&acb->votes);
> +}
> +
>  static BlockDriverAIOCB *quorum_aio_readv(BlockDriverState *bs,
>                                           int64_t sector_num,
>                                           QEMUIOVector *qiov,
> @@ -175,7 +557,7 @@ static BlockDriverAIOCB 
> *quorum_aio_readv(BlockDriverState *bs,
>      }
>  
>      for (i = 0; i < s->total; i++) {
> -        bdrv_aio_readv(s->bs[i], sector_num, qiov, nb_sectors,
> +        bdrv_aio_readv(s->bs[i], sector_num, &acb->aios[i].qiov, nb_sectors,
>                         quorum_aio_cb, &acb->aios[i]);
>      }

Why don't you do this from the beginning?

Kevin



reply via email to

[Prev in Thread] Current Thread [Next in Thread]