[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]
Re: [Qemu-devel] [PATCH V15 06/13] quorum: Add quorum mechanism.
From: |
Benoît Canet |
Subject: |
Re: [Qemu-devel] [PATCH V15 06/13] quorum: Add quorum mechanism. |
Date: |
Wed, 5 Feb 2014 16:14:31 +0100 |
User-agent: |
Mutt/1.5.21 (2010-09-15) |
Le Tuesday 04 Feb 2014 à 16:40:12 (+0100), Kevin Wolf a écrit :
> Am 03.02.2014 um 22:51 hat Benoît Canet geschrieben:
> > From: Benoît Canet <address@hidden>
> >
> > Use gnutls's SHA-256 to compare versions.
> >
> > Signed-off-by: Benoit Canet <address@hidden>
> > ---
> > block/Makefile.objs | 2 +-
> > block/quorum.c | 386
> > +++++++++++++++++++++++++++++++++++++++++++++-
> > configure | 36 +++++
> > docs/qmp/qmp-events.txt | 33 ++++
> > include/monitor/monitor.h | 2 +
> > monitor.c | 2 +
> > 6 files changed, 458 insertions(+), 3 deletions(-)
> >
> > diff --git a/block/Makefile.objs b/block/Makefile.objs
> > index a2650b9..4ca9d43 100644
> > --- a/block/Makefile.objs
> > +++ b/block/Makefile.objs
> > @@ -3,7 +3,7 @@ block-obj-y += qcow2.o qcow2-refcount.o qcow2-cluster.o
> > qcow2-snapshot.o qcow2-c
> > block-obj-y += qed.o qed-gencb.o qed-l2-cache.o qed-table.o qed-cluster.o
> > block-obj-y += qed-check.o
> > block-obj-$(CONFIG_VHDX) += vhdx.o vhdx-endian.o vhdx-log.o
> > -block-obj-y += quorum.o
> > +block-obj-$(CONFIG_QUORUM) += quorum.o
> > block-obj-y += parallels.o blkdebug.o blkverify.o
> > block-obj-y += snapshot.o qapi.o
> > block-obj-$(CONFIG_WIN32) += raw-win32.o win32-aio.o
> > diff --git a/block/quorum.c b/block/quorum.c
> > index 699b512..837d261 100644
> > --- a/block/quorum.c
> > +++ b/block/quorum.c
> > @@ -13,7 +13,43 @@
> > * See the COPYING file in the top-level directory.
> > */
> >
> > +#include <gnutls/gnutls.h>
> > +#include <gnutls/crypto.h>
> > #include "block/block_int.h"
> > +#include "qapi/qmp/qjson.h"
> > +
> > +#define HASH_LENGTH 32
> > +
> > +/* This union holds a vote hash value */
> > +typedef union QuorumVoteValue {
> > + char h[HASH_LENGTH]; /* SHA-256 hash */
> > + int64_t l; /* simpler 64 bits hash */
> > +} QuorumVoteValue;
> > +
> > +/* A vote item */
> > +typedef struct QuorumVoteItem {
> > + int index;
> > + QLIST_ENTRY(QuorumVoteItem) next;
> > +} QuorumVoteItem;
> > +
> > +/* this structure is a vote version. A version is the set of votes sharing
> > the
> > + * same vote value.
> > + * The set of votes will be tracked with the items field and its
> > cardinality is
> > + * vote_count.
> > + */
> > +typedef struct QuorumVoteVersion {
> > + QuorumVoteValue value;
> > + int index;
> > + int vote_count;
> > + QLIST_HEAD(, QuorumVoteItem) items;
> > + QLIST_ENTRY(QuorumVoteVersion) next;
> > +} QuorumVoteVersion;
> > +
> > +/* this structure holds a group of vote versions together */
> > +typedef struct QuorumVotes {
> > + QLIST_HEAD(, QuorumVoteVersion) vote_list;
> > + int (*compare)(QuorumVoteValue *a, QuorumVoteValue *b);
> > +} QuorumVotes;
> >
> > /* the following structure holds the state of one quorum instance */
> > typedef struct {
> > @@ -60,10 +96,14 @@ struct QuorumAIOCB {
> > int success_count; /* number of successfully completed AIOCB
> > */
> > bool *finished; /* completion signal for cancel */
> >
> > + QuorumVotes votes;
> > +
> > bool is_read;
> > int vote_ret;
> > };
> >
> > +static void quorum_vote(QuorumAIOCB *acb);
> > +
> > static void quorum_aio_cancel(BlockDriverAIOCB *blockacb)
> > {
> > QuorumAIOCB *acb = container_of(blockacb, QuorumAIOCB, common);
> > @@ -81,10 +121,12 @@ static AIOCBInfo quorum_aiocb_info = {
> > .cancel = quorum_aio_cancel,
> > };
> >
> > +static int quorum_vote_error(QuorumAIOCB *acb);
> > +
>
> What's the reason for putting the forward declaration here? This is
> neither directly before the first user nor at the top.
>
> In fact, the next occurence of quorum_vote_error() is the implementation
> of the function, so the forward declaration is completely unnecessary.
>
> > static void quorum_aio_finalize(QuorumAIOCB *acb)
> > {
> > BDRVQuorumState *s = acb->bqs;
> > - int ret = 0;
> > + int i, ret = 0;
> >
> > for (i = 0; i < s->total; i++) {
> > qemu_vfree(acb->aios[i].buf);
> > @@ -92,6 +134,10 @@ static void quorum_aio_finalize(QuorumAIOCB *acb)
> > acb->aios[i].ret = 0;
> > }
> >
> > + if (acb->vote_ret) {
> > + ret = acb->vote_ret;
> > + }
> > +
> > acb->common.cb(acb->common.opaque, ret);
> > if (acb->finished) {
> > *acb->finished = true;
> > @@ -103,6 +149,27 @@ static void quorum_aio_finalize(QuorumAIOCB *acb)
> > qemu_aio_release(acb);
> > }
> >
> > +static int quorum_sha256_compare(QuorumVoteValue *a, QuorumVoteValue *b)
> > +{
> > + return memcmp(a->h, b->h, HASH_LENGTH);
> > +}
> > +
> > +static int quorum_64bits_compare(QuorumVoteValue *a, QuorumVoteValue *b)
> > +{
> > + int64_t i = a->l;
> > + int64_t j = b->l;
> > +
> > + if (i < j) {
> > + return -1;
> > + }
> > +
> > + if (i > j) {
> > + return 1;
> > + }
> > +
> > + return 0;
> > +}
>
> The usual way to implement this is 'return a->l - b->l;', because if you
> expect memcmp() to return a valid value for the compare function you
> can't assume that it's normalised to -1/0/1 anyway.
>
> As you only ever use the result as a bool, you could alternatively
> even declare the function as such and do 'return a->l != b->l;'.
>
> > static QuorumAIOCB *quorum_aio_get(BDRVQuorumState *s,
> > BlockDriverState *bs,
> > QEMUIOVector *qiov,
> > @@ -122,6 +189,7 @@ static QuorumAIOCB *quorum_aio_get(BDRVQuorumState *s,
> > acb->count = 0;
> > acb->success_count = 0;
> > acb->finished = NULL;
> > + acb->votes.compare = quorum_sha256_compare;
> > acb->is_read = false;
> > acb->vote_ret = 0;
> >
>
> You need to initialise votes.vote_list as well.
>
> > @@ -151,9 +219,323 @@ static void quorum_aio_cb(void *opaque, int ret)
> > return;
> > }
> >
> > + /* Do the vote on read */
> > + if (acb->is_read) {
> > + quorum_vote(acb);
> > + }
> > +
> > quorum_aio_finalize(acb);
> > }
> >
> > +static void quorum_report_bad(QuorumAIOCB *acb, char *node_name)
> > +{
> > + QObject *data;
> > + data = qobject_from_jsonf("{ 'node-name': \"%s\""
> > + ", 'sector-num': %" PRId64
> > + ", 'sectors-count': %i }",
> > + node_name,
>
> Can't node_name be NULL here?
No node_name is a member of BlockDriverState so the only thing that could happen
is bs->node_name[0] == '\0'
Yet I will add a security.
>
> > + acb->sector_num,
> > + acb->nb_sectors);
> > + monitor_protocol_event(QEVENT_QUORUM_REPORT_BAD, data);
> > + qobject_decref(data);
> > +}
> > +
> > +static void quorum_report_failure(QuorumAIOCB *acb)
> > +{
> > + QObject *data;
> > + data = qobject_from_jsonf("{ 'sector-num': %" PRId64
> > + ", 'sectors-count': %i }",
> > + acb->sector_num,
> > + acb->nb_sectors);
>
> If I have multiple quorum devices, this event doesn't tell me, which one
> it is about.
>
> > + monitor_protocol_event(QEVENT_QUORUM_FAILURE, data);
> > + qobject_decref(data);
> > +}
> > +
> > +static void quorum_report_bad_versions(BDRVQuorumState *s,
> > + QuorumAIOCB *acb,
> > + QuorumVoteValue *value)
> > +{
> > + QuorumVoteVersion *version;
> > + QuorumVoteItem *item;
> > +
> > + QLIST_FOREACH(version, &acb->votes.vote_list, next) {
> > + if (!acb->votes.compare(&version->value, value)) {
> > + continue;
> > + }
> > + QLIST_FOREACH(item, &version->items, next) {
> > + quorum_report_bad(acb, s->bs[item->index]->node_name);
> > + }
> > + }
> > +}
> > +
> > +static void quorum_copy_qiov(QEMUIOVector *dest, QEMUIOVector *source)
> > +{
> > + int i;
> > + assert(dest->niov == source->niov);
> > + assert(dest->size == source->size);
> > + for (i = 0; i < source->niov; i++) {
> > + assert(dest->iov[i].iov_len == source->iov[i].iov_len);
> > + memcpy(dest->iov[i].iov_base,
> > + source->iov[i].iov_base,
> > + source->iov[i].iov_len);
> > + }
> > +}
> > +
> > +static void quorum_count_vote(QuorumVotes *votes,
> > + QuorumVoteValue *value,
> > + int index)
> > +{
> > + QuorumVoteVersion *v = NULL, *version = NULL;
> > + QuorumVoteItem *item;
> > +
> > + /* look if we have something with this hash */
> > + QLIST_FOREACH(v, &votes->vote_list, next) {
> > + if (!votes->compare(&v->value, value)) {
> > + version = v;
> > + break;
> > + }
> > + }
> > +
> > + /* It's a version not yet in the list add it */
> > + if (!version) {
> > + version = g_new0(QuorumVoteVersion, 1);
> > + QLIST_INIT(&version->items);
> > + memcpy(&version->value, value, sizeof(version->value));
> > + version->index = index;
> > + version->vote_count = 0;
> > + QLIST_INSERT_HEAD(&votes->vote_list, version, next);
> > + }
> > +
> > + version->vote_count++;
> > +
> > + item = g_new0(QuorumVoteItem, 1);
> > + item->index = index;
> > + QLIST_INSERT_HEAD(&version->items, item, next);
> > +}
> > +
> > +static void quorum_free_vote_list(QuorumVotes *votes)
> > +{
> > + QuorumVoteVersion *version, *next_version;
> > + QuorumVoteItem *item, *next_item;
> > +
> > + QLIST_FOREACH_SAFE(version, &votes->vote_list, next, next_version) {
> > + QLIST_REMOVE(version, next);
> > + QLIST_FOREACH_SAFE(item, &version->items, next, next_item) {
> > + QLIST_REMOVE(item, next);
> > + g_free(item);
> > + }
> > + g_free(version);
> > + }
> > +}
> > +
> > +static int quorum_compute_hash(QuorumAIOCB *acb, int i, QuorumVoteValue
> > *hash)
> > +{
> > + int j, ret;
> > + gnutls_hash_hd_t dig;
> > + QEMUIOVector *qiov = &acb->aios[i].qiov;
> > +
> > + ret = gnutls_hash_init(&dig, GNUTLS_DIG_SHA256);
> > +
> > + if (ret < 0) {
> > + return ret;
> > + }
> > +
> > + for (j = 0; j < qiov->niov; j++) {
> > + ret = gnutls_hash(dig, qiov->iov[j].iov_base,
> > qiov->iov[j].iov_len);
> > + if (ret < 0) {
> > + break;
> > + }
> > + }
> > +
> > + gnutls_hash_deinit(dig, (void *) hash);
> > + return ret;
> > +}
> > +
> > +static QuorumVoteVersion *quorum_get_vote_winner(QuorumVotes *votes)
> > +{
> > + int i = 0;
>
> I like obvious variable names. This must be a loop counter.
>
> > + QuorumVoteVersion *candidate, *winner = NULL;
> > +
> > + QLIST_FOREACH(candidate, &votes->vote_list, next) {
> > + if (candidate->vote_count > i) {
> > + i = candidate->vote_count;
>
> Wait, what? This doesn't quite look like a loop.
>
> > + winner = candidate;
> > + }
> > + }
> > +
> > + return winner;
> > +}
> > +
> > +static bool quorum_iovec_compare(QEMUIOVector *a, QEMUIOVector *b)
> > +{
> > + int i;
> > + int result;
> > +
> > + assert(a->niov == b->niov);
> > + for (i = 0; i < a->niov; i++) {
> > + assert(a->iov[i].iov_len == b->iov[i].iov_len);
> > + result = memcmp(a->iov[i].iov_base,
> > + b->iov[i].iov_base,
> > + a->iov[i].iov_len);
> > + if (result) {
> > + return false;
> > + }
> > + }
> > +
> > + return true;
> > +}
>
> I thought we introduced qemu_iovec_compare() earlier in this series to
> do exactly this, except more generically?
>
> I see that you call one or the other depending on whether we're running
> in blkverify mode, but what is the difference in the semantics? Either
> both are the same and there is no reason to have both, or one of them
> must have non-obvious semantics and lacks proper documentation.
>
> > +static void GCC_FMT_ATTR(2, 3) quorum_err(QuorumAIOCB *acb,
> > + const char *fmt, ...)
> > +{
> > + va_list ap;
> > +
> > + va_start(ap, fmt);
> > + fprintf(stderr, "quorum: sector_num=%" PRId64 " nb_sectors=%d ",
> > + acb->sector_num, acb->nb_sectors);
> > + vfprintf(stderr, fmt, ap);
> > + fprintf(stderr, "\n");
> > + va_end(ap);
> > + exit(1);
> > +}
> > +
> > +static bool quorum_compare(QuorumAIOCB *acb,
> > + QEMUIOVector *a,
> > + QEMUIOVector *b)
> > +{
> > + BDRVQuorumState *s = acb->bqs;
> > + ssize_t offset;
> > +
> > + /* This driver will replace blkverify in this particular case */
> > + if (s->is_blkverify) {
> > + offset = qemu_iovec_compare(a, b);
> > + if (offset != -1) {
> > + quorum_err(acb, "contents mismatch in sector %" PRId64,
> > + acb->sector_num +
> > + (uint64_t)(offset / BDRV_SECTOR_SIZE));
> > + }
> > + return true;
> > + }
> > +
> > + return quorum_iovec_compare(a, b);
> > +}
> > +
> > +/* Do a vote to get the error code */
> > +static int quorum_vote_error(QuorumAIOCB *acb)
> > +{
> > + BDRVQuorumState *s = acb->bqs;
> > + QuorumVoteVersion *winner = NULL;
> > + QuorumVotes error_votes;
> > + QuorumVoteValue result_value;
> > + int i, ret = 0;
> > + bool error = false;
> > +
> > + QLIST_INIT(&error_votes.vote_list);
> > + error_votes.compare = quorum_64bits_compare;
> > +
> > + for (i = 0; i < s->total; i++) {
> > + ret = acb->aios[i].ret;
> > + if (ret) {
> > + error = true;
> > + result_value.l = ret;
> > + quorum_count_vote(&error_votes, &result_value, i);
> > + }
> > + }
> > +
> > + if (error) {
> > + winner = quorum_get_vote_winner(&error_votes);
> > + ret = winner->value.l;
> > + }
> > +
> > + quorum_free_vote_list(&error_votes);
> > +
> > + return ret;
> > +}
> > +
> > +static void quorum_vote(QuorumAIOCB *acb)
> > +{
> > + bool quorum = false;
> > + int i, j, ret;
> > + QuorumVoteValue hash;
> > + BDRVQuorumState *s = acb->bqs;
> > + QuorumVoteVersion *winner;
> > +
> > + QLIST_INIT(&acb->votes.vote_list);
> > +
> > + /* if we don't get enough successful read use the first error code */
> > + if (acb->success_count < s->threshold) {
> > + acb->vote_ret = quorum_vote_error(acb);
> > + quorum_report_failure(acb);
> > + return;
> > + }
> > +
> > + /* get the index of the first successful read (we are sure to find
> > one) */
> > + for (i = 0; i < s->total; i++) {
> > + if (!acb->aios[i].ret) {
> > + break;
> > + }
> > + }
>
> "we are sure to find one" is spelt "assert(i < s->total);"
>
> > +
> > + /* compare this read with all other successful read looking for quorum
> > */
> > + for (j = i + 1; j < s->total; j++) {
> > + if (acb->aios[j].ret) {
> > + continue;
> > + }
> > + quorum = quorum_compare(acb, &acb->aios[i].qiov,
> > &acb->aios[j].qiov);
> > + if (!quorum) {
> > + break;
> > + }
> > + }
> > +
> > + /* Every successful read agrees and their count is higher or equal
> > threshold
> > + * -> Quorum
> > + */
> > + if (quorum && acb->success_count >= s->threshold) {
> > + quorum_copy_qiov(acb->qiov, &acb->aios[i].qiov);
> > + return;
> > + }
>
> For threshold == success_count == 1, the condition in the comment is
> fulfilled, but the one in the code isn't.
>
> > +
> > + /* compute hashs for each successful read, also store indexes */
> > + for (i = 0; i < s->total; i++) {
> > + if (acb->aios[i].ret) {
> > + continue;
> > + }
> > + ret = quorum_compute_hash(acb, i, &hash);
> > + /* if ever the hash computation failed */
> > + if (ret < 0) {
> > + acb->vote_ret = ret;
> > + goto free_exit;
> > + }
> > + quorum_count_vote(&acb->votes, &hash, i);
> > + }
> > +
> > + /* vote to select the most represented version */
> > + winner = quorum_get_vote_winner(&acb->votes);
> > + /* every vote version are differents -> error */
> > + if (!winner) {
>
> Can this happen? This means that there was no vote at all.
>
> > + quorum_report_failure(acb);
> > + acb->vote_ret = -EIO;
> > + goto free_exit;
> > + }
> > +
> > + /* if the winner count is smaller than threshold the read fails */
> > + if (winner->vote_count < s->threshold) {
> > + quorum_report_failure(acb);
> > + acb->vote_ret = -EIO;
> > + goto free_exit;
> > + }
> > +
> > + /* we have a winner: copy it */
> > + quorum_copy_qiov(acb->qiov, &acb->aios[winner->index].qiov);
> > +
> > + /* some versions are bad print them */
> > + quorum_report_bad_versions(s, acb, &winner->value);
> > +
> > +free_exit:
> > + /* free lists */
> > + quorum_free_vote_list(&acb->votes);
> > +}
> > +
> > static BlockDriverAIOCB *quorum_aio_readv(BlockDriverState *bs,
> > int64_t sector_num,
> > QEMUIOVector *qiov,
> > @@ -175,7 +557,7 @@ static BlockDriverAIOCB
> > *quorum_aio_readv(BlockDriverState *bs,
> > }
> >
> > for (i = 0; i < s->total; i++) {
> > - bdrv_aio_readv(s->bs[i], sector_num, qiov, nb_sectors,
> > + bdrv_aio_readv(s->bs[i], sector_num, &acb->aios[i].qiov,
> > nb_sectors,
> > quorum_aio_cb, &acb->aios[i]);
> > }
>
> Why don't you do this from the beginning?
>
> Kevin
- [Qemu-devel] [PATCH V15 05/13] quorum: Add quorum_aio_readv., (continued)
- [Qemu-devel] [PATCH V15 05/13] quorum: Add quorum_aio_readv., Benoît Canet, 2014/02/03
- [Qemu-devel] [PATCH V15 08/13] quorum: Add quorum_invalidate_cache()., Benoît Canet, 2014/02/03
- [Qemu-devel] [PATCH V15 10/13] quorum: Add quorum_co_flush()., Benoît Canet, 2014/02/03
- [Qemu-devel] [PATCH V15 11/13] quorum: Implement recursive .bdrv_recurse_is_first_non_filter in quorum., Benoît Canet, 2014/02/03
- [Qemu-devel] [PATCH V15 01/13] quorum: Create quorum.c, add QuorumSingleAIOCB and QuorumAIOCB., Benoît Canet, 2014/02/03
- [Qemu-devel] [PATCH V15 06/13] quorum: Add quorum mechanism., Benoît Canet, 2014/02/03
- [Qemu-devel] [PATCH V15 13/13] quorum: Add unit test., Benoît Canet, 2014/02/03
- [Qemu-devel] [PATCH V15 12/13] quorum: Add quorum_open() and quorum_close()., Benoît Canet, 2014/02/03