[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]
Re: [Qemu-devel] [PATCH v8 05/36] raw-posix: Add image locking support
From: |
Max Reitz |
Subject: |
Re: [Qemu-devel] [PATCH v8 05/36] raw-posix: Add image locking support |
Date: |
Sat, 22 Oct 2016 01:40:26 +0200 |
User-agent: |
Mozilla/5.0 (X11; Linux x86_64; rv:45.0) Gecko/20100101 Thunderbird/45.4.0 |
On 30.09.2016 14:09, Fam Zheng wrote:
> virtlockd in libvirt locks the first byte, we lock byte 1 to avoid
> the intervene.
s/the intervene/a conflict/?
>
> Both file and host device protocols are covered.
>
> The complication is with reopen. We have three different locking states,
> namely "unlocked", "shared locked" and "exclusively locked".
>
> When we reopen, the new fd may need a new locking mode. Moving away to or from
> exclusive is a bit tricky because we cannot do it atomically. This patch
> solves
> it by dup() s->fd to s->lock_fd and avoid close(), so that there isn't a racy
> window where we drop the lock on one fd before acquiring the exclusive lock on
> the other.
>
> To make the logic easier to manage, and allow better reuse, the code is
> internally organized by state transition table (old_lock -> new_lock).
>
> Signed-off-by: Fam Zheng <address@hidden>
> ---
> block/raw-posix.c | 318
> +++++++++++++++++++++++++++++++++++++++++++++++++++++-
> 1 file changed, 317 insertions(+), 1 deletion(-)
>
> diff --git a/block/raw-posix.c b/block/raw-posix.c
> index 6ed7547..22de242 100644
> --- a/block/raw-posix.c
> +++ b/block/raw-posix.c
> @@ -133,6 +133,7 @@ do { \
>
> typedef struct BDRVRawState {
> int fd;
> + int lock_fd;
I think it would be a good idea to put a comment about the semantics of
this lock_fd here.
> int type;
> int open_flags;
> size_t buf_align;
> @@ -149,6 +150,7 @@ typedef struct BDRVRawState {
>
> typedef struct BDRVRawReopenState {
> int fd;
> + int lock_fd;
> int open_flags;
> } BDRVRawReopenState;
>
> @@ -367,6 +369,43 @@ static void raw_parse_flags(int bdrv_flags, int
> *open_flags)
> }
> }
>
> +static int raw_lock_fd(int fd, ImageLockMode mode)
> +{
> + assert(fd >= 0);
> + /* Locking byte 1 avoids interfereing with virtlockd. */
> + switch (mode) {
> + case IMAGE_LOCK_MODE_EXCLUSIVE:
> + return qemu_lock_fd(fd, 1, 1, true);
> + case IMAGE_LOCK_MODE_SHARED:
> + return qemu_lock_fd(fd, 1, 1, false);
> + case IMAGE_LOCK_MODE_NOLOCK:
> + return qemu_unlock_fd(fd, 1, 1);
> + default:
> + abort();
> + }
> +}
> +
> +static int raw_lockf(BlockDriverState *bs, ImageLockMode mode)
> +{
> + BDRVRawState *s = bs->opaque;
> +
> + if (s->lock_fd < 0) {
> + if (mode == IMAGE_LOCK_MODE_NOLOCK) {
> + return 0;
> + }
> + s->lock_fd = qemu_dup(s->fd);
> + if (s->lock_fd < 0) {
> + return s->lock_fd;
You should probably return -errno instead (qemu_dup apparently doesn't
return that but just -1).
(I'm getting further and further from a high-level review, am I not?)
> + }
> + }
> + if (mode == IMAGE_LOCK_MODE_AUTO) {
> + mode = bdrv_get_flags(bs) & BDRV_O_RDWR ?
> + IMAGE_LOCK_MODE_EXCLUSIVE :
> + IMAGE_LOCK_MODE_SHARED;
> + }
> + return raw_lock_fd(s->lock_fd, mode);
Without a comment for how lock_fd is supposed to work, this (for
example) looks a bit weird. After all, the user is trying to lock the
fd, and I don't find it immediately clear that lock_fd always points to
the same file as fd, and you're using it only for locking (for reasons
the comment should explain as well).
> +}
> +
> #ifdef CONFIG_LINUX_AIO
> static bool raw_use_aio(int bdrv_flags)
> {
> @@ -433,6 +472,7 @@ static int raw_open_common(BlockDriverState *bs, QDict
> *options,
> raw_parse_flags(bdrv_flags, &s->open_flags);
>
> s->fd = -1;
> + s->lock_fd = -1;
> fd = qemu_open(filename, s->open_flags, 0644);
> if (fd < 0) {
> ret = -errno;
> @@ -529,6 +569,268 @@ static int raw_open(BlockDriverState *bs, QDict
> *options, int flags,
> return raw_open_common(bs, options, flags, 0, errp);
> }
>
> +typedef enum {
> + RAW_REOPEN_PREPARE,
> + RAW_REOPEN_COMMIT,
> + RAW_REOPEN_ABORT
> +} RawReopenOperation;
> +
> +typedef int (*RawReopenFunc)(BDRVReopenState *state,
> + RawReopenOperation op,
> + ImageLockMode old_lock,
> + ImageLockMode new_lock,
> + Error **errp);
> +
> +static int
> +raw_reopen_identical(BDRVReopenState *state,
> + RawReopenOperation op,
> + ImageLockMode old_lock,
> + ImageLockMode new_lock,
> + Error **errp)
> +{
> + assert(old_lock == new_lock);
> + return 0;
> +}
> +
> +static int
> +raw_reopen_from_unlock(BDRVReopenState *state,
> + RawReopenOperation op,
> + ImageLockMode old_lock,
> + ImageLockMode new_lock,
> + Error **errp)
> +{
> + BDRVRawReopenState *raw_s = state->opaque;
> + int ret = 0;
> +
> + assert(old_lock != new_lock);
> + assert(old_lock == IMAGE_LOCK_MODE_NOLOCK);
> + switch (op) {
> + case RAW_REOPEN_PREPARE:
> + ret = raw_lock_fd(raw_s->lock_fd, new_lock);
> + if (ret) {
> + error_setg_errno(errp, -ret, "Failed to lock new fd %d",
> raw_s->lock_fd);
> + }
> + break;
> + case RAW_REOPEN_COMMIT:
> + case RAW_REOPEN_ABORT:
> + break;
> + }
> +
> + return ret;
> +}
> +
> +static int
> +raw_reopen_to_unlock(BDRVReopenState *state,
> + RawReopenOperation op,
> + ImageLockMode old_lock,
> + ImageLockMode new_lock,
> + Error **errp)
> +{
> + BDRVRawState *s = state->bs->opaque;
> + int ret = 0;
> +
> + assert(old_lock != new_lock);
> + assert(new_lock == IMAGE_LOCK_MODE_NOLOCK);
> + switch (op) {
> + case RAW_REOPEN_PREPARE:
> + break;
> + case RAW_REOPEN_COMMIT:
> + if (s->lock_fd >= 0) {
> + qemu_close(s->lock_fd);
> + s->lock_fd = -1;
> + }
> + break;
> + case RAW_REOPEN_ABORT:
> + break;
> + }
> +
> + return ret;
> +}
> +
> +static int
> +raw_reopen_upgrade(BDRVReopenState *state,
> + RawReopenOperation op,
> + ImageLockMode old_lock,
> + ImageLockMode new_lock,
> + Error **errp)
> +{
> + BDRVRawReopenState *raw_s = state->opaque;
> + BDRVRawState *s = state->bs->opaque;
> + int ret = 0, ret2;
> +
> + assert(old_lock == IMAGE_LOCK_MODE_SHARED);
> + assert(new_lock == IMAGE_LOCK_MODE_EXCLUSIVE);
> + switch (op) {
> + case RAW_REOPEN_PREPARE:
> + ret = raw_lock_fd(raw_s->lock_fd, IMAGE_LOCK_MODE_SHARED);
> + if (ret) {
> + error_setg_errno(errp, -ret, "Failed to lock new fd (shared)");
> + break;
> + }
> + ret = raw_lock_fd(s->lock_fd, IMAGE_LOCK_MODE_NOLOCK);
> + if (ret) {
> + error_setg_errno(errp, -ret, "Failed to unlock old fd");
> + goto restore;
> + }
> + ret = raw_lock_fd(raw_s->lock_fd, IMAGE_LOCK_MODE_EXCLUSIVE);
> + if (ret) {
> + error_setg_errno(errp, -ret, "Failed to lock new fd
> (exclusive)");
> + goto restore;
> + }
> + break;
> + case RAW_REOPEN_COMMIT:
> + break;
> + case RAW_REOPEN_ABORT:
> + raw_lock_fd(raw_s->lock_fd, IMAGE_LOCK_MODE_SHARED);
> + ret = raw_lock_fd(s->lock_fd, IMAGE_LOCK_MODE_SHARED);
> + if (ret) {
> + error_report("Failed to restore lock on old fd");
If we get here, s->lock_fd is still locked exclusively. The following is
a very personal opinion, but anyway: I think it would be be better for
it to be unlocked. If it's locked too strictly, this can really break
something; but if it's just not locked (while it should be locked in
shared mode), everything's going to be fine unless the user makes a
mistake. I think the latter is less bad.
And we can always achieve unlocking the FD by closing s->lock_fd, so I
think that is what we should do here.
> + }
> + break;
> + }
> +
> + return ret;
> +restore:
> + ret2 = raw_lock_fd(s->lock_fd, IMAGE_LOCK_MODE_SHARED);
> + if (ret2) {
> + error_report("Failed to restore old lock");
> + }
> + return ret;
> +}
> +
> +static int
> +raw_reopen_downgrade(BDRVReopenState *state,
> + RawReopenOperation op,
> + ImageLockMode old_lock,
> + ImageLockMode new_lock,
> + Error **errp)
> +{
> + BDRVRawReopenState *raw_s = state->opaque;
> + BDRVRawState *s = state->bs->opaque;
> + int ret = 0;
> +
> + assert(old_lock == IMAGE_LOCK_MODE_EXCLUSIVE);
> + assert(new_lock == IMAGE_LOCK_MODE_SHARED);
> + switch (op) {
> + case RAW_REOPEN_PREPARE:
> + break;
> + case RAW_REOPEN_COMMIT:
> + ret = raw_lock_fd(s->lock_fd, IMAGE_LOCK_MODE_SHARED);
> + if (ret) {
> + error_report("Failed to downgrade old lock");
> + break;
> + }
> + ret = raw_lock_fd(raw_s->lock_fd, IMAGE_LOCK_MODE_SHARED);
> + if (ret) {
> + error_report("Failed to lock new fd (shared)");
> + break;
> + }
> + break;
It's really unfortunate that we cannot do anything in prepare and have
to hope for the best in commit. But I can't come up with anything better.
And if something does fail, the FD will be unlocked after reopening.
That's good enough for me.
> + case RAW_REOPEN_ABORT:
> + break;
> + }
> +
> + return ret;
> +}
> +
> +/**
> + * Transactionally moving between three possible locking states is tricky and
> + * must be done carefully. That is mostly because downgrading an exclusive
> lock
> + * to shared or unlocked is not guaranteed to be revertable. As a result, in
> + * such cases we have to defer the downgraing to "commit", given that no
> revert
> + * will happen after that point, and that downgrading a lock should never
> fail.
> + *
> + * On the other hand, upgrading a lock (e.g. from unlocked or shared to
> + * exclusive lock) must happen in "prepare" because it may fail.
> + *
> + * Manage the operation matrix with this state transition table to make
> + * fulfulling above conditions easier.
> + */
> +static const struct RawReopenFuncRecord {
> + ImageLockMode old_lock;
> + ImageLockMode new_lock;
> + RawReopenFunc func;
> + bool need_lock_fd;
> +} reopen_functions[] = {
> + {IMAGE_LOCK_MODE_NOLOCK, IMAGE_LOCK_MODE_NOLOCK, raw_reopen_identical,
> false},
> + {IMAGE_LOCK_MODE_NOLOCK, IMAGE_LOCK_MODE_SHARED, raw_reopen_from_unlock,
> true},
> + {IMAGE_LOCK_MODE_NOLOCK, IMAGE_LOCK_MODE_EXCLUSIVE,
> raw_reopen_from_unlock, true},
> + {IMAGE_LOCK_MODE_SHARED, IMAGE_LOCK_MODE_NOLOCK, raw_reopen_to_unlock,
> false},
> + {IMAGE_LOCK_MODE_SHARED, IMAGE_LOCK_MODE_SHARED, raw_reopen_identical,
> false},
> + {IMAGE_LOCK_MODE_SHARED, IMAGE_LOCK_MODE_EXCLUSIVE, raw_reopen_upgrade,
> true},
> + {IMAGE_LOCK_MODE_EXCLUSIVE, IMAGE_LOCK_MODE_NOLOCK,
> raw_reopen_to_unlock, false},
> + {IMAGE_LOCK_MODE_EXCLUSIVE, IMAGE_LOCK_MODE_SHARED,
> raw_reopen_downgrade, true},
> + {IMAGE_LOCK_MODE_EXCLUSIVE, IMAGE_LOCK_MODE_EXCLUSIVE,
> raw_reopen_identical, false},
> +};
> +
> +static int raw_reopen_handle_lock(BDRVReopenState *state,
> + RawReopenOperation op,
> + Error **errp)
> +{
> + BDRVRawReopenState *raw_s = state->opaque;
Please choose another name, it's hard not to confuse this with the
BDRVRawState all the time. (e.g. raw_rs or just rs would be enough.)
(Same in the places above, I got confused more than once...)
> + BDRVRawState *s = state->bs->opaque;
> + ImageLockMode old_lock, new_lock;
> + const struct RawReopenFuncRecord *rec;
> + int ret;
> +
> + old_lock = bdrv_get_lock_mode(state->bs);
> + new_lock = bdrv_lock_mode_from_flags(state->flags);
> +
> + if (old_lock == IMAGE_LOCK_MODE__MAX) {
> + /* bs was not locked, leave it unlocked. */
> + old_lock = new_lock = IMAGE_LOCK_MODE_NOLOCK;
> + }
> +
> + if (old_lock == IMAGE_LOCK_MODE_AUTO) {
> + old_lock = bdrv_get_flags(state->bs) & BDRV_O_RDWR ?
> + IMAGE_LOCK_MODE_EXCLUSIVE : IMAGE_LOCK_MODE_SHARED;
> + }
> +
> + if (new_lock == IMAGE_LOCK_MODE_AUTO) {
> + new_lock = state->flags & BDRV_O_RDWR ?
> + IMAGE_LOCK_MODE_EXCLUSIVE : IMAGE_LOCK_MODE_SHARED;
> + }
> +
> + for (rec = &reopen_functions[0];
> + rec < &reopen_functions[ARRAY_SIZE(reopen_functions)];
> + rec++) {
> + if (rec->old_lock == old_lock && rec->new_lock == new_lock) {
> + break;
> + }
> + }
> + assert(rec != &reopen_functions[ARRAY_SIZE(reopen_functions)]);
> +
> + switch (op) {
> + case RAW_REOPEN_PREPARE:
> + if (rec->need_lock_fd) {
> + ret = qemu_dup(raw_s->fd);
> + if (ret < 0) {
> + error_setg_errno(errp, -ret, "Failed to dup new fd");
> + return ret;
> + }
> + raw_s->lock_fd = ret;
> + }
> + return rec->func(state, op, old_lock, new_lock, errp);
> + case RAW_REOPEN_COMMIT:
> + rec->func(state, op, old_lock, new_lock, errp);
> + if (rec->need_lock_fd) {
> + if (s->lock_fd >= 0) {
> + qemu_close(s->lock_fd);
> + }
> + s->lock_fd = raw_s->lock_fd;
> + }
> + break;
> + case RAW_REOPEN_ABORT:
> + rec->func(state, op, old_lock, new_lock, errp);
> + if (rec->need_lock_fd && raw_s->lock_fd >= 0) {
> + qemu_close(raw_s->lock_fd);
> + raw_s->lock_fd = -1;
> + }
> + break;
> + }
> + return 0;
> +}
> +
> static int raw_reopen_prepare(BDRVReopenState *state,
> BlockReopenQueue *queue, Error **errp)
> {
> @@ -607,6 +909,10 @@ static int raw_reopen_prepare(BDRVReopenState *state,
> }
> }
>
> + if (!ret) {
> + ret = raw_reopen_handle_lock(state, RAW_REOPEN_PREPARE, errp);
> + }
> +
> return ret;
> }
>
> @@ -617,6 +923,8 @@ static void raw_reopen_commit(BDRVReopenState *state)
>
> s->open_flags = raw_s->open_flags;
>
> + raw_reopen_handle_lock(state, RAW_REOPEN_COMMIT, NULL);
I'd prefer &error_abort instead of NULL.
> +
> qemu_close(s->fd);
> s->fd = raw_s->fd;
>
> @@ -634,6 +942,8 @@ static void raw_reopen_abort(BDRVReopenState *state)
> return;
> }
>
> + raw_reopen_handle_lock(state, RAW_REOPEN_ABORT, NULL);
Same here.
> +
> if (raw_s->fd >= 0) {
> qemu_close(raw_s->fd);
> raw_s->fd = -1;
> @@ -1321,6 +1631,10 @@ static void raw_close(BlockDriverState *bs)
> qemu_close(s->fd);
> s->fd = -1;
> }
> + if (s->lock_fd >= 0) {
> + qemu_close(s->lock_fd);
> + s->lock_fd = -1;
> + }
> }
>
> static int raw_truncate(BlockDriverState *bs, int64_t offset)
> @@ -1874,7 +2188,7 @@ BlockDriver bdrv_file = {
> .bdrv_get_info = raw_get_info,
> .bdrv_get_allocated_file_size
> = raw_get_allocated_file_size,
> -
> + .bdrv_lockf = raw_lockf,
> .create_opts = &raw_create_opts,
> };
>
> @@ -2324,6 +2638,8 @@ static BlockDriver bdrv_host_device = {
> #ifdef __linux__
> .bdrv_aio_ioctl = hdev_aio_ioctl,
> #endif
> +
> + .bdrv_lockf = raw_lockf,
> };
>
> #if defined(__linux__) || defined(__FreeBSD__) || defined(__FreeBSD_kernel__)
>
Overall design looks good to me. Unfortunately we can't strictly follow
the transaction principle for downgrades and upgrades, but I guess it'll
work most of the time (fingers crossed).
Max
signature.asc
Description: OpenPGP digital signature
- Re: [Qemu-devel] [PATCH v8 05/36] raw-posix: Add image locking support,
Max Reitz <=