qemu-devel
[Top][All Lists]
Advanced

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

Re: [PATCH v2] block/blklogwrites: Protect mutable driver state with a m


From: Kevin Wolf
Subject: Re: [PATCH v2] block/blklogwrites: Protect mutable driver state with a mutex.
Date: Thu, 18 Jan 2024 20:18:40 +0100

Am 11.01.2024 um 17:32 hat Ari Sundholm geschrieben:
> During the review of a fix for a concurrency issue in blklogwrites,
> it was found that the driver needs an additional fix when enabling
> multiqueue, which is a new feature introduced in QEMU 9.0, as the
> driver state may be read and written by multiple threads at the same
> time, which was not the case when the driver was originally written.
> 
> Fix the multi-threaded scenario by introducing a mutex to protect the
> mutable fields in the driver state, and always having the mutex locked
> by the current thread when accessing them. Also use the mutex and a
> condition variable to ensure that the super block is not being written
> to by multiple threads concurrently.
> 
> Additionally, add the const qualifier to a few BDRVBlkLogWritesState
> pointer targets in contexts where the driver state is not written to.
> 
> Signed-off-by: Ari Sundholm <ari@tuxera.com>
> 
> v1->v2: Ensure that the super block is not written to concurrently.
> ---
>  block/blklogwrites.c | 77 +++++++++++++++++++++++++++++++++++++++-----
>  1 file changed, 69 insertions(+), 8 deletions(-)
> 
> diff --git a/block/blklogwrites.c b/block/blklogwrites.c
> index ba717dab4d..f8bec7c863 100644
> --- a/block/blklogwrites.c
> +++ b/block/blklogwrites.c
> @@ -3,7 +3,7 @@
>   *
>   * Copyright (c) 2017 Tuomas Tynkkynen <tuomas@tuxera.com>
>   * Copyright (c) 2018 Aapo Vienamo <aapo@tuxera.com>
> - * Copyright (c) 2018 Ari Sundholm <ari@tuxera.com>
> + * Copyright (c) 2018-2024 Ari Sundholm <ari@tuxera.com>
>   *
>   * This work is licensed under the terms of the GNU GPL, version 2 or later.
>   * See the COPYING file in the top-level directory.
> @@ -55,9 +55,34 @@ typedef struct {
>      BdrvChild *log_file;
>      uint32_t sectorsize;
>      uint32_t sectorbits;
> +    uint64_t update_interval;
> +
> +    /*
> +     * The mutable state of the driver, consisting of the current log sector
> +     * and the number of log entries.
> +     *
> +     * May be read and/or written from multiple threads, and the mutex must 
> be
> +     * held when accessing these fields.
> +     */
>      uint64_t cur_log_sector;
>      uint64_t nr_entries;
> -    uint64_t update_interval;
> +    QemuMutex mutex;
> +
> +    /*
> +     * The super block sequence number. Non-zero if a super block update is 
> in
> +     * progress.
> +     *
> +     * The mutex must be held when accessing this field.
> +     */
> +    uint64_t super_update_seq;
> +
> +    /*
> +     * A condition variable to wait for and signal finished superblock 
> updates.
> +     *
> +     * Used with the mutex to ensure that only one thread be updating the 
> super
> +     * block at a time.
> +     */
> +    QemuCond super_updated;
>  } BDRVBlkLogWritesState;
>  
>  static QemuOptsList runtime_opts = {
> @@ -169,6 +194,9 @@ static int blk_log_writes_open(BlockDriverState *bs, 
> QDict *options, int flags,
>          goto fail;
>      }
>  
> +    qemu_mutex_init(&s->mutex);
> +    qemu_cond_init(&s->super_updated);
> +
>      log_append = qemu_opt_get_bool(opts, "log-append", false);
>  
>      if (log_append) {
> @@ -231,6 +259,8 @@ static int blk_log_writes_open(BlockDriverState *bs, 
> QDict *options, int flags,
>          s->nr_entries = 0;
>      }
>  
> +    s->super_update_seq = 0;
> +
>      if (!blk_log_writes_sector_size_valid(log_sector_size)) {
>          ret = -EINVAL;
>          error_setg(errp, "Invalid log sector size %"PRIu64, log_sector_size);
> @@ -255,6 +285,8 @@ fail_log:
>          bdrv_unref_child(bs, s->log_file);
>          bdrv_graph_wrunlock();
>          s->log_file = NULL;
> +        qemu_cond_destroy(&s->super_updated);
> +        qemu_mutex_destroy(&s->mutex);
>      }
>  fail:
>      qemu_opts_del(opts);
> @@ -269,6 +301,8 @@ static void blk_log_writes_close(BlockDriverState *bs)
>      bdrv_unref_child(bs, s->log_file);
>      s->log_file = NULL;
>      bdrv_graph_wrunlock();
> +    qemu_cond_destroy(&s->super_updated);
> +    qemu_mutex_destroy(&s->mutex);
>  }
>  
>  static int64_t coroutine_fn GRAPH_RDLOCK
> @@ -295,7 +329,7 @@ static void blk_log_writes_child_perm(BlockDriverState 
> *bs, BdrvChild *c,
>  
>  static void blk_log_writes_refresh_limits(BlockDriverState *bs, Error **errp)
>  {
> -    BDRVBlkLogWritesState *s = bs->opaque;
> +    const BDRVBlkLogWritesState *s = bs->opaque;
>      bs->bl.request_alignment = s->sectorsize;
>  }
>  
> @@ -338,15 +372,18 @@ blk_log_writes_co_do_log(BlkLogWritesLogReq *lr)
>       * driver may be modified by other driver operations while waiting for 
> the
>       * I/O to complete.
>       */
> +    qemu_mutex_lock(&s->mutex);
>      const uint64_t entry_start_sector = s->cur_log_sector;
>      const uint64_t entry_offset = entry_start_sector << s->sectorbits;
>      const uint64_t qiov_aligned_size = ROUND_UP(lr->qiov->size, 
> s->sectorsize);
>      const uint64_t entry_aligned_size = qiov_aligned_size +
>          ROUND_UP(lr->zero_size, s->sectorsize);
>      const uint64_t entry_nr_sectors = entry_aligned_size >> s->sectorbits;
> +    const uint64_t entry_seq = s->nr_entries + 1;
>  
> -    s->nr_entries++;
> +    s->nr_entries = entry_seq;
>      s->cur_log_sector += entry_nr_sectors;
> +    qemu_mutex_unlock(&s->mutex);
>  
>      /*
>       * Write the log entry. Note that if this is a "write zeroes" operation,
> @@ -366,17 +403,34 @@ blk_log_writes_co_do_log(BlkLogWritesLogReq *lr)
>  
>      /* Update super block on flush or every update interval */
>      if (lr->log_ret == 0 && ((lr->entry.flags & LOG_FLUSH_FLAG)
> -        || (s->nr_entries % s->update_interval == 0)))
> +        || (entry_seq % s->update_interval == 0)))
>      {
>          struct log_write_super super = {
>              .magic      = cpu_to_le64(WRITE_LOG_MAGIC),
>              .version    = cpu_to_le64(WRITE_LOG_VERSION),
> -            .nr_entries = cpu_to_le64(s->nr_entries),
> +            .nr_entries = const_le64(0),
>              .sectorsize = cpu_to_le32(s->sectorsize),
>          };
> -        void *zeroes = g_malloc0(s->sectorsize - sizeof(super));
> +        void *zeroes;
>          QEMUIOVector qiov;
>  
> +        /*
> +         * Wait if a super block update is already in progress.
> +         * Bail out if a newer update got its turn before us.
> +         */
> +        WITH_QEMU_LOCK_GUARD(&s->mutex) {
> +            while (s->super_update_seq) {
> +                if (entry_seq < s->super_update_seq) {
> +                    return;
> +                }
> +                qemu_cond_wait(&s->super_updated, &s->mutex);

This will block, which is exactly what you want if another thread is
writing the super block.

However, in a single-threaded case where it's just the previous request
coroutine that is still writing its super block (i.e. bdrv_co_pwritev()
below has yielded), this will deadlock because we'll never switch back
and actually complete the previous super block write.

So unless I'm missing a reason why this won't happen, I think you need a
coroutine aware mechanism here. The obvious options would be using a
CoMutex in the first place and holding it across the I/O operation or
keeping the cheaper QemuMutex and replacing the condition variable with
a CoQueue.

> +            }
> +            s->super_update_seq = entry_seq;
> +            super.nr_entries = cpu_to_le64(s->nr_entries);
> +        }
> +
> +        zeroes = g_malloc0(s->sectorsize - sizeof(super));
> +
>          qemu_iovec_init(&qiov, 2);
>          qemu_iovec_add(&qiov, &super, sizeof(super));
>          qemu_iovec_add(&qiov, zeroes, s->sectorsize - sizeof(super));

Kevin




reply via email to

[Prev in Thread] Current Thread [Next in Thread]