Re: [Qemu-devel] [RFC PATCH] Convert ram

qemu-devel
[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]
Re: [Qemu-devel] [RFC PATCH] Convert ram_list to RCU DQ V2

From:	Paolo Bonzini
Subject:	Re: [Qemu-devel] [RFC PATCH] Convert ram_list to RCU DQ V2
Date:	Fri, 30 Aug 2013 18:38:50 +0200
User-agent:	Mozilla/5.0 (X11; Linux x86_64; rv:17.0) Gecko/20130805 Thunderbird/17.0.8
Il 30/08/2013 18:06, Mike Day ha scritto:
> Changes from V1:
> 
> * Omitted locks or rcu critical sections within Some functions that
>   read or write the ram_list but are called in a protected context
>   (the caller holds the iothread lock, the ram_list mutex, or an rcu
>   critical section).
> 
> Allow "unlocked" reads of the ram_list by using an RCU-enabled
> DQ. Most readers of the list no longer require holding the list mutex.
> 
> The ram_list now uses a QLIST instead of a QTAILQ. The difference is
> minimal.
> 
> This patch has been built and make-checked for the x86_64, ppc64,
> s390x, and arm targets. It has not been tested further than that at
> this point.
> 
> To apply this patch, you must base upon Paolo Bonzini's rcu tree and
> also apply the RCU DQ patch (below).
> 
> https://github.com/bonzini/qemu/tree/rcu
> http://article.gmane.org/gmane.comp.emulators.qemu/230159/
> 
> Signed-off-by: Mike Day <address@hidden>

Thanks.  I moved the include/qemu/rcu_queue.h part to the previous patch
and applied it.

ram_save_complete is called with the iothread lock held, so you can get
rid of the ramlist mutex completely.

> -    QTAILQ_FOREACH(block, &ram_list.blocks, next) {
> +     /* This assumes the iothread lock or the ram_list mutex is taken.
> +     * if that changes, accesses to ram_list need to be protected
> +     * by a mutex (writes) or an rcu read lock (reads)
> +     */
> +    QLIST_FOREACH_RCU(block, &ram_list.blocks, next) {
>          for (addr = 0; addr < block->length; addr += TARGET_PAGE_SIZE) {
>              if (memory_region_test_and_clear_dirty(block->mr,
>                                                     addr, TARGET_PAGE_SIZE,

Perhaps rcu_read_lock/unlock unconditionally is simpler here, since it's
cheap?

> @@ -815,6 +822,10 @@ static inline void *host_from_stream_offset(QEMUFile *f,
>      char id[256];
>      uint8_t len;
>  
> +    /* Must be called from within a rcu critical section.
> +     * Returns a pointer from within the RCU-protected ram_list.
> +     */
> +
>      if (flags & RAM_SAVE_FLAG_CONTINUE) {
>          if (!block) {
>              fprintf(stderr, "Ack, bad migration stream!\n");

Please put the comment before the "static inline void
*host_from_stream_offset" line.

> @@ -828,9 +839,10 @@ static inline void *host_from_stream_offset(QEMUFile *f,
>      qemu_get_buffer(f, (uint8_t *)id, len);
>      id[len] = 0;
>  
> -    QTAILQ_FOREACH(block, &ram_list.blocks, next) {
> -        if (!strncmp(id, block->idstr, sizeof(id)))
> +    QLIST_FOREACH_RCU(block, &ram_list.blocks, next) {
> +        if (!strncmp(id, block->idstr, sizeof(id))) {
>              return memory_region_get_ram_ptr(block->mr) + offset;
> +        }
>      }
>  
>      fprintf(stderr, "Can't find block %s!\n", id);
> @@ -867,7 +879,12 @@ static int ram_load(QEMUFile *f, void *opaque, int 
> version_id)
>      if (version_id < 4 || version_id > 4) {
>          return -EINVAL;
>      }
> -
> +    /* this implements a long-running RCU critical section.
> +     * When rcu reclaims in the code start to become numerous
> +     * it will be necessary to reduce the granularity of this critical
> +     * section.
> +     */

Please add the same comment (and a rcu_read_lock/unlock pair replacing
the ramlist mutex) in ram_save_iterate, too.

> diff --git a/exec.c b/exec.c
> index 5eebcc1..d1132da 100644
> --- a/exec.c
> +++ b/exec.c
> @@ -46,7 +46,7 @@
>  #endif
>  #include "exec/cpu-all.h"
>  #include "qemu/tls.h"
> -
> +#include "qemu/rcu_queue.h"
>  #include "exec/cputlb.h"
>  #include "translate-all.h"
>  
> @@ -57,7 +57,7 @@
>  #if !defined(CONFIG_USER_ONLY)
>  static int in_migration;
>  
> -RAMList ram_list = { .blocks = QTAILQ_HEAD_INITIALIZER(ram_list.blocks) };
> +RAMList ram_list = { .blocks = QLIST_HEAD_INITIALIZER(ram_list.blocks) };
>  
>  static MemoryRegion *system_memory;
>  static MemoryRegion *system_io;
> @@ -1021,17 +1021,24 @@ static ram_addr_t find_ram_offset(ram_addr_t size)
>      RAMBlock *block, *next_block;
>      ram_addr_t offset = RAM_ADDR_MAX, mingap = RAM_ADDR_MAX;
>  
> +    /* ram_list must be protected by a mutex (for writes), or

s/a mutex/the iothread lock/

> +     * an rcu critical section (for reads). Currently this code
> +     * is called with the iothread lock held. If that changes,
> +     * make sure to protect ram_list with an rcu critical section.
> +    */

Right---also because find_ram_offset returns a value that is within the
RCU-protected list.

> @@ -1126,13 +1138,18 @@ static int memory_try_enable_merging(void *addr, 
> size_t len)
>  ram_addr_t qemu_ram_alloc_from_ptr(ram_addr_t size, void *host,
>                                     MemoryRegion *mr)
>  {
> -    RAMBlock *block, *new_block;
> +    RAMBlock *block, *new_block, *last_block = 0;
>  
>      size = TARGET_PAGE_ALIGN(size);
>      new_block = g_malloc0(sizeof(*new_block));
>  
> -    /* This assumes the iothread lock is taken here too.  */
> -    qemu_mutex_lock_ramlist();
> +    /* ram_list needs to be protected by the ram_list mutex
> +     * for writes, and by an rcu critical section for reads.
> +     * Currently this function is called with the iothread lock
> +     * being held, so we can foregoe protecting the ram_list.
> +     * when that changes, acquire the ram_list mutex before
> +     * sorting and writing the list below.
> +     */

Please remove other references to the ram_list mutex---in practice, the
ram_list is protected by the BQL.  The practice of using the BQL for the
write side is not going to go away anytime soon, so we might as well
make the code simpler and codify it.

>      new_block->mr = mr;
>      new_block->offset = find_ram_offset(size);
>      if (host) {
> @@ -1164,21 +1181,28 @@ ram_addr_t qemu_ram_alloc_from_ptr(ram_addr_t size, 
> void *host,
>      }
>      new_block->length = size;
>  
> +    /* when this code is called without the iothread lock, protect
> +     * the ram_list here by acquiring its mutex.
> +     */
>      /* Keep the list sorted from biggest to smallest block.  */
> -    QTAILQ_FOREACH(block, &ram_list.blocks, next) {
> +    QLIST_FOREACH_RCU(block, &ram_list.blocks, next) {
> +        last_block = block;
>          if (block->length < new_block->length) {
>              break;
>          }
>      }
>      if (block) {
> -        QTAILQ_INSERT_BEFORE(block, new_block, next);
> +        QLIST_INSERT_BEFORE_RCU(block, new_block, next);
>      } else {
> -        QTAILQ_INSERT_TAIL(&ram_list.blocks, new_block, next);
> +        if (last_block) {
> +            QLIST_INSERT_AFTER_RCU(last_block, new_block, next);
> +        } else { /* list is empty */
> +            QLIST_INSERT_HEAD_RCU(&ram_list.blocks, new_block, next);
> +        }
>      }
>      ram_list.mru_block = NULL;
>  
>      ram_list.version++;
> -    qemu_mutex_unlock_ramlist();
>  
>      ram_list.phys_dirty = g_realloc(ram_list.phys_dirty,
>                                         last_ram_offset() >> 
> TARGET_PAGE_BITS);
> @@ -1204,29 +1228,32 @@ void qemu_ram_free_from_ptr(ram_addr_t addr)
>  {
>      RAMBlock *block;
>  
> -    /* This assumes the iothread lock is taken here too.  */
> -    qemu_mutex_lock_ramlist();
> -    QTAILQ_FOREACH(block, &ram_list.blocks, next) {
> +    /* This assumes the iothread lock is taken here too.
> +     * when this code is called without the iothread lock, protect
> +     * the ram_list here by acquiring its mutex.
> +     */
> +    QLIST_FOREACH_RCU(block, &ram_list.blocks, next) {
>          if (addr == block->offset) {
> -            QTAILQ_REMOVE(&ram_list.blocks, block, next);
> +            QLIST_REMOVE_RCU(block, next);
>              ram_list.mru_block = NULL;
>              ram_list.version++;
>              g_free(block);
>              break;
>          }
>      }
> -    qemu_mutex_unlock_ramlist();
>  }
>  
>  void qemu_ram_free(ram_addr_t addr)
>  {
>      RAMBlock *block;
>  
> -    /* This assumes the iothread lock is taken here too.  */
> -    qemu_mutex_lock_ramlist();
> -    QTAILQ_FOREACH(block, &ram_list.blocks, next) {
> +    /* This assumes the iothread lock is taken here too.
> +     * if that changes, accesses to ram_list need to be protected
> +     * by a mutex (writes) or an rcu read lock (reads)
> +     */

Same here.  Also, wherever a loop writes to the ram_list, there's no
need to mention RCU.  It's already evident from the code's usage of
QLIST_FOREACH_RCU.  So:

- read-side with rcu_read_lock -> no comment

- read-side without rcu_read_lock, returns RCU-protected value (which
includes ram_addr_t) -> comment that caller must hold RCU lock or
iothread mutex

- read-side without rcu_read_lock, does not return RCU-protected value
-> don't do it, just use rcu_read_lock :)

- write-side -> no comment (but please make sure the policy is
documented where ram_list is declared)


> +    QLIST_FOREACH_RCU(block, &ram_list.blocks, next) {
>          if (addr == block->offset) {
> -            QTAILQ_REMOVE(&ram_list.blocks, block, next);
> +            QLIST_REMOVE_RCU(block, next);
>              ram_list.mru_block = NULL;
>              ram_list.version++;
>              if (block->flags & RAM_PREALLOC_MASK) {

...

            g_free(block);

This should be changed to call_rcu.

> @@ -1253,8 +1280,6 @@ void qemu_ram_free(ram_addr_t addr)
>              break;
>          }
>      }
> -    qemu_mutex_unlock_ramlist();
> -
>  }
>  
>  #ifndef _WIN32
> @@ -1265,7 +1290,8 @@ void qemu_ram_remap(ram_addr_t addr, ram_addr_t length)
>      int flags;
>      void *area, *vaddr;
>  
> -    QTAILQ_FOREACH(block, &ram_list.blocks, next) {
> +    rcu_read_lock();

This is a write side, so no need to use rcu_read_lock.

(Note that I'm not commenting on all the occurrences).

> +    QLIST_FOREACH_RCU(block, &ram_list.blocks, next) {
>          offset = addr - block->offset;
>          if (offset < block->length) {
>              vaddr = block->host + offset;
> @@ -1313,9 +1339,11 @@ void qemu_ram_remap(ram_addr_t addr, ram_addr_t length)
>                  memory_try_enable_merging(vaddr, length);
>                  qemu_ram_setup_dump(vaddr, length);
>              }
> -            return;
> +            goto unlock_out;
>          }
>      }
> +unlock_out:
> +    rcu_read_unlock();
>  }
>  #endif /* !_WIN32 */
>  
> @@ -1323,12 +1351,15 @@ static RAMBlock *qemu_get_ram_block(ram_addr_t addr)
>  {
>      RAMBlock *block;
>  
> -    /* The list is protected by the iothread lock here.  */
> +   /* This assumes the iothread lock is taken here too.
> +     * if that changes, accesses to ram_list need to be protected
> +     * by a mutex (writes) or an rcu read lock (reads)
> +     */
>      block = ram_list.mru_block;
>      if (block && addr - block->offset < block->length) {
>          goto found;
>      }
> -    QTAILQ_FOREACH(block, &ram_list.blocks, next) {
> +    QLIST_FOREACH_RCU(block, &ram_list.blocks, next) {
>          if (addr - block->offset < block->length) {
>              goto found;
>          }
> @@ -1378,8 +1409,11 @@ static void *qemu_safe_ram_ptr(ram_addr_t addr)
>  {
>      RAMBlock *block;
>  
> -    /* The list is protected by the iothread lock here.  */
> -    QTAILQ_FOREACH(block, &ram_list.blocks, next) {
> +    /* This assumes the iothread lock is taken here too.
> +     * if that changes, accesses to ram_list need to be protected
> +     * by a mutex (writes) or an rcu read lock (reads)
> +     */
> +    QLIST_FOREACH_RCU(block, &ram_list.blocks, next) {
>          if (addr - block->offset < block->length) {
>              if (xen_enabled()) {
>                  /* We need to check if the requested address is in the RAM
> @@ -1399,7 +1433,6 @@ static void *qemu_safe_ram_ptr(ram_addr_t addr)
>  
>      fprintf(stderr, "Bad ram offset %" PRIx64 "\n", (uint64_t)addr);
>      abort();
> -
>      return NULL;
>  }
>  
> @@ -1407,6 +1440,7 @@ static void *qemu_safe_ram_ptr(ram_addr_t addr)
>   * but takes a size argument */
>  static void *qemu_ram_ptr_length(ram_addr_t addr, hwaddr *size)
>  {
> +    void *ptr = NULL;
>      if (*size == 0) {
>          return NULL;
>      }
> @@ -1414,18 +1448,22 @@ static void *qemu_ram_ptr_length(ram_addr_t addr, 
> hwaddr *size)
>          return xen_map_cache(addr, *size, 1);
>      } else {
>          RAMBlock *block;
> -
> -        QTAILQ_FOREACH(block, &ram_list.blocks, next) {
> +        rcu_read_lock();
> +        QLIST_FOREACH_RCU(block, &ram_list.blocks, next) {
>              if (addr - block->offset < block->length) {
>                  if (addr - block->offset + *size > block->length)
>                      *size = block->length - addr + block->offset;
> -                return block->host + (addr - block->offset);
> +                ptr = block->host + (addr - block->offset);
> +                goto unlock_out;
>              }
>          }
>  
>          fprintf(stderr, "Bad ram offset %" PRIx64 "\n", (uint64_t)addr);
>          abort();
>      }
> +unlock_out:
> +    rcu_read_unlock();
> +    return ptr;
>  }

This returns a RCU-protected value.  However, it's okay to include the
function in a "small" critical section instead of involving the caller,
because the caller is ensuring that the returned block doesn't disappear
(through memory_region_ref/unref).  Perhaps you can add a comment.

>  /* Some of the softmmu routines need to translate from a host pointer
> @@ -1434,32 +1472,35 @@ MemoryRegion *qemu_ram_addr_from_host(void *ptr, 
> ram_addr_t *ram_addr)
>  {
>      RAMBlock *block;
>      uint8_t *host = ptr;
> +    MemoryRegion *mr = NULL;
>  
>      if (xen_enabled()) {
>          *ram_addr = xen_ram_addr_from_mapcache(ptr);
>          return qemu_get_ram_block(*ram_addr)->mr;
>      }
> -
> +    rcu_read_lock();
>      block = ram_list.mru_block;
>      if (block && block->host && host - block->host < block->length) {
> -        goto found;
> +        *ram_addr = block->offset + (host - block->host);
> +        mr = block->mr;
> +        goto unlock_out;
>      }
>  
> -    QTAILQ_FOREACH(block, &ram_list.blocks, next) {
> +    QLIST_FOREACH_RCU(block, &ram_list.blocks, next) {
>          /* This case append when the block is not mapped. */
>          if (block->host == NULL) {
>              continue;
>          }
>          if (host - block->host < block->length) {
> -            goto found;
> +            *ram_addr = block->offset + (host - block->host);
> +            mr = block->mr;
> +            goto unlock_out;
>          }
>      }
>  
> -    return NULL;
> -
> -found:
> -    *ram_addr = block->offset + (host - block->host);
> -    return block->mr;
> +unlock_out:
> +    rcu_read_unlock();
> +    return mr;

Same here.  Perhaps add a comment that the caller must either have
already a reference to mr (as is the case for address_space_unmap) or be
holding the iothread mutex.

In general, a good comment is IMO one that occurs once or twice.  A bad
comment is one that is cut-and-pasted all over the place.

Paolo

>  }
>  
>  static void notdirty_mem_write(void *opaque, hwaddr ram_addr,
> @@ -2709,9 +2750,10 @@ bool cpu_physical_memory_is_io(hwaddr phys_addr)
>  void qemu_ram_foreach_block(RAMBlockIterFunc func, void *opaque)
>  {
>      RAMBlock *block;
> -
> -    QTAILQ_FOREACH(block, &ram_list.blocks, next) {
> +    rcu_read_lock();
> +    QLIST_FOREACH_RCU(block, &ram_list.blocks, next) {
>          func(block->host, block->offset, block->length, opaque);
>      }
> +    rcu_read_unlock();
>  }
>  #endif
> diff --git a/hw/9pfs/virtio-9p-synth.c b/hw/9pfs/virtio-9p-synth.c
> index fdfea21..c2efaca 100644
> --- a/hw/9pfs/virtio-9p-synth.c
> +++ b/hw/9pfs/virtio-9p-synth.c
> @@ -18,7 +18,7 @@
>  #include "fsdev/qemu-fsdev.h"
>  #include "virtio-9p-synth.h"
>  #include "qemu/rcu.h"
> -
> +#include "qemu/rcu_queue.h"
>  #include <sys/stat.h>
>  
>  /* Root node for synth file system */
> diff --git a/include/exec/cpu-all.h b/include/exec/cpu-all.h
> index e088089..9cd8a30 100644
> --- a/include/exec/cpu-all.h
> +++ b/include/exec/cpu-all.h
> @@ -457,7 +457,7 @@ typedef struct RAMBlock {
>      /* Reads can take either the iothread or the ramlist lock.
>       * Writes must take both locks.
>       */
> -    QTAILQ_ENTRY(RAMBlock) next;
> +    QLIST_ENTRY(RAMBlock) next;
>  #if defined(__linux__) && !defined(TARGET_S390X)
>      int fd;
>  #endif
> @@ -469,7 +469,7 @@ typedef struct RAMList {
>      uint8_t *phys_dirty;
>      RAMBlock *mru_block;
>      /* Protected by the ramlist lock.  */
> -    QTAILQ_HEAD(, RAMBlock) blocks;
> +    QLIST_HEAD(, RAMBlock) blocks;
>      uint32_t version;
>  } RAMList;
>  extern RAMList ram_list;
> diff --git a/include/qemu/rcu_queue.h b/include/qemu/rcu_queue.h
> index e2b8ba5..d159850 100644
> --- a/include/qemu/rcu_queue.h
> +++ b/include/qemu/rcu_queue.h
> @@ -37,6 +37,14 @@
>  extern "C" {
>  #endif
>  
> +
> +/*
> + * List access methods.
> + */
> +#define QLIST_EMPTY_RCU(head) (atomic_rcu_read(&(head)->lh_first) == NULL)
> +#define QLIST_FIRST_RCU(head) (atomic_rcu_read(&(head)->lh_first))
> +#define QLIST_NEXT_RCU(elm, field) (atomic_rcu_read(&(elm)->field.le_next))
> +
>  /*
>   * List functions.
>   */
>
[Prev in Thread]
Current Thread
[Next in Thread]
[Qemu-devel] [RFC PATCH] Convert ram_list to RCU DQ V2, Mike Day, 2013/08/30
- Re: [Qemu-devel] [RFC PATCH] Convert ram_list to RCU DQ V2, Paolo Bonzini <=
Prev by Date: Re: [Qemu-devel] [RFC 2/5] hw/arm: add very initial support for Canon DIGIC SoC
Next by Date: Re: [Qemu-devel] PING^2 Re: [PATCH] KVM: always use MADV_DONTFORK
Previous by thread: [Qemu-devel] [RFC PATCH] Convert ram_list to RCU DQ V2
Next by thread: [Qemu-devel] [PATCH v2 0/8] tcg-arm ldst improvements
Index(es):
- Date
- Thread