[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]
Re: [Qemu-devel] [PULL 02/32] memory: RCU ram_list.dirty_memory[] for sa
From: |
Leon Alrae |
Subject: |
Re: [Qemu-devel] [PULL 02/32] memory: RCU ram_list.dirty_memory[] for safe RAM hotplug |
Date: |
Wed, 10 Feb 2016 16:56:28 +0000 |
User-agent: |
Mozilla/5.0 (X11; Linux x86_64; rv:38.0) Gecko/20100101 Thunderbird/38.4.0 |
Hi,
I just noticed significant performance hit with this change. Booting
small system (I tried on system mips only) was usually taking around 20
seconds, now reaches 3 minutes with this change.
Leon
On 09/02/16 12:13, Paolo Bonzini wrote:
> From: Stefan Hajnoczi <address@hidden>
>
> Although accesses to ram_list.dirty_memory[] use atomics so multiple
> threads can safely dirty the bitmap, the data structure is not fully
> thread-safe yet.
>
> This patch handles the RAM hotplug case where ram_list.dirty_memory[] is
> grown. ram_list.dirty_memory[] is change from a regular bitmap to an
> RCU array of pointers to fixed-size bitmap blocks. Threads can continue
> accessing bitmap blocks while the array is being extended. See the
> comments in the code for an in-depth explanation of struct
> DirtyMemoryBlocks.
>
> I have tested that live migration with virtio-blk dataplane works.
>
> Signed-off-by: Stefan Hajnoczi <address@hidden>
> Message-Id: <address@hidden>
> Signed-off-by: Paolo Bonzini <address@hidden>
> ---
> exec.c | 75 +++++++++++++++----
> include/exec/ram_addr.h | 189
> ++++++++++++++++++++++++++++++++++++++++++------
> migration/ram.c | 4 -
> 3 files changed, 225 insertions(+), 43 deletions(-)
>
> diff --git a/exec.c b/exec.c
> index ab37360..7d67c11 100644
> --- a/exec.c
> +++ b/exec.c
> @@ -980,8 +980,9 @@ bool cpu_physical_memory_test_and_clear_dirty(ram_addr_t
> start,
> ram_addr_t length,
> unsigned client)
> {
> + DirtyMemoryBlocks *blocks;
> unsigned long end, page;
> - bool dirty;
> + bool dirty = false;
>
> if (length == 0) {
> return false;
> @@ -989,8 +990,22 @@ bool cpu_physical_memory_test_and_clear_dirty(ram_addr_t
> start,
>
> end = TARGET_PAGE_ALIGN(start + length) >> TARGET_PAGE_BITS;
> page = start >> TARGET_PAGE_BITS;
> - dirty = bitmap_test_and_clear_atomic(ram_list.dirty_memory[client],
> - page, end - page);
> +
> + rcu_read_lock();
> +
> + blocks = atomic_rcu_read(&ram_list.dirty_memory[client]);
> +
> + while (page < end) {
> + unsigned long idx = page / DIRTY_MEMORY_BLOCK_SIZE;
> + unsigned long offset = page % DIRTY_MEMORY_BLOCK_SIZE;
> + unsigned long num = MIN(end - page, DIRTY_MEMORY_BLOCK_SIZE -
> offset);
> +
> + dirty |= bitmap_test_and_clear_atomic(blocks->blocks[idx],
> + offset, num);
> + page += num;
> + }
> +
> + rcu_read_unlock();
>
> if (dirty && tcg_enabled()) {
> tlb_reset_dirty_range_all(start, length);
> @@ -1504,6 +1519,47 @@ int qemu_ram_resize(ram_addr_t base, ram_addr_t
> newsize, Error **errp)
> return 0;
> }
>
> +/* Called with ram_list.mutex held */
> +static void dirty_memory_extend(ram_addr_t old_ram_size,
> + ram_addr_t new_ram_size)
> +{
> + ram_addr_t old_num_blocks = DIV_ROUND_UP(old_ram_size,
> + DIRTY_MEMORY_BLOCK_SIZE);
> + ram_addr_t new_num_blocks = DIV_ROUND_UP(new_ram_size,
> + DIRTY_MEMORY_BLOCK_SIZE);
> + int i;
> +
> + /* Only need to extend if block count increased */
> + if (new_num_blocks <= old_num_blocks) {
> + return;
> + }
> +
> + for (i = 0; i < DIRTY_MEMORY_NUM; i++) {
> + DirtyMemoryBlocks *old_blocks;
> + DirtyMemoryBlocks *new_blocks;
> + int j;
> +
> + old_blocks = atomic_rcu_read(&ram_list.dirty_memory[i]);
> + new_blocks = g_malloc(sizeof(*new_blocks) +
> + sizeof(new_blocks->blocks[0]) *
> new_num_blocks);
> +
> + if (old_num_blocks) {
> + memcpy(new_blocks->blocks, old_blocks->blocks,
> + old_num_blocks * sizeof(old_blocks->blocks[0]));
> + }
> +
> + for (j = old_num_blocks; j < new_num_blocks; j++) {
> + new_blocks->blocks[j] = bitmap_new(DIRTY_MEMORY_BLOCK_SIZE);
> + }
> +
> + atomic_rcu_set(&ram_list.dirty_memory[i], new_blocks);
> +
> + if (old_blocks) {
> + g_free_rcu(old_blocks, rcu);
> + }
> + }
> +}
> +
> static ram_addr_t ram_block_add(RAMBlock *new_block, Error **errp)
> {
> RAMBlock *block;
> @@ -1543,6 +1599,7 @@ static ram_addr_t ram_block_add(RAMBlock *new_block,
> Error **errp)
> (new_block->offset + new_block->max_length) >>
> TARGET_PAGE_BITS);
> if (new_ram_size > old_ram_size) {
> migration_bitmap_extend(old_ram_size, new_ram_size);
> + dirty_memory_extend(old_ram_size, new_ram_size);
> }
> /* Keep the list sorted from biggest to smallest block. Unlike QTAILQ,
> * QLIST (which has an RCU-friendly variant) does not have insertion at
> @@ -1568,18 +1625,6 @@ static ram_addr_t ram_block_add(RAMBlock *new_block,
> Error **errp)
> ram_list.version++;
> qemu_mutex_unlock_ramlist();
>
> - new_ram_size = last_ram_offset() >> TARGET_PAGE_BITS;
> -
> - if (new_ram_size > old_ram_size) {
> - int i;
> -
> - /* ram_list.dirty_memory[] is protected by the iothread lock. */
> - for (i = 0; i < DIRTY_MEMORY_NUM; i++) {
> - ram_list.dirty_memory[i] =
> - bitmap_zero_extend(ram_list.dirty_memory[i],
> - old_ram_size, new_ram_size);
> - }
> - }
> cpu_physical_memory_set_dirty_range(new_block->offset,
> new_block->used_length,
> DIRTY_CLIENTS_ALL);
> diff --git a/include/exec/ram_addr.h b/include/exec/ram_addr.h
> index f2e872d..b1413a1 100644
> --- a/include/exec/ram_addr.h
> +++ b/include/exec/ram_addr.h
> @@ -49,13 +49,43 @@ static inline void *ramblock_ptr(RAMBlock *block,
> ram_addr_t offset)
> return (char *)block->host + offset;
> }
>
> +/* The dirty memory bitmap is split into fixed-size blocks to allow growth
> + * under RCU. The bitmap for a block can be accessed as follows:
> + *
> + * rcu_read_lock();
> + *
> + * DirtyMemoryBlocks *blocks =
> + * atomic_rcu_read(&ram_list.dirty_memory[DIRTY_MEMORY_MIGRATION]);
> + *
> + * ram_addr_t idx = (addr >> TARGET_PAGE_BITS) / DIRTY_MEMORY_BLOCK_SIZE;
> + * unsigned long *block = blocks.blocks[idx];
> + * ...access block bitmap...
> + *
> + * rcu_read_unlock();
> + *
> + * Remember to check for the end of the block when accessing a range of
> + * addresses. Move on to the next block if you reach the end.
> + *
> + * Organization into blocks allows dirty memory to grow (but not shrink)
> under
> + * RCU. When adding new RAMBlocks requires the dirty memory to grow, a new
> + * DirtyMemoryBlocks array is allocated with pointers to existing blocks kept
> + * the same. Other threads can safely access existing blocks while dirty
> + * memory is being grown. When no threads are using the old
> DirtyMemoryBlocks
> + * anymore it is freed by RCU (but the underlying blocks stay because they
> are
> + * pointed to from the new DirtyMemoryBlocks).
> + */
> +#define DIRTY_MEMORY_BLOCK_SIZE ((ram_addr_t)256 * 1024 * 8)
> +typedef struct {
> + struct rcu_head rcu;
> + unsigned long *blocks[];
> +} DirtyMemoryBlocks;
> +
> typedef struct RAMList {
> QemuMutex mutex;
> - /* Protected by the iothread lock. */
> - unsigned long *dirty_memory[DIRTY_MEMORY_NUM];
> RAMBlock *mru_block;
> /* RCU-enabled, writes protected by the ramlist lock. */
> QLIST_HEAD(, RAMBlock) blocks;
> + DirtyMemoryBlocks *dirty_memory[DIRTY_MEMORY_NUM];
> uint32_t version;
> } RAMList;
> extern RAMList ram_list;
> @@ -89,30 +119,70 @@ static inline bool
> cpu_physical_memory_get_dirty(ram_addr_t start,
> ram_addr_t length,
> unsigned client)
> {
> - unsigned long end, page, next;
> + DirtyMemoryBlocks *blocks;
> + unsigned long end, page;
> + bool dirty = false;
>
> assert(client < DIRTY_MEMORY_NUM);
>
> end = TARGET_PAGE_ALIGN(start + length) >> TARGET_PAGE_BITS;
> page = start >> TARGET_PAGE_BITS;
> - next = find_next_bit(ram_list.dirty_memory[client], end, page);
>
> - return next < end;
> + rcu_read_lock();
> +
> + blocks = atomic_rcu_read(&ram_list.dirty_memory[client]);
> +
> + while (page < end) {
> + unsigned long idx = page / DIRTY_MEMORY_BLOCK_SIZE;
> + unsigned long offset = page % DIRTY_MEMORY_BLOCK_SIZE;
> + unsigned long num = MIN(end - page, DIRTY_MEMORY_BLOCK_SIZE -
> offset);
> +
> + if (find_next_bit(blocks->blocks[idx], offset, num) < num) {
> + dirty = true;
> + break;
> + }
> +
> + page += num;
> + }
> +
> + rcu_read_unlock();
> +
> + return dirty;
> }
>
> static inline bool cpu_physical_memory_all_dirty(ram_addr_t start,
> ram_addr_t length,
> unsigned client)
> {
> - unsigned long end, page, next;
> + DirtyMemoryBlocks *blocks;
> + unsigned long end, page;
> + bool dirty = true;
>
> assert(client < DIRTY_MEMORY_NUM);
>
> end = TARGET_PAGE_ALIGN(start + length) >> TARGET_PAGE_BITS;
> page = start >> TARGET_PAGE_BITS;
> - next = find_next_zero_bit(ram_list.dirty_memory[client], end, page);
>
> - return next >= end;
> + rcu_read_lock();
> +
> + blocks = atomic_rcu_read(&ram_list.dirty_memory[client]);
> +
> + while (page < end) {
> + unsigned long idx = page / DIRTY_MEMORY_BLOCK_SIZE;
> + unsigned long offset = page % DIRTY_MEMORY_BLOCK_SIZE;
> + unsigned long num = MIN(end - page, DIRTY_MEMORY_BLOCK_SIZE -
> offset);
> +
> + if (find_next_zero_bit(blocks->blocks[idx], offset, num) < num) {
> + dirty = false;
> + break;
> + }
> +
> + page += num;
> + }
> +
> + rcu_read_unlock();
> +
> + return dirty;
> }
>
> static inline bool cpu_physical_memory_get_dirty_flag(ram_addr_t addr,
> @@ -154,16 +224,31 @@ static inline uint8_t
> cpu_physical_memory_range_includes_clean(ram_addr_t start,
> static inline void cpu_physical_memory_set_dirty_flag(ram_addr_t addr,
> unsigned client)
> {
> + unsigned long page, idx, offset;
> + DirtyMemoryBlocks *blocks;
> +
> assert(client < DIRTY_MEMORY_NUM);
> - set_bit_atomic(addr >> TARGET_PAGE_BITS, ram_list.dirty_memory[client]);
> +
> + page = addr >> TARGET_PAGE_BITS;
> + idx = page / DIRTY_MEMORY_BLOCK_SIZE;
> + offset = page % DIRTY_MEMORY_BLOCK_SIZE;
> +
> + rcu_read_lock();
> +
> + blocks = atomic_rcu_read(&ram_list.dirty_memory[client]);
> +
> + set_bit_atomic(offset, blocks->blocks[idx]);
> +
> + rcu_read_unlock();
> }
>
> static inline void cpu_physical_memory_set_dirty_range(ram_addr_t start,
> ram_addr_t length,
> uint8_t mask)
> {
> + DirtyMemoryBlocks *blocks[DIRTY_MEMORY_NUM];
> unsigned long end, page;
> - unsigned long **d = ram_list.dirty_memory;
> + int i;
>
> if (!mask && !xen_enabled()) {
> return;
> @@ -171,15 +256,36 @@ static inline void
> cpu_physical_memory_set_dirty_range(ram_addr_t start,
>
> end = TARGET_PAGE_ALIGN(start + length) >> TARGET_PAGE_BITS;
> page = start >> TARGET_PAGE_BITS;
> - if (likely(mask & (1 << DIRTY_MEMORY_MIGRATION))) {
> - bitmap_set_atomic(d[DIRTY_MEMORY_MIGRATION], page, end - page);
> - }
> - if (unlikely(mask & (1 << DIRTY_MEMORY_VGA))) {
> - bitmap_set_atomic(d[DIRTY_MEMORY_VGA], page, end - page);
> +
> + rcu_read_lock();
> +
> + for (i = 0; i < DIRTY_MEMORY_NUM; i++) {
> + blocks[i] = atomic_rcu_read(&ram_list.dirty_memory[i]);
> }
> - if (unlikely(mask & (1 << DIRTY_MEMORY_CODE))) {
> - bitmap_set_atomic(d[DIRTY_MEMORY_CODE], page, end - page);
> +
> + while (page < end) {
> + unsigned long idx = page / DIRTY_MEMORY_BLOCK_SIZE;
> + unsigned long offset = page % DIRTY_MEMORY_BLOCK_SIZE;
> + unsigned long num = MIN(end - page, DIRTY_MEMORY_BLOCK_SIZE -
> offset);
> +
> + if (likely(mask & (1 << DIRTY_MEMORY_MIGRATION))) {
> + bitmap_set_atomic(blocks[DIRTY_MEMORY_MIGRATION]->blocks[idx],
> + offset, num);
> + }
> + if (unlikely(mask & (1 << DIRTY_MEMORY_VGA))) {
> + bitmap_set_atomic(blocks[DIRTY_MEMORY_VGA]->blocks[idx],
> + offset, num);
> + }
> + if (unlikely(mask & (1 << DIRTY_MEMORY_CODE))) {
> + bitmap_set_atomic(blocks[DIRTY_MEMORY_CODE]->blocks[idx],
> + offset, num);
> + }
> +
> + page += num;
> }
> +
> + rcu_read_unlock();
> +
> xen_modified_memory(start, length);
> }
>
> @@ -199,21 +305,41 @@ static inline void
> cpu_physical_memory_set_dirty_lebitmap(unsigned long *bitmap,
> /* start address is aligned at the start of a word? */
> if ((((page * BITS_PER_LONG) << TARGET_PAGE_BITS) == start) &&
> (hpratio == 1)) {
> + unsigned long **blocks[DIRTY_MEMORY_NUM];
> + unsigned long idx;
> + unsigned long offset;
> long k;
> long nr = BITS_TO_LONGS(pages);
>
> + idx = (start >> TARGET_PAGE_BITS) / DIRTY_MEMORY_BLOCK_SIZE;
> + offset = BIT_WORD((start >> TARGET_PAGE_BITS) %
> + DIRTY_MEMORY_BLOCK_SIZE);
> +
> + rcu_read_lock();
> +
> + for (i = 0; i < DIRTY_MEMORY_NUM; i++) {
> + blocks[i] = atomic_rcu_read(&ram_list.dirty_memory[i])->blocks;
> + }
> +
> for (k = 0; k < nr; k++) {
> if (bitmap[k]) {
> unsigned long temp = leul_to_cpu(bitmap[k]);
> - unsigned long **d = ram_list.dirty_memory;
>
> - atomic_or(&d[DIRTY_MEMORY_MIGRATION][page + k], temp);
> - atomic_or(&d[DIRTY_MEMORY_VGA][page + k], temp);
> + atomic_or(&blocks[DIRTY_MEMORY_MIGRATION][idx][offset],
> temp);
> + atomic_or(&blocks[DIRTY_MEMORY_VGA][idx][offset], temp);
> if (tcg_enabled()) {
> - atomic_or(&d[DIRTY_MEMORY_CODE][page + k], temp);
> + atomic_or(&blocks[DIRTY_MEMORY_CODE][idx][offset], temp);
> }
> }
> +
> + if (++offset >= BITS_TO_LONGS(DIRTY_MEMORY_BLOCK_SIZE)) {
> + offset = 0;
> + idx++;
> + }
> }
> +
> + rcu_read_unlock();
> +
> xen_modified_memory(start, pages << TARGET_PAGE_BITS);
> } else {
> uint8_t clients = tcg_enabled() ? DIRTY_CLIENTS_ALL :
> DIRTY_CLIENTS_NOCODE;
> @@ -265,18 +391,33 @@ uint64_t cpu_physical_memory_sync_dirty_bitmap(unsigned
> long *dest,
> if (((page * BITS_PER_LONG) << TARGET_PAGE_BITS) == start) {
> int k;
> int nr = BITS_TO_LONGS(length >> TARGET_PAGE_BITS);
> - unsigned long *src = ram_list.dirty_memory[DIRTY_MEMORY_MIGRATION];
> + unsigned long * const *src;
> + unsigned long idx = (page * BITS_PER_LONG) / DIRTY_MEMORY_BLOCK_SIZE;
> + unsigned long offset = BIT_WORD((page * BITS_PER_LONG) %
> + DIRTY_MEMORY_BLOCK_SIZE);
> +
> + rcu_read_lock();
> +
> + src = atomic_rcu_read(
> + &ram_list.dirty_memory[DIRTY_MEMORY_MIGRATION])->blocks;
>
> for (k = page; k < page + nr; k++) {
> - if (src[k]) {
> - unsigned long bits = atomic_xchg(&src[k], 0);
> + if (src[idx][offset]) {
> + unsigned long bits = atomic_xchg(&src[idx][offset], 0);
> unsigned long new_dirty;
> new_dirty = ~dest[k];
> dest[k] |= bits;
> new_dirty &= bits;
> num_dirty += ctpopl(new_dirty);
> }
> +
> + if (++offset >= BITS_TO_LONGS(DIRTY_MEMORY_BLOCK_SIZE)) {
> + offset = 0;
> + idx++;
> + }
> }
> +
> + rcu_read_unlock();
> } else {
> for (addr = 0; addr < length; addr += TARGET_PAGE_SIZE) {
> if (cpu_physical_memory_test_and_clear_dirty(
> diff --git a/migration/ram.c b/migration/ram.c
> index 3cdfea4..96c749f 100644
> --- a/migration/ram.c
> +++ b/migration/ram.c
> @@ -609,7 +609,6 @@ static void migration_bitmap_sync_init(void)
> iterations_prev = 0;
> }
>
> -/* Called with iothread lock held, to protect ram_list.dirty_memory[] */
> static void migration_bitmap_sync(void)
> {
> RAMBlock *block;
> @@ -1921,8 +1920,6 @@ static int ram_save_setup(QEMUFile *f, void *opaque)
> acct_clear();
> }
>
> - /* iothread lock needed for ram_list.dirty_memory[] */
> - qemu_mutex_lock_iothread();
> qemu_mutex_lock_ramlist();
> rcu_read_lock();
> bytes_transferred = 0;
> @@ -1947,7 +1944,6 @@ static int ram_save_setup(QEMUFile *f, void *opaque)
> memory_global_dirty_log_start();
> migration_bitmap_sync();
> qemu_mutex_unlock_ramlist();
> - qemu_mutex_unlock_iothread();
>
> qemu_put_be64(f, ram_bytes_total() | RAM_SAVE_FLAG_MEM_SIZE);
>
>
[Qemu-devel] [PULL 32/32] qemu-char, io: fix ordering of arguments for UDP socket creation, Paolo Bonzini, 2016/02/09
[Qemu-devel] [PULL 08/32] hw: Add support for LSI SAS1068 (mptsas) device, Paolo Bonzini, 2016/02/09