[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]
Re: [Qemu-devel] [RFC PATCH RDMA support v1: 5/5] send memory over RDMA
From: |
Orit Wasserman |
Subject: |
Re: [Qemu-devel] [RFC PATCH RDMA support v1: 5/5] send memory over RDMA as blocks are iterated |
Date: |
Thu, 31 Jan 2013 13:04:52 +0200 |
User-agent: |
Mozilla/5.0 (X11; Linux x86_64; rv:17.0) Gecko/20130110 Thunderbird/17.0.2 |
Hi Michael,
I maybe missing something here but why do you need a RAM_SAVE_FLAG_RDMA flag?
You don't do any decoding in the destination.
I would suggest creating a QEMUFileRDMA and moving the write/read code
You can either add a new rdma_buffer QEMUFileOps or add the address to
put_buffer.
you also have some white space damage in the beginning of savevm.c.
Regards,
Orit
On 01/29/2013 12:01 AM, address@hidden wrote
> From: "Michael R. Hines" <address@hidden>
>
>
> Signed-off-by: Michael R. Hines <address@hidden>
> ---
> arch_init.c | 116
> +++++++++++++++++++++++++++++++++++++++--
> include/migration/qemu-file.h | 1 +
> savevm.c | 90 +++++++++++++++++++++++++++-----
> 3 files changed, 189 insertions(+), 18 deletions(-)
>
> diff --git a/arch_init.c b/arch_init.c
> index dada6de..7633fa6 100644
> --- a/arch_init.c
> +++ b/arch_init.c
> @@ -42,6 +42,7 @@
> #include "migration/migration.h"
> #include "exec/gdbstub.h"
> #include "hw/smbios.h"
> +#include "qemu/rdma.h"
> #include "exec/address-spaces.h"
> #include "hw/pcspk.h"
> #include "migration/page_cache.h"
> @@ -113,6 +114,7 @@ const uint32_t arch_type = QEMU_ARCH;
> #define RAM_SAVE_FLAG_EOS 0x10
> #define RAM_SAVE_FLAG_CONTINUE 0x20
> #define RAM_SAVE_FLAG_XBZRLE 0x40
> +#define RAM_SAVE_FLAG_RDMA 0x80
>
> #ifdef __ALTIVEC__
> #include <altivec.h>
> @@ -434,6 +436,7 @@ static int ram_save_block(QEMUFile *f, bool last_stage)
> int bytes_sent = 0;
> MemoryRegion *mr;
> ram_addr_t current_addr;
> + static int not_sent = 1;
>
> if (!block)
> block = QTAILQ_FIRST(&ram_list.blocks);
> @@ -457,23 +460,75 @@ static int ram_save_block(QEMUFile *f, bool last_stage)
> int cont = (block == last_sent_block) ?
> RAM_SAVE_FLAG_CONTINUE : 0;
>
> + current_addr = block->offset + offset;
> p = memory_region_get_ram_ptr(mr) + offset;
>
> /* In doubt sent page as normal */
> bytes_sent = -1;
> - if (is_dup_page(p)) {
> +
> + /*
> + * RFC RDMA: The empirical cost of searching for zero pages here
> + * plus the cost of communicating with the other side
> + * seems to take significantly more time than simply
> + * dumping the page into remote memory.
> + */
> + if (!qemu_rdma_migration_enabled() && is_dup_page(p)) {
> acct_info.dup_pages++;
> bytes_sent = save_block_hdr(f, block, offset, cont,
> RAM_SAVE_FLAG_COMPRESS);
> qemu_put_byte(f, *p);
> bytes_sent += 1;
> + /*
> + * RFC RDMA: Same comment as above. time(run-length encoding)
> + * + time(communication) is too big. RDMA throughput
> tanks
> + * when this feature is enabled. But there's no need
> + * to change the code since the feature is optional.
> + */
> } else if (migrate_use_xbzrle()) {
> - current_addr = block->offset + offset;
> bytes_sent = save_xbzrle_page(f, p, current_addr, block,
> offset, cont, last_stage);
> if (!last_stage) {
> p = get_cached_data(XBZRLE.cache, current_addr);
> }
> + } else if (qemu_rdma_migration_enabled()) {
> + int ret;
> +
> + /*
> + * RFC RDMA: This bad hack was to cause the loop on the
> + * receiving side to break. Comments are welcome
> + * on how to get rid of it.
> + */
> + if (not_sent == 1) {
> + not_sent = 0;
> + bytes_sent = save_block_hdr(f, block, offset,
> + cont, RAM_SAVE_FLAG_RDMA);
> + }
> + acct_info.norm_pages++;
> + /*
> + * use RDMA to send page
> + */
> + if (qemu_rdma_migration_write(&rdma_mdata, current_addr,
> + TARGET_PAGE_SIZE)) {
> + fprintf(stderr, "rdma migration: write error!\n");
> + qemu_file_set_error(f, -EIO);
> + return 0;
> + }
> +
> + /*
> + * do some polling
> + */
> + while (1) {
> + ret = qemu_rdma_migration_poll(&rdma_mdata);
> + if (ret == QEMU_RDMA_MIGRATION_WRID_NONE) {
> + break;
> + }
> + if (ret < 0) {
> + fprintf(stderr, "rdma migration: polling error!\n");
> + qemu_file_set_error(f, -EIO);
> + return 0;
> + }
> + }
> + bytes_sent += TARGET_PAGE_SIZE;
> }
>
> /* XBZRLE overflow or normal page */
> @@ -601,12 +656,15 @@ static int ram_save_setup(QEMUFile *f, void *opaque)
> return 0;
> }
>
> +
> +int tprate = 1000;
> +
> static int ram_save_iterate(QEMUFile *f, void *opaque)
> {
> int ret;
> int i;
> - int64_t t0;
> - int total_sent = 0;
> + int64_t t0, tp0;
> + int total_sent = 0, last_total_sent = 0;
>
> qemu_mutex_lock_ramlist();
>
> @@ -625,23 +683,55 @@ static int ram_save_iterate(QEMUFile *f, void *opaque)
> break;
> }
> total_sent += bytes_sent;
> + last_total_sent += bytes_sent;
> acct_info.iterations++;
> /* we want to check in the 1st loop, just in case it was the 1st time
> and we had to sync the dirty bitmap.
> qemu_get_clock_ns() is a bit expensive, so we only check each some
> iterations
> */
> +
> + /*
> + * RFC RDMA: Can we have something like this to periodically print
> + * out throughput.
> + * This is just a rough-sketch that partially worked for me.
> + * I assume there a better way that everyone would prefer.
> + * Perhaps we could set a QMP command that toggled a "periodic
> printing"
> + * option that allowed more details to be printed on stdout.....?
> + */
> if ((i & 63) == 0) {
> - uint64_t t1 = (qemu_get_clock_ns(rt_clock) - t0) / 1000000;
> + uint64_t curr = qemu_get_clock_ns(rt_clock);
> + uint64_t t1 = (curr - t0) / 1000000;
> + double tp;
> if (t1 > MAX_WAIT) {
> DPRINTF("big wait: %" PRIu64 " milliseconds, %d
> iterations\n",
> t1, i);
> break;
> }
> +
> + if ((i % tprate) == 0) {
> + uint64_t tp1 = (curr - tp0) / 1000000;
> + tp = ((double) last_total_sent * 8.0 /
> + ((double) tp1 / 1000.0)) / 1000.0 / 1000.0;
> + printf("throughput: %f mbps\n", tp);
> + last_total_sent = 0;
> + tp0 = curr;
> + }
> }
> i++;
> }
>
> + /* flush buffer write */
> + if (qemu_rdma_migration_enabled()) {
> + int resp;
> + resp = qemu_rdma_migration_write_flush(&rdma_mdata);
> + if (resp < 0) {
> + fprintf(stderr, "rdma migration: write flush error!\n");
> + qemu_file_set_error(f, -EIO);
> + return 0;
> + }
> + }
> +
> qemu_mutex_unlock_ramlist();
>
> if (ret < 0) {
> @@ -863,6 +953,22 @@ static int ram_load(QEMUFile *f, void *opaque, int
> version_id)
> ret = -EINVAL;
> goto done;
> }
> + } else if (flags & RAM_SAVE_FLAG_RDMA) {
> + /*
> + * RFC RDMA: This bad hack was to cause the loop break.
> + * Comments are welcome on how to get rid of it.
> + * Communicating here is unnecessary because the
> + * RDMA page has already arrived.
> + * Comments are welcome on how to get rif of this.
> + */
> + if (!qemu_rdma_migration_enabled()) {
> + return -EINVAL;
> + }
> + void *host = host_from_stream_offset(f, addr, flags);
> + if (!host) {
> + return -EINVAL;
> + }
> + /* rdma page is already here, nothing to do */
> }
> error = qemu_file_get_error(f);
> if (error) {
> diff --git a/include/migration/qemu-file.h b/include/migration/qemu-file.h
> index 68deefb..7c9968e 100644
> --- a/include/migration/qemu-file.h
> +++ b/include/migration/qemu-file.h
> @@ -112,6 +112,7 @@ int qemu_file_rate_limit(QEMUFile *f);
> int64_t qemu_file_set_rate_limit(QEMUFile *f, int64_t new_rate);
> int64_t qemu_file_get_rate_limit(QEMUFile *f);
> int qemu_file_get_error(QEMUFile *f);
> +void qemu_file_set_error(QEMUFile *f, int ret);
>
> static inline void qemu_put_be64s(QEMUFile *f, const uint64_t *pv)
> {
> diff --git a/savevm.c b/savevm.c
> index 304d1ef..071196e 100644
> --- a/savevm.c
> +++ b/savevm.c
> @@ -24,6 +24,7 @@
>
> #include "config-host.h"
> #include "qemu-common.h"
> +#include "qemu/rdma.h"
> #include "hw/hw.h"
> #include "hw/qdev.h"
> #include "net/net.h"
> @@ -50,7 +51,7 @@
> #define ARP_OP_REQUEST_REV 0x3
>
> static int announce_self_create(uint8_t *buf,
> - uint8_t *mac_addr)
> + uint8_t *mac_addr)
> {
> /* Ethernet header. */
> memset(buf, 0xff, 6); /* destination MAC addr */
> @@ -97,16 +98,16 @@ static void qemu_announce_self_once(void *opaque)
> qemu_mod_timer(timer, qemu_get_clock_ms(rt_clock) +
> 50 + (SELF_ANNOUNCE_ROUNDS - count - 1) * 100);
> } else {
> - qemu_del_timer(timer);
> - qemu_free_timer(timer);
> + qemu_del_timer(timer);
> + qemu_free_timer(timer);
> }
> }
>
> void qemu_announce_self(void)
> {
> - static QEMUTimer *timer;
> - timer = qemu_new_timer_ms(rt_clock, qemu_announce_self_once, &timer);
> - qemu_announce_self_once(&timer);
> + static QEMUTimer *timer;
> + timer = qemu_new_timer_ms(rt_clock, qemu_announce_self_once, &timer);
> + qemu_announce_self_once(&timer);
> }
>
> /***********************************************************/
> @@ -299,8 +300,8 @@ QEMUFile *qemu_fdopen(int fd, const char *mode)
> QEMUFileStdio *s;
>
> if (mode == NULL ||
> - (mode[0] != 'r' && mode[0] != 'w') ||
> - mode[1] != 'b' || mode[2] != 0) {
> + (mode[0] != 'r' && mode[0] != 'w') ||
> + mode[1] != 'b' || mode[2] != 0) {
> fprintf(stderr, "qemu_fdopen: Argument validity check failed\n");
> return NULL;
> }
> @@ -342,8 +343,8 @@ QEMUFile *qemu_fopen(const char *filename, const char
> *mode)
> QEMUFileStdio *s;
>
> if (mode == NULL ||
> - (mode[0] != 'r' && mode[0] != 'w') ||
> - mode[1] != 'b' || mode[2] != 0) {
> + (mode[0] != 'r' && mode[0] != 'w') ||
> + mode[1] != 'b' || mode[2] != 0) {
> fprintf(stderr, "qemu_fopen: Argument validity check failed\n");
> return NULL;
> }
> @@ -417,7 +418,7 @@ int qemu_file_get_error(QEMUFile *f)
> return f->last_error;
> }
>
> -static void qemu_file_set_error(QEMUFile *f, int ret)
> +void qemu_file_set_error(QEMUFile *f, int ret)
> {
> if (f->last_error == 0) {
> f->last_error = ret;
> @@ -1613,6 +1614,7 @@ int qemu_savevm_state_iterate(QEMUFile *f)
> {
> SaveStateEntry *se;
> int ret = 1;
> + static int first_time = 1;
>
> QTAILQ_FOREACH(se, &savevm_handlers, entry) {
> if (!se->ops || !se->ops->save_live_iterate) {
> @@ -1643,8 +1645,36 @@ int qemu_savevm_state_iterate(QEMUFile *f)
> }
> }
> if (ret != 0) {
> +#ifdef QEMU_RDMA_MIGRATION_EXTRA_SYNC
> + /*
> + * We use two "sync" infiniband messages happen during migration.
> + * One at the beginning and one at the end, just to be thorough.
> + * This is the first one.
> + */
> + if (first_time && qemu_rdma_migration_enabled()) {
> + int r;
> + first_time = 0;
> + if (qemu_rdma_migration_post_send_sync(&rdma_mdata,
> + QEMU_RDMA_MIGRATION_WRID_SEND_EXTRA_SYNC)) {
> + fprintf(stderr,
> + "rdma migration: error posting extra send sync!\n");
> + return -EIO;
> + }
> +
> + r = qemu_rdma_migration_wait_for_wrid(&rdma_mdata,
> + QEMU_RDMA_MIGRATION_WRID_SEND_EXTRA_SYNC);
> + if (r < 0) {
> + fprintf(stderr,
> + "rdma migration: qemu_savevm_state_iterate"
> + " sync polling error!\n");
> + return -EIO;
> + }
> + }
> +#endif
> +
> return ret;
> }
> +
> ret = qemu_file_get_error(f);
> if (ret != 0) {
> qemu_savevm_state_cancel();
> @@ -1684,7 +1714,7 @@ int qemu_savevm_state_complete(QEMUFile *f)
> int len;
>
> if ((!se->ops || !se->ops->save_state) && !se->vmsd) {
> - continue;
> + continue;
> }
> trace_savevm_section_start();
> /* Section type */
> @@ -1703,8 +1733,32 @@ int qemu_savevm_state_complete(QEMUFile *f)
> trace_savevm_section_end(se->section_id);
> }
>
> + /*
> + * We use two "sync" infiniband messages happen during migration.
> + * One at the beginning and one at the end, just to be thorough.
> + * This is the second one.
> + */
> + if (qemu_rdma_migration_enabled()) {
> + if (qemu_rdma_migration_post_send_sync(&rdma_mdata,
> + QEMU_RDMA_MIGRATION_WRID_SEND_SYNC)) {
> + fprintf(stderr, "rdma migration: error posting send sync!\n");
> + return -EIO;
> + }
> + }
> +
> qemu_put_byte(f, QEMU_VM_EOF);
>
> + /* wait for RDMA sync message to complete */
> + if (qemu_rdma_migration_enabled()) {
> + int ret = qemu_rdma_migration_wait_for_wrid(&rdma_mdata,
> + QEMU_RDMA_MIGRATION_WRID_SEND_SYNC);
> + if (ret < 0) {
> + fprintf(stderr, "rdma migration: qemu_savevm_state_full"
> + " sync polling error!\n");
> + return -EIO;
> + }
> + }
> +
> return qemu_file_get_error(f);
> }
>
> @@ -2014,8 +2068,18 @@ int qemu_loadvm_state(QEMUFile *f)
>
> cpu_synchronize_all_post_init();
>
> - ret = 0;
> + /* wait for RDMA sync message */
> + if (qemu_rdma_migration_enabled()) {
> + ret = qemu_rdma_migration_wait_for_wrid(&rdma_mdata,
> + QEMU_RDMA_MIGRATION_WRID_RECV_SYNC);
> + if (ret < 0) {
> + fprintf(stderr, "rdma migration: qemu_loadvm_state_no_header"
> + " sync polling error!\n");
> + goto out;
> + }
> + }
>
> + ret = 0;
> out:
> QLIST_FOREACH_SAFE(le, &loadvm_handlers, entry, new_le) {
> QLIST_REMOVE(le, entry);
>
Re: [Qemu-devel] [RFC PATCH RDMA support v1: 1/5] add openfabrics RDMA libraries and base RDMA code to build, Andreas Färber, 2013/01/29
Re: [Qemu-devel] [RFC PATCH RDMA support v1: 1/5] add openfabrics RDMA libraries and base RDMA code to build, Stefan Hajnoczi, 2013/01/30