qemu-devel
[Top][All Lists]
Advanced

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

Re: [Qemu-devel] [RFC PATCH RDMA support v1: 5/5] send memory over RDMA


From: Michael R. hines
Subject: Re: [Qemu-devel] [RFC PATCH RDMA support v1: 5/5] send memory over RDMA as blocks are iterated
Date: Thu, 31 Jan 2013 10:08:34 -0500
User-agent: Mozilla/5.0 (X11; Linux i686; rv:17.0) Gecko/20130106 Thunderbird/17.0.2

Yes, I was hoping for a comment about this in particular (before I send out another patchest with the proper coverletter and so forth).

So here's the problem I was experiencing inside savevm.c, qemu_loadvm_state():

I was having a little trouble serializing the client/server protocol in my brain.

When the server-side begins loading pages into ram, qemu_loadvm_state() goes into a tight loop waiting for each memory page to be transmitted, one-by-one.

Now, for RDMA, this loop is not necessary, but the loop was stuck waiting for TCP messages that did not need to be sent. So, the extra flag you saw was a hack to break out of the loop

.... but according to you, I should bypass this loop entirely?
Should I write a brand new function in savevm.c which skips this function?

Also, with the QEMUFileRDMA, I have a question: Since RDMA does not require a file-like abstraction of any kind (meaning there is no explicit handshaking during an RDMA transfer), should I really create one of these? Unlike sockets and snapshot files that a typical migration would normally need, an RDMA-based migration doesn't operate this way anymore.

Thanks for all the comments ....... keep them coming =)

- Michael


On 01/31/2013 06:04 AM, Orit Wasserman wrote:
Hi Michael,
I maybe missing something here but why do you need a RAM_SAVE_FLAG_RDMA flag?
You don't do any decoding in the destination.

I would suggest creating a QEMUFileRDMA and moving the write/read code
You can either add a new rdma_buffer QEMUFileOps or add the address to 
put_buffer.

you also have some white space damage in the beginning of savevm.c.

Regards,
Orit

On 01/29/2013 12:01 AM, address@hidden wrote
From: "Michael R. Hines" <address@hidden>


Signed-off-by: Michael R. Hines <address@hidden>
---
  arch_init.c                   |  116 +++++++++++++++++++++++++++++++++++++++--
  include/migration/qemu-file.h |    1 +
  savevm.c                      |   90 +++++++++++++++++++++++++++-----
  3 files changed, 189 insertions(+), 18 deletions(-)

diff --git a/arch_init.c b/arch_init.c
index dada6de..7633fa6 100644
--- a/arch_init.c
+++ b/arch_init.c
@@ -42,6 +42,7 @@
  #include "migration/migration.h"
  #include "exec/gdbstub.h"
  #include "hw/smbios.h"
+#include "qemu/rdma.h"
  #include "exec/address-spaces.h"
  #include "hw/pcspk.h"
  #include "migration/page_cache.h"
@@ -113,6 +114,7 @@ const uint32_t arch_type = QEMU_ARCH;
  #define RAM_SAVE_FLAG_EOS      0x10
  #define RAM_SAVE_FLAG_CONTINUE 0x20
  #define RAM_SAVE_FLAG_XBZRLE   0x40
+#define RAM_SAVE_FLAG_RDMA     0x80
#ifdef __ALTIVEC__
  #include <altivec.h>
@@ -434,6 +436,7 @@ static int ram_save_block(QEMUFile *f, bool last_stage)
      int bytes_sent = 0;
      MemoryRegion *mr;
      ram_addr_t current_addr;
+    static int not_sent = 1;
if (!block)
          block = QTAILQ_FIRST(&ram_list.blocks);
@@ -457,23 +460,75 @@ static int ram_save_block(QEMUFile *f, bool last_stage)
              int cont = (block == last_sent_block) ?
                  RAM_SAVE_FLAG_CONTINUE : 0;
+ current_addr = block->offset + offset;
              p = memory_region_get_ram_ptr(mr) + offset;
/* In doubt sent page as normal */
              bytes_sent = -1;
-            if (is_dup_page(p)) {
+
+            /*
+             * RFC RDMA: The empirical cost of searching for zero pages here
+             *           plus the cost of communicating with the other side
+             *           seems to take significantly more time than simply
+             *           dumping the page into remote memory.
+             */
+            if (!qemu_rdma_migration_enabled() && is_dup_page(p)) {
                  acct_info.dup_pages++;
                  bytes_sent = save_block_hdr(f, block, offset, cont,
                                              RAM_SAVE_FLAG_COMPRESS);
                  qemu_put_byte(f, *p);
                  bytes_sent += 1;
+            /*
+             * RFC RDMA: Same comment as above. time(run-length encoding)
+             *           + time(communication) is too big. RDMA throughput 
tanks
+             *           when this feature is enabled. But there's no need
+             *           to change the code since the feature is optional.
+             */
              } else if (migrate_use_xbzrle()) {
-                current_addr = block->offset + offset;
                  bytes_sent = save_xbzrle_page(f, p, current_addr, block,
                                                offset, cont, last_stage);
                  if (!last_stage) {
                      p = get_cached_data(XBZRLE.cache, current_addr);
                  }
+            } else if (qemu_rdma_migration_enabled()) {
+                int ret;
+
+                /*
+                 * RFC RDMA: This bad hack was to cause the loop on the
+                 *           receiving side to break. Comments are welcome
+                 *           on how to get rid of it.
+                 */
+                if (not_sent == 1) {
+                        not_sent = 0;
+                        bytes_sent = save_block_hdr(f, block, offset,
+                                                cont, RAM_SAVE_FLAG_RDMA);
+                }
+                acct_info.norm_pages++;
+                /*
+                 * use RDMA to send page
+                 */
+                if (qemu_rdma_migration_write(&rdma_mdata, current_addr,
+                        TARGET_PAGE_SIZE)) {
+                    fprintf(stderr, "rdma migration: write error!\n");
+                    qemu_file_set_error(f, -EIO);
+                    return 0;
+                }
+
+                /*
+                 * do some polling
+                 */
+                while (1) {
+                    ret = qemu_rdma_migration_poll(&rdma_mdata);
+                    if (ret == QEMU_RDMA_MIGRATION_WRID_NONE) {
+                        break;
+                    }
+                    if (ret < 0) {
+                        fprintf(stderr, "rdma migration: polling error!\n");
+                        qemu_file_set_error(f, -EIO);
+                        return 0;
+                    }
+                }
+                bytes_sent += TARGET_PAGE_SIZE;
              }
/* XBZRLE overflow or normal page */
@@ -601,12 +656,15 @@ static int ram_save_setup(QEMUFile *f, void *opaque)
      return 0;
  }
+
+int tprate = 1000;
+
  static int ram_save_iterate(QEMUFile *f, void *opaque)
  {
      int ret;
      int i;
-    int64_t t0;
-    int total_sent = 0;
+    int64_t t0, tp0;
+    int total_sent = 0, last_total_sent = 0;
qemu_mutex_lock_ramlist(); @@ -625,23 +683,55 @@ static int ram_save_iterate(QEMUFile *f, void *opaque)
              break;
          }
          total_sent += bytes_sent;
+        last_total_sent += bytes_sent;
          acct_info.iterations++;
          /* we want to check in the 1st loop, just in case it was the 1st time
             and we had to sync the dirty bitmap.
             qemu_get_clock_ns() is a bit expensive, so we only check each some
             iterations
          */
+
+        /*
+         * RFC RDMA: Can we have something like this to periodically print
+         * out throughput.
+         * This is just a rough-sketch that partially worked for me.
+         * I assume there a better way that everyone would prefer.
+         * Perhaps we could set a QMP command that toggled a "periodic 
printing"
+         * option that allowed more details to be printed on stdout.....?
+         */
          if ((i & 63) == 0) {
-            uint64_t t1 = (qemu_get_clock_ns(rt_clock) - t0) / 1000000;
+            uint64_t curr = qemu_get_clock_ns(rt_clock);
+            uint64_t t1 = (curr - t0) / 1000000;
+            double tp;
              if (t1 > MAX_WAIT) {
                  DPRINTF("big wait: %" PRIu64 " milliseconds, %d iterations\n",
                          t1, i);
                  break;
              }
+
+            if ((i % tprate) == 0) {
+                uint64_t tp1 = (curr - tp0) / 1000000;
+                tp = ((double) last_total_sent * 8.0 /
+                               ((double) tp1 / 1000.0)) / 1000.0 / 1000.0;
+                printf("throughput: %f mbps\n", tp);
+                last_total_sent = 0;
+                tp0 = curr;
+            }
          }
          i++;
      }
+ /* flush buffer write */
+    if (qemu_rdma_migration_enabled()) {
+        int resp;
+        resp = qemu_rdma_migration_write_flush(&rdma_mdata);
+        if (resp < 0) {
+            fprintf(stderr, "rdma migration: write flush error!\n");
+            qemu_file_set_error(f, -EIO);
+            return 0;
+        }
+    }
+
      qemu_mutex_unlock_ramlist();
if (ret < 0) {
@@ -863,6 +953,22 @@ static int ram_load(QEMUFile *f, void *opaque, int 
version_id)
                  ret = -EINVAL;
                  goto done;
              }
+        } else if (flags & RAM_SAVE_FLAG_RDMA) {
+            /*
+             * RFC RDMA: This bad hack was to cause the loop break.
+             *           Comments are welcome on how to get rid of it.
+             *           Communicating here is unnecessary because the
+             *           RDMA page has already arrived.
+             *           Comments are welcome on how to get rif of this.
+             */
+            if (!qemu_rdma_migration_enabled()) {
+                return -EINVAL;
+            }
+            void *host = host_from_stream_offset(f, addr, flags);
+            if (!host) {
+                return -EINVAL;
+            }
+            /* rdma page is already here, nothing to do */
          }
          error = qemu_file_get_error(f);
          if (error) {
diff --git a/include/migration/qemu-file.h b/include/migration/qemu-file.h
index 68deefb..7c9968e 100644
--- a/include/migration/qemu-file.h
+++ b/include/migration/qemu-file.h
@@ -112,6 +112,7 @@ int qemu_file_rate_limit(QEMUFile *f);
  int64_t qemu_file_set_rate_limit(QEMUFile *f, int64_t new_rate);
  int64_t qemu_file_get_rate_limit(QEMUFile *f);
  int qemu_file_get_error(QEMUFile *f);
+void qemu_file_set_error(QEMUFile *f, int ret);
static inline void qemu_put_be64s(QEMUFile *f, const uint64_t *pv)
  {
diff --git a/savevm.c b/savevm.c
index 304d1ef..071196e 100644
--- a/savevm.c
+++ b/savevm.c
@@ -24,6 +24,7 @@
#include "config-host.h"
  #include "qemu-common.h"
+#include "qemu/rdma.h"
  #include "hw/hw.h"
  #include "hw/qdev.h"
  #include "net/net.h"
@@ -50,7 +51,7 @@
  #define ARP_OP_REQUEST_REV 0x3
static int announce_self_create(uint8_t *buf,
-                               uint8_t *mac_addr)
+                                uint8_t *mac_addr)
  {
      /* Ethernet header. */
      memset(buf, 0xff, 6);         /* destination MAC addr */
@@ -97,16 +98,16 @@ static void qemu_announce_self_once(void *opaque)
          qemu_mod_timer(timer, qemu_get_clock_ms(rt_clock) +
                         50 + (SELF_ANNOUNCE_ROUNDS - count - 1) * 100);
      } else {
-           qemu_del_timer(timer);
-           qemu_free_timer(timer);
+            qemu_del_timer(timer);
+            qemu_free_timer(timer);
      }
  }
void qemu_announce_self(void)
  {
-       static QEMUTimer *timer;
-       timer = qemu_new_timer_ms(rt_clock, qemu_announce_self_once, &timer);
-       qemu_announce_self_once(&timer);
+        static QEMUTimer *timer;
+        timer = qemu_new_timer_ms(rt_clock, qemu_announce_self_once, &timer);
+        qemu_announce_self_once(&timer);
  }
/***********************************************************/
@@ -299,8 +300,8 @@ QEMUFile *qemu_fdopen(int fd, const char *mode)
      QEMUFileStdio *s;
if (mode == NULL ||
-       (mode[0] != 'r' && mode[0] != 'w') ||
-       mode[1] != 'b' || mode[2] != 0) {
+        (mode[0] != 'r' && mode[0] != 'w') ||
+        mode[1] != 'b' || mode[2] != 0) {
          fprintf(stderr, "qemu_fdopen: Argument validity check failed\n");
          return NULL;
      }
@@ -342,8 +343,8 @@ QEMUFile *qemu_fopen(const char *filename, const char *mode)
      QEMUFileStdio *s;
if (mode == NULL ||
-       (mode[0] != 'r' && mode[0] != 'w') ||
-       mode[1] != 'b' || mode[2] != 0) {
+        (mode[0] != 'r' && mode[0] != 'w') ||
+        mode[1] != 'b' || mode[2] != 0) {
          fprintf(stderr, "qemu_fopen: Argument validity check failed\n");
          return NULL;
      }
@@ -417,7 +418,7 @@ int qemu_file_get_error(QEMUFile *f)
      return f->last_error;
  }
-static void qemu_file_set_error(QEMUFile *f, int ret)
+void qemu_file_set_error(QEMUFile *f, int ret)
  {
      if (f->last_error == 0) {
          f->last_error = ret;
@@ -1613,6 +1614,7 @@ int qemu_savevm_state_iterate(QEMUFile *f)
  {
      SaveStateEntry *se;
      int ret = 1;
+    static int first_time = 1;
QTAILQ_FOREACH(se, &savevm_handlers, entry) {
          if (!se->ops || !se->ops->save_live_iterate) {
@@ -1643,8 +1645,36 @@ int qemu_savevm_state_iterate(QEMUFile *f)
          }
      }
      if (ret != 0) {
+#ifdef QEMU_RDMA_MIGRATION_EXTRA_SYNC
+        /*
+         * We use two "sync" infiniband messages happen during migration.
+         * One at the beginning and one at the end, just to be thorough.
+         * This is the first one.
+         */
+        if (first_time && qemu_rdma_migration_enabled()) {
+            int r;
+            first_time = 0;
+            if (qemu_rdma_migration_post_send_sync(&rdma_mdata,
+                        QEMU_RDMA_MIGRATION_WRID_SEND_EXTRA_SYNC)) {
+                fprintf(stderr,
+                        "rdma migration: error posting extra send sync!\n");
+                return -EIO;
+            }
+
+            r = qemu_rdma_migration_wait_for_wrid(&rdma_mdata,
+                        QEMU_RDMA_MIGRATION_WRID_SEND_EXTRA_SYNC);
+            if (r < 0) {
+                fprintf(stderr,
+                        "rdma migration: qemu_savevm_state_iterate"
+                        " sync polling error!\n");
+                return -EIO;
+            }
+        }
+#endif
+
          return ret;
      }
+
      ret = qemu_file_get_error(f);
      if (ret != 0) {
          qemu_savevm_state_cancel();
@@ -1684,7 +1714,7 @@ int qemu_savevm_state_complete(QEMUFile *f)
          int len;
if ((!se->ops || !se->ops->save_state) && !se->vmsd) {
-           continue;
+            continue;
          }
          trace_savevm_section_start();
          /* Section type */
@@ -1703,8 +1733,32 @@ int qemu_savevm_state_complete(QEMUFile *f)
          trace_savevm_section_end(se->section_id);
      }
+ /*
+     * We use two "sync" infiniband messages happen during migration.
+     * One at the beginning and one at the end, just to be thorough.
+     * This is the second one.
+     */
+    if (qemu_rdma_migration_enabled()) {
+        if (qemu_rdma_migration_post_send_sync(&rdma_mdata,
+                    QEMU_RDMA_MIGRATION_WRID_SEND_SYNC)) {
+            fprintf(stderr, "rdma migration: error posting send sync!\n");
+            return -EIO;
+        }
+    }
+
      qemu_put_byte(f, QEMU_VM_EOF);
+ /* wait for RDMA sync message to complete */
+    if (qemu_rdma_migration_enabled()) {
+        int ret = qemu_rdma_migration_wait_for_wrid(&rdma_mdata,
+                QEMU_RDMA_MIGRATION_WRID_SEND_SYNC);
+        if (ret < 0) {
+            fprintf(stderr, "rdma migration: qemu_savevm_state_full"
+                            " sync polling error!\n");
+            return -EIO;
+        }
+     }
+
      return qemu_file_get_error(f);
  }
@@ -2014,8 +2068,18 @@ int qemu_loadvm_state(QEMUFile *f) cpu_synchronize_all_post_init(); - ret = 0;
+    /* wait for RDMA sync message */
+    if (qemu_rdma_migration_enabled()) {
+        ret = qemu_rdma_migration_wait_for_wrid(&rdma_mdata,
+                QEMU_RDMA_MIGRATION_WRID_RECV_SYNC);
+        if (ret < 0) {
+            fprintf(stderr, "rdma migration: qemu_loadvm_state_no_header"
+                            " sync polling error!\n");
+            goto out;
+        }
+    }
+ ret = 0;
  out:
      QLIST_FOREACH_SAFE(le, &loadvm_handlers, entry, new_le) {
          QLIST_REMOVE(le, entry);






reply via email to

[Prev in Thread] Current Thread [Next in Thread]