qemu-devel
[Top][All Lists]
Advanced

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

Re: [Qemu-devel] [RFC PATCH RDMA support v2: 6/6] send memory over RDMA


From: Michael R. Hines
Subject: Re: [Qemu-devel] [RFC PATCH RDMA support v2: 6/6] send memory over RDMA as blocks are iterated
Date: Wed, 13 Feb 2013 08:49:38 -0500
User-agent: Mozilla/5.0 (X11; Linux i686; rv:17.0) Gecko/20130106 Thunderbird/17.0.2

Very helpful, thank you. I'll implement these and all the other ones soon.

- Michael

On 02/13/2013 04:50 AM, Orit Wasserman wrote:
On 02/12/2013 12:49 AM, Michael R. Hines wrote:
From: "Michael R. Hines" <address@hidden>


Signed-off-by: Michael R. Hines <address@hidden>
---
  arch_init.c |   84 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++---
  savevm.c    |   59 ++++++++++++++++++++++++++++++++++++++++-
  2 files changed, 139 insertions(+), 4 deletions(-)

diff --git a/arch_init.c b/arch_init.c
index dada6de..76092cc 100644
--- a/arch_init.c
+++ b/arch_init.c
@@ -42,6 +42,7 @@
  #include "migration/migration.h"
  #include "exec/gdbstub.h"
  #include "hw/smbios.h"
+#include "qemu/rdma.h"
  #include "exec/address-spaces.h"
  #include "hw/pcspk.h"
  #include "migration/page_cache.h"
@@ -170,6 +171,15 @@ static int is_dup_page(uint8_t *page)
      VECTYPE val = SPLAT(page);
      int i;
+ /*
+     * RFC RDMA: The empirical cost of searching for zero pages here
+     *           plus the cost of communicating with the other side
+     *           seems to take significantly more time than simply
+     *           dumping the page into remote memory.
+     */
+    if (migrate_rdma_enabled())
+        return 0;
+
I would change ram_save_block not this function.
So I would move the if to before calling is_dup.

      for (i = 0; i < TARGET_PAGE_SIZE / sizeof(VECTYPE); i++) {
          if (!ALL_EQ(val, p[i])) {
              return 0;
@@ -282,6 +292,44 @@ static size_t save_block_hdr(QEMUFile *f, RAMBlock *block, 
ram_addr_t offset,
      return size;
  }
+static size_t save_rdma_page(QEMUFile *f, RAMBlock *block, ram_addr_t offset,
+                             int cont)
+{
+    size_t bytes_sent = 0;
+    ram_addr_t current_addr;
+
+    acct_info.norm_pages++;
+
+    /*
+     * use RDMA to send page
+     */
+    current_addr = block->offset + offset;
+    if (rdma_write(&rdma_mdata, current_addr,
+            TARGET_PAGE_SIZE)) {
+        fprintf(stderr, "rdma migration: write error!\n");
+        qemu_file_set_error(f, -EIO);
we lose the error here,
why not let rdma_write to set the error, let it get the QemuFile as a parameter.
+        return 0;
+    }
+
+    /*
+     * do some polling
+     */
+    while (1) {
+        int ret = rdma_poll(&rdma_mdata);
+        if (ret == RDMA_WRID_NONE) {
+            break;
+        }
+        if (ret < 0) {
+            fprintf(stderr, "rdma migration: polling error!\n");
+            qemu_file_set_error(f, -EIO);
see above
+            return 0;
+        }
+    }
+
+    bytes_sent += TARGET_PAGE_SIZE;
+    return bytes_sent;
+}
+
  #define ENCODING_FLAG_XBZRLE 0x1
static int save_xbzrle_page(QEMUFile *f, uint8_t *current_data,
@@ -474,6 +522,8 @@ static int ram_save_block(QEMUFile *f, bool last_stage)
                  if (!last_stage) {
                      p = get_cached_data(XBZRLE.cache, current_addr);
                  }
+            } else if (migrate_rdma_enabled()) {
+                bytes_sent = save_rdma_page(f, block, offset, cont);
              }
You can move it to the top and than you won't need to change is_dup.
/* XBZRLE overflow or normal page */
@@ -601,12 +651,14 @@ static int ram_save_setup(QEMUFile *f, void *opaque)
      return 0;
  }
+static int tprate = 1000;
+
  static int ram_save_iterate(QEMUFile *f, void *opaque)
  {
      int ret;
      int i;
-    int64_t t0;
-    int total_sent = 0;
+    int64_t t0, tp0;
+    int total_sent = 0, last_total_sent = 0;
qemu_mutex_lock_ramlist(); @@ -625,23 +677,49 @@ static int ram_save_iterate(QEMUFile *f, void *opaque)
              break;
          }
          total_sent += bytes_sent;
+        last_total_sent += bytes_sent;
          acct_info.iterations++;
          /* we want to check in the 1st loop, just in case it was the 1st time
             and we had to sync the dirty bitmap.
             qemu_get_clock_ns() is a bit expensive, so we only check each some
             iterations
          */
+
+        /*
+         * RFC RDMA: Can we have something like this to periodically print
+         *           out throughput? This is just a rough-sketch that
+         *           partially worked for me. I assume there a better way
+         *           that everyone would prefer. Perhaps we could set a QMP
+         *           command that toggled a "periodic printing" option that
+         *           allowed more details to be printed on stdout.....?
+         */
          if ((i & 63) == 0) {
-            uint64_t t1 = (qemu_get_clock_ns(rt_clock) - t0) / 1000000;
+            uint64_t curr = qemu_get_clock_ns(rt_clock);
+            uint64_t t1 = (curr - t0) / 1000000;
+            double tp;
              if (t1 > MAX_WAIT) {
                  DPRINTF("big wait: %" PRIu64 " milliseconds, %d iterations\n",
                          t1, i);
                  break;
              }
+
+            if ((i % tprate) == 0) {
+                uint64_t tp1 = (curr - tp0) / 1000000;
+                tp = ((double) last_total_sent * 8.0 /
+                               ((double) tp1 / 1000.0)) / 1000.0 / 1000.0;
+                printf("throughput: %f mbps\n", tp);
DPRINTF please ...
+                last_total_sent = 0;
+                tp0 = curr;
+            }
          }
          i++;
      }
+ if (migrate_rdma_enabled() && rdma_write_flush(&rdma_mdata) < 0) {
+        qemu_file_set_error(f, -EIO);
see above ...
+        return 0;
+    }
+
      qemu_mutex_unlock_ramlist();
if (ret < 0) {
diff --git a/savevm.c b/savevm.c
index 304d1ef..4d0bef3 100644
--- a/savevm.c
+++ b/savevm.c
@@ -24,6 +24,7 @@
#include "config-host.h"
  #include "qemu-common.h"
+#include "qemu/rdma.h"
  #include "hw/hw.h"
  #include "hw/qdev.h"
  #include "net/net.h"
@@ -417,7 +418,7 @@ int qemu_file_get_error(QEMUFile *f)
      return f->last_error;
  }
-static void qemu_file_set_error(QEMUFile *f, int ret)
+void qemu_file_set_error(QEMUFile *f, int ret)
  {
      if (f->last_error == 0) {
          f->last_error = ret;
@@ -1613,6 +1614,7 @@ int qemu_savevm_state_iterate(QEMUFile *f)
  {
      SaveStateEntry *se;
      int ret = 1;
+    static int first_time = 1;
QTAILQ_FOREACH(se, &savevm_handlers, entry) {
          if (!se->ops || !se->ops->save_live_iterate) {
@@ -1643,6 +1645,30 @@ int qemu_savevm_state_iterate(QEMUFile *f)
          }
      }
      if (ret != 0) {
+#ifdef RDMA_EXTRA_SYNC
+        /*
+         * We use two "sync" infiniband messages happen during migration.
+         * One at the beginning and one at the end, just to be thorough.
+         * This is the first one.
+         */
+        if (first_time && migrate_rdma_enabled()) {
Can't this code move to qemu_savevm_state_begin or even better to 
ram_save_setup  ..
+            int r;
+            first_time = 0;
+            if (rdma_post_send_sync(&rdma_mdata, RDMA_WRID_SEND_EXTRA_SYNC)) {
+                fprintf(stderr,
+                        "rdma migration: error posting extra send sync!\n");
+                return -EIO;
Why not get the real error, please try to give more details on
the actual error to allow the user to understand why migration failed.
Also please update QemuFile error too.
+            }
+
+            r = rdma_wait_for_wrid(&rdma_mdata, RDMA_WRID_SEND_EXTRA_SYNC);
+            if (r < 0) {
+                fprintf(stderr,
+                        "rdma migration: qemu_savevm_state_iterate"
+                        " sync polling error!\n");
+                return -EIO;
same here ...
+            }
+        }
+#endif
          return ret;
      }
      ret = qemu_file_get_error(f);
@@ -1703,8 +1729,30 @@ int qemu_savevm_state_complete(QEMUFile *f)
          trace_savevm_section_end(se->section_id);
      }
+ /*
+     * We use two "sync" infiniband messages happen during migration.
+     * One at the beginning and one at the end, just to be thorough.
+     * This is the second one.
+     */
+    if (migrate_rdma_enabled()) {
+        if (rdma_post_send_sync(&rdma_mdata, RDMA_WRID_SEND_SYNC)) {
+            fprintf(stderr, "rdma migration: error posting send sync!\n");
+            return -EIO;
can't this moved to ram_save_complete?
Regards,
Orit
+        }
+    }
+
      qemu_put_byte(f, QEMU_VM_EOF);
+ /* wait for RDMA sync message to complete */
+    if (migrate_rdma_enabled()) {
+        int ret = rdma_wait_for_wrid(&rdma_mdata, RDMA_WRID_SEND_SYNC);
+        if (ret < 0) {
+            fprintf(stderr, "rdma migration: qemu_savevm_state_full"
+                            " sync polling error!\n");
+            return -EIO;
...
+        }
+     }
+
      return qemu_file_get_error(f);
  }
@@ -2014,6 +2062,15 @@ int qemu_loadvm_state(QEMUFile *f) cpu_synchronize_all_post_init(); + /* wait for RDMA sync message */
+    if (migrate_rdma_enabled()) {
+        ret = rdma_wait_for_wrid(&rdma_mdata, RDMA_WRID_RECV_SYNC);
+        if (ret < 0) {
+            fprintf(stderr, "rdma migration: qemu_loadvm_state_no_header"
+                            " sync polling error!\n");
+            goto out;
+        }
+    }
      ret = 0;
out:





reply via email to

[Prev in Thread] Current Thread [Next in Thread]