qemu-devel
[Top][All Lists]
Advanced

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

Re: [Qemu-devel] [PATCH COLO-Frame v13 32/39] COLO: Separate the process


From: Hailiang Zhang
Subject: Re: [Qemu-devel] [PATCH COLO-Frame v13 32/39] COLO: Separate the process of saving/loading ram and device state
Date: Sat, 30 Jan 2016 18:23:31 +0800
User-agent: Mozilla/5.0 (Windows NT 6.1; WOW64; rv:38.0) Gecko/20100101 Thunderbird/38.5.1

On 2016/1/27 22:14, Dr. David Alan Gilbert wrote:
* zhanghailiang (address@hidden) wrote:
We separate the process of saving/loading ram and device state when do 
checkpoint,
we add new helpers for save/load ram/device. With this change, we can directly
transfer ram from master to slave without using QEMUSizeBuffer as assistant,
which also reduce the size of extra memory been used during checkpoint.

Besides, we move the colo_flush_ram_cache to the proper position after the
above change.

Signed-off-by: zhanghailiang <address@hidden>
Signed-off-by: Li Zhijian <address@hidden>
---
v13:
- Re-use some existed helper functions to realize saving/loading ram and device.
v11:
- Remove load configuration section in qemu_loadvm_state_begin()
---
  include/sysemu/sysemu.h |  6 +++++
  migration/colo.c        | 47 ++++++++++++++++++++++++++---------
  migration/ram.c         |  5 ----
  migration/savevm.c      | 66 ++++++++++++++++++++++++++++++++++++++++++++++---
  4 files changed, 104 insertions(+), 20 deletions(-)

diff --git a/include/sysemu/sysemu.h b/include/sysemu/sysemu.h
index 91eeda3..f553f8a 100644
--- a/include/sysemu/sysemu.h
+++ b/include/sysemu/sysemu.h
@@ -133,7 +133,13 @@ void qemu_savevm_send_postcopy_ram_discard(QEMUFile *f, 
const char *name,
                                             uint64_t *start_list,
                                             uint64_t *length_list);

+void qemu_savevm_live_state(QEMUFile *f);
+int qemu_save_device_state(QEMUFile *f);
+
  int qemu_loadvm_state(QEMUFile *f);
+int qemu_loadvm_state_begin(QEMUFile *f);
+int qemu_load_ram_state(QEMUFile *f);
+int qemu_load_device_state(QEMUFile *f);

  typedef enum DisplayType
  {
diff --git a/migration/colo.c b/migration/colo.c
index fd87f34..4a99819 100644
--- a/migration/colo.c
+++ b/migration/colo.c
@@ -290,21 +290,34 @@ static int colo_do_checkpoint_transaction(MigrationState 
*s,
          goto out;
      }

+    colo_put_cmd(s->to_dst_file, COLO_COMMAND_VMSTATE_SEND, &local_err);
+    if (local_err) {
+        goto out;
+    }
+
      /* Disable block migration */
      s->params.blk = 0;
      s->params.shared = 0;
-    qemu_savevm_state_header(trans);
-    qemu_savevm_state_begin(trans, &s->params);
-    qemu_mutex_lock_iothread();
-    qemu_savevm_state_complete_precopy(trans, false);
-    qemu_mutex_unlock_iothread();
-
-    qemu_fflush(trans);
+    qemu_savevm_state_begin(s->to_dst_file, &s->params);
+    ret = qemu_file_get_error(s->to_dst_file);
+    if (ret < 0) {
+        error_report("save vm state begin error");
+        goto out;
+    }

-    colo_put_cmd(s->to_dst_file, COLO_COMMAND_VMSTATE_SEND, &local_err);
-    if (local_err) {


+    qemu_mutex_lock_iothread();
+    /* Only save VM's live state, which not including device state */
+    qemu_savevm_live_state(s->to_dst_file);

I'm a little worried that this is done with the lock held;
if the destination hung during this point and there was
enough data to fill the network buffers, couldn't we hang
and then we wouldn't be able to issue the colo_lost_heartbeat
command?


Yes, here we may encounter this problem, i'm wondering if it is
safe to save the remaining pages without holding this lock,
do you know why we must hold this lock while in the completion stage
of migration while VM is stopped?

+    /* Note: device state is saved into buffer */
+    ret = qemu_save_device_state(trans);
+    if (ret < 0) {
+        error_report("save device state error\n");
+        qemu_mutex_unlock_iothread();
          goto out;
      }
+    qemu_fflush(trans);
+    qemu_mutex_unlock_iothread();

You can save an unlock by doing:
    ret = ...
    qemu_mutex_unlock_iothread();
    if (ret < 0) {


I will fix it in next version.


      /* we send the total size of the vmstate first */
      size = qsb_get_length(buffer);
      colo_put_cmd_value(s->to_dst_file, COLO_COMMAND_VMSTATE_SIZE,
@@ -573,6 +586,16 @@ void *colo_process_incoming_thread(void *opaque)
              goto out;
          }

+        ret = qemu_loadvm_state_begin(mis->from_src_file);
+        if (ret < 0) {
+            error_report("load vm state begin error, ret=%d", ret);
+            goto out;
+        }
+        ret = qemu_load_ram_state(mis->from_src_file);
+        if (ret < 0) {
+            error_report("load ram state error");
+            goto out;
+        }
          /* read the VM state total size first */
          value = colo_get_cmd_value(mis->from_src_file,
                                   COLO_COMMAND_VMSTATE_SIZE, &local_err);
@@ -605,8 +628,10 @@ void *colo_process_incoming_thread(void *opaque)
          qemu_mutex_lock_iothread();
          qemu_system_reset(VMRESET_SILENT);
          vmstate_loading = true;
-        if (qemu_loadvm_state(fb) < 0) {
-            error_report("COLO: loadvm failed");
+        colo_flush_ram_cache();
+        ret = qemu_load_device_state(fb);
+        if (ret < 0) {
+            error_report("COLO: load device state failed\n");

Unneeded \n


[...]

              vmstate_loading = false;
              qemu_mutex_unlock_iothread();
              goto out;
diff --git a/migration/ram.c b/migration/ram.c
index 8ff7f7c..45d9332 100644
--- a/migration/ram.c
+++ b/migration/ram.c
@@ -2458,7 +2458,6 @@ static int ram_load(QEMUFile *f, void *opaque, int 
version_id)
       * be atomic
       */
      bool postcopy_running = postcopy_state_get() >= 
POSTCOPY_INCOMING_LISTENING;
-    bool need_flush = false;

      seq_iter++;

@@ -2493,7 +2492,6 @@ static int ram_load(QEMUFile *f, void *opaque, int 
version_id)
              /* After going into COLO, we should load the Page into colo_cache 
*/
              if (ram_cache_enable) {
                  host = colo_cache_from_block_offset(block, addr);
-                need_flush = true;
              } else {
                  host = host_from_ram_block_offset(block, addr);
              }
@@ -2588,9 +2586,6 @@ static int ram_load(QEMUFile *f, void *opaque, int 
version_id)

      rcu_read_unlock();

-    if (!ret  && ram_cache_enable && need_flush) {
-        colo_flush_ram_cache();
-    }
      DPRINTF("Completed load of VM with exit code %d seq iteration "
              "%" PRIu64 "\n", ret, seq_iter);
      return ret;
diff --git a/migration/savevm.c b/migration/savevm.c
index 7ceec2d..7892070 100644
--- a/migration/savevm.c
+++ b/migration/savevm.c
@@ -50,6 +50,7 @@
  #include "qemu/iov.h"
  #include "block/snapshot.h"
  #include "block/qapi.h"
+#include "migration/colo.h"


  #ifndef ETH_P_RARP
@@ -923,6 +924,10 @@ void qemu_savevm_state_begin(QEMUFile *f,
              break;
          }
      }
+    if (migration_in_colo_state()) {
+        qemu_put_byte(f, QEMU_VM_EOF);
+        qemu_fflush(f);
+    }
  }

  /*
@@ -1192,13 +1197,20 @@ static int qemu_savevm_state(QEMUFile *f, Error **errp)
      return ret;
  }

-static int qemu_save_device_state(QEMUFile *f)
+void qemu_savevm_live_state(QEMUFile *f)
  {
-    SaveStateEntry *se;
+    /* save QEMU_VM_SECTION_END section */
+    qemu_savevm_state_complete_precopy(f, true);
+    qemu_put_byte(f, QEMU_VM_EOF);
+}

-    qemu_put_be32(f, QEMU_VM_FILE_MAGIC);
-    qemu_put_be32(f, QEMU_VM_FILE_VERSION);
+int qemu_save_device_state(QEMUFile *f)
+{
+    SaveStateEntry *se;

+    if (!migration_in_colo_state()) {
+        qemu_savevm_state_header(f);
+    }
      cpu_synchronize_all_states();

      QTAILQ_FOREACH(se, &savevm_state.handlers, entry) {
@@ -1957,6 +1969,52 @@ int qemu_loadvm_state(QEMUFile *f)
      return ret;
  }

+int qemu_loadvm_state_begin(QEMUFile *f)
+{
+    MigrationIncomingState *mis = migration_incoming_get_current();
+    Error *local_err = NULL;
+    int ret;
+
+    if (qemu_savevm_state_blocked(&local_err)) {
+        error_report_err(local_err);
+        return -EINVAL;
+    }
+    /* Load QEMU_VM_SECTION_START section */
+    ret = qemu_loadvm_state_main(f, mis);
+    if (ret < 0) {
+        error_report("Failed to loadvm begin work: %d", ret);
+    }
+    return ret;
+}
+
+int qemu_load_ram_state(QEMUFile *f)
+{
+    MigrationIncomingState *mis = migration_incoming_get_current();
+    int ret;
+
+    ret = qemu_loadvm_state_main(f, mis);
+    if (ret < 0) {
+        error_report("Failed to load ram state: %d", ret);
+    }
+    return ret;
+}

This function does very little; why not just make qemu_loadvm_state_main
non-static and call that directly?


Hmm, no special, i will fix it in next version.

Thanks,
Hailiang


+int qemu_load_device_state(QEMUFile *f)
+{
+    MigrationIncomingState *mis = migration_incoming_get_current();
+    int ret;
+
+    /* Load QEMU_VM_SECTION_FULL section */
+    ret = qemu_loadvm_state_main(f, mis);
+    if (ret < 0) {
+        error_report("Failed to load device state: %d", ret);
+        return ret;
+    }
+
+    cpu_synchronize_all_post_init();
+    return 0;
+}
+
  void hmp_savevm(Monitor *mon, const QDict *qdict)
  {
      BlockDriverState *bs, *bs1;
--
1.8.3.1


--
Dr. David Alan Gilbert / address@hidden / Manchester, UK

.






reply via email to

[Prev in Thread] Current Thread [Next in Thread]