qemu-devel
[Top][All Lists]
Advanced

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

[Qemu-devel] [PATCH 1/6] Migration: Reconnect network in case of network


From: Md Haris Iqbal
Subject: [Qemu-devel] [PATCH 1/6] Migration: Reconnect network in case of network failure during pc migration (source)
Date: Mon, 22 Aug 2016 02:28:47 +0530

Signed-off-by: Md Haris Iqbal <address@hidden>
---
 hmp-commands.hx               |  20 +++---
 hmp.c                         |   4 +-
 include/migration/migration.h |   4 ++
 migration/migration.c         | 153 +++++++++++++++++++++++++++++++++++++-----
 qapi-schema.json              |  16 ++++-
 qmp-commands.hx               |   3 +-
 vl.c                          |   4 ++
 7 files changed, 174 insertions(+), 30 deletions(-)

diff --git a/hmp-commands.hx b/hmp-commands.hx
index 848efee..8f765fd 100644
--- a/hmp-commands.hx
+++ b/hmp-commands.hx
@@ -894,23 +894,25 @@ ETEXI
 
     {
         .name       = "migrate",
-        .args_type  = "detach:-d,blk:-b,inc:-i,uri:s",
-        .params     = "[-d] [-b] [-i] uri",
+        .args_type  = "detach:-d,recover:-r,blk:-b,inc:-i,uri:s",
+        .params     = "[-d] [-r] [-b] [-i] uri",
         .help       = "migrate to URI (using -d to not wait for completion)"
-                     "\n\t\t\t -b for migration without shared storage with"
-                     " full copy of disk\n\t\t\t -i for migration without "
-                     "shared storage with incremental copy of disk "
-                     "(base image shared between src and destination)",
+                     "\n\t\t\t -r to recover from a broken migration\n\t\t\t"
+                     " -b for migration without shared storage with"
+                     " full copy of disk\n\t\t\t -i for migration without "
+                     "shared storage with incremental copy of disk "
+                     "(base image shared between src and destination)",
         .mhandler.cmd = hmp_migrate,
     },
 
 
 STEXI
address@hidden migrate [-d] [-b] [-i] @var{uri}
address@hidden migrate [-d] [-r] [-b] [-i] @var{uri}
 @findex migrate
 Migrate to @var{uri} (using -d to not wait for completion).
-       -b for migration with full copy of disk
-       -i for migration with incremental copy of disk (base image is shared)
+       -r to recover from a broken migration
+       -b for migration with full copy of disk
+       -i for migration with incremental copy of disk (base image is shared)
 ETEXI
 
     {
diff --git a/hmp.c b/hmp.c
index cc2056e..02ed457 100644
--- a/hmp.c
+++ b/hmp.c
@@ -1563,12 +1563,14 @@ static void hmp_migrate_status_cb(void *opaque)
 void hmp_migrate(Monitor *mon, const QDict *qdict)
 {
     bool detach = qdict_get_try_bool(qdict, "detach", false);
+    bool recover = qdict_get_try_bool(qdict, "recover", false);
     bool blk = qdict_get_try_bool(qdict, "blk", false);
     bool inc = qdict_get_try_bool(qdict, "inc", false);
     const char *uri = qdict_get_str(qdict, "uri");
     Error *err = NULL;
 
-    qmp_migrate(uri, !!blk, blk, !!inc, inc, false, false, &err);
+    qmp_migrate(uri, !!recover, recover, !!blk, blk, !!inc, inc, false, false,
+                &err);
     if (err) {
         error_report_err(err);
         return;
diff --git a/include/migration/migration.h b/include/migration/migration.h
index 3c96623..bcaf55d 100644
--- a/include/migration/migration.h
+++ b/include/migration/migration.h
@@ -142,6 +142,7 @@ struct MigrationState
     int state;
     /* Old style params from 'migrate' command */
     MigrationParams params;
+    bool in_recovery;
 
     /* State related to return path */
     struct {
@@ -351,6 +352,9 @@ void flush_page_queue(MigrationState *ms);
 int ram_save_queue_pages(MigrationState *ms, const char *rbname,
                          ram_addr_t start, ram_addr_t len);
 
+int qemu_migrate_postcopy_outgoing_recovery(MigrationState *ms);
+int qemu_migrate_postcopy_incoming_recovery(QEMUFile 
**f,MigrationIncomingState* mis);
+
 PostcopyState postcopy_state_get(void);
 /* Set the state and return the old state */
 PostcopyState postcopy_state_set(PostcopyState new_state);
diff --git a/migration/migration.c b/migration/migration.c
index 955d5ee..6ed2e82 100644
--- a/migration/migration.c
+++ b/migration/migration.c
@@ -709,6 +709,33 @@ MigrationInfo *qmp_query_migrate(Error **errp)
     case MIGRATION_STATUS_CANCELLED:
         info->has_status = true;
         break;
+    case MIGRATION_STATUS_POSTCOPY_RECOVERY:
+        info->has_status = true;
+        info->has_total_time = true;
+        info->total_time = qemu_clock_get_ms(QEMU_CLOCK_REALTIME);
+
+        info->has_ram = true;
+        info->ram = g_malloc0(sizeof(*info->ram));
+        info->ram->transferred = ram_bytes_transferred();
+        info->ram->remaining = ram_bytes_remaining();
+        info->ram->total = ram_bytes_total();
+        info->ram->duplicate = dup_mig_pages_transferred();
+        info->ram->skipped = skipped_mig_pages_transferred();
+        info->ram->normal = norm_mig_pages_transferred();
+        info->ram->normal_bytes = norm_mig_bytes_transferred();
+        info->ram->dirty_pages_rate = s->dirty_pages_rate;
+        info->ram->mbps = s->mbps;
+        info->ram->dirty_sync_count = s->dirty_sync_count;
+
+        if (blk_mig_active()) {
+            info->has_disk = true;
+            info->disk = g_malloc0(sizeof(*info->disk));
+            info->disk->transferred = blk_mig_bytes_transferred();
+            info->disk->remaining = blk_mig_bytes_remaining();
+            info->disk->total = blk_mig_bytes_total();
+        }
+
+        get_xbzrle_cache_stats(info);
     }
     info->status = s->state;
 
@@ -993,6 +1020,7 @@ MigrationState *migrate_init(const MigrationParams *params)
     s->xfer_limit = 0;
     s->cleanup_bh = 0;
     s->to_dst_file = NULL;
+    s->in_recovery = false;
     s->state = MIGRATION_STATUS_NONE;
     s->params = *params;
     s->rp_state.from_dst_file = NULL;
@@ -1069,13 +1097,14 @@ bool migration_is_blocked(Error **errp)
     return false;
 }
 
-void qmp_migrate(const char *uri, bool has_blk, bool blk,
-                 bool has_inc, bool inc, bool has_detach, bool detach,
+void qmp_migrate(const char *uri, bool in_recover, bool recover, bool has_blk,
+                 bool blk, bool has_inc, bool inc, bool has_detach, bool 
detach,
                  Error **errp)
 {
     Error *local_err = NULL;
     MigrationState *s = migrate_get_current();
     MigrationParams params;
+    bool recovery = in_recover && recover;
     const char *p;
 
     params.blk = has_blk && blk;
@@ -1095,7 +1124,39 @@ void qmp_migrate(const char *uri, bool has_blk, bool blk,
         return;
     }
 
-    s = migrate_init(&params);
+    if (recovery ^ atomic_mb_read(&s->in_recovery)) {
+        if (recovery) {
+            /* No VM is waiting for recovery and
+             * recovery option was set
+             */
+
+            error_setg(errp, "No VM to recover");
+            return;
+        } else {
+            /* A VM is waiting for recovery and
+             * no recovery option is set
+             */
+
+            error_setg(errp, "A migration is in recovery state");
+            return;
+        }
+    } else {
+        if (!recovery) {
+            /* No VM is waiting for recovery and
+             * no recovery option is set
+             */
+            s = migrate_init(&params);
+        } else {
+            /* A VM is waiting for recovery and
+             * recovery option was set
+             */
+            s->to_dst_file = NULL;
+            if (s->rp_state.from_dst_file) {
+                /* shutdown the rp socket, so causing the rp thread to 
shutdown */
+                qemu_file_shutdown(s->rp_state.from_dst_file);
+            }
+        }
+    }
 
     if (strstart(uri, "tcp:", &p)) {
         tcp_start_outgoing_migration(s, p, &local_err);
@@ -1336,6 +1397,8 @@ static void migrate_handle_rp_req_pages(MigrationState 
*ms, const char* rbname,
  */
 static void *source_return_path_thread(void *opaque)
 {
+    fprintf(stderr, "Return path started on source\n");
+
     MigrationState *ms = opaque;
     QEMUFile *rp = ms->rp_state.from_dst_file;
     uint16_t header_len, header_type;
@@ -1439,8 +1502,8 @@ static void *source_return_path_thread(void *opaque)
 
     trace_source_return_path_thread_end();
 out:
-    ms->rp_state.from_dst_file = NULL;
     qemu_fclose(rp);
+    fprintf(stderr, "Return path failed on source\n");
     return NULL;
 }
 
@@ -1714,6 +1777,7 @@ static void *migration_thread(void *opaque)
     bool entered_postcopy = false;
     /* The active state we expect to be in; ACTIVE or POSTCOPY_ACTIVE */
     enum MigrationStatus current_active_state = MIGRATION_STATUS_ACTIVE;
+    int ret;
 
     rcu_register_thread();
 
@@ -1781,7 +1845,26 @@ static void *migration_thread(void *opaque)
             }
         }
 
-        if (qemu_file_get_error(s->to_dst_file)) {
+        if ((ret = qemu_file_get_error(s->to_dst_file))) {
+            /*  This check is based on how the error is set during the network
+             *  recv(). When recv() returns 0 (i.e. no data to read), the error
+             *  is set to -EIO. For all other network errors, it is set
+             *  according to the return value received.
+             */
+            if (ret == -EIO && s->state == MIGRATION_STATUS_POSTCOPY_ACTIVE) {
+                /* Network Failure during postcopy */
+
+                current_active_state = MIGRATION_STATUS_POSTCOPY_RECOVERY;
+                runstate_set(RUN_STATE_POSTMIGRATE_RECOVERY);
+                ret = qemu_migrate_postcopy_outgoing_recovery(s);
+                if(ret == 0) {
+                    current_active_state = MIGRATION_STATUS_POSTCOPY_ACTIVE;
+                    runstate_set(RUN_STATE_FINISH_MIGRATE);
+                    qemu_file_clear_error(s->to_dst_file);
+                    continue;
+                }
+
+            }
             migrate_set_state(&s->state, current_active_state,
                               MIGRATION_STATUS_FAILED);
             trace_migration_thread_file_err();
@@ -1852,17 +1935,6 @@ static void *migration_thread(void *opaque)
 
 void migrate_fd_connect(MigrationState *s)
 {
-    /* This is a best 1st approximation. ns to ms */
-    s->expected_downtime = max_downtime/1000000;
-    s->cleanup_bh = qemu_bh_new(migrate_fd_cleanup, s);
-
-    qemu_file_set_blocking(s->to_dst_file, true);
-    qemu_file_set_rate_limit(s->to_dst_file,
-                             s->bandwidth_limit / XFER_LIMIT_RATIO);
-
-    /* Notify before starting migration thread */
-    notifier_list_notify(&migration_state_notifiers, s);
-
     /*
      * Open the return path; currently for postcopy but other things might
      * also want it.
@@ -1877,12 +1949,61 @@ void migrate_fd_connect(MigrationState *s)
         }
     }
 
+    qemu_file_set_blocking(s->to_dst_file, true);
+    qemu_file_set_rate_limit(s->to_dst_file,
+                             s->bandwidth_limit / XFER_LIMIT_RATIO);
+
+    if (atomic_mb_read(&s->in_recovery)) {
+        qemu_mutex_lock(&migration_recovery_mutex);
+        atomic_mb_set(&s->in_recovery, false);
+        qemu_cond_signal(&migration_recovery_cond);
+        qemu_mutex_unlock(&migration_recovery_mutex);
+
+        fprintf(stderr, "recovered\n");
+        return;
+    }
+
+    /* This is a best 1st approximation. ns to ms */
+    s->expected_downtime = max_downtime/1000000;
+    s->cleanup_bh = qemu_bh_new(migrate_fd_cleanup, s);
+
+
+    /* Notify before starting migration thread */
+    notifier_list_notify(&migration_state_notifiers, s);
+
     migrate_compress_threads_create();
     qemu_thread_create(&s->thread, "migration", migration_thread, s,
                        QEMU_THREAD_JOINABLE);
     s->migration_thread_running = true;
 }
 
+int qemu_migrate_postcopy_outgoing_recovery(MigrationState* ms)
+{
+    migrate_set_state(&ms->state, MIGRATION_STATUS_POSTCOPY_ACTIVE,
+                                  MIGRATION_STATUS_POSTCOPY_RECOVERY);
+
+    atomic_mb_set(&ms->in_recovery, true);
+    /* Code for network recovery to be added here */
+    qemu_mutex_lock(&migration_recovery_mutex);
+    while(atomic_mb_read(&ms->in_recovery) == true) {
+        fprintf(stderr, "Under recovery, not letting it fail %p\n", 
ms->to_dst_file);
+        qemu_cond_wait(&migration_recovery_cond, &migration_recovery_mutex);
+    }
+    qemu_mutex_unlock(&migration_recovery_mutex);
+
+    if(ms->to_dst_file != NULL) {
+        /* Recovery successfull */
+        migrate_set_state(&ms->state, MIGRATION_STATUS_POSTCOPY_RECOVERY,
+                                      MIGRATION_STATUS_POSTCOPY_ACTIVE);
+
+        qemu_savevm_send_open_return_path(ms->to_dst_file);
+        return 0;
+    }
+
+    return -1;
+
+}
+
 PostcopyState  postcopy_state_get(void)
 {
     return atomic_mb_read(&incoming_postcopy_state);
diff --git a/qapi-schema.json b/qapi-schema.json
index 5658723..a658462 100644
--- a/qapi-schema.json
+++ b/qapi-schema.json
@@ -154,12 +154,15 @@
 # @watchdog: the watchdog action is configured to pause and has been triggered
 #
 # @guest-panicked: guest has been panicked as a result of guest OS panic
+#
+# @postmigrate-recovery: guest is paused for recovery after a network failure
+# (since 2.7)
 ##
 { 'enum': 'RunState',
   'data': [ 'debug', 'inmigrate', 'internal-error', 'io-error', 'paused',
             'postmigrate', 'prelaunch', 'finish-migrate', 'restore-vm',
             'running', 'save-vm', 'shutdown', 'suspended', 'watchdog',
-            'guest-panicked' ] }
+            'guest-panicked', 'postmigrate-recovery' ] }
 
 ##
 # @StatusInfo:
@@ -438,12 +441,15 @@
 #
 # @failed: some error occurred during migration process.
 #
+# @postcopy-recovery: in recovery mode, after a network failure. (since 2.7)
+#
 # Since: 2.3
 #
 ##
 { 'enum': 'MigrationStatus',
   'data': [ 'none', 'setup', 'cancelling', 'cancelled',
-            'active', 'postcopy-active', 'completed', 'failed' ] }
+            'active', 'postcopy-active', 'completed', 'failed',
+            'postcopy-recovery' ] }
 
 ##
 # @MigrationInfo
@@ -2119,6 +2125,8 @@
 #
 # @uri: the Uniform Resource Identifier of the destination VM
 #
+# @recover: #optional recover from a broken migration (since 2.7)
+#
 # @blk: #optional do block migration (full disk copy)
 #
 # @inc: #optional incremental disk copy migration
@@ -2131,7 +2139,7 @@
 # Since: 0.14.0
 ##
 { 'command': 'migrate',
-  'data': {'uri': 'str', '*blk': 'bool', '*inc': 'bool', '*detach': 'bool' } }
+  'data': {'uri': 'str', '*recover': 'bool', '*blk': 'bool', '*inc': 'bool', 
'*detach': 'bool' } }
 
 ##
 # @migrate-incoming
@@ -2142,6 +2150,8 @@
 # @uri: The Uniform Resource Identifier identifying the source or
 #       address to listen on
 #
+# @recover: #optional recover from a broken migration (since 2.7)
+#
 # Returns: nothing on success
 #
 # Since: 2.3
diff --git a/qmp-commands.hx b/qmp-commands.hx
index 6866264..dd727bf 100644
--- a/qmp-commands.hx
+++ b/qmp-commands.hx
@@ -639,7 +639,7 @@ EQMP
 
     {
         .name       = "migrate",
-        .args_type  = "detach:-d,blk:-b,inc:-i,uri:s",
+        .args_type  = "detach:-d,recover:-r,blk:-b,inc:-i,uri:s",
         .mhandler.cmd_new = qmp_marshal_migrate,
     },
 
@@ -651,6 +651,7 @@ Migrate to URI.
 
 Arguments:
 
+- "recover": recover migration (json-bool, optional)
 - "blk": block migration, full disk copy (json-bool, optional)
 - "inc": incremental disk copy (json-bool, optional)
 - "uri": Destination URI (json-string)
diff --git a/vl.c b/vl.c
index b3c80d5..f702886 100644
--- a/vl.c
+++ b/vl.c
@@ -597,6 +597,10 @@ static const RunStateTransition runstate_transitions_def[] 
= {
     { RUN_STATE_FINISH_MIGRATE, RUN_STATE_RUNNING },
     { RUN_STATE_FINISH_MIGRATE, RUN_STATE_POSTMIGRATE },
     { RUN_STATE_FINISH_MIGRATE, RUN_STATE_PRELAUNCH },
+    { RUN_STATE_FINISH_MIGRATE, RUN_STATE_POSTMIGRATE_RECOVERY },
+
+    { RUN_STATE_POSTMIGRATE_RECOVERY, RUN_STATE_FINISH_MIGRATE },
+    { RUN_STATE_POSTMIGRATE_RECOVERY, RUN_STATE_SHUTDOWN },
 
     { RUN_STATE_RESTORE_VM, RUN_STATE_RUNNING },
     { RUN_STATE_RESTORE_VM, RUN_STATE_PRELAUNCH },
-- 
2.7.4




reply via email to

[Prev in Thread] Current Thread [Next in Thread]