qemu-devel
[Top][All Lists]
Advanced

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

Re: [Qemu-devel] [Qemu-devel [RFC] [WIP] v2] Keeping the Source side ali


From: Dr. David Alan Gilbert
Subject: Re: [Qemu-devel] [Qemu-devel [RFC] [WIP] v2] Keeping the Source side alive incase of network failure (Migration recovery from network failure)
Date: Wed, 15 Jun 2016 14:02:50 +0100
User-agent: Mutt/1.6.1 (2016-04-27)

* Md Haris Iqbal (address@hidden) wrote:
> ---
>  include/migration/migration.h |  1 +
>  migration/migration.c         | 76 
> ++++++++++++++++++++++++++++++++++++++++---
>  qapi-schema.json              | 11 +++++--
>  vl.c                          |  4 +++
>  4 files changed, 85 insertions(+), 7 deletions(-)

You need to pull this pair of patches together as a set of patches rather
than two individual patches; it just gets too confusing (like with the
version numbers).
Pull them together in git, and then we can also get the patches split
up into smaller parts and you can keep upto date with current qemu.

> diff --git a/include/migration/migration.h b/include/migration/migration.h
> index 4a3201b..59e26e6 100644
> --- a/include/migration/migration.h
> +++ b/include/migration/migration.h
> @@ -326,6 +326,7 @@ void global_state_store_running(void);
>  void flush_page_queue(MigrationState *ms);
>  int ram_save_queue_pages(MigrationState *ms, const char *rbname,
>                           ram_addr_t start, ram_addr_t len);
> +int qemu_migrate_postcopy_outgoing_recovery(MigrationState *ms);
>  
>  PostcopyState postcopy_state_get(void);
>  /* Set the state and return the old state */
> diff --git a/migration/migration.c b/migration/migration.c
> index a77f62e..41c28e1 100644
> --- a/migration/migration.c
> +++ b/migration/migration.c
> @@ -676,6 +676,33 @@ MigrationInfo *qmp_query_migrate(Error **errp)
>      case MIGRATION_STATUS_CANCELLED:
>          info->has_status = true;
>          break;
> +    case MIGRATION_STATUS_POSTCOPY_RECOVERY:
> +        info->has_status = true;
> +        info->has_total_time = true;
> +        info->total_time = qemu_clock_get_ms(QEMU_CLOCK_REALTIME);
> +
> +        info->has_ram = true;
> +        info->ram = g_malloc0(sizeof(*info->ram));
> +        info->ram->transferred = ram_bytes_transferred();
> +        info->ram->remaining = ram_bytes_remaining();
> +        info->ram->total = ram_bytes_total();
> +        info->ram->duplicate = dup_mig_pages_transferred();
> +        info->ram->skipped = skipped_mig_pages_transferred();
> +        info->ram->normal = norm_mig_pages_transferred();
> +        info->ram->normal_bytes = norm_mig_bytes_transferred();
> +        info->ram->dirty_pages_rate = s->dirty_pages_rate;
> +        info->ram->mbps = s->mbps;
> +        info->ram->dirty_sync_count = s->dirty_sync_count;

If you update to the current qemu git you'll see there's a
populate_ram_info function that avoids a lot of this duplication.

> +        if (blk_mig_active()) {
> +            info->has_disk = true;
> +            info->disk = g_malloc0(sizeof(*info->disk));
> +            info->disk->transferred = blk_mig_bytes_transferred();
> +            info->disk->remaining = blk_mig_bytes_remaining();
> +            info->disk->total = blk_mig_bytes_total();
> +        }
> +
> +        get_xbzrle_cache_stats(info);
>      }
>      info->status = s->state;
>  
> @@ -1660,6 +1687,8 @@ static void *migration_thread(void *opaque)
>      /* The active state we expect to be in; ACTIVE or POSTCOPY_ACTIVE */
>      enum MigrationStatus current_active_state = MIGRATION_STATUS_ACTIVE;
>  
> +    int ret;
> +
>      rcu_register_thread();
>  
>      qemu_savevm_state_header(s->to_dst_file);
> @@ -1726,11 +1755,32 @@ static void *migration_thread(void *opaque)
>              }
>          }
>  
> -        if (qemu_file_get_error(s->to_dst_file)) {
> -            migrate_set_state(&s->state, current_active_state,
> -                              MIGRATION_STATUS_FAILED);
> -            trace_migration_thread_file_err();
> -            break;
> +        if ((ret = qemu_file_get_error(s->to_dst_file))) {
> +            fprintf(stderr, "1 : Error %s %d\n", strerror(-ret), -ret);
> +
> +            /*  This check is based on how the error is set during the 
> network
> +             *  recv(). When recv() returns 0 (i.e. no data to read), the 
> error
> +             *  is set to -EIO. For all other network errors, it is set
> +             *  according to the return value received.
> +             */
> +            if (ret != -EIO && s->state == MIGRATION_STATUS_POSTCOPY_ACTIVE) 
> {

shouldn't that be   ret == -EIO ?

> +                /* Network Failure during postcopy */
> +
> +                current_active_state = MIGRATION_STATUS_POSTCOPY_RECOVERY;
> +                runstate_set(RUN_STATE_POSTMIGRATE_RECOVERY);
> +                fprintf(stderr, "1.1 : Error %s %d\n", strerror(-ret), -ret);

All the fprintf's need to come out for the final version; use traces instaed.

> +                ret = qemu_migrate_postcopy_outgoing_recovery(s);
> +                if(ret < 0) {
> +                    break;
> +                }
> +
> +            } else {
> +                migrate_set_state(&s->state, current_active_state,
> +                                             MIGRATION_STATUS_FAILED);
> +                fprintf(stderr, "1.2 : Error %s %d\n", strerror(-ret), -ret);
> +                trace_migration_thread_file_err();
> +                break;
> +            }
>          }
>          current_time = qemu_clock_get_ms(QEMU_CLOCK_REALTIME);
>          if (current_time >= initial_time + BUFFER_DELAY) {
> @@ -1831,6 +1881,22 @@ void migrate_fd_connect(MigrationState *s)
>      s->migration_thread_running = true;
>  }
>  
> +int qemu_migrate_postcopy_outgoing_recovery(MigrationState* ms)
> +{
> +    migrate_set_state(&ms->state, MIGRATION_STATUS_POSTCOPY_ACTIVE,
> +                                  MIGRATION_STATUS_POSTCOPY_RECOVERY);
> +
> +    ms->in_recovery = true;
> +    /* Code for network recovery to be added here */
> +    while(atomic_mb_read(&ms->in_recovery) == true) {
> +        fprintf(stderr, "Not letting it fail %p\n", ms->to_dst_file);
> +        sleep(5);
> +    }
> +
> +    return -1;
> +
> +}
> +
>  PostcopyState  postcopy_state_get(void)
>  {
>      return atomic_mb_read(&incoming_postcopy_state);
> diff --git a/qapi-schema.json b/qapi-schema.json
> index 1b7b1e1..613f8d2 100644
> --- a/qapi-schema.json
> +++ b/qapi-schema.json
> @@ -154,12 +154,14 @@
>  # @watchdog: the watchdog action is configured to pause and has been 
> triggered
>  #
>  # @guest-panicked: guest has been panicked as a result of guest OS panic
> +#
> +# @postmigrate-recovery: guest is paused for recovery after a network failure
>  ##
>  { 'enum': 'RunState',
>    'data': [ 'debug', 'inmigrate', 'internal-error', 'io-error', 'paused',
>              'postmigrate', 'prelaunch', 'finish-migrate', 'restore-vm',
>              'running', 'save-vm', 'shutdown', 'suspended', 'watchdog',
> -            'guest-panicked' ] }
> +            'guest-panicked', 'postmigrate-recovery' ] }

Would it be possible to have that as postcopy-recovery rather than postmigrate?

>  ##
>  # @StatusInfo:
> @@ -434,12 +436,15 @@
>  #
>  # @failed: some error occurred during migration process.
>  #
> +# @postcopy-recovery: in recovery mode, after a network failure.
> +#
>  # Since: 2.3
>  #
>  ##
>  { 'enum': 'MigrationStatus',
>    'data': [ 'none', 'setup', 'cancelling', 'cancelled',
> -            'active', 'postcopy-active', 'completed', 'failed' ] }
> +            'active', 'postcopy-active', 'completed', 'failed',
> +            'postcopy-recovery' ] }
>  
>  ##
>  # @MigrationInfo
> @@ -2058,6 +2063,8 @@
>  #
>  # @uri: the Uniform Resource Identifier of the destination VM
>  #
> +# @recover: #optional recover from a broken migration
> +#

That looks like it belongs in a different patch.

>  # @blk: #optional do block migration (full disk copy)
>  #
>  # @inc: #optional incremental disk copy migration
> diff --git a/vl.c b/vl.c
> index 5fd22cb..c237140 100644
> --- a/vl.c
> +++ b/vl.c
> @@ -618,6 +618,10 @@ static const RunStateTransition 
> runstate_transitions_def[] = {
>      { RUN_STATE_FINISH_MIGRATE, RUN_STATE_RUNNING },
>      { RUN_STATE_FINISH_MIGRATE, RUN_STATE_POSTMIGRATE },
>      { RUN_STATE_FINISH_MIGRATE, RUN_STATE_PRELAUNCH },
> +    { RUN_STATE_FINISH_MIGRATE, RUN_STATE_POSTMIGRATE_RECOVERY },
> +
> +    { RUN_STATE_POSTMIGRATE_RECOVERY, RUN_STATE_FINISH_MIGRATE },
> +    { RUN_STATE_POSTMIGRATE_RECOVERY, RUN_STATE_SHUTDOWN },
>  
>      { RUN_STATE_RESTORE_VM, RUN_STATE_RUNNING },
>      { RUN_STATE_RESTORE_VM, RUN_STATE_PRELAUNCH },
> -- 
> 2.7.4
> 

Dave

--
Dr. David Alan Gilbert / address@hidden / Manchester, UK



reply via email to

[Prev in Thread] Current Thread [Next in Thread]