qemu-devel
[Top][All Lists]
Advanced

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

Re: [Qemu-devel] [PATCH 11/17] pseries: savevm support for pseries machi


From: Anthony Liguori
Subject: Re: [Qemu-devel] [PATCH 11/17] pseries: savevm support for pseries machine
Date: Mon, 08 Jul 2013 13:45:05 -0500
User-agent: Notmuch/0.15.2+202~g0c4b8aa (http://notmuchmail.org) Emacs/23.3.1 (x86_64-pc-linux-gnu)

Alexey Kardashevskiy <address@hidden> writes:

> From: David Gibson <address@hidden>
>
> This adds the necessary pieces to implement savevm / migration for the
> pseries machine.  The most complex part here is migrating the hash
> table - for the paravirtualized pseries machine the guest's hash page
> table is not stored within guest memory, but externally and the guest
> accesses it via hypercalls.
>
> This patch uses a hypervisor reserved bit of the HPTE as a dirty bit
> (tracking changes to the HPTE itself, not the page it references).
> This is used to implement a live migration style incremental save and
> restore of the hash table contents.
>
> In addition it adds VMStateDescription information to save and restore
> the (few) remaining pieces of state information needed by the pseries
> machine.
>
> Signed-off-by: David Gibson <address@hidden>
> Signed-off-by: Alexey Kardashevskiy <address@hidden>

I vaguely recall making the suggestion to use a live section like this.
How large is the HTAB typically?

Regards,

Anthony Liguori

> ---
>  hw/ppc/spapr.c         |  269 
> +++++++++++++++++++++++++++++++++++++++++++++++-
>  hw/ppc/spapr_hcall.c   |    8 +-
>  include/hw/ppc/spapr.h |   12 ++-
>  3 files changed, 281 insertions(+), 8 deletions(-)
>
> diff --git a/hw/ppc/spapr.c b/hw/ppc/spapr.c
> index def3505..f989a22 100644
> --- a/hw/ppc/spapr.c
> +++ b/hw/ppc/spapr.c
> @@ -32,6 +32,7 @@
>  #include "sysemu/cpus.h"
>  #include "sysemu/kvm.h"
>  #include "kvm_ppc.h"
> +#include "mmu-hash64.h"
>  
>  #include "hw/boards.h"
>  #include "hw/ppc/ppc.h"
> @@ -667,7 +668,7 @@ static void spapr_cpu_reset(void *opaque)
>  
>      env->spr[SPR_HIOR] = 0;
>  
> -    env->external_htab = spapr->htab;
> +    env->external_htab = (uint8_t *)spapr->htab;
>      env->htab_base = -1;
>      env->htab_mask = HTAB_SIZE(spapr) - 1;
>      env->spr[SPR_SDR1] = (target_ulong)spapr->htab |
> @@ -719,6 +720,268 @@ static int spapr_vga_init(PCIBus *pci_bus)
>      }
>  }
>  
> +static const VMStateDescription vmstate_spapr = {
> +    .name = "spapr",
> +    .version_id = 1,
> +    .minimum_version_id = 1,
> +    .minimum_version_id_old = 1,
> +    .fields      = (VMStateField []) {
> +        VMSTATE_UINT32(next_irq, sPAPREnvironment),
> +
> +        /* RTC offset */
> +        VMSTATE_UINT64(rtc_offset, sPAPREnvironment),
> +
> +        VMSTATE_END_OF_LIST()
> +    },
> +};
> +
> +#define HPTE(_table, _i)   (void *)(((uint64_t *)(_table)) + ((_i) * 2))
> +#define HPTE_VALID(_hpte)  (tswap64(*((uint64_t *)(_hpte))) & HPTE64_V_VALID)
> +#define HPTE_DIRTY(_hpte)  (tswap64(*((uint64_t *)(_hpte))) & 
> HPTE64_V_HPTE_DIRTY)
> +#define CLEAN_HPTE(_hpte)  ((*(uint64_t *)(_hpte)) &= 
> tswap64(~HPTE64_V_HPTE_DIRTY))
> +
> +static int htab_save_setup(QEMUFile *f, void *opaque)
> +{
> +    sPAPREnvironment *spapr = opaque;
> +
> +    spapr->htab_save_index = 0;
> +    spapr->htab_first_pass = true;
> +
> +    /* "Iteration" header */
> +    qemu_put_be32(f, spapr->htab_shift);
> +
> +    return 0;
> +}
> +
> +#define MAX_ITERATION_NS    5000000 /* 5 ms */
> +
> +static void htab_save_first_pass(QEMUFile *f, sPAPREnvironment *spapr,
> +                                 int64_t max_ns)
> +{
> +    int htabslots = HTAB_SIZE(spapr) / HASH_PTE_SIZE_64;
> +    int index = spapr->htab_save_index;
> +    int64_t starttime = qemu_get_clock_ns(rt_clock);
> +
> +    assert(spapr->htab_first_pass);
> +
> +    do {
> +        int chunkstart;
> +
> +        /* Consume invalid HPTEs */
> +        while ((index < htabslots)
> +               && !HPTE_VALID(HPTE(spapr->htab, index))) {
> +            index++;
> +            CLEAN_HPTE(HPTE(spapr->htab, index));
> +        }
> +
> +        /* Consume valid HPTEs */
> +        chunkstart = index;
> +        while ((index < htabslots)
> +               && HPTE_VALID(HPTE(spapr->htab, index))) {
> +            index++;
> +            CLEAN_HPTE(HPTE(spapr->htab, index));
> +        }
> +
> +        if (index > chunkstart) {
> +            int n_valid = index - chunkstart;
> +
> +            qemu_put_be32(f, chunkstart);
> +            qemu_put_be16(f, n_valid);
> +            qemu_put_be16(f, 0);
> +            qemu_put_buffer(f, HPTE(spapr->htab, chunkstart),
> +                            HASH_PTE_SIZE_64 * n_valid);
> +
> +            if ((qemu_get_clock_ns(rt_clock) - starttime) > max_ns) {
> +                break;
> +            }
> +        }
> +    } while ((index < htabslots) && !qemu_file_rate_limit(f));
> +
> +    if (index >= htabslots) {
> +        assert(index == htabslots);
> +        index = 0;
> +        spapr->htab_first_pass = false;
> +    }
> +    spapr->htab_save_index = index;
> +}
> +
> +static bool htab_save_later_pass(QEMUFile *f, sPAPREnvironment *spapr,
> +                                 int64_t max_ns)
> +{
> +    bool final = max_ns < 0;
> +    int htabslots = HTAB_SIZE(spapr) / HASH_PTE_SIZE_64;
> +    int examined = 0, sent = 0;
> +    int index = spapr->htab_save_index;
> +    int64_t starttime = qemu_get_clock_ns(rt_clock);
> +
> +    assert(!spapr->htab_first_pass);
> +
> +    do {
> +        int chunkstart, invalidstart;
> +
> +        /* Consume non-dirty HPTEs */
> +        while ((index < htabslots)
> +               && !HPTE_DIRTY(HPTE(spapr->htab, index))) {
> +            index++;
> +            examined++;
> +        }
> +
> +        chunkstart = index;
> +        /* Consume valid dirty HPTEs */
> +        while ((index < htabslots)
> +               && HPTE_DIRTY(HPTE(spapr->htab, index))
> +               && HPTE_VALID(HPTE(spapr->htab, index))) {
> +            CLEAN_HPTE(HPTE(spapr->htab, index));
> +            index++;
> +            examined++;
> +        }
> +
> +        invalidstart = index;
> +        /* Consume invalid dirty HPTEs */
> +        while ((index < htabslots)
> +               && HPTE_DIRTY(HPTE(spapr->htab, index))
> +               && !HPTE_VALID(HPTE(spapr->htab, index))) {
> +            CLEAN_HPTE(HPTE(spapr->htab, index));
> +            index++;
> +            examined++;
> +        }
> +
> +        if (index > chunkstart) {
> +            int n_valid = invalidstart - chunkstart;
> +            int n_invalid = index - invalidstart;
> +
> +            qemu_put_be32(f, chunkstart);
> +            qemu_put_be16(f, n_valid);
> +            qemu_put_be16(f, n_invalid);
> +            qemu_put_buffer(f, HPTE(spapr->htab, chunkstart),
> +                            HASH_PTE_SIZE_64 * n_valid);
> +            sent += index - chunkstart;
> +
> +            if (!final && (qemu_get_clock_ns(rt_clock) - starttime) > 
> max_ns) {
> +                break;
> +            }
> +        }
> +
> +        if (examined >= htabslots) {
> +            break;
> +        }
> +
> +        if (index >= htabslots) {
> +            assert(index == htabslots);
> +            index = 0;
> +        }
> +    } while ((examined < htabslots) && (!qemu_file_rate_limit(f) || final));
> +
> +    if (index >= htabslots) {
> +        assert(index == htabslots);
> +        index = 0;
> +    }
> +
> +    spapr->htab_save_index = index;
> +
> +    return (examined >= htabslots) && (sent == 0);
> +}
> +
> +static int htab_save_iterate(QEMUFile *f, void *opaque)
> +{
> +    sPAPREnvironment *spapr = opaque;
> +    bool nothingleft = false;;
> +
> +    /* Iteration header */
> +    qemu_put_be32(f, 0);
> +
> +    if (spapr->htab_first_pass) {
> +        htab_save_first_pass(f, spapr, MAX_ITERATION_NS);
> +    } else {
> +        nothingleft = htab_save_later_pass(f, spapr, MAX_ITERATION_NS);
> +    }
> +
> +    /* End marker */
> +    qemu_put_be32(f, 0);
> +    qemu_put_be16(f, 0);
> +    qemu_put_be16(f, 0);
> +
> +    return nothingleft ? 1 : 0;
> +}
> +
> +static int htab_save_complete(QEMUFile *f, void *opaque)
> +{
> +    sPAPREnvironment *spapr = opaque;
> +
> +    /* Iteration header */
> +    qemu_put_be32(f, 0);
> +
> +    htab_save_later_pass(f, spapr, -1);
> +
> +    /* End marker */
> +    qemu_put_be32(f, 0);
> +    qemu_put_be16(f, 0);
> +    qemu_put_be16(f, 0);
> +
> +    return 0;
> +}
> +
> +static int htab_load(QEMUFile *f, void *opaque, int version_id)
> +{
> +    sPAPREnvironment *spapr = opaque;
> +    uint32_t section_hdr;
> +
> +    if (version_id < 1 || version_id > 1) {
> +        fprintf(stderr, "htab_load() bad version\n");
> +        return -EINVAL;
> +    }
> +
> +    section_hdr = qemu_get_be32(f);
> +
> +    if (section_hdr) {
> +        /* First section, just the hash shift */
> +        if (spapr->htab_shift != section_hdr) {
> +            return -EINVAL;
> +        }
> +        return 0;
> +    }
> +
> +    while (true) {
> +        uint32_t index;
> +        uint16_t n_valid, n_invalid;
> +
> +        index = qemu_get_be32(f);
> +        n_valid = qemu_get_be16(f);
> +        n_invalid = qemu_get_be16(f);
> +
> +        if ((index == 0) && (n_valid == 0) && (n_invalid == 0)) {
> +            /* End of Stream */
> +            break;
> +        }
> +
> +        if ((index + n_valid + n_invalid) >=
> +            (HTAB_SIZE(spapr) / HASH_PTE_SIZE_64)) {
> +            /* Bad index in stream */
> +            fprintf(stderr, "htab_load() bad index %d (%hd+%hd entries) "
> +                    "in htab stream\n", index, n_valid, n_invalid);
> +            return -EINVAL;
> +        }
> +
> +        if (n_valid) {
> +            qemu_get_buffer(f, HPTE(spapr->htab, index),
> +                            HASH_PTE_SIZE_64 * n_valid);
> +        }
> +        if (n_invalid) {
> +            memset(HPTE(spapr->htab, index + n_valid), 0,
> +                   HASH_PTE_SIZE_64 * n_invalid);
> +        }
> +    }
> +
> +    return 0;
> +}
> +
> +static SaveVMHandlers savevm_htab_handlers = {
> +    .save_live_setup = htab_save_setup,
> +    .save_live_iterate = htab_save_iterate,
> +    .save_live_complete = htab_save_complete,
> +    .load_state = htab_load,
> +};
> +
>  static struct icp_state *try_create_xics(const char *type, int nr_servers,
>                                           int nr_irqs)
>  {
> @@ -987,6 +1250,10 @@ static void ppc_spapr_init(QEMUMachineInitArgs *args)
>  
>      spapr->entry_point = 0x100;
>  
> +    vmstate_register(NULL, 0, &vmstate_spapr, spapr);
> +    register_savevm_live(NULL, "spapr/htab", -1, 1,
> +                         &savevm_htab_handlers, spapr);
> +
>      /* Prepare the device tree */
>      spapr->fdt_skel = spapr_create_fdt_skel(cpu_model,
>                                              initrd_base, initrd_size,
> diff --git a/hw/ppc/spapr_hcall.c b/hw/ppc/spapr_hcall.c
> index e6f321d..7ca984e 100644
> --- a/hw/ppc/spapr_hcall.c
> +++ b/hw/ppc/spapr_hcall.c
> @@ -115,7 +115,7 @@ static target_ulong h_enter(PowerPCCPU *cpu, 
> sPAPREnvironment *spapr,
>      }
>      ppc_hash64_store_hpte1(env, hpte, ptel);
>      /* eieio();  FIXME: need some sort of barrier for smp? */
> -    ppc_hash64_store_hpte0(env, hpte, pteh);
> +    ppc_hash64_store_hpte0(env, hpte, pteh | HPTE64_V_HPTE_DIRTY);
>  
>      args[0] = pte_index + i;
>      return H_SUCCESS;
> @@ -152,7 +152,7 @@ static target_ulong remove_hpte(CPUPPCState *env, 
> target_ulong ptex,
>      }
>      *vp = v;
>      *rp = r;
> -    ppc_hash64_store_hpte0(env, hpte, 0);
> +    ppc_hash64_store_hpte0(env, hpte, HPTE64_V_HPTE_DIRTY);
>      rb = compute_tlbie_rb(v, r, ptex);
>      ppc_tlb_invalidate_one(env, rb);
>      return REMOVE_SUCCESS;
> @@ -282,11 +282,11 @@ static target_ulong h_protect(PowerPCCPU *cpu, 
> sPAPREnvironment *spapr,
>      r |= (flags << 48) & HPTE64_R_KEY_HI;
>      r |= flags & (HPTE64_R_PP | HPTE64_R_N | HPTE64_R_KEY_LO);
>      rb = compute_tlbie_rb(v, r, pte_index);
> -    ppc_hash64_store_hpte0(env, hpte, v & ~HPTE64_V_VALID);
> +    ppc_hash64_store_hpte0(env, hpte, (v & ~HPTE64_V_VALID) | 
> HPTE64_V_HPTE_DIRTY);
>      ppc_tlb_invalidate_one(env, rb);
>      ppc_hash64_store_hpte1(env, hpte, r);
>      /* Don't need a memory barrier, due to qemu's global lock */
> -    ppc_hash64_store_hpte0(env, hpte, v);
> +    ppc_hash64_store_hpte0(env, hpte, v | HPTE64_V_HPTE_DIRTY);
>      return H_SUCCESS;
>  }
>  
> diff --git a/include/hw/ppc/spapr.h b/include/hw/ppc/spapr.h
> index 09c4570..4cfe449 100644
> --- a/include/hw/ppc/spapr.h
> +++ b/include/hw/ppc/spapr.h
> @@ -9,6 +9,8 @@ struct sPAPRPHBState;
>  struct sPAPRNVRAM;
>  struct icp_state;
>  
> +#define HPTE64_V_HPTE_DIRTY     0x0000000000000040ULL
> +
>  typedef struct sPAPREnvironment {
>      struct VIOsPAPRBus *vio_bus;
>      QLIST_HEAD(, sPAPRPHBState) phbs;
> @@ -17,20 +19,24 @@ typedef struct sPAPREnvironment {
>  
>      hwaddr ram_limit;
>      void *htab;
> -    long htab_shift;
> +    uint32_t htab_shift;
>      hwaddr rma_size;
>      int vrma_adjust;
>      hwaddr fdt_addr, rtas_addr;
>      long rtas_size;
>      void *fdt_skel;
>      target_ulong entry_point;
> -    int next_irq;
> -    int rtc_offset;
> +    uint32_t next_irq;
> +    uint64_t rtc_offset;
>      char *cpu_model;
>      bool has_graphics;
>  
>      uint32_t epow_irq;
>      Notifier epow_notifier;
> +
> +    /* Migration state */
> +    int htab_save_index;
> +    bool htab_first_pass;
>  } sPAPREnvironment;
>  
>  #define H_SUCCESS         0
> -- 
> 1.7.10.4




reply via email to

[Prev in Thread] Current Thread [Next in Thread]