qemu-devel
[Top][All Lists]
Advanced

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

Re: [Qemu-devel] [PATCH qemu v9 09/13] spapr_pci: Enable vfio-pci hotplu


From: Alexey Kardashevskiy
Subject: Re: [Qemu-devel] [PATCH qemu v9 09/13] spapr_pci: Enable vfio-pci hotplug
Date: Fri, 3 Jul 2015 21:39:24 +1000
User-agent: Mozilla/5.0 (X11; Linux i686 on x86_64; rv:38.0) Gecko/20100101 Thunderbird/38.0.1

Oops, I have not added Mike in cc:, especially for this patch.


On 07/03/2015 09:28 PM, Alexey Kardashevskiy wrote:
sPAPR IOMMU is managing two copies of an TCE table:
1) a guest view of the table - this is what emulated devices use and
this is where H_GET_TCE reads from;
2) a hardware TCE table - only present if there is at least one vfio-pci
device on a PHB; it is updated via a memory listener on a PHB address
space which forwards map/unmap requests to vfio-pci IOMMU host driver.

At the moment presence of vfio-pci devices on a bus affect the way
the guest view table is allocated. If there is no vfio-pci on a PHB
and the host kernel supports KVM acceleration of H_PUT_TCE, a table
is allocated in KVM. However, if there is vfio-pci and we do yet not
support KVM acceleration for these, the table has to be allocated
by the userspace.

When vfio-pci device is hotplugged and there were no vfio-pci devices
already, the guest view table could have been allocated by KVM which
means that H_PUT_TCE is handled by the host kernel and since we
do not support vfio-pci in KVM, the hardware table will not be updated.

This reallocates the guest view table in QEMU if the first vfio-pci
device has just been plugged. spapr_tce_realloc_userspace() handles this.

This replays all the mappings to make sure that the tables are in sync.
This will not have a visible effect though as for a new device
the guest kernel will allocate-and-map new addresses and therefore
existing mappings from emulated devices will not be used by vfio-pci
devices.

This adds calls to spapr_phb_dma_capabilities_update() in PCI hotplug
hooks.

Signed-off-by: Alexey Kardashevskiy <address@hidden>
---
Changes:
v9:
* spapr_phb_hotplug_dma_sync() enumerates TCE tables explicitely rather than
via object_child_foreach()
* spapr_phb_hotplug_dma_sync() does memory_region_del_subregion() +
memory_region_add_subregion() as otherwise vfio_listener_region_del() is not
called and we end up with vfio_iommu_map_notify registered twice (comments 
welcome!)
if we do hotplug+hotunplug+hotplug of the same device.
* moved spapr_phb_hotplug_dma_sync() on unplug event to rcu as before calling
spapr_phb_hotplug_dma_sync(), we need VFIO to release the container, otherwise
spapr_phb_dma_capabilities_update() will decide that the PHB still has VFIO 
device.
Actual VFIO PCI device release happens from rcu and since we add ours later,
it gets executed later and we are good.
---
  hw/ppc/spapr_iommu.c        | 51 ++++++++++++++++++++++++++++++++++++++++++---
  hw/ppc/spapr_pci.c          | 49 +++++++++++++++++++++++++++++++++++++++++++
  include/hw/pci-host/spapr.h |  1 +
  include/hw/ppc/spapr.h      |  2 ++
  trace-events                |  2 ++
  5 files changed, 102 insertions(+), 3 deletions(-)

diff --git a/hw/ppc/spapr_iommu.c b/hw/ppc/spapr_iommu.c
index 45c00d8..2d99c3b 100644
--- a/hw/ppc/spapr_iommu.c
+++ b/hw/ppc/spapr_iommu.c
@@ -78,12 +78,13 @@ static uint64_t *spapr_tce_alloc_table(uint32_t liobn,
                                         uint32_t nb_table,
                                         uint32_t page_shift,
                                         int *fd,
-                                       bool vfio_accel)
+                                       bool vfio_accel,
+                                       bool force_userspace)
  {
      uint64_t *table = NULL;
      uint64_t window_size = (uint64_t)nb_table << page_shift;

-    if (kvm_enabled() && !(window_size >> 32)) {
+    if (kvm_enabled() && !force_userspace && !(window_size >> 32)) {
          table = kvmppc_create_spapr_tce(liobn, window_size, fd, vfio_accel);
      }

@@ -222,7 +223,8 @@ static void spapr_tce_table_do_enable(sPAPRTCETable *tcet, 
bool vfio_accel)
                                          tcet->nb_table,
                                          tcet->page_shift,
                                          &tcet->fd,
-                                        vfio_accel);
+                                        vfio_accel,
+                                        false);

      memory_region_set_size(&tcet->iommu,
                             (uint64_t)tcet->nb_table << tcet->page_shift);
@@ -495,6 +497,49 @@ int spapr_dma_dt(void *fdt, int node_off, const char 
*propname,
      return 0;
  }

+static int spapr_tce_do_replay(sPAPRTCETable *tcet, uint64_t *table)
+{
+    target_ulong ioba = tcet->bus_offset, pgsz = (1ULL << tcet->page_shift);
+    long i, ret = 0;
+
+    for (i = 0; i < tcet->nb_table; ++i, ioba += pgsz) {
+        ret = put_tce_emu(tcet, ioba, table[i]);
+        if (ret) {
+            break;
+        }
+    }
+
+    return ret;
+}
+
+int spapr_tce_replay(sPAPRTCETable *tcet)
+{
+    return spapr_tce_do_replay(tcet, tcet->table);
+}
+
+int spapr_tce_realloc_userspace(sPAPRTCETable *tcet, bool replay)
+{
+    int ret = 0, oldfd;
+    uint64_t *oldtable;
+
+    oldtable = tcet->table;
+    oldfd = tcet->fd;
+    tcet->table = spapr_tce_alloc_table(tcet->liobn,
+                                        tcet->nb_table,
+                                        tcet->page_shift,
+                                        &tcet->fd,
+                                        false,
+                                        true); /* force_userspace */
+
+    if (replay) {
+        ret = spapr_tce_do_replay(tcet, oldtable);
+    }
+
+    spapr_tce_free_table(oldtable, oldfd, tcet->nb_table);
+
+    return ret;
+}
+
  int spapr_tcet_dma_dt(void *fdt, int node_off, const char *propname,
                        sPAPRTCETable *tcet)
  {
diff --git a/hw/ppc/spapr_pci.c b/hw/ppc/spapr_pci.c
index 76c988f..dca747f 100644
--- a/hw/ppc/spapr_pci.c
+++ b/hw/ppc/spapr_pci.c
@@ -827,6 +827,45 @@ int spapr_phb_dma_reset(sPAPRPHBState *sphb)
      return 0;
  }

+static int spapr_phb_hotplug_dma_sync(sPAPRPHBState *sphb)
+{
+    int ret = 0, i;
+    bool had_vfio = sphb->has_vfio;
+    sPAPRTCETable *tcet;
+
+    spapr_phb_dma_capabilities_update(sphb);
+
+    /* We only update DMA config if there was no VFIO and now we got one */
+    if (had_vfio || !sphb->has_vfio) {
+        return 0;
+    }
+
+    for (i = 0; i < SPAPR_PCI_DMA_MAX_WINDOWS; ++i) {
+        tcet = spapr_tce_find_by_liobn(SPAPR_PCI_LIOBN(sphb->index, i));
+        if (!tcet || !tcet->enabled) {
+            continue;
+        }
+        if (tcet->fd >= 0) {
+            /*
+             * We got first vfio-pci device on accelerated table.
+             * VFIO acceleration is not possible.
+             * Reallocate table in userspace and replay mappings.
+             */
+            ret = spapr_tce_realloc_userspace(tcet, true);
+            trace_spapr_pci_dma_realloc_update(tcet->liobn, ret);
+        } else {
+            /* There was no acceleration, so just replay mappings. */
+            ret = spapr_tce_replay(tcet);
+            trace_spapr_pci_dma_update(tcet->liobn, ret);
+        }
+        if (ret) {
+            break;
+        }
+    }
+
+    return ret;
+}
+
  /* Macros to operate with address in OF binding to PCI */
  #define b_x(x, p, l)    (((x) & ((1<<(l))-1)) << (p))
  #define b_n(x)          b_x((x), 31, 1) /* 0 if relocatable */
@@ -1106,6 +1145,7 @@ static void spapr_phb_add_pci_device(sPAPRDRConnector 
*drc,
              error_setg(errp, "Failed to create pci child device tree node");
              goto out;
          }
+        spapr_phb_hotplug_dma_sync(phb);
      }

      drck->attach(drc, DEVICE(pdev),
@@ -1116,6 +1156,12 @@ out:
      }
  }

+static void spapr_phb_remove_sync_dma(struct rcu_head *head)
+{
+    sPAPRPHBState *sphb = container_of(head, sPAPRPHBState, rcu);
+    spapr_phb_hotplug_dma_sync(sphb);
+}
+
  static void spapr_phb_remove_pci_device_cb(DeviceState *dev, void *opaque)
  {
      /* some version guests do not wait for completion of a device
@@ -1130,6 +1176,9 @@ static void spapr_phb_remove_pci_device_cb(DeviceState 
*dev, void *opaque)
       */
      pci_device_reset(PCI_DEVICE(dev));
      object_unparent(OBJECT(dev));
+
+    /* Actual VFIO device release happens from RCU so postpone DMA update */
+    call_rcu1(&((sPAPRPHBState *)opaque)->rcu, spapr_phb_remove_sync_dma);
  }

  static void spapr_phb_remove_pci_device(sPAPRDRConnector *drc,
diff --git a/include/hw/pci-host/spapr.h b/include/hw/pci-host/spapr.h
index bf66315..8b007aa 100644
--- a/include/hw/pci-host/spapr.h
+++ b/include/hw/pci-host/spapr.h
@@ -61,6 +61,7 @@ typedef struct spapr_pci_msi_mig {

  struct sPAPRPHBState {
      PCIHostState parent_obj;
+    struct rcu_head rcu;

      uint32_t index;
      uint64_t buid;
diff --git a/include/hw/ppc/spapr.h b/include/hw/ppc/spapr.h
index e32e787..4645f16 100644
--- a/include/hw/ppc/spapr.h
+++ b/include/hw/ppc/spapr.h
@@ -588,6 +588,8 @@ int spapr_dma_dt(void *fdt, int node_off, const char 
*propname,
                   uint32_t liobn, uint64_t window, uint32_t size);
  int spapr_tcet_dma_dt(void *fdt, int node_off, const char *propname,
                        sPAPRTCETable *tcet);
+int spapr_tce_replay(sPAPRTCETable *tcet);
+int spapr_tce_realloc_userspace(sPAPRTCETable *tcet, bool replay);
  void spapr_pci_switch_vga(bool big_endian);
  void spapr_hotplug_req_add_event(sPAPRDRConnector *drc);
  void spapr_hotplug_req_remove_event(sPAPRDRConnector *drc);
diff --git a/trace-events b/trace-events
index a93af9a..a994019 100644
--- a/trace-events
+++ b/trace-events
@@ -1300,6 +1300,8 @@ spapr_pci_rtas_ibm_query_interrupt_source_number(unsigned ioa, 
unsigned intr) "q
  spapr_pci_msi_write(uint64_t addr, uint64_t data, uint32_t dt_irq) 
"@%"PRIx64"<=%"PRIx64" IRQ %u"
  spapr_pci_lsi_set(const char *busname, int pin, uint32_t irq) "%s PIN%d IRQ 
%u"
  spapr_pci_msi_retry(unsigned config_addr, unsigned req_num, unsigned max_irqs) 
"Guest device at %x asked %u, have only %u"
+spapr_pci_dma_update(uint64_t liobn, long ret) "liobn=%"PRIx64" ret=%ld"
+spapr_pci_dma_realloc_update(uint64_t liobn, long ret) "liobn=%"PRIx64" 
tcet=%ld"

  # hw/pci/pci.c
  pci_update_mappings_del(void *d, uint32_t bus, uint32_t func, uint32_t slot, int bar, uint64_t 
addr, uint64_t size) "d=%p %02x:%02x.%x %d,%#"PRIx64"+%#"PRIx64



--
Alexey



reply via email to

[Prev in Thread] Current Thread [Next in Thread]