qemu-devel
[Top][All Lists]
Advanced

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

Re: [Qemu-devel] [PATCH 2/2] vfio-powerpc: added VFIO support


From: Alexey Kardashevskiy
Subject: Re: [Qemu-devel] [PATCH 2/2] vfio-powerpc: added VFIO support
Date: Wed, 11 Jul 2012 12:54:45 +1000
User-agent: Mozilla/5.0 (X11; Linux i686; rv:13.0) Gecko/20120614 Thunderbird/13.0.1

On 11/07/12 02:55, Alex Williamson wrote:
> On Tue, 2012-07-10 at 15:51 +1000, Alexey Kardashevskiy wrote:
>> The patch enables VFIO on POWER.
>>
>> It literally does the following:
>>
>> 1. POWERPC IOMMU support (the kernel counterpart is required)
>>
>> 2. Added #ifdef TARGET_PPC64 for EOI handlers initialisation.
>>
>> 3. Added vfio_get_container_fd() to VFIO in order to initialize 1).
>>
>> 4. Makefile fixed and "is_vfio" flag added into sPAPR PHB - required to
>> distinguish VFIO's DMA context from the emulated one.
>>
>> WIth the pathes posted today a bit earlier, this patch fully supports
>> VFIO what includes MSIX as well,
>>
>>
>> Signed-off-by: Alexey Kardashevskiy <address@hidden>
>> ---
>>  hw/ppc/Makefile.objs |    3 ++
>>  hw/spapr.h           |    4 +++
>>  hw/spapr_iommu.c     |   87 
>> ++++++++++++++++++++++++++++++++++++++++++++++++++
>>  hw/spapr_pci.c       |   23 ++++++++++++-
>>  hw/spapr_pci.h       |    2 ++
>>  hw/vfio_pci.c        |   76 +++++++++++++++++++++++++++++++++++++++++--
>>  hw/vfio_pci.h        |    2 ++
>>  7 files changed, 193 insertions(+), 4 deletions(-)
>>
>> diff --git a/hw/ppc/Makefile.objs b/hw/ppc/Makefile.objs
>> index f573a95..c46a049 100644
>> --- a/hw/ppc/Makefile.objs
>> +++ b/hw/ppc/Makefile.objs
>> @@ -25,4 +25,7 @@ obj-$(CONFIG_FDT) += ../device_tree.o
>>  # Xilinx PPC peripherals
>>  obj-y += xilinx_ethlite.o
>>  
>> +# VFIO PCI device assignment
>> +obj-$(CONFIG_VFIO_PCI) += vfio_pci.o
>> +
>>  obj-y := $(addprefix ../,$(obj-y))
>> diff --git a/hw/spapr.h b/hw/spapr.h
>> index b37f337..9dca704 100644
>> --- a/hw/spapr.h
>> +++ b/hw/spapr.h
>> @@ -340,4 +340,8 @@ int spapr_dma_dt(void *fdt, int node_off, const char 
>> *propname,
>>  int spapr_tcet_dma_dt(void *fdt, int node_off, const char *propname,
>>                        DMAContext *dma);
>>  
>> +void spapr_vfio_init_dma(int fd, uint32_t liobn,
>> +                         uint64_t *dma32_window_start,
>> +                         uint64_t *dma32_window_size);
>> +
>>  #endif /* !defined (__HW_SPAPR_H__) */
>> diff --git a/hw/spapr_iommu.c b/hw/spapr_iommu.c
>> index 50c288d..0a194e8 100644
>> --- a/hw/spapr_iommu.c
>> +++ b/hw/spapr_iommu.c
>> @@ -16,6 +16,8 @@
>>   * You should have received a copy of the GNU Lesser General Public
>>   * License along with this library; if not, see 
>> <http://www.gnu.org/licenses/>.
>>   */
>> +#include <sys/ioctl.h>
>> +
>>  #include "hw.h"
>>  #include "kvm.h"
>>  #include "qdev.h"
>> @@ -23,6 +25,7 @@
>>  #include "dma.h"
>>  
>>  #include "hw/spapr.h"
>> +#include "hw/linux-vfio.h"
> 
> I really need to move this into linux-headers.
> 
>>  
>>  #include <libfdt.h>
>>  
>> @@ -183,6 +186,86 @@ static int put_tce_emu(target_ulong liobn, target_ulong 
>> ioba, target_ulong tce)
>>      return 0;
>>  }
>>  
>> +/* -------- API for POWERPC IOMMU -------- */
>> +
>> +#define POWERPC_IOMMU           2
>> +
>> +struct tce_iommu_info {
>> +    __u32 argsz;
>> +    __u32 dma32_window_start;
>> +    __u32 dma32_window_size;
>> +};
>> +
>> +#define POWERPC_IOMMU_GET_INFO _IO(VFIO_TYPE, VFIO_BASE + 12)
>> +
>> +struct tce_iommu_dma_map {
>> +    __u32 argsz;
>> +    __u64 va;
>> +    __u64 dmaaddr;
>> +};
>> +
>> +#define POWERPC_IOMMU_MAP_DMA _IO(VFIO_TYPE, VFIO_BASE + 13)
>> +#define POWERPC_IOMMU_UNMAP_DMA _IO(VFIO_TYPE, VFIO_BASE + 14)
> 
> I assume this would eventually go into the kernel vfio.h with a VFIO_
> prefix.  Add a flags field to the structures or it'll be hard to extend
> them later.


We can always define another type of IOMMU :) But yes, I'll extend both map and 
info structures.



>> +typedef struct sPAPRVFIOTable {
>> +    int fd;
>> +    uint32_t liobn;
>> +    QLIST_ENTRY(sPAPRVFIOTable) list;
>> +} sPAPRVFIOTable;
>> +
>> +QLIST_HEAD(vfio_tce_tables, sPAPRVFIOTable) vfio_tce_tables;
>> +
>> +void spapr_vfio_init_dma(int fd, uint32_t liobn,
>> +                         uint64_t *dma32_window_start,
>> +                         uint64_t *dma32_window_size)
>> +{
>> +    sPAPRVFIOTable *t;
>> +    struct tce_iommu_info info = { .argsz = sizeof(info) };
>> +
>> +    if (ioctl(fd, POWERPC_IOMMU_GET_INFO, &info)) {
>> +        fprintf(stderr, "POWERPC_IOMMU_GET_INFO failed %d\n", errno);
>> +        return;
>> +    }
>> +    *dma32_window_start = info.dma32_window_start;
>> +    *dma32_window_size = info.dma32_window_size;
>> +
>> +    t = g_malloc0(sizeof(*t));
>> +    t->fd = fd;
>> +    t->liobn = liobn;
>> +
>> +    QLIST_INSERT_HEAD(&vfio_tce_tables, t, list);
>> +}
>> +
>> +static int put_tce_vfio(uint32_t liobn, target_ulong ioba, target_ulong tce)
>> +{
>> +    sPAPRVFIOTable *t;
>> +    struct tce_iommu_dma_map map = {
>> +        .argsz = sizeof(map),
>> +        .va = 0,
>> +        .dmaaddr = ioba,
>> +    };
>> +
>> +    QLIST_FOREACH(t, &vfio_tce_tables, list) {
>> +        if (t->liobn != liobn) {
>> +            continue;
>> +        }
>> +        if (tce) {
>> +            map.va = (uintptr_t)qemu_get_ram_ptr(tce & 
>> ~SPAPR_TCE_PAGE_MASK);
>> +            if (ioctl(t->fd, POWERPC_IOMMU_MAP_DMA, &map)) {
>> +                fprintf(stderr, "TCE_MAP_DMA: %d\n", errno);
>> +                return H_PARAMETER;
>> +            }
>> +        } else {
>> +            if (ioctl(t->fd, POWERPC_IOMMU_UNMAP_DMA, &map)) {
>> +                fprintf(stderr, "TCE_UNMAP_DMA: %d\n", errno);
>> +                return H_PARAMETER;
>> +            }
>> +        }
>> +        return H_SUCCESS;
>> +    }
>> +    return H_CONTINUE; /* positive non-zero value */
>> +}
>> +
> 
> I wish you could do this through a MemoryListener like we do on x86.


What is the point? Map the entire RAM to the guest? And It will still use our 
own IOMMU ioctls as it
is completely our IOMMU implementaiton.


>>  static target_ulong h_put_tce(CPUPPCState *env, sPAPREnvironment *spapr,
>>                                target_ulong opcode, target_ulong *args)
>>  {
>> @@ -203,6 +286,10 @@ static target_ulong h_put_tce(CPUPPCState *env, 
>> sPAPREnvironment *spapr,
>>      if (0 >= ret) {
>>          return ret ? H_PARAMETER : H_SUCCESS;
>>      }
>> +    ret = put_tce_vfio(liobn, ioba, tce);
>> +    if (0 >= ret) {
>> +        return ret ? H_PARAMETER : H_SUCCESS;
>> +    }
>>  #ifdef DEBUG_TCE
>>      fprintf(stderr, "%s on liobn=" TARGET_FMT_lx
>>              "  ioba 0x" TARGET_FMT_lx "  TCE 0x" TARGET_FMT_lx " ret=%d\n",
>> diff --git a/hw/spapr_pci.c b/hw/spapr_pci.c
>> index 5f89003..3375c3f 100644
>> --- a/hw/spapr_pci.c
>> +++ b/hw/spapr_pci.c
>> @@ -29,6 +29,7 @@
>>  #include "pci_host.h"
>>  #include "hw/spapr.h"
>>  #include "hw/spapr_pci.h"
>> +#include "hw/vfio_pci.h"
>>  #include "exec-memory.h"
>>  #include <libfdt.h>
>>  #include "trace.h"
>> @@ -440,6 +441,12 @@ static void pci_spapr_set_irq(void *opaque, int 
>> irq_num, int level)
>>                   level);
>>  }
>>  
>> +static int pci_spapr_get_irq(void *opaque, int irq_num)
>> +{
>> +    sPAPRPHBState *phb = opaque;
>> +    return phb->lsi_table[irq_num].dt_irq;
>> +}
>> +
>>  static uint64_t spapr_io_read(void *opaque, target_phys_addr_t addr,
>>                                unsigned size)
>>  {
>> @@ -567,7 +574,8 @@ static int spapr_phb_init(SysBusDevice *s)
>>  
>>      bus = pci_register_bus(&phb->host_state.busdev.qdev,
>>                             phb->busname ? phb->busname : phb->dtbusname,
>> -                           pci_spapr_set_irq, NULL, pci_spapr_map_irq, phb,
>> +                           pci_spapr_set_irq, pci_spapr_get_irq,
>> +                           pci_spapr_map_irq, phb,
>>                             &phb->memspace, &phb->iospace,
>>                             PCI_DEVFN(0, 0), PCI_NUM_PINS);
>>      phb->host_state.bus = bus;
>> @@ -596,6 +604,7 @@ static Property spapr_phb_properties[] = {
>>      DEFINE_PROP_HEX64("io_win_addr", sPAPRPHBState, io_win_addr, 0),
>>      DEFINE_PROP_HEX64("io_win_size", sPAPRPHBState, io_win_size, 0x10000),
>>      DEFINE_PROP_HEX64("msi_win_addr", sPAPRPHBState, msi_win_addr, 0),
>> +    DEFINE_PROP_UINT8("vfio", sPAPRPHBState, is_vfio, 0),
>>      DEFINE_PROP_END_OF_LIST(),
>>  };
>>  
>> @@ -639,6 +648,18 @@ void spapr_create_phb(sPAPREnvironment *spapr,
>>  /* Finalize PCI setup, called when all devices are already created */
>>  int spapr_finalize_pci_setup(sPAPRPHBState *phb)
>>  {
>> +    if (phb->is_vfio) {
>> +        int fd = vfio_get_container_fd(phb->host_state.bus);
>> +
>> +        if (fd < 0) {
>> +            return fd;
>> +        }
>> +        spapr_vfio_init_dma(fd, phb->dma_liobn,
>> +                            &phb->dma_window_start,
>> +                            &phb->dma_window_size);
>> +        return 0;
>> +    }
>> +
>>      phb->dma_window_start = 0;
>>      phb->dma_window_size = 0x40000000;
>>      phb->dma = spapr_tce_new_dma_context(phb->dma_liobn,
>> diff --git a/hw/spapr_pci.h b/hw/spapr_pci.h
>> index 3aae273..a4f031b 100644
>> --- a/hw/spapr_pci.h
>> +++ b/hw/spapr_pci.h
>> @@ -57,6 +57,8 @@ typedef struct sPAPRPHBState {
>>          int nvec;
>>      } msi_table[SPAPR_MSIX_MAX_DEVS];
>>  
>> +    uint8_t is_vfio;
>> +
>>      QLIST_ENTRY(sPAPRPHBState) list;
>>  } sPAPRPHBState;
>>  
>> diff --git a/hw/vfio_pci.c b/hw/vfio_pci.c
>> index 1ac287f..cc0b974 100644
>> --- a/hw/vfio_pci.c
>> +++ b/hw/vfio_pci.c
>> @@ -21,7 +21,6 @@
>>  #include <dirent.h>
>>  #include <stdio.h>
>>  #include <unistd.h>
>> -#include <sys/io.h>
>>  #include <sys/ioctl.h>
>>  #include <sys/mman.h>
>>  #include <sys/types.h>
>> @@ -44,6 +43,17 @@
>>  #include "vfio_pci.h"
>>  #include "linux-vfio.h"
>>  
>> +#ifndef TARGET_PPC64
>> +#include <sys/io.h>
>> +#define VFIO_IOMMU_EXTENSION    VFIO_X86_IOMMU
>> +#else
>> +#include "hw/pci_internals.h"
>> +#include "hw/xics.h"
>> +#include "hw/spapr.h"
>> +#define POWERPC_IOMMU           2
>> +#define VFIO_IOMMU_EXTENSION    POWERPC_IOMMU
>> +#endif
>> +
> 
> VFIO_IOMMU_EXTENSION never gets used, POWER_IOMMU is redefined below.


Yes, a bit messy. Was not sure about the name so I postponed it.


>>  //#define DEBUG_VFIO
>>  #ifdef DEBUG_VFIO
>>  #define DPRINTF(fmt, ...) \
>> @@ -235,6 +245,7 @@ struct vfio_irq_set_fd {
>>  
>>  static void vfio_enable_intx_kvm(VFIODevice *vdev)
>>  {
>> +#ifndef TARGET_PPC64
> 
> Why do you need this, aren't the extension checks sufficient for this to
> be a nop for you?


It uses ioapic_remove_gsi_eoi_notifier() so it needs some #ifdef anyway. And as 
we do not support
kvm_irqchip_in_kernel(), there is no point in fixing it and I disabled it all.
When we make eoi notifiers a platform independent, then yes, it will be nop.


>>  #ifdef CONFIG_KVM
>>      struct vfio_irq_set_fd irq_set_fd = {
>>      .irq_set = {
>> @@ -298,10 +309,12 @@ fail:
>>      qemu_set_fd_handler(irqfd.fd, vfio_intx_interrupt, NULL, vdev);
>>      vfio_unmask_intx(vdev);
>>  #endif
>> +#endif
>>  }
>>  
>>  static void vfio_disable_intx_kvm(VFIODevice *vdev)
>>  {
>> +#ifndef TARGET_PPC64
> 
> Same

Same :)

> 
>>  #ifdef CONFIG_KVM
>>      struct vfio_irq_set_fd irq_set_fd = {
>>      .irq_set = {
>> @@ -350,8 +363,10 @@ static void vfio_disable_intx_kvm(VFIODevice *vdev)
>>      DPRINTF("%s(%04x:%02x:%02x.%x) KVM INTx accel disabled\n", __FUNCTION__,
>>              vdev->host.seg, vdev->host.bus, vdev->host.dev, 
>> vdev->host.func);
>>  #endif
>> +#endif
>>  }
>>  
>> +#ifndef TARGET_PPC64
>>  static void vfio_update_irq(Notifier *notify, void *data)
>>  {
>>      VFIODevice *vdev = container_of(notify, VFIODevice, intx.update_irq);
>> @@ -381,6 +396,7 @@ static void vfio_update_irq(Notifier *notify, void *data)
>>      /* Re-enable the interrupt in cased we missed an EOI */
>>      vfio_eoi(&vdev->intx.eoi, NULL);
>>  }
>> +#endif
>>  
>>  static int vfio_enable_intx(VFIODevice *vdev)
>>  {
>> @@ -404,10 +420,14 @@ static int vfio_enable_intx(VFIODevice *vdev)
>>      vdev->intx.pin = pin - 1; /* Pin A (1) -> irq[0] */
>>      vdev->intx.irq = pci_get_irq(&vdev->pdev, vdev->intx.pin);
>>      vdev->intx.eoi.notify = vfio_eoi;
>> +#ifndef TARGET_PPC64
>>      ioapic_add_gsi_eoi_notifier(&vdev->intx.eoi, vdev->intx.irq);
> 
> This is really only a place holder for x86 too, I don't think my eoi
> notifier as written is acceptable upstream.  We really need some common
> infrastructure here.  I'm hoping to get the kvm acceleration in place
> which would make vfio usable on x86 with kvm (the common case), then
> work towards a generic eoi notifier.
> 
>>  
>>      vdev->intx.update_irq.notify = vfio_update_irq;
>>      pci_add_irq_update_notifier(&vdev->pdev, &vdev->intx.update_irq);
> 
> Can't you stub this out to make it safe to do on POWER too?


I could even simply enable it (not sure if it is going to be called ever though 
but anyway) once we
get unified eoi notifiers.


>> +#else
>> +    xics_add_eoi_notifier(&vdev->intx.eoi, vdev->intx.irq);
>> +#endif
>>  
>>      if (event_notifier_init(&vdev->intx.interrupt, 0)) {
>>          error_report("vfio: Error: event_notifier_init failed\n");
>> @@ -440,8 +460,12 @@ static void vfio_disable_intx(VFIODevice *vdev)
>>      vfio_disable_intx_kvm(vdev);
>>      vfio_disable_irqindex(vdev, VFIO_PCI_INTX_IRQ_INDEX);
>>  
>> +#ifndef TARGET_PPC64
>>      pci_remove_irq_update_notifier(&vdev->intx.update_irq);
>>      ioapic_remove_gsi_eoi_notifier(&vdev->intx.eoi, vdev->intx.irq);
>> +#else
>> +    xics_remove_eoi_notifier(&vdev->intx.eoi);
>> +#endif
>>  
>>      fd = event_notifier_get_fd(&vdev->intx.interrupt);
>>      qemu_set_fd_handler(fd, NULL, NULL, vdev);
>> @@ -543,7 +567,7 @@ static int vfio_msix_vector_use(PCIDevice *pdev,
>>      }
>>  
>>      fd = event_notifier_get_fd(&vdev->msi_vectors[vector].interrupt);
>> -
>> +#ifndef TARGET_PPC64
>>      vdev->msi_vectors[vector].virq = kvm_irqchip_add_msi_route(kvm_state, 
>> msg);
>>      if (vdev->msi_vectors[vector].virq < 0 || 
>>          kvm_irqchip_add_irqfd(kvm_state, fd,
>> @@ -551,7 +575,11 @@ static int vfio_msix_vector_use(PCIDevice *pdev,
>>          qemu_set_fd_handler(fd, vfio_msi_interrupt, NULL,
>>                              &vdev->msi_vectors[vector]);
>>      }
>> -
>> +#else
>> +    vdev->msi_vectors[vector].virq = -1;
>> +    qemu_set_fd_handler(fd, vfio_msi_interrupt, NULL,
>> +                        &vdev->msi_vectors[vector]);
>> +#endif
> 
> This shouldn't be necessary once the abort is removed from
> kvm_irqchip_add_msi_route.  It'll be merged next time the kvm uq tree
> merges into qemu.


True, I just did not pick up your very last changes. Updating is always 
painful, and now it is even
worse then usual as pci_get_irq has been renamed to something else :) Will do 
though.



>>      if (vdev->nr_vectors < vector + 1) {
>>          int i;
>>  
>> @@ -692,6 +720,7 @@ retry:
>>          fd = event_notifier_get_fd(&vdev->msi_vectors[i].interrupt);
>>  
>>          msg = msi_get_msg(&vdev->pdev, i);
>> +#ifndef TARGET_PPC64
>>          vdev->msi_vectors[i].virq = kvm_irqchip_add_msi_route(kvm_state, 
>> msg);
>>          if (vdev->msi_vectors[i].virq < 0 || 
>>              kvm_irqchip_add_irqfd(kvm_state, fd,
>> @@ -699,6 +728,12 @@ retry:
>>              qemu_set_fd_handler(fd, vfio_msi_interrupt, NULL,
>>                                  &vdev->msi_vectors[i]);
>>          }
>> +#else
>> +        vdev->msi_vectors[i].virq = -1;
>> +        qemu_set_fd_handler(fd, vfio_msi_interrupt, NULL,
>> +                            &vdev->msi_vectors[i]);
>> +        msg = msg;
>> +#endif
> 
> Same here
> 
>>      }
>>      
>>      ret = vfio_enable_vectors(vdev, false);
>> @@ -1581,6 +1616,25 @@ static int vfio_connect_container(VFIOGroup *group)
>>  
>>          memory_listener_register(&container->listener, get_system_memory());
>>  
>> +#define POWERPC_IOMMU           2
> 
> Assume this will go in the kernel vfio.h at some point.  You may want to
> pick a different name if there's a possibility of other powerpc iommu
> implementations... thus the crappy type1 name for x86.
> 
>> +    } else if (ioctl(fd, VFIO_CHECK_EXTENSION, POWERPC_IOMMU)) {
>> +        ret = ioctl(group->fd, VFIO_GROUP_SET_CONTAINER, &fd);
>> +        if (ret) {
>> +            error_report("vfio: failed to set group container: %s\n",
>> +                         strerror(errno));
>> +            g_free(container);
>> +            close(fd);
>> +            return -1;
>> +        }
>> +
>> +        ret = ioctl(fd, VFIO_SET_IOMMU, POWERPC_IOMMU);
>> +        if (ret) {
>> +            error_report("vfio: failed to set iommu for container: %s\n",
>> +                         strerror(errno));
>> +            g_free(container);
>> +            close(fd);
>> +            return -1;
>> +        }
>>      } else {
>>          error_report("vfio: No available IOMMU models\n");
>>          g_free(container);
>> @@ -2005,3 +2059,19 @@ static void register_vfio_pci_dev_type(void)
>>  }
>>  
>>  type_init(register_vfio_pci_dev_type)
>> +
>> +int vfio_get_container_fd(struct PCIBus *pbus)
>> +{
>> +    BusChild *kid1st = QTAILQ_FIRST(&pbus->qbus.children);
>> +    VFIODevice *vdev1st;
>> +
>> +    if (!kid1st) {
>> +        printf("No device registered on PCI bus \"%s\", no DMA enabled\n",
>> +               pbus->qbus.name);
>> +        return -1;
>> +    }
>> +    vdev1st = container_of(kid1st->child, VFIODevice, pdev.qdev);
>> +
>> +    return vdev1st->group->container->fd;
>> +}
>> +
> 
> This is not a generic implementation.  x86 won't have all devices on a
> bus be vfio devices and even if it did, there's no guarantee they all
> belong to the same container.  This should probably at least take a
> PCIDevice and some kind of POWER specific code will need to know that
> the container is the same for the whole bus.  Thanks,


This is a workaround, true. x86 does not need this call at all. And on powerpc 
VFIO devices won't
share PCI bus with emulated devices. I just need some API to get this fd.

Well I probably can add MemoryListener for the DMA window and move all 
power-specific map/unmap code
to VFIO but it does not look much better. I would rather prefer separating 
IOMMU code from vfio_pci
somehow (more or less as it is now for powerpc). While doing it, we could think 
of the API to get
this fd which we need anyway in order to setup the DMA window which is per 
group (which QEMU does
not understand) but not per device.



> Alex
> 
>> diff --git a/hw/vfio_pci.h b/hw/vfio_pci.h
>> index 226607c..0d13341 100644
>> --- a/hw/vfio_pci.h
>> +++ b/hw/vfio_pci.h
>> @@ -105,4 +105,6 @@ typedef struct VFIOGroup {
>>  #define VFIO_FLAG_IOMMU_SHARED_BIT 0
>>  #define VFIO_FLAG_IOMMU_SHARED (1U << VFIO_FLAG_UIOMMU_SHARED_BIT)
>>  
>> +int vfio_get_container_fd(struct PCIBus *pbus);
>> +
>>  #endif /* __VFIO_H__ */
> 
> 
> 


-- 
Alexey





reply via email to

[Prev in Thread] Current Thread [Next in Thread]