qemu-devel
[Top][All Lists]
Advanced

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

Re: [Qemu-devel] [PATCH 7/8] spapr vfio: add spapr-pci-vfio-host-bridge


From: Alexander Graf
Subject: Re: [Qemu-devel] [PATCH 7/8] spapr vfio: add spapr-pci-vfio-host-bridge to support vfio
Date: Tue, 27 Aug 2013 13:08:01 +0200

On 07.08.2013, at 10:21, Alexey Kardashevskiy wrote:

> The patch adds a spapr-pci-vfio-host-bridge device type
> which is a PCI Host Bridge with VFIO support. The new device
> inherits from the spapr-pci-host-bridge device and adds
> the following properties:
>       iommu - IOMMU group ID which represents a Partitionable
>               Endpoint, QEMU/ppc64 uses a separate PHB for
>               an IOMMU group so the guest kernel has to have
>               PCI Domain support enabled.
>       forceaddr (optional, 0 by default) - forces QEMU to copy
>               device:function from the host address as
>               certain guest drivers expect devices to appear in
>               particular locations;
>       mf (optional, 0 by default) - forces multifunction bit for
>               the function #0 of a found device, only makes sense
>               for multifunction devices and only with the forceaddr
>               property set. It would not be required if there
>               was a way to know in advance whether a device is
>               multifunctional or not.
>       scan (optional, 1 by default) - if non-zero, the new PHB walks
>               through all non-bridge devices in the group and tries
>               adding them to the PHB; if zero, all devices in the group
>               have to be configured manually via the QEMU command line.
> 
> The patch also adds a VFIO IOMMU type support to the existing
> sPAPR TCE list in spapr_iommu.c.
> 
> The patch also uses the host kernel support of a new KVM_CAP_SPAPR_TCE_IOMMU
> capability and KVM_CREATE_SPAPR_TCE_IOMMU ioctl which let QEMU tell
> the host what LIOBN is used for an IOMMU group. This ioctl turns real mode TCE
> requests handling on which accelerates actual throughput in 2.5-5 times.
> 
> Examples:
> 1) Scan and add all devices from IOMMU group with ID=1 to QEMU's PHB #6:
>       -device spapr-pci-vfio-host-bridge,id=DEVICENAME,iommu=1,index=6
> 
> 2) Configure and Add 3 functions of a multifunctional device to QEMU:
> (the NEC PCI USB card is used as an example here):
>       -device spapr-pci-vfio-host-bridge,id=USB,iommu=4,scan=0,index=7 \
>       -device vfio-pci,host=4:0:1.0,addr=1.0,bus=USB,multifunction=true
>       -device vfio-pci,host=4:0:1.1,addr=1.1,bus=USB
>       -device vfio-pci,host=4:0:1.2,addr=1.2,bus=USB
> 
> Cc: David Gibson <address@hidden>
> Signed-off-by: Alexey Kardashevskiy <address@hidden>
> ---
> hw/ppc/spapr_iommu.c        | 176 ++++++++++++++++++++++++++++++++-----
> hw/ppc/spapr_pci.c          | 209 +++++++++++++++++++++++++++++++++++++++++---
> include/hw/pci-host/spapr.h |  12 +++
> include/hw/ppc/spapr.h      |  19 ++++
> target-ppc/kvm.c            |  33 +++++++
> target-ppc/kvm_ppc.h        |  12 +++
> trace-events                |   4 +
> 7 files changed, 429 insertions(+), 36 deletions(-)
> 
> diff --git a/hw/ppc/spapr_iommu.c b/hw/ppc/spapr_iommu.c
> index 22b09be..096b6a9 100644
> --- a/hw/ppc/spapr_iommu.c
> +++ b/hw/ppc/spapr_iommu.c
> @@ -16,12 +16,14 @@
>  * You should have received a copy of the GNU Lesser General Public
>  * License along with this library; if not, see 
> <http://www.gnu.org/licenses/>.
>  */
> +
> #include "hw/hw.h"
> #include "sysemu/kvm.h"
> #include "hw/qdev.h"
> #include "kvm_ppc.h"
> #include "sysemu/dma.h"
> #include "exec/address-spaces.h"
> +#include "trace.h"
> 
> #include "hw/ppc/spapr.h"
> 
> @@ -244,6 +246,74 @@ static target_ulong put_tce_emu(sPAPRTCETable *tcet, 
> target_ulong ioba,
>     return H_SUCCESS;
> }
> 
> +static IOMMUTLBEntry spapr_vfio_translate_iommu(MemoryRegion *iommu, hwaddr 
> addr)
> +{
> +    IOMMUTLBEntry entry;
> +    /* Must never be called */
> +    assert(0);
> +    return entry;
> +}
> +
> +static MemoryRegionIOMMUOps spapr_vfio_iommu_ops = {
> +    .translate = spapr_vfio_translate_iommu,
> +};
> +
> +static int spapr_tce_table_vfio_realize(DeviceState *dev)
> +{
> +    sPAPRTCETable *tcet = SPAPR_TCE_TABLE(dev);
> +
> +    memory_region_init_iommu(&tcet->iommu, NULL, &spapr_vfio_iommu_ops,
> +                             "iommu-vfio-spapr", (uint64_t)INT64_MAX+1);
> +
> +    QLIST_INSERT_HEAD(&spapr_tce_tables, tcet, list);
> +
> +    return 0;
> +}
> +
> +sPAPRTCETable *spapr_vfio_new_table(DeviceState *owner, uint32_t liobn,
> +                                    int group_fd)
> +{
> +    sPAPRTCETable *tcet;
> +    int fd;
> +
> +    if (spapr_tce_find_by_liobn(liobn)) {
> +        fprintf(stderr, "Attempted to create TCE table with duplicate"
> +                " LIOBN 0x%x\n", liobn);
> +        return NULL;
> +    }
> +
> +    fd = kvmppc_create_spapr_tce_iommu(liobn, group_fd);
> +
> +    tcet = SPAPR_TCE_TABLE(object_new(TYPE_SPAPR_TCE_TABLE_VFIO));
> +    tcet->liobn = liobn;
> +    tcet->fd = fd;
> +    object_property_add_child(OBJECT(owner), "tce-table", OBJECT(tcet), 
> NULL);
> +
> +    qdev_init_nofail(DEVICE(tcet));
> +
> +    return tcet;
> +}
> +
> +static target_ulong put_tce_vfio(sPAPRTCETable *tcet, target_ulong ioba,
> +                                 target_ulong tce)
> +{
> +    IOMMUTLBEntry entry;
> +
> +    entry.iova = ioba & ~SPAPR_TCE_PAGE_MASK;
> +    entry.translated_addr = tce & ~SPAPR_TCE_PAGE_MASK;
> +    entry.addr_mask = SPAPR_TCE_PAGE_MASK;
> +    entry.perm = 0;
> +    if ((tce & SPAPR_TCE_RO) == SPAPR_TCE_RO) {
> +        entry.perm |= IOMMU_RO;
> +    }
> +    if ((tce & SPAPR_TCE_WO) == SPAPR_TCE_WO) {
> +        entry.perm |= IOMMU_WO;
> +    }
> +    memory_region_notify_iommu(&tcet->iommu, entry);
> +
> +    return H_SUCCESS;
> +}
> +
> static target_ulong h_put_tce_indirect(PowerPCCPU *cpu,
>                                        sPAPREnvironment *spapr,
>                                        target_ulong opcode, target_ulong 
> *args)
> @@ -255,18 +325,36 @@ static target_ulong h_put_tce_indirect(PowerPCCPU *cpu,
>     target_ulong npages = args[3];
>     target_ulong ret = 0;
>     sPAPRTCETable *tcet = spapr_tce_find_by_liobn(liobn);
> +    sPAPRTCETableClass *info;
> 
> -    if (tcet) {
> -        for (i = 0; i < npages; ++i, ioba += SPAPR_TCE_PAGE_SIZE) {
> -            target_ulong tce = ldq_phys((tce_list & ~SPAPR_TCE_PAGE_MASK) +
> -                                        i * sizeof(target_ulong));
> -            ret = put_tce_emu(tcet, ioba, tce);
> -            if (ret) {
> -                break;
> -            }
> +    if (!tcet) {
> +        return H_PARAMETER;
> +    }
> +
> +    info = SPAPR_TCE_TABLE_GET_CLASS(tcet);
> +    if (!info || !info->put_tce) {
> +        return H_PARAMETER;
> +    }
> +
> +    if ((tce_list & SPAPR_TCE_PAGE_MASK) || (npages > 512)) {
> +        return H_PARAMETER;
> +    }
> +
> +    if (liobn & 0xFFFFFFFF00000000ULL) {
> +        hcall_dprintf("spapr_vio_put_tce on out-of-boundsw LIOBN "
> +                      TARGET_FMT_lx "\n", liobn);
> +        return H_PARAMETER;
> +    }
> +
> +    for (i = 0; i < npages; ++i, ioba += SPAPR_TCE_PAGE_SIZE) {
> +        target_ulong tce = ldq_phys((tce_list & ~SPAPR_TCE_PAGE_MASK) +
> +                                    i * sizeof(target_ulong));
> +        ret = info->put_tce(tcet, ioba, tce);
> +        if (ret) {
> +            break;
>         }
> -        return ret;
>     }
> +
> #ifdef DEBUG_TCE
>     fprintf(stderr, "%s on liobn=" TARGET_FMT_lx
>             "  ioba 0x" TARGET_FMT_lx "  TCE 0x" TARGET_FMT_lx
> @@ -274,7 +362,7 @@ static target_ulong h_put_tce_indirect(PowerPCCPU *cpu,
>             __func__, liobn, ioba, tce_list, ret);
> #endif
> 
> -    return H_PARAMETER;
> +    return ret;
> }
> 
> static target_ulong h_stuff_tce(PowerPCCPU *cpu, sPAPREnvironment *spapr,
> @@ -287,17 +375,30 @@ static target_ulong h_stuff_tce(PowerPCCPU *cpu, 
> sPAPREnvironment *spapr,
>     target_ulong npages = args[3];
>     target_ulong ret = 0;
>     sPAPRTCETable *tcet = spapr_tce_find_by_liobn(liobn);
> +    sPAPRTCETableClass *info;
> +
> +    if (!tcet) {
> +        return H_PARAMETER;
> +    }
> +
> +    info = SPAPR_TCE_TABLE_GET_CLASS(tcet);
> +    if (!info || !info->put_tce) {
> +        return H_PARAMETER;
> +    }
> +
> +    if (liobn & 0xFFFFFFFF00000000ULL) {
> +        hcall_dprintf("spapr_vio_put_tce on out-of-boundsw LIOBN "
> +                      TARGET_FMT_lx "\n", liobn);
> +        return H_PARAMETER;
> +    }
> 
>     ioba &= ~(SPAPR_TCE_PAGE_SIZE - 1);
> 
> -    if (tcet) {
> -        for (i = 0; i < npages; ++i, ioba += SPAPR_TCE_PAGE_SIZE) {
> -            ret = put_tce_emu(tcet, ioba, tce_value);
> -            if (ret) {
> -                break;
> -            }
> +    for (i = 0; i < npages; ++i, ioba += SPAPR_TCE_PAGE_SIZE) {
> +        ret = info->put_tce(tcet, ioba, tce_value);
> +        if (ret) {
> +            break;
>         }
> -        return ret;
>     }
> #ifdef DEBUG_TCE
>     fprintf(stderr, "%s on liobn=" TARGET_FMT_lx
> @@ -306,7 +407,7 @@ static target_ulong h_stuff_tce(PowerPCCPU *cpu, 
> sPAPREnvironment *spapr,
>             __func__, liobn, ioba, tce_value, ret);
> #endif
> 
> -    return H_PARAMETER;
> +    return ret;
> }
> 
> static target_ulong h_put_tce(PowerPCCPU *cpu, sPAPREnvironment *spapr,
> @@ -316,12 +417,21 @@ static target_ulong h_put_tce(PowerPCCPU *cpu, 
> sPAPREnvironment *spapr,
>     target_ulong ioba = args[1];
>     target_ulong tce = args[2];
>     sPAPRTCETable *tcet = spapr_tce_find_by_liobn(liobn);
> +    target_ulong ret;
> +    sPAPRTCETableClass *info;
> +
> +    if (!tcet) {
> +        return H_PARAMETER;
> +    }
> +
> +    info = SPAPR_TCE_TABLE_GET_CLASS(tcet);
> +    if (!info || !info->put_tce) {
> +        return H_PARAMETER;
> +    }
> 
>     ioba &= ~(SPAPR_TCE_PAGE_SIZE - 1);
> 
> -    if (tcet) {
> -        return put_tce_emu(tcet, ioba, tce);
> -    }
> +    ret = info->put_tce(tcet, ioba, tce);
> #ifdef DEBUG_TCE
>     fprintf(stderr, "%s on liobn=" TARGET_FMT_lx
>             "  ioba 0x" TARGET_FMT_lx "  TCE 0x" TARGET_FMT_lx
> @@ -329,7 +439,7 @@ static target_ulong h_put_tce(PowerPCCPU *cpu, 
> sPAPREnvironment *spapr,
>             __func__, liobn, ioba, tce, ret);
> #endif
> 
> -    return H_PARAMETER;
> +    return ret;
> }
> 
> int spapr_dma_dt(void *fdt, int node_off, const char *propname,
> @@ -376,9 +486,12 @@ int spapr_tcet_dma_dt(void *fdt, int node_off, const 
> char *propname,
> static void spapr_tce_table_class_init(ObjectClass *klass, void *data)
> {
>     DeviceClass *dc = DEVICE_CLASS(klass);
> +    sPAPRTCETableClass *k = SPAPR_TCE_TABLE_CLASS(klass);
> +
>     dc->vmsd = &vmstate_spapr_tce_table;
>     dc->init = spapr_tce_table_realize;
>     dc->reset = spapr_tce_reset;
> +    k->put_tce = put_tce_emu;
> 
>     QLIST_INIT(&spapr_tce_tables);
> 
> @@ -393,12 +506,31 @@ static TypeInfo spapr_tce_table_info = {
>     .parent = TYPE_DEVICE,
>     .instance_size = sizeof(sPAPRTCETable),
>     .class_init = spapr_tce_table_class_init,
> +    .class_size = sizeof(sPAPRTCETableClass),
>     .instance_finalize = spapr_tce_table_finalize,
> };
> 
> +static void spapr_tce_table_vfio_class_init(ObjectClass *klass, void *data)
> +{
> +    DeviceClass *dc = DEVICE_CLASS(klass);
> +    sPAPRTCETableClass *k = SPAPR_TCE_TABLE_CLASS(klass);
> +
> +    dc->init = spapr_tce_table_vfio_realize;
> +    k->put_tce = put_tce_vfio;
> +}
> +
> +static TypeInfo spapr_tce_table_vfio_info = {
> +    .name = TYPE_SPAPR_TCE_TABLE_VFIO,
> +    .parent = TYPE_SPAPR_TCE_TABLE,
> +    .instance_size = sizeof(sPAPRTCETable),
> +    .class_init = spapr_tce_table_vfio_class_init,
> +    .class_size = sizeof(sPAPRTCETableClass),
> +};
> +
> static void register_types(void)
> {
>     type_register_static(&spapr_tce_table_info);
> +    type_register_static(&spapr_tce_table_vfio_info);
> }
> 
> type_init(register_types);
> diff --git a/hw/ppc/spapr_pci.c b/hw/ppc/spapr_pci.c
> index 869ca43..3f37cac 100644
> --- a/hw/ppc/spapr_pci.c
> +++ b/hw/ppc/spapr_pci.c

I think we should move the vfio phb into a separate file and make it be a 
proper subclass without even the chance to randomly call normal spapr pci 
functions ;).

Andreas, could you please check through this and see if you can spot a way to 
isolate it out?


Alex

> @@ -22,6 +22,9 @@
>  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
>  * THE SOFTWARE.
>  */
> +#include <sys/types.h>
> +#include <dirent.h>
> +
> #include "hw/hw.h"
> #include "hw/pci/pci.h"
> #include "hw/pci/msi.h"
> @@ -32,6 +35,7 @@
> #include "exec/address-spaces.h"
> #include <libfdt.h>
> #include "trace.h"
> +#include "hw/misc/vfio.h"
> 
> #include "hw/pci/pci_bus.h"
> 
> @@ -496,7 +500,11 @@ static AddressSpace *spapr_pci_dma_iommu(PCIBus *bus, 
> void *opaque, int devfn)
>     return &phb->iommu_as;
> }
> 
> -static int spapr_phb_init(SysBusDevice *s)
> +/*
> + * This is the common initialization part for both emulated and VFIO PHBs
> + * which includes everything but DMA and device scan (optional, VFIO only).
> + */
> +static int _spapr_phb_init(SysBusDevice *s)
> {
>     DeviceState *dev = DEVICE(s);
>     sPAPRPHBState *sphb = SPAPR_PCI_HOST_BRIDGE(s);
> @@ -610,19 +618,6 @@ static int spapr_phb_init(SysBusDevice *s)
>                            PCI_DEVFN(0, 0), PCI_NUM_PINS, TYPE_PCI_BUS);
>     phb->bus = bus;
> 
> -    sphb->dma_window_start = 0;
> -    sphb->dma_window_size = 0x40000000;
> -    sphb->tcet = spapr_tce_new_table(dev, sphb->dma_liobn,
> -                                     sphb->dma_window_size);
> -    if (!sphb->tcet) {
> -        fprintf(stderr, "Unable to create TCE table for %s\n", 
> sphb->dtbusname);
> -        return -1;
> -    }
> -    address_space_init(&sphb->iommu_as, spapr_tce_get_iommu(sphb->tcet),
> -                       sphb->dtbusname);
> -
> -    pci_setup_iommu(bus, spapr_pci_dma_iommu, sphb);
> -
>     QLIST_INSERT_HEAD(&spapr->phbs, sphb, list);
> 
>     /* Initialize the LSI table */
> @@ -641,6 +636,30 @@ static int spapr_phb_init(SysBusDevice *s)
>     return 0;
> }
> 
> +static int spapr_phb_init(SysBusDevice *s)
> +{
> +    sPAPRPHBState *sphb = SPAPR_PCI_HOST_BRIDGE(s);
> +    int ret;
> +
> +    ret = _spapr_phb_init(s);
> +    if (ret)
> +        return ret;
> +
> +    sphb->dma_window_start = 0;
> +    sphb->dma_window_size = 0x40000000;
> +    sphb->tcet = spapr_tce_new_table(DEVICE(sphb), sphb->dma_liobn,
> +                                     sphb->dma_window_size);
> +    if (!sphb->tcet) {
> +        fprintf(stderr, "Unable to create TCE table for %s\n", 
> sphb->dtbusname);
> +        return -1;
> +    }
> +    address_space_init(&sphb->iommu_as, spapr_tce_get_iommu(sphb->tcet),
> +                       sphb->dtbusname);
> +    pci_setup_iommu(sphb->parent_obj.bus, spapr_pci_dma_iommu, sphb);
> +
> +    return 0;
> +}
> +
> static void spapr_phb_reset(DeviceState *qdev)
> {
>     SysBusDevice *s = SYS_BUS_DEVICE(qdev);
> @@ -749,6 +768,163 @@ PCIHostState *spapr_create_phb(sPAPREnvironment *spapr, 
> int index)
>     return PCI_HOST_BRIDGE(dev);
> }
> 
> +/* sPAPR VFIO */
> +static Property spapr_phb_vfio_properties[] = {
> +    DEFINE_PROP_INT32("iommu", sPAPRPHBVFIOState, iommugroupid, -1),
> +    DEFINE_PROP_UINT8("scan", sPAPRPHBVFIOState, scan, 1),
> +    DEFINE_PROP_UINT8("mf", sPAPRPHBVFIOState, enable_multifunction, 0),
> +    DEFINE_PROP_UINT8("forceaddr", sPAPRPHBVFIOState, force_addr, 0),
> +    DEFINE_PROP_END_OF_LIST(),
> +};
> +
> +static int spapr_pci_vfio_scan(sPAPRPHBVFIOState *svphb)
> +{
> +    PCIHostState *phb = PCI_HOST_BRIDGE(svphb);
> +    char *iommupath;
> +    DIR *dirp;
> +    struct dirent *entry;
> +
> +    if (!svphb->scan) {
> +        trace_spapr_pci("autoscan disabled for ", svphb->phb.dtbusname);
> +        return 0;
> +    }
> +
> +    iommupath = g_strdup_printf("/sys/kernel/iommu_groups/%d/devices/",
> +                                svphb->iommugroupid);
> +    if (!iommupath) {
> +        return -ENOMEM;
> +    }
> +
> +    dirp = opendir(iommupath);
> +    if (!dirp) {
> +        fprintf(stderr, "failed to scan group=%d\n", svphb->iommugroupid);
> +        g_free(iommupath);
> +        return -1;
> +    }
> +
> +    while ((entry = readdir(dirp)) != NULL) {
> +        Error *err = NULL;
> +        char *tmp;
> +        FILE *deviceclassfile;
> +        unsigned deviceclass = 0, domainid, busid, devid, fnid;
> +        char addr[32];
> +        DeviceState *dev;
> +
> +        if (sscanf(entry->d_name, "%X:%X:%X.%x",
> +                   &domainid, &busid, &devid, &fnid) != 4) {
> +            continue;
> +        }
> +
> +        tmp = g_strdup_printf("%s%s/class", iommupath, entry->d_name);
> +        trace_spapr_pci("Reading device class from ", tmp);
> +
> +        deviceclassfile = fopen(tmp, "r");
> +        if (deviceclassfile) {
> +            int ret = fscanf(deviceclassfile, "%x", &deviceclass);
> +            fclose(deviceclassfile);
> +            if (ret != 1) {
> +                continue;
> +            }
> +        }
> +        g_free(tmp);
> +
> +        if (!deviceclass) {
> +            continue;
> +        }
> +        if ((deviceclass >> 16) == (PCI_CLASS_BRIDGE_OTHER >> 8)) {
> +            /* Skip bridges */
> +            continue;
> +        }
> +        trace_spapr_pci("Creating device from ", entry->d_name);
> +
> +        dev = qdev_create(&phb->bus->qbus, "vfio-pci");
> +        if (!dev) {
> +            fprintf(stderr, "failed to create vfio-pci\n");
> +            continue;
> +        }
> +        qdev_prop_parse(dev, "host", entry->d_name, &err);
> +        if (err != NULL) {
> +            continue;
> +        }
> +        if (svphb->force_addr) {
> +            snprintf(addr, sizeof(addr), "%x.%x", devid, fnid);
> +            err = NULL;
> +            qdev_prop_parse(dev, "addr", addr, &err);
> +            if (err != NULL) {
> +                continue;
> +            }
> +        }
> +        if (svphb->enable_multifunction) {
> +            qdev_prop_set_bit(dev, "multifunction", 1);
> +        }
> +        qdev_init_nofail(dev);
> +    }
> +    closedir(dirp);
> +    g_free(iommupath);
> +
> +    return 0;
> +}
> +
> +static int spapr_phb_vfio_init(SysBusDevice *s)
> +{
> +    sPAPRPHBVFIOState *svphb = SPAPR_PCI_VFIO_HOST_BRIDGE(s);
> +    sPAPRPHBState *sphb = &svphb->phb;
> +    struct vfio_iommu_spapr_tce_info info = { .argsz = sizeof(info) };
> +    int ret, group_fd;
> +
> +    if (svphb->iommugroupid == -1) {
> +        fprintf(stderr, "Wrong IOMMU group ID %d\n", svphb->iommugroupid);
> +        return -1;
> +    }
> +
> +    ret = _spapr_phb_init(s);
> +    if (ret) {
> +        return ret;
> +    }
> +
> +    ret = vfio_container_spapr_get_info(&svphb->phb.iommu_as,
> +                                        svphb->iommugroupid,
> +                                        &info, &group_fd);
> +    if (ret)
> +        return ret;
> +
> +    svphb->phb.dma_window_start = info.dma32_window_start;
> +    svphb->phb.dma_window_size = info.dma32_window_size;
> +    svphb->phb.tcet = spapr_vfio_new_table(DEVICE(sphb), 
> svphb->phb.dma_liobn,
> +                                           group_fd);
> +
> +    address_space_init(&sphb->iommu_as, spapr_tce_get_iommu(sphb->tcet),
> +                       sphb->dtbusname);
> +    pci_setup_iommu(sphb->parent_obj.bus, spapr_pci_dma_iommu, sphb);
> +
> +    ret = spapr_pci_vfio_scan(svphb);
> +
> +    return ret;
> +}
> +
> +static void spapr_phb_vfio_reset(DeviceState *qdev)
> +{
> +    /* Do nothing */
> +}
> +
> +static void spapr_phb_vfio_class_init(ObjectClass *klass, void *data)
> +{
> +    SysBusDeviceClass *sdc = SYS_BUS_DEVICE_CLASS(klass);
> +    DeviceClass *dc = DEVICE_CLASS(klass);
> +
> +    sdc->init = spapr_phb_vfio_init;
> +    dc->props = spapr_phb_vfio_properties;
> +    dc->reset = spapr_phb_vfio_reset;
> +    dc->vmsd = &vmstate_spapr_pci;
> +}
> +
> +static const TypeInfo spapr_phb_vfio_info = {
> +    .name          = TYPE_SPAPR_PCI_VFIO_HOST_BRIDGE,
> +    .parent        = TYPE_SPAPR_PCI_HOST_BRIDGE,
> +    .instance_size = sizeof(sPAPRPHBVFIOState),
> +    .class_init    = spapr_phb_vfio_class_init,
> +};
> +
> /* Macros to operate with address in OF binding to PCI */
> #define b_x(x, p, l)    (((x) & ((1<<(l))-1)) << (p))
> #define b_n(x)          b_x((x), 31, 1) /* 0 if relocatable */
> @@ -839,6 +1015,10 @@ int spapr_populate_pci_dt(sPAPRPHBState *phb,
>     _FDT(fdt_setprop(fdt, bus_off, "interrupt-map", &interrupt_map,
>                      sizeof(interrupt_map)));
> 
> +    if (!phb->dma_window_size) {
> +        fprintf(stderr, "Unexpected error: DMA window is zero, exiting\n");
> +        exit(1);
> +    }
>     spapr_dma_dt(fdt, bus_off, "ibm,dma-window",
>                  phb->dma_liobn, phb->dma_window_start,
>                  phb->dma_window_size);
> @@ -862,6 +1042,7 @@ void spapr_pci_rtas_init(void)
> static void spapr_pci_register_types(void)
> {
>     type_register_static(&spapr_phb_info);
> +    type_register_static(&spapr_phb_vfio_info);
> }
> 
> type_init(spapr_pci_register_types)
> diff --git a/include/hw/pci-host/spapr.h b/include/hw/pci-host/spapr.h
> index 970b4a9..fab18e5 100644
> --- a/include/hw/pci-host/spapr.h
> +++ b/include/hw/pci-host/spapr.h
> @@ -30,10 +30,14 @@
> #define SPAPR_MSIX_MAX_DEVS 32
> 
> #define TYPE_SPAPR_PCI_HOST_BRIDGE "spapr-pci-host-bridge"
> +#define TYPE_SPAPR_PCI_VFIO_HOST_BRIDGE "spapr-pci-vfio-host-bridge"
> 
> #define SPAPR_PCI_HOST_BRIDGE(obj) \
>     OBJECT_CHECK(sPAPRPHBState, (obj), TYPE_SPAPR_PCI_HOST_BRIDGE)
> 
> +#define SPAPR_PCI_VFIO_HOST_BRIDGE(obj) \
> +    OBJECT_CHECK(sPAPRPHBVFIOState, (obj), TYPE_SPAPR_PCI_VFIO_HOST_BRIDGE)
> +
> typedef struct sPAPRPHBState {
>     PCIHostState parent_obj;
> 
> @@ -64,6 +68,14 @@ typedef struct sPAPRPHBState {
>     QLIST_ENTRY(sPAPRPHBState) list;
> } sPAPRPHBState;
> 
> +typedef struct sPAPRPHBVFIOState {
> +    sPAPRPHBState phb;
> +
> +    struct VFIOContainer *container;
> +    int32_t iommugroupid;
> +    uint8_t scan, enable_multifunction, force_addr;
> +} sPAPRPHBVFIOState;
> +
> #define SPAPR_PCI_BASE_BUID          0x800000020000000ULL
> 
> #define SPAPR_PCI_WINDOW_BASE        0x10000000000ULL
> diff --git a/include/hw/ppc/spapr.h b/include/hw/ppc/spapr.h
> index 2dc3d06..a64e58a 100644
> --- a/include/hw/ppc/spapr.h
> +++ b/include/hw/ppc/spapr.h
> @@ -353,12 +353,29 @@ int spapr_rtas_device_tree_setup(void *fdt, hwaddr 
> rtas_addr,
> 
> #define RTAS_ERROR_LOG_MAX      2048
> 
> +typedef struct sPAPRTCETableClass sPAPRTCETableClass;
> typedef struct sPAPRTCETable sPAPRTCETable;
> 
> #define TYPE_SPAPR_TCE_TABLE "spapr-tce-table"
> #define SPAPR_TCE_TABLE(obj) \
>     OBJECT_CHECK(sPAPRTCETable, (obj), TYPE_SPAPR_TCE_TABLE)
> 
> +#define TYPE_SPAPR_TCE_TABLE_VFIO "spapr-tce-table-vfio"
> +#define SPAPR_TCE_TABLE_VFIO(obj) \
> +    OBJECT_CHECK(sPAPRTCETable, (obj), TYPE_SPAPR_TCE_TABLE_VFIO)
> +
> +#define SPAPR_TCE_TABLE_CLASS(klass) \
> +     OBJECT_CLASS_CHECK(sPAPRTCETableClass, (klass), TYPE_SPAPR_TCE_TABLE)
> +#define SPAPR_TCE_TABLE_GET_CLASS(obj) \
> +     OBJECT_GET_CLASS(sPAPRTCETableClass, (obj), TYPE_SPAPR_TCE_TABLE)
> +
> +struct sPAPRTCETableClass {
> +    DeviceClass parent_class;
> +
> +    target_ulong (*put_tce)(sPAPRTCETable *tcet, target_ulong ioba,
> +                            target_ulong tce);
> +};
> +
> struct sPAPRTCETable {
>     DeviceState parent;
>     uint32_t liobn;
> @@ -375,6 +392,8 @@ void spapr_events_init(sPAPREnvironment *spapr);
> void spapr_events_fdt_skel(void *fdt, uint32_t epow_irq);
> sPAPRTCETable *spapr_tce_new_table(DeviceState *owner, uint32_t liobn,
>                                    size_t window_size);
> +sPAPRTCETable *spapr_vfio_new_table(DeviceState *owner, uint32_t liobn,
> +                                    int group_fd);
> MemoryRegion *spapr_tce_get_iommu(sPAPRTCETable *tcet);
> void spapr_tce_set_bypass(sPAPRTCETable *tcet, bool bypass);
> int spapr_dma_dt(void *fdt, int node_off, const char *propname,
> diff --git a/target-ppc/kvm.c b/target-ppc/kvm.c
> index 3d0e398..eb59d7d 100644
> --- a/target-ppc/kvm.c
> +++ b/target-ppc/kvm.c
> @@ -61,6 +61,7 @@ static int cap_ppc_smt;
> static int cap_ppc_rma;
> static int cap_spapr_tce;
> static int cap_spapr_multitce;
> +static int cap_spapr_tce_iommu;
> static int cap_hior;
> static int cap_one_reg;
> static int cap_epr;
> @@ -98,6 +99,7 @@ int kvm_arch_init(KVMState *s)
>     cap_ppc_rma = kvm_check_extension(s, KVM_CAP_PPC_RMA);
>     cap_spapr_tce = kvm_check_extension(s, KVM_CAP_SPAPR_TCE);
>     cap_spapr_multitce = kvm_check_extension(s, KVM_CAP_SPAPR_MULTITCE);
> +    cap_spapr_tce_iommu = kvm_check_extension(s, KVM_CAP_SPAPR_TCE_IOMMU);
>     cap_one_reg = kvm_check_extension(s, KVM_CAP_ONE_REG);
>     cap_hior = kvm_check_extension(s, KVM_CAP_PPC_HIOR);
>     cap_epr = kvm_check_extension(s, KVM_CAP_PPC_EPR);
> @@ -1669,6 +1671,37 @@ int kvmppc_remove_spapr_tce(void *table, int fd, 
> uint32_t window_size)
>     return 0;
> }
> 
> +int kvmppc_create_spapr_tce_iommu(uint32_t liobn, int group_fd)
> +{
> +    int fd = 0;
> +    struct kvm_create_spapr_tce_iommu args = {
> +        .liobn = liobn,
> +        .fd = group_fd
> +    };
> +
> +    if (!kvm_enabled() || !cap_spapr_tce_iommu) {
> +        fprintf(stderr, "KVM VFIO: TCE IOMMU capability is not present, DMA 
> may be slow\n");
> +        return -1;
> +    }
> +
> +    fd = kvm_vm_ioctl(kvm_state, KVM_CREATE_SPAPR_TCE_IOMMU, &args);
> +    if (fd < 0) {
> +        fprintf(stderr, "KVM VFIO: Failed to create TCE table for liobn 
> 0x%x, ret = %d, DMA may be slow\n",
> +                liobn, fd);
> +    }
> +
> +    return fd;
> +}
> +
> +int kvmppc_remove_spapr_tce_iommu(int fd)
> +{
> +    if (fd < 0) {
> +        return -1;
> +    }
> +
> +    return close(fd);
> +}
> +
> int kvmppc_reset_htab(int shift_hint)
> {
>     uint32_t shift = shift_hint;
> diff --git a/target-ppc/kvm_ppc.h b/target-ppc/kvm_ppc.h
> index a2a903f..a223e63 100644
> --- a/target-ppc/kvm_ppc.h
> +++ b/target-ppc/kvm_ppc.h
> @@ -34,6 +34,8 @@ off_t kvmppc_alloc_rma(const char *name, MemoryRegion 
> *sysmem);
> bool kvmppc_spapr_use_multitce(void);
> void *kvmppc_create_spapr_tce(uint32_t liobn, uint32_t window_size, int *pfd);
> int kvmppc_remove_spapr_tce(void *table, int pfd, uint32_t window_size);
> +int kvmppc_create_spapr_tce_iommu(uint32_t liobn, int group_fd);
> +int kvmppc_remove_spapr_tce_iommu(int fd);
> int kvmppc_reset_htab(int shift_hint);
> uint64_t kvmppc_rma_size(uint64_t current_size, unsigned int hash_shift);
> #endif /* !CONFIG_USER_ONLY */
> @@ -144,6 +146,16 @@ static inline int kvmppc_remove_spapr_tce(void *table, 
> int pfd,
>     return -1;
> }
> 
> +static inline int kvmppc_create_spapr_tce_iommu(uint32_t liobn, uint32_t 
> iommu_id)
> +{
> +    return -1;
> +}
> +
> +static inline int kvmppc_remove_spapr_tce_iommu(int fd)
> +{
> +    return -1;
> +}
> +
> static inline int kvmppc_reset_htab(int shift_hint)
> {
>     return -1;
> diff --git a/trace-events b/trace-events
> index 3856b5c..d1e54ad 100644
> --- a/trace-events
> +++ b/trace-events
> @@ -1113,6 +1113,7 @@ qxl_render_guest_primary_resized(int32_t width, int32_t 
> height, int32_t stride,
> qxl_render_update_area_done(void *cookie) "%p"
> 
> # hw/ppc/spapr_pci.c
> +spapr_pci(const char *msg1, const char *msg2) "%s%s"
> spapr_pci_msi(const char *msg, uint32_t n, uint32_t ca) "%s (device#%d, 
> cfg=%x)"
> spapr_pci_msi_setup(const char *name, unsigned vector, uint64_t addr) 
> "dev\"%s\" vector %u, addr=%"PRIx64
> spapr_pci_rtas_ibm_change_msi(unsigned func, unsigned req) "func %u, 
> requested %u"
> @@ -1133,6 +1134,9 @@ xics_ics_write_xive(int nr, int srcno, int server, 
> uint8_t priority) "ics_write_
> xics_ics_reject(int nr, int srcno) "reject irq %#x [src %d]"
> xics_ics_eoi(int nr) "ics_eoi: irq %#x"
> 
> +# hw/ppc/spapr_iommu.c
> +spapr_iommu(const char *op, uint32_t liobn, uint64_t ioba, uint64_t tce, int 
> ret) "%s %x ioba=%"PRIx64" tce=%"PRIx64" ret=%d"
> +
> # util/hbitmap.c
> hbitmap_iter_skip_words(const void *hb, void *hbi, uint64_t pos, unsigned 
> long cur) "hb %p hbi %p pos %"PRId64" cur 0x%lx"
> hbitmap_reset(void *hb, uint64_t start, uint64_t count, uint64_t sbit, 
> uint64_t ebit) "hb %p items %"PRIu64",%"PRIu64" bits %"PRIu64"..%"PRIu64
> -- 
> 1.8.3.2
> 




reply via email to

[Prev in Thread] Current Thread [Next in Thread]