[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]
Re: [Qemu-devel] [RFC v2 3/6] vfio: add vfio-platform support
From: |
Alex Williamson |
Subject: |
Re: [Qemu-devel] [RFC v2 3/6] vfio: add vfio-platform support |
Date: |
Fri, 18 Apr 2014 08:54:21 -0600 |
On Wed, 2014-04-09 at 16:33 +0100, Eric Auger wrote:
> From: Kim Phillips <address@hidden>
>
> Functions for which PCI and platform device support share are moved
> into common.c. The common vfio_{get,put}_group() get an additional
> argument, a pointer to a vfio_reset_handler(), for which to pass on to
> qemu_register_reset, but only if it exists (the platform device code
> currently passes a NULL as its reset_handler).
>
> For the platform device code, we basically use SysBusDevice
> instead of PCIDevice. Since realize() returns void, unlike
> PCIDevice's initfn, error codes are moved into the
> error message text with %m.
>
> Currently only MMIO access is supported at this time.
>
> The perceived path for future QEMU development is:
>
> - add support for interrupts
> - verify and test platform dev unmap path
> - test existing PCI path for any regressions
> - add support for creating platform devices on the qemu command line
> - currently device address specification is hardcoded for test
> development on Calxeda Midway's fff51000.ethernet device
> - reset is not supported and registration of reset functions is
> bypassed for platform devices.
> - there is no standard means of resetting a platform device,
> unsure if it suffices to be handled at device--VFIO binding time
>
> Signed-off-by: Kim Phillips <address@hidden>
>
> [1] http://www.spinics.net/lists/kvm-arm/msg08195.html
> ---
> hw/vfio/Makefile.objs | 2 +
> hw/vfio/common.c | 486
> ++++++++++++++++++++++++++++++++++++++++++++++++++
> hw/vfio/pci.c | 480 ++-----------------------------------------------
> hw/vfio/platform.c | 381 +++++++++++++++++++++++++++++++++++++++
> hw/vfio/vfio-common.h | 55 ++++++
> 5 files changed, 937 insertions(+), 467 deletions(-)
> create mode 100644 hw/vfio/common.c
> create mode 100644 hw/vfio/platform.c
> create mode 100644 hw/vfio/vfio-common.h
>
> diff --git a/hw/vfio/Makefile.objs b/hw/vfio/Makefile.objs
> index 31c7dab..c5c76fe 100644
> --- a/hw/vfio/Makefile.objs
> +++ b/hw/vfio/Makefile.objs
> @@ -1,3 +1,5 @@
> ifeq ($(CONFIG_LINUX), y)
> +obj-$(CONFIG_SOFTMMU) += common.o
> obj-$(CONFIG_PCI) += pci.o
> +obj-$(CONFIG_SOFTMMU) += platform.o
> endif
> diff --git a/hw/vfio/common.c b/hw/vfio/common.c
> new file mode 100644
> index 0000000..9d1f723
> --- /dev/null
> +++ b/hw/vfio/common.c
> @@ -0,0 +1,486 @@
> +/*
> + * vfio based device assignment support
> + *
> + * Copyright Red Hat, Inc. 2012
> + *
> + * Authors:
> + * Alex Williamson <address@hidden>
> + *
> + * This work is licensed under the terms of the GNU GPL, version 2. See
> + * the COPYING file in the top-level directory.
> + *
> + * Based on qemu-kvm device-assignment:
> + * Adapted for KVM by Qumranet.
> + * Copyright (c) 2007, Neocleus, Alex Novik (address@hidden)
> + * Copyright (c) 2007, Neocleus, Guy Zana (address@hidden)
> + * Copyright (C) 2008, Qumranet, Amit Shah (address@hidden)
> + * Copyright (C) 2008, Red Hat, Amit Shah (address@hidden)
> + * Copyright (C) 2008, IBM, Muli Ben-Yehuda (address@hidden)
> + */
> +
> +#include <dirent.h>
> +#include <linux/vfio.h>
> +#include <sys/ioctl.h>
> +#include <sys/mman.h>
> +#include <sys/stat.h>
> +#include <sys/types.h>
> +#include <unistd.h>
> +
> +#include "config.h"
> +#include "exec/address-spaces.h"
> +#include "exec/memory.h"
> +#include "hw/pci/msi.h"
> +#include "hw/pci/msix.h"
> +#include "hw/pci/pci.h"
I expect these pci includes aren't really needed
> +#include "qemu-common.h"
> +#include "qemu/error-report.h"
> +#include "qemu/event_notifier.h"
> +#include "qemu/queue.h"
> +#include "qemu/range.h"
> +#include "sysemu/kvm.h"
> +#include "sysemu/sysemu.h"
> +
> +#include "vfio-common.h"
> +
> +#define DEBUG_VFIO
> +#ifdef DEBUG_VFIO
> +#define DPRINTF(fmt, ...) \
> + do { fprintf(stderr, "vfio: " fmt, ## __VA_ARGS__); } while (0)
> +#else
> +#define DPRINTF(fmt, ...) \
> + do { } while (0)
> +#endif
DEBUG_VFIO should probably be in vfio-common.h
> +
> +static QLIST_HEAD(, VFIOContainer)
> + container_list = QLIST_HEAD_INITIALIZER(container_list);
> +
> +QLIST_HEAD(, VFIOGroup)
> + group_list = QLIST_HEAD_INITIALIZER(group_list);
> +
> +
> +struct VFIODevice;
I don't see where this is needed.
> +
> +#ifdef CONFIG_KVM
> +/*
> + * We have a single VFIO pseudo device per KVM VM. Once created it lives
> + * for the life of the VM. Closing the file descriptor only drops our
> + * reference to it and the device's reference to kvm. Therefore once
> + * initialized, this file descriptor is only released on QEMU exit and
> + * we'll re-use it should another vfio device be attached before then.
> + */
> +static int vfio_kvm_device_fd = -1;
> +#endif
> +
> +/*
> + * DMA - Mapping and unmapping for the "type1" IOMMU interface used on x86
> + */
> +static int vfio_dma_unmap(VFIOContainer *container,
> + hwaddr iova, ram_addr_t size)
> +{
> + struct vfio_iommu_type1_dma_unmap unmap = {
> + .argsz = sizeof(unmap),
> + .flags = 0,
> + .iova = iova,
> + .size = size,
> + };
> +
> + if (ioctl(container->fd, VFIO_IOMMU_UNMAP_DMA, &unmap)) {
> + DPRINTF("VFIO_UNMAP_DMA: %d\n", -errno);
> + return -errno;
> + }
> +
> + return 0;
> +}
> +
> +static int vfio_dma_map(VFIOContainer *container, hwaddr iova,
> + ram_addr_t size, void *vaddr, bool readonly)
> +{
> + struct vfio_iommu_type1_dma_map map = {
> + .argsz = sizeof(map),
> + .flags = VFIO_DMA_MAP_FLAG_READ,
> + .vaddr = (__u64)(uintptr_t)vaddr,
> + .iova = iova,
> + .size = size,
> + };
> +
> + if (!readonly) {
> + map.flags |= VFIO_DMA_MAP_FLAG_WRITE;
> + }
> +
> + /*
> + * Try the mapping, if it fails with EBUSY, unmap the region and try
> + * again. This shouldn't be necessary, but we sometimes see it in
> + * the the VGA ROM space.
> + */
> + if (ioctl(container->fd, VFIO_IOMMU_MAP_DMA, &map) == 0 ||
> + (errno == EBUSY && vfio_dma_unmap(container, iova, size) == 0 &&
> + ioctl(container->fd, VFIO_IOMMU_MAP_DMA, &map) == 0)) {
> + return 0;
> + }
> +
> + DPRINTF("VFIO_MAP_DMA: %d\n", -errno);
> + return -errno;
> +}
> +
> +static bool vfio_listener_skipped_section(MemoryRegionSection *section)
> +{
> + return !memory_region_is_ram(section->mr) ||
> + /*
> + * Sizing an enabled 64-bit BAR can cause spurious mappings to
> + * addresses in the upper part of the 64-bit address space. These
> + * are never accessed by the CPU and beyond the address width of
> + * some IOMMU hardware. TODO: VFIO should tell us the IOMMU
> width.
> + */
> + section->offset_within_address_space & (1ULL << 63);
> +}
> +
> +static void vfio_listener_region_add(MemoryListener *listener,
> + MemoryRegionSection *section)
> +{
> + VFIOContainer *container = container_of(listener, VFIOContainer,
> + iommu_data.type1.listener);
> + hwaddr iova, end;
> + void *vaddr;
> + int ret;
> +
> + assert(!memory_region_is_iommu(section->mr));
> +
> + if (vfio_listener_skipped_section(section)) {
> + DPRINTF("SKIPPING region_add %"HWADDR_PRIx" - %"PRIx64"\n",
> + section->offset_within_address_space,
> + section->offset_within_address_space +
> + int128_get64(int128_sub(section->size, int128_one())));
> + return;
> + }
> +
> + if (unlikely((section->offset_within_address_space & ~TARGET_PAGE_MASK)
> !=
> + (section->offset_within_region & ~TARGET_PAGE_MASK))) {
> + error_report("%s received unaligned region", __func__);
> + return;
> + }
> +
> + iova = TARGET_PAGE_ALIGN(section->offset_within_address_space);
> + end = (section->offset_within_address_space +
> int128_get64(section->size)) &
> + TARGET_PAGE_MASK;
> +
> + if (iova >= end) {
> + return;
> + }
> +
> + vaddr = memory_region_get_ram_ptr(section->mr) +
> + section->offset_within_region +
> + (iova - section->offset_within_address_space);
> +
> + DPRINTF("region_add %"HWADDR_PRIx" - %"HWADDR_PRIx" [%p]\n",
> + iova, end - 1, vaddr);
> +
> + memory_region_ref(section->mr);
> + ret = vfio_dma_map(container, iova, end - iova, vaddr,
> section->readonly);
> + if (ret) {
> + error_report("vfio_dma_map(%p, 0x%"HWADDR_PRIx", "
> + "0x%"HWADDR_PRIx", %p) = %d (%m)",
> + container, iova, end - iova, vaddr, ret);
> +
> + /*
> + * On the initfn path, store the first error in the container so we
> + * can gracefully fail. Runtime, there's not much we can do other
> + * than throw a hardware error.
> + */
> + if (!container->iommu_data.type1.initialized) {
> + if (!container->iommu_data.type1.error) {
> + container->iommu_data.type1.error = ret;
> + }
> + } else {
> + hw_error("vfio: DMA mapping failed, unable to continue");
> + }
> + }
> +}
> +
> +static void vfio_listener_region_del(MemoryListener *listener,
> + MemoryRegionSection *section)
> +{
> + VFIOContainer *container = container_of(listener, VFIOContainer,
> + iommu_data.type1.listener);
> + hwaddr iova, end;
> + int ret;
> +
> + if (vfio_listener_skipped_section(section)) {
> + DPRINTF("SKIPPING region_del %"HWADDR_PRIx" - %"PRIx64"\n",
> + section->offset_within_address_space,
> + section->offset_within_address_space +
> + int128_get64(int128_sub(section->size, int128_one())));
> + return;
> + }
> +
> + if (unlikely((section->offset_within_address_space & ~TARGET_PAGE_MASK)
> !=
> + (section->offset_within_region & ~TARGET_PAGE_MASK))) {
> + error_report("%s received unaligned region", __func__);
> + return;
> + }
> +
> + iova = TARGET_PAGE_ALIGN(section->offset_within_address_space);
> + end = (section->offset_within_address_space +
> int128_get64(section->size)) &
> + TARGET_PAGE_MASK;
> +
> + if (iova >= end) {
> + return;
> + }
> +
> + DPRINTF("region_del %"HWADDR_PRIx" - %"HWADDR_PRIx"\n",
> + iova, end - 1);
> +
> + ret = vfio_dma_unmap(container, iova, end - iova);
> + memory_region_unref(section->mr);
> + if (ret) {
> + error_report("vfio_dma_unmap(%p, 0x%"HWADDR_PRIx", "
> + "0x%"HWADDR_PRIx") = %d (%m)",
> + container, iova, end - iova, ret);
> + }
> +}
> +
> +static MemoryListener vfio_memory_listener = {
> + .region_add = vfio_listener_region_add,
> + .region_del = vfio_listener_region_del,
> +};
> +
> +static void vfio_listener_release(VFIOContainer *container)
> +{
> + memory_listener_unregister(&container->iommu_data.type1.listener);
> +}
> +
> +static void vfio_kvm_device_add_group(VFIOGroup *group)
> +{
> +#ifdef CONFIG_KVM
> + struct kvm_device_attr attr = {
> + .group = KVM_DEV_VFIO_GROUP,
> + .attr = KVM_DEV_VFIO_GROUP_ADD,
> + .addr = (uint64_t)(unsigned long)&group->fd,
> + };
> +
> + if (!kvm_enabled()) {
> + return;
> + }
> +
> + if (vfio_kvm_device_fd < 0) {
> + struct kvm_create_device cd = {
> + .type = KVM_DEV_TYPE_VFIO,
> + };
> +
> + if (kvm_vm_ioctl(kvm_state, KVM_CREATE_DEVICE, &cd)) {
> + DPRINTF("KVM_CREATE_DEVICE: %m\n");
> + return;
> + }
> +
> + vfio_kvm_device_fd = cd.fd;
> + }
> +
> + if (ioctl(vfio_kvm_device_fd, KVM_SET_DEVICE_ATTR, &attr)) {
> + error_report("Failed to add group %d to KVM VFIO device: %m",
> + group->groupid);
> + }
> +#endif
> +}
> +
> +static void vfio_kvm_device_del_group(VFIOGroup *group)
> +{
> +#ifdef CONFIG_KVM
> + struct kvm_device_attr attr = {
> + .group = KVM_DEV_VFIO_GROUP,
> + .attr = KVM_DEV_VFIO_GROUP_DEL,
> + .addr = (uint64_t)(unsigned long)&group->fd,
> + };
> +
> + if (vfio_kvm_device_fd < 0) {
> + return;
> + }
> +
> + if (ioctl(vfio_kvm_device_fd, KVM_SET_DEVICE_ATTR, &attr)) {
> + error_report("Failed to remove group %d from KVM VFIO device: %m",
> + group->groupid);
> + }
> +#endif
> +}
> +
> +static int vfio_connect_container(VFIOGroup *group)
> +{
> + VFIOContainer *container;
> + int ret, fd;
> +
> + if (group->container) {
> + return 0;
> + }
> +
> + QLIST_FOREACH(container, &container_list, next) {
> + if (!ioctl(group->fd, VFIO_GROUP_SET_CONTAINER, &container->fd)) {
> + group->container = container;
> + QLIST_INSERT_HEAD(&container->group_list, group, container_next);
> + return 0;
> + }
> + }
> +
> + fd = qemu_open("/dev/vfio/vfio", O_RDWR);
> + if (fd < 0) {
> + error_report("vfio: failed to open /dev/vfio/vfio: %m");
> + return -errno;
> + }
> +
> + ret = ioctl(fd, VFIO_GET_API_VERSION);
> + if (ret != VFIO_API_VERSION) {
> + error_report("vfio: supported vfio version: %d, "
> + "reported version: %d", VFIO_API_VERSION, ret);
> + close(fd);
> + return -EINVAL;
> + }
> +
> + container = g_malloc0(sizeof(*container));
> + container->fd = fd;
> +
> + if (ioctl(fd, VFIO_CHECK_EXTENSION, VFIO_TYPE1_IOMMU)) {
> + ret = ioctl(group->fd, VFIO_GROUP_SET_CONTAINER, &fd);
> + if (ret) {
> + error_report("vfio: failed to set group container: %m");
> + g_free(container);
> + close(fd);
> + return -errno;
> + }
> +
> + ret = ioctl(fd, VFIO_SET_IOMMU, VFIO_TYPE1_IOMMU);
> + if (ret) {
> + error_report("vfio: failed to set iommu for container: %m");
> + g_free(container);
> + close(fd);
> + return -errno;
> + }
> +
> + container->iommu_data.type1.listener = vfio_memory_listener;
> + container->iommu_data.release = vfio_listener_release;
> +
> + memory_listener_register(&container->iommu_data.type1.listener,
> + &address_space_memory);
> +
> + if (container->iommu_data.type1.error) {
> + ret = container->iommu_data.type1.error;
> + vfio_listener_release(container);
> + g_free(container);
> + close(fd);
> + error_report("vfio: memory listener initialization failed for
> container");
> + return ret;
> + }
> +
> + container->iommu_data.type1.initialized = true;
> +
> + } else {
> + error_report("vfio: No available IOMMU models");
> + g_free(container);
> + close(fd);
> + return -EINVAL;
> + }
> +
> + QLIST_INIT(&container->group_list);
> + QLIST_INSERT_HEAD(&container_list, container, next);
> +
> + group->container = container;
> + QLIST_INSERT_HEAD(&container->group_list, group, container_next);
> +
> + return 0;
> +}
> +
> +static void vfio_disconnect_container(VFIOGroup *group)
> +{
> + VFIOContainer *container = group->container;
> +
> + if (ioctl(group->fd, VFIO_GROUP_UNSET_CONTAINER, &container->fd)) {
> + error_report("vfio: error disconnecting group %d from container",
> + group->groupid);
> + }
> +
> + QLIST_REMOVE(group, container_next);
> + group->container = NULL;
> +
> + if (QLIST_EMPTY(&container->group_list)) {
> + if (container->iommu_data.release) {
> + container->iommu_data.release(container);
> + }
> + QLIST_REMOVE(container, next);
> + DPRINTF("vfio_disconnect_container: close container->fd\n");
> + close(container->fd);
> + g_free(container);
> + }
> +}
> +
> +VFIOGroup *vfio_get_group(int groupid, QEMUResetHandler *reset_handler)
> +{
> + VFIOGroup *group;
> + char path[32];
> + struct vfio_group_status status = { .argsz = sizeof(status) };
> +
> + QLIST_FOREACH(group, &group_list, next) {
> + if (group->groupid == groupid) {
> + return group;
> + }
> + }
> +
> + group = g_malloc0(sizeof(*group));
> +
> + snprintf(path, sizeof(path), "/dev/vfio/%d", groupid);
> + group->fd = qemu_open(path, O_RDWR);
> + if (group->fd < 0) {
> + error_report("vfio: error opening %s: %m", path);
> + g_free(group);
> + return NULL;
> + }
> +
> + if (ioctl(group->fd, VFIO_GROUP_GET_STATUS, &status)) {
> + error_report("vfio: error getting group status: %m");
> + close(group->fd);
> + g_free(group);
> + return NULL;
> + }
> +
> + if (!(status.flags & VFIO_GROUP_FLAGS_VIABLE)) {
> + error_report("vfio: error, group %d is not viable, please ensure "
> + "all devices within the iommu_group are bound to their "
> + "vfio bus driver.", groupid);
> + close(group->fd);
> + g_free(group);
> + return NULL;
> + }
> +
> + group->groupid = groupid;
> + QLIST_INIT(&group->device_list);
> +
> + if (vfio_connect_container(group)) {
> + error_report("vfio: failed to setup container for group %d",
> groupid);
> + close(group->fd);
> + g_free(group);
> + return NULL;
> + }
> +
> + if (QLIST_EMPTY(&group_list) && reset_handler) {
> + qemu_register_reset(reset_handler, NULL);
> + }
> +
> + QLIST_INSERT_HEAD(&group_list, group, next);
> +
> + vfio_kvm_device_add_group(group);
> +
> + return group;
> +}
> +
> +void vfio_put_group(VFIOGroup *group, QEMUResetHandler *reset_handler)
> +{
> + if (!QLIST_EMPTY(&group->device_list)) {
> + return;
> + }
> +
> + vfio_kvm_device_del_group(group);
> + vfio_disconnect_container(group);
> + QLIST_REMOVE(group, next);
> + DPRINTF("vfio_put_group: close group->fd\n");
> + close(group->fd);
> + g_free(group);
> +
> + if (QLIST_EMPTY(&group_list) && reset_handler) {
> + qemu_unregister_reset(reset_handler, NULL);
> + }
The reset_handler stuff needs work. We can theoretically support both
PCI and platform devices simultaneously, but this would only allow one
to register a reset handler and one to remove it (which may not be the
same one).
> +}
> diff --git a/hw/vfio/pci.c b/hw/vfio/pci.c
> index 9cf5b84..9e70d68 100644
> --- a/hw/vfio/pci.c
> +++ b/hw/vfio/pci.c
> @@ -1,5 +1,5 @@
> /*
> - * vfio based device assignment support
> + * vfio based device assignment support - PCI devices
> *
> * Copyright Red Hat, Inc. 2012
> *
> @@ -40,6 +40,8 @@
> #include "sysemu/kvm.h"
> #include "sysemu/sysemu.h"
There were probably some includes that could be cleaned out with the
split.
> +#include "vfio-common.h"
> +
> /* #define DEBUG_VFIO */
> #ifdef DEBUG_VFIO
> #define DPRINTF(fmt, ...) \
DEBUG_VFIO is duplicated here... move to vfio-common.h
> @@ -55,6 +57,8 @@
> #define VFIO_ALLOW_KVM_MSI 1
> #define VFIO_ALLOW_KVM_MSIX 1
>
> +extern QLIST_HEAD(, VFIOGroup) group_list;
Define in vfio-common.h?
> +
> struct VFIODevice;
>
> typedef struct VFIOQuirk {
> @@ -135,25 +139,6 @@ enum {
>
> struct VFIOGroup;
>
> -typedef struct VFIOType1 {
> - MemoryListener listener;
> - int error;
> - bool initialized;
> -} VFIOType1;
> -
> -typedef struct VFIOContainer {
> - int fd; /* /dev/vfio/vfio, empowered by the attached groups */
> - struct {
> - /* enable abstraction to support various iommu backends */
> - union {
> - VFIOType1 type1;
> - };
> - void (*release)(struct VFIOContainer *);
> - } iommu_data;
> - QLIST_HEAD(, VFIOGroup) group_list;
> - QLIST_ENTRY(VFIOContainer) next;
> -} VFIOContainer;
> -
> /* Cache of MSI-X setup plus extra mmap and memory region for split BAR map
> */
> typedef struct VFIOMSIXInfo {
> uint8_t table_bar;
> @@ -200,15 +185,6 @@ typedef struct VFIODevice {
> bool rom_read_failed;
> } VFIODevice;
>
> -typedef struct VFIOGroup {
> - int fd;
> - int groupid;
> - VFIOContainer *container;
> - QLIST_HEAD(, VFIODevice) device_list;
> - QLIST_ENTRY(VFIOGroup) next;
> - QLIST_ENTRY(VFIOGroup) container_next;
> -} VFIOGroup;
> -
> typedef struct VFIORomBlacklistEntry {
> uint16_t vendor_id;
> uint16_t device_id;
> @@ -234,23 +210,6 @@ static const VFIORomBlacklistEntry romblacklist[] = {
>
> #define MSIX_CAP_LENGTH 12
>
> -static QLIST_HEAD(, VFIOContainer)
> - container_list = QLIST_HEAD_INITIALIZER(container_list);
> -
> -static QLIST_HEAD(, VFIOGroup)
> - group_list = QLIST_HEAD_INITIALIZER(group_list);
> -
> -#ifdef CONFIG_KVM
> -/*
> - * We have a single VFIO pseudo device per KVM VM. Once created it lives
> - * for the life of the VM. Closing the file descriptor only drops our
> - * reference to it and the device's reference to kvm. Therefore once
> - * initialized, this file descriptor is only released on QEMU exit and
> - * we'll re-use it should another vfio device be attached before then.
> - */
> -static int vfio_kvm_device_fd = -1;
> -#endif
> -
> static void vfio_disable_interrupts(VFIODevice *vdev);
> static uint32_t vfio_pci_read_config(PCIDevice *pdev, uint32_t addr, int
> len);
> static void vfio_pci_write_config(PCIDevice *pdev, uint32_t addr,
> @@ -2180,183 +2139,6 @@ static void vfio_pci_write_config(PCIDevice *pdev,
> uint32_t addr,
> }
>
> /*
> - * DMA - Mapping and unmapping for the "type1" IOMMU interface used on x86
> - */
> -static int vfio_dma_unmap(VFIOContainer *container,
> - hwaddr iova, ram_addr_t size)
> -{
> - struct vfio_iommu_type1_dma_unmap unmap = {
> - .argsz = sizeof(unmap),
> - .flags = 0,
> - .iova = iova,
> - .size = size,
> - };
> -
> - if (ioctl(container->fd, VFIO_IOMMU_UNMAP_DMA, &unmap)) {
> - DPRINTF("VFIO_UNMAP_DMA: %d\n", -errno);
> - return -errno;
> - }
> -
> - return 0;
> -}
> -
> -static int vfio_dma_map(VFIOContainer *container, hwaddr iova,
> - ram_addr_t size, void *vaddr, bool readonly)
> -{
> - struct vfio_iommu_type1_dma_map map = {
> - .argsz = sizeof(map),
> - .flags = VFIO_DMA_MAP_FLAG_READ,
> - .vaddr = (__u64)(uintptr_t)vaddr,
> - .iova = iova,
> - .size = size,
> - };
> -
> - if (!readonly) {
> - map.flags |= VFIO_DMA_MAP_FLAG_WRITE;
> - }
> -
> - /*
> - * Try the mapping, if it fails with EBUSY, unmap the region and try
> - * again. This shouldn't be necessary, but we sometimes see it in
> - * the the VGA ROM space.
> - */
> - if (ioctl(container->fd, VFIO_IOMMU_MAP_DMA, &map) == 0 ||
> - (errno == EBUSY && vfio_dma_unmap(container, iova, size) == 0 &&
> - ioctl(container->fd, VFIO_IOMMU_MAP_DMA, &map) == 0)) {
> - return 0;
> - }
> -
> - DPRINTF("VFIO_MAP_DMA: %d\n", -errno);
> - return -errno;
> -}
> -
> -static bool vfio_listener_skipped_section(MemoryRegionSection *section)
> -{
> - return !memory_region_is_ram(section->mr) ||
> - /*
> - * Sizing an enabled 64-bit BAR can cause spurious mappings to
> - * addresses in the upper part of the 64-bit address space. These
> - * are never accessed by the CPU and beyond the address width of
> - * some IOMMU hardware. TODO: VFIO should tell us the IOMMU
> width.
> - */
> - section->offset_within_address_space & (1ULL << 63);
> -}
> -
> -static void vfio_listener_region_add(MemoryListener *listener,
> - MemoryRegionSection *section)
> -{
> - VFIOContainer *container = container_of(listener, VFIOContainer,
> - iommu_data.type1.listener);
> - hwaddr iova, end;
> - void *vaddr;
> - int ret;
> -
> - assert(!memory_region_is_iommu(section->mr));
> -
> - if (vfio_listener_skipped_section(section)) {
> - DPRINTF("SKIPPING region_add %"HWADDR_PRIx" - %"PRIx64"\n",
> - section->offset_within_address_space,
> - section->offset_within_address_space +
> - int128_get64(int128_sub(section->size, int128_one())));
> - return;
> - }
> -
> - if (unlikely((section->offset_within_address_space & ~TARGET_PAGE_MASK)
> !=
> - (section->offset_within_region & ~TARGET_PAGE_MASK))) {
> - error_report("%s received unaligned region", __func__);
> - return;
> - }
> -
> - iova = TARGET_PAGE_ALIGN(section->offset_within_address_space);
> - end = (section->offset_within_address_space +
> int128_get64(section->size)) &
> - TARGET_PAGE_MASK;
> -
> - if (iova >= end) {
> - return;
> - }
> -
> - vaddr = memory_region_get_ram_ptr(section->mr) +
> - section->offset_within_region +
> - (iova - section->offset_within_address_space);
> -
> - DPRINTF("region_add %"HWADDR_PRIx" - %"HWADDR_PRIx" [%p]\n",
> - iova, end - 1, vaddr);
> -
> - memory_region_ref(section->mr);
> - ret = vfio_dma_map(container, iova, end - iova, vaddr,
> section->readonly);
> - if (ret) {
> - error_report("vfio_dma_map(%p, 0x%"HWADDR_PRIx", "
> - "0x%"HWADDR_PRIx", %p) = %d (%m)",
> - container, iova, end - iova, vaddr, ret);
> -
> - /*
> - * On the initfn path, store the first error in the container so we
> - * can gracefully fail. Runtime, there's not much we can do other
> - * than throw a hardware error.
> - */
> - if (!container->iommu_data.type1.initialized) {
> - if (!container->iommu_data.type1.error) {
> - container->iommu_data.type1.error = ret;
> - }
> - } else {
> - hw_error("vfio: DMA mapping failed, unable to continue");
> - }
> - }
> -}
> -
> -static void vfio_listener_region_del(MemoryListener *listener,
> - MemoryRegionSection *section)
> -{
> - VFIOContainer *container = container_of(listener, VFIOContainer,
> - iommu_data.type1.listener);
> - hwaddr iova, end;
> - int ret;
> -
> - if (vfio_listener_skipped_section(section)) {
> - DPRINTF("SKIPPING region_del %"HWADDR_PRIx" - %"PRIx64"\n",
> - section->offset_within_address_space,
> - section->offset_within_address_space +
> - int128_get64(int128_sub(section->size, int128_one())));
> - return;
> - }
> -
> - if (unlikely((section->offset_within_address_space & ~TARGET_PAGE_MASK)
> !=
> - (section->offset_within_region & ~TARGET_PAGE_MASK))) {
> - error_report("%s received unaligned region", __func__);
> - return;
> - }
> -
> - iova = TARGET_PAGE_ALIGN(section->offset_within_address_space);
> - end = (section->offset_within_address_space +
> int128_get64(section->size)) &
> - TARGET_PAGE_MASK;
> -
> - if (iova >= end) {
> - return;
> - }
> -
> - DPRINTF("region_del %"HWADDR_PRIx" - %"HWADDR_PRIx"\n",
> - iova, end - 1);
> -
> - ret = vfio_dma_unmap(container, iova, end - iova);
> - memory_region_unref(section->mr);
> - if (ret) {
> - error_report("vfio_dma_unmap(%p, 0x%"HWADDR_PRIx", "
> - "0x%"HWADDR_PRIx") = %d (%m)",
> - container, iova, end - iova, ret);
> - }
> -}
> -
> -static MemoryListener vfio_memory_listener = {
> - .region_add = vfio_listener_region_add,
> - .region_del = vfio_listener_region_del,
> -};
> -
> -static void vfio_listener_release(VFIOContainer *container)
> -{
> - memory_listener_unregister(&container->iommu_data.type1.listener);
> -}
> -
> -/*
> * Interrupt setup
> */
> static void vfio_disable_interrupts(VFIODevice *vdev)
> @@ -3221,244 +3003,8 @@ static void vfio_pci_reset_handler(void *opaque)
> }
> }
>
> -static void vfio_kvm_device_add_group(VFIOGroup *group)
> -{
> -#ifdef CONFIG_KVM
> - struct kvm_device_attr attr = {
> - .group = KVM_DEV_VFIO_GROUP,
> - .attr = KVM_DEV_VFIO_GROUP_ADD,
> - .addr = (uint64_t)(unsigned long)&group->fd,
> - };
> -
> - if (!kvm_enabled()) {
> - return;
> - }
> -
> - if (vfio_kvm_device_fd < 0) {
> - struct kvm_create_device cd = {
> - .type = KVM_DEV_TYPE_VFIO,
> - };
> -
> - if (kvm_vm_ioctl(kvm_state, KVM_CREATE_DEVICE, &cd)) {
> - DPRINTF("KVM_CREATE_DEVICE: %m\n");
> - return;
> - }
> -
> - vfio_kvm_device_fd = cd.fd;
> - }
> -
> - if (ioctl(vfio_kvm_device_fd, KVM_SET_DEVICE_ATTR, &attr)) {
> - error_report("Failed to add group %d to KVM VFIO device: %m",
> - group->groupid);
> - }
> -#endif
> -}
> -
> -static void vfio_kvm_device_del_group(VFIOGroup *group)
> -{
> -#ifdef CONFIG_KVM
> - struct kvm_device_attr attr = {
> - .group = KVM_DEV_VFIO_GROUP,
> - .attr = KVM_DEV_VFIO_GROUP_DEL,
> - .addr = (uint64_t)(unsigned long)&group->fd,
> - };
> -
> - if (vfio_kvm_device_fd < 0) {
> - return;
> - }
> -
> - if (ioctl(vfio_kvm_device_fd, KVM_SET_DEVICE_ATTR, &attr)) {
> - error_report("Failed to remove group %d from KVM VFIO device: %m",
> - group->groupid);
> - }
> -#endif
> -}
> -
> -static int vfio_connect_container(VFIOGroup *group)
> -{
> - VFIOContainer *container;
> - int ret, fd;
> -
> - if (group->container) {
> - return 0;
> - }
> -
> - QLIST_FOREACH(container, &container_list, next) {
> - if (!ioctl(group->fd, VFIO_GROUP_SET_CONTAINER, &container->fd)) {
> - group->container = container;
> - QLIST_INSERT_HEAD(&container->group_list, group, container_next);
> - return 0;
> - }
> - }
> -
> - fd = qemu_open("/dev/vfio/vfio", O_RDWR);
> - if (fd < 0) {
> - error_report("vfio: failed to open /dev/vfio/vfio: %m");
> - return -errno;
> - }
> -
> - ret = ioctl(fd, VFIO_GET_API_VERSION);
> - if (ret != VFIO_API_VERSION) {
> - error_report("vfio: supported vfio version: %d, "
> - "reported version: %d", VFIO_API_VERSION, ret);
> - close(fd);
> - return -EINVAL;
> - }
> -
> - container = g_malloc0(sizeof(*container));
> - container->fd = fd;
> -
> - if (ioctl(fd, VFIO_CHECK_EXTENSION, VFIO_TYPE1_IOMMU)) {
> - ret = ioctl(group->fd, VFIO_GROUP_SET_CONTAINER, &fd);
> - if (ret) {
> - error_report("vfio: failed to set group container: %m");
> - g_free(container);
> - close(fd);
> - return -errno;
> - }
> -
> - ret = ioctl(fd, VFIO_SET_IOMMU, VFIO_TYPE1_IOMMU);
> - if (ret) {
> - error_report("vfio: failed to set iommu for container: %m");
> - g_free(container);
> - close(fd);
> - return -errno;
> - }
> -
> - container->iommu_data.type1.listener = vfio_memory_listener;
> - container->iommu_data.release = vfio_listener_release;
> -
> - memory_listener_register(&container->iommu_data.type1.listener,
> - &address_space_memory);
> -
> - if (container->iommu_data.type1.error) {
> - ret = container->iommu_data.type1.error;
> - vfio_listener_release(container);
> - g_free(container);
> - close(fd);
> - error_report("vfio: memory listener initialization failed for
> container");
> - return ret;
> - }
> -
> - container->iommu_data.type1.initialized = true;
> -
> - } else {
> - error_report("vfio: No available IOMMU models");
> - g_free(container);
> - close(fd);
> - return -EINVAL;
> - }
> -
> - QLIST_INIT(&container->group_list);
> - QLIST_INSERT_HEAD(&container_list, container, next);
> -
> - group->container = container;
> - QLIST_INSERT_HEAD(&container->group_list, group, container_next);
> -
> - return 0;
> -}
> -
> -static void vfio_disconnect_container(VFIOGroup *group)
> -{
> - VFIOContainer *container = group->container;
> -
> - if (ioctl(group->fd, VFIO_GROUP_UNSET_CONTAINER, &container->fd)) {
> - error_report("vfio: error disconnecting group %d from container",
> - group->groupid);
> - }
> -
> - QLIST_REMOVE(group, container_next);
> - group->container = NULL;
> -
> - if (QLIST_EMPTY(&container->group_list)) {
> - if (container->iommu_data.release) {
> - container->iommu_data.release(container);
> - }
> - QLIST_REMOVE(container, next);
> - DPRINTF("vfio_disconnect_container: close container->fd\n");
> - close(container->fd);
> - g_free(container);
> - }
> -}
> -
> -static VFIOGroup *vfio_get_group(int groupid)
> -{
> - VFIOGroup *group;
> - char path[32];
> - struct vfio_group_status status = { .argsz = sizeof(status) };
> -
> - QLIST_FOREACH(group, &group_list, next) {
> - if (group->groupid == groupid) {
> - return group;
> - }
> - }
> -
> - group = g_malloc0(sizeof(*group));
> -
> - snprintf(path, sizeof(path), "/dev/vfio/%d", groupid);
> - group->fd = qemu_open(path, O_RDWR);
> - if (group->fd < 0) {
> - error_report("vfio: error opening %s: %m", path);
> - g_free(group);
> - return NULL;
> - }
> -
> - if (ioctl(group->fd, VFIO_GROUP_GET_STATUS, &status)) {
> - error_report("vfio: error getting group status: %m");
> - close(group->fd);
> - g_free(group);
> - return NULL;
> - }
> -
> - if (!(status.flags & VFIO_GROUP_FLAGS_VIABLE)) {
> - error_report("vfio: error, group %d is not viable, please ensure "
> - "all devices within the iommu_group are bound to their "
> - "vfio bus driver.", groupid);
> - close(group->fd);
> - g_free(group);
> - return NULL;
> - }
> -
> - group->groupid = groupid;
> - QLIST_INIT(&group->device_list);
> -
> - if (vfio_connect_container(group)) {
> - error_report("vfio: failed to setup container for group %d",
> groupid);
> - close(group->fd);
> - g_free(group);
> - return NULL;
> - }
> -
> - if (QLIST_EMPTY(&group_list)) {
> - qemu_register_reset(vfio_pci_reset_handler, NULL);
> - }
> -
> - QLIST_INSERT_HEAD(&group_list, group, next);
> -
> - vfio_kvm_device_add_group(group);
> -
> - return group;
> -}
> -
> -static void vfio_put_group(VFIOGroup *group)
> -{
> - if (!QLIST_EMPTY(&group->device_list)) {
> - return;
> - }
> -
> - vfio_kvm_device_del_group(group);
> - vfio_disconnect_container(group);
> - QLIST_REMOVE(group, next);
> - DPRINTF("vfio_put_group: close group->fd\n");
> - close(group->fd);
> - g_free(group);
> -
> - if (QLIST_EMPTY(&group_list)) {
> - qemu_unregister_reset(vfio_pci_reset_handler, NULL);
> - }
> -}
> -
> -static int vfio_get_device(VFIOGroup *group, const char *name, VFIODevice
> *vdev)
> +static int vfio_get_device(VFIOGroup *group, const char *name,
> + struct VFIODevice *vdev)
Why?
> {
> struct vfio_device_info dev_info = { .argsz = sizeof(dev_info) };
> struct vfio_region_info reg_info = { .argsz = sizeof(reg_info) };
> @@ -3485,7 +3031,7 @@ static int vfio_get_device(VFIOGroup *group, const char
> *name, VFIODevice *vdev)
> goto error;
> }
>
> - DPRINTF("Device %s flags: %u, regions: %u, irgs: %u\n", name,
> + DPRINTF("Device %s flags: %u, regions: %u, irqs: %u\n", name,
???
> dev_info.flags, dev_info.num_regions, dev_info.num_irqs);
>
> if (!(dev_info.flags & VFIO_DEVICE_FLAGS_PCI)) {
> @@ -3768,7 +3314,7 @@ static int vfio_initfn(PCIDevice *pdev)
> DPRINTF("%s(%04x:%02x:%02x.%x) group %d\n", __func__, vdev->host.domain,
> vdev->host.bus, vdev->host.slot, vdev->host.function, groupid);
>
> - group = vfio_get_group(groupid);
> + group = vfio_get_group(groupid, vfio_pci_reset_handler);
> if (!group) {
> error_report("vfio: failed to get group %d", groupid);
> return -ENOENT;
> @@ -3785,7 +3331,7 @@ static int vfio_initfn(PCIDevice *pdev)
> pvdev->host.function == vdev->host.function) {
>
> error_report("vfio: error: device %s is already attached", path);
> - vfio_put_group(group);
> + vfio_put_group(group, vfio_pci_reset_handler);
> return -EBUSY;
> }
> }
> @@ -3793,7 +3339,7 @@ static int vfio_initfn(PCIDevice *pdev)
> ret = vfio_get_device(group, path, vdev);
> if (ret) {
> error_report("vfio: failed to get device %s", path);
> - vfio_put_group(group);
> + vfio_put_group(group, vfio_pci_reset_handler);
> return ret;
> }
>
> @@ -3879,7 +3425,7 @@ out_teardown:
> out_put:
> g_free(vdev->emulated_config_bits);
> vfio_put_device(vdev);
> - vfio_put_group(group);
> + vfio_put_group(group, vfio_pci_reset_handler);
> return ret;
> }
>
> @@ -3899,7 +3445,7 @@ static void vfio_exitfn(PCIDevice *pdev)
> g_free(vdev->emulated_config_bits);
> g_free(vdev->rom);
> vfio_put_device(vdev);
> - vfio_put_group(group);
> + vfio_put_group(group, vfio_pci_reset_handler);
> }
>
> static void vfio_pci_reset(DeviceState *dev)
> diff --git a/hw/vfio/platform.c b/hw/vfio/platform.c
> new file mode 100644
> index 0000000..138fb13
> --- /dev/null
> +++ b/hw/vfio/platform.c
> @@ -0,0 +1,381 @@
> +/*
> + * vfio based device assignment support - platform devices
> + *
> + * Copyright Linaro Limited, 2014
> + *
> + * Authors:
> + * Kim Phillips <address@hidden>
> + *
> + * This work is licensed under the terms of the GNU GPL, version 2. See
> + * the COPYING file in the top-level directory.
> + *
> + * Based on vfio based PCI device assignment support:
> + * Copyright Red Hat, Inc. 2012
> + */
> +
> +#include <dirent.h>
> +#include <linux/vfio.h>
> +#include <sys/ioctl.h>
> +#include <sys/mman.h>
> +#include <sys/stat.h>
> +#include <sys/types.h>
> +#include <unistd.h>
> +
> +#include "config.h"
> +#include "exec/address-spaces.h"
> +#include "exec/memory.h"
> +#include "qemu-common.h"
> +#include "qemu/error-report.h"
> +#include "qemu/event_notifier.h"
> +#include "qemu/queue.h"
> +#include "qemu/range.h"
> +#include "sysemu/kvm.h"
> +#include "sysemu/sysemu.h"
> +#include "hw/qdev-properties.h"
> +#include "migration/vmstate.h"
> +#include "hw/hw.h"
> +#include "hw/sysbus.h"
> +
> +#include "vfio-common.h"
> +
> +#define DEBUG_VFIO
> +#ifdef DEBUG_VFIO
> +#define DPRINTF(fmt, ...) \
> + do { fprintf(stderr, "vfio: " fmt, ## __VA_ARGS__); } while (0)
> +#else
> +#define DPRINTF(fmt, ...) \
> + do { } while (0)
> +#endif
> +
> +/* Extra debugging, trap acceleration paths for more logging */
> +#define VFIO_ALLOW_MMAP 1
Maybe there should be a common debug section in vfio-common.h?
> +
> +#define TYPE_VFIO_PLATFORM "vfio-platform"
> +
> +typedef struct VFIORegion {
> + off_t fd_offset; /* offset of region within device fd */
> + int fd; /* device fd, allows us to pass VFIORegion as opaque data */
> + MemoryRegion mem; /* slow, read/write access */
> + MemoryRegion mmap_mem; /* direct mapped access */
> + void *mmap;
> + size_t size;
> + uint32_t flags; /* VFIO region flags (rd/wr/mmap) */
> + uint8_t nr; /* cache the region number for debug */
> +} VFIORegion;
> +
> +typedef struct VFIODevice {
> + SysBusDevice sbdev;
> + int fd;
> + int num_regions;
> + VFIORegion *regions;
> + QLIST_ENTRY(VFIODevice) next;
> + struct VFIOGroup *group;
> + char *name;
> +} VFIODevice;
I suspect we really need:
(in vfio-common.h)
typedef struct VFIODevice {
QLIST_ENTRY(VFIODevice) next;
struct VFIOGroup *group;
enum VFIODeviceType;
/* maybe some of the other common stuff */
} VFIODevice;
(per pci/platform)
typedef struct VFIOPlatformDevice {
VFIODevice vdev;
/* device specific stuff */
} VFIOPlatformDevice;
Otherwise I don't see how the device_list on VFIOGroup works.
> +
> +static int vfio_mmap_region(VFIODevice *vdev, VFIORegion *region,
> + MemoryRegion *mem, MemoryRegion *submem,
> + void **map, size_t size, off_t offset,
> + const char *name)
> +{
> + int ret = 0;
> +
> + if (VFIO_ALLOW_MMAP && size && region->flags &
> VFIO_REGION_INFO_FLAG_MMAP) {
> + int prot = 0;
> + ret = 0;
> +
> + if (region->flags & VFIO_REGION_INFO_FLAG_READ) {
> + prot |= PROT_READ;
> + }
> +
> + if (region->flags & VFIO_REGION_INFO_FLAG_WRITE) {
> + prot |= PROT_WRITE;
> + }
> +
> + *map = mmap(NULL, size, prot, MAP_SHARED,
> + region->fd, region->fd_offset + offset);
> + if (*map == MAP_FAILED) {
> + ret = -errno;
> + *map = NULL;
> + goto error;
> + }
> +
> + memory_region_init_ram_ptr(submem, OBJECT(vdev), name, size, *map);
> + }
> +
> + memory_region_add_subregion(mem, offset, submem);
> +
> +error:
> + return ret;
> +}
> +
> +/*
> + * IO Port/MMIO - Beware of the endians, VFIO is always little endian
> + */
> +static void vfio_region_write(void *opaque, hwaddr addr,
> + uint64_t data, unsigned size)
> +{
> + VFIORegion *region = opaque;
> + union {
> + uint8_t byte;
> + uint16_t word;
> + uint32_t dword;
> + uint64_t qword;
> + } buf;
> +
> + switch (size) {
> + case 1:
> + buf.byte = data;
> + break;
> + case 2:
> + buf.word = data;
> + break;
> + case 4:
> + buf.dword = data;
> + break;
> + default:
> + hw_error("vfio: unsupported write size, %d bytes\n", size);
> + break;
> + }
> +
> + if (pwrite(region->fd, &buf, size, region->fd_offset + addr) != size) {
> + error_report("%s(,0x%"HWADDR_PRIx", 0x%"PRIx64", %d) failed: %m",
> + __func__, addr, data, size);
> + }
> +
> + DPRINTF("%s(region %d+0x%"HWADDR_PRIx", 0x%"PRIx64", %d)\n",
> + __func__, region->nr, addr, data, size);
> +}
> +
> +static uint64_t vfio_region_read(void *opaque, hwaddr addr, unsigned size)
> +{
> + VFIORegion *region = opaque;
> + union {
> + uint8_t byte;
> + uint16_t word;
> + uint32_t dword;
> + uint64_t qword;
> + } buf;
> + uint64_t data = 0;
> +
> + if (pread(region->fd, &buf, size, region->fd_offset + addr) != size) {
> + error_report("%s(,0x%"HWADDR_PRIx", %d) failed: %m",
> + __func__, addr, size);
> + return (uint64_t)-1;
> + }
> +
> + switch (size) {
> + case 1:
> + data = buf.byte;
> + break;
> + case 2:
> + data = buf.word;
> + break;
> + case 4:
> + data = buf.dword;
> + break;
> + default:
> + hw_error("vfio: unsupported read size, %d bytes\n", size);
> + break;
> + }
> +
> + DPRINTF("%s(region %d+0x%"HWADDR_PRIx", %d) = 0x%"PRIx64"\n",
> + __func__, region->nr, addr, size, data);
> +
> + return data;
> +}
> +
> +static const MemoryRegionOps vfio_region_ops = {
> + .read = vfio_region_read,
> + .write = vfio_region_write,
> + .endianness = DEVICE_NATIVE_ENDIAN,
> +};
> +
> +static void vfio_map_region(VFIODevice *vdev, int nr)
> +{
> + VFIORegion *region = &vdev->regions[nr];
> + unsigned size = region->size;
> + char name[64];
> +
> + snprintf(name, sizeof(name), "VFIO %s region %d", vdev->name, nr);
> +
> + /* A "slow" read/write mapping underlies all regions */
> + memory_region_init_io(®ion->mem, OBJECT(vdev), &vfio_region_ops,
> + region, name, size);
> +
> + strncat(name, " mmap", sizeof(name) - strlen(name) - 1);
> + if (vfio_mmap_region(vdev, region, ®ion->mem,
> + ®ion->mmap_mem, ®ion->mmap, size, 0, name)) {
> + error_report("%s unsupported. Performance may be slow", name);
> + }
> +}
> +
> +static int vfio_get_device(VFIOGroup *group, const char *name,
> + struct VFIODevice *vdev)
> +{
> + struct vfio_device_info dev_info = { .argsz = sizeof(dev_info) };
> + struct vfio_region_info reg_info = { .argsz = sizeof(reg_info) };
> + struct vfio_irq_info irq_info = { .argsz = sizeof(irq_info) };
> + int ret, i;
> +
> + ret = ioctl(group->fd, VFIO_GROUP_GET_DEVICE_FD, name);
> + if (ret < 0) {
> + error_report("vfio: error getting device %s from group %d: %m",
> + name, group->groupid);
> + error_printf("Verify all devices in group %d are bound to the vfio "
> + "platform driver and are not already in use\n",
> + group->groupid);
> + return ret;
> + }
> +
> + vdev->fd = ret;
> + vdev->group = group;
> + QLIST_INSERT_HEAD(&group->device_list, vdev, next);
> +
> + /* Sanity check device */
> + ret = ioctl(vdev->fd, VFIO_DEVICE_GET_INFO, &dev_info);
> + if (ret) {
> + error_report("vfio: error getting device info: %m");
> + goto error;
> + }
> +
> + DPRINTF("Device %s flags: %u, regions: %u, irqs: %u\n", name,
> + dev_info.flags, dev_info.num_regions, dev_info.num_irqs);
> +
> + vdev->regions = g_malloc0(sizeof(VFIORegion) * dev_info.num_regions);
> + if (!vdev->regions) {
> + error_report("vfio: Error allocating space for %d regions",
> + dev_info.num_regions);
> + ret = -ENOMEM;
> + goto error;
> + }
> +
> + vdev->num_regions = dev_info.num_regions;
> +
> + for (i = 0; i < dev_info.num_regions; i++) {
> + reg_info.index = i;
> +
> + ret = ioctl(vdev->fd, VFIO_DEVICE_GET_REGION_INFO, ®_info);
> + if (ret) {
> + error_report("vfio: Error getting region %d info: %m", i);
> + goto error;
> + }
> +
> + DPRINTF("Device %s region %d:\n", name, i);
> + DPRINTF(" size: 0x%lx, offset: 0x%lx, flags: 0x%lx\n",
> + (unsigned long)reg_info.size, (unsigned long)reg_info.offset,
> + (unsigned long)reg_info.flags);
> +
> + vdev->regions[i].flags = reg_info.flags;
> + vdev->regions[i].size = reg_info.size;
> + vdev->regions[i].fd_offset = reg_info.offset;
> + vdev->regions[i].fd = vdev->fd;
> + vdev->regions[i].nr = i;
> + }
Perhaps more of this could be in common.c if we had a generic VFIODevice
embedded in a VFIOPlatformDevice or VFIOPCIDevice. We could fill in
high-level things like number of regions/irqs, fd, name, etc, leaving
interpretation and probing of each region/irq to device specific code.
> +
> +error:
> + if (ret) {
> + g_free(vdev->regions);
> + QLIST_REMOVE(vdev, next);
> + vdev->group = NULL;
> + close(vdev->fd);
> + }
> + return ret;
> +}
> +
> +static void vfio_platform_realize(DeviceState *dev, Error **errp)
> +{
> + SysBusDevice *sbdev = SYS_BUS_DEVICE(dev);
> + VFIODevice *pvdev, *vdev = DO_UPCAST(VFIODevice, sbdev, sbdev);
> + VFIOGroup *group;
> + char path[PATH_MAX], iommu_group_path[PATH_MAX], *group_name;
> + ssize_t len;
> + struct stat st;
> + int groupid, i, ret;
> +
> + /* TODO: pass device name on command line */
> + vdev->name = malloc(PATH_MAX);
> + strcpy(vdev->name, "fff51000.ethernet");
> +
> + /* Check that the host device exists */
> + snprintf(path, sizeof(path), "/sys/bus/platform/devices/%s/",
> vdev->name);
> + if (stat(path, &st) < 0) {
> + error_report("vfio: error: no such host device: %s", path);
> + return;
> + }
> +
> + strncat(path, "iommu_group", sizeof(path) - strlen(path) - 1);
> +
> + len = readlink(path, iommu_group_path, PATH_MAX);
> + if (len <= 0) {
> + error_report("vfio: error no iommu_group for device");
> + return;
> + }
> +
> + iommu_group_path[len] = 0;
> + group_name = basename(iommu_group_path);
> +
> + if (sscanf(group_name, "%d", &groupid) != 1) {
> + error_report("vfio: error reading %s: %m", path);
> + return;
> + }
> +
> + DPRINTF("%s(%s) group %d\n", __func__, vdev->name, groupid);
> +
> + group = vfio_get_group(groupid, NULL);
> + if (!group) {
> + error_report("vfio: failed to get group %d", groupid);
> + return;
> + }
> +
> + snprintf(path, sizeof(path), "%s", vdev->name);
> +
> + QLIST_FOREACH(pvdev, &group->device_list, next) {
> + if (strcmp(pvdev->name, vdev->name) == 0) {
> + error_report("vfio: error: device %s is already attached", path);
> + vfio_put_group(group, NULL);
> + return;
> + }
> + }
> +
> + ret = vfio_get_device(group, path, vdev);
> + if (ret) {
> + error_report("vfio: failed to get device %s", path);
> + vfio_put_group(group, NULL);
> + return;
> + }
> +
> + for (i = 0; i < vdev->num_regions; i++) {
> + vfio_map_region(vdev, i);
> + sysbus_init_mmio(sbdev, &vdev->regions[i].mem);
> + }
> +}
> +
> +static const VMStateDescription vfio_platform_vmstate = {
> + .name = TYPE_VFIO_PLATFORM,
> + .unmigratable = 1,
> +};
> +
> +static void vfio_platform_dev_class_init(ObjectClass *klass, void *data)
> +{
> + DeviceClass *dc = DEVICE_CLASS(klass);
> +
> + dc->realize = vfio_platform_realize;
> + dc->vmsd = &vfio_platform_vmstate;
> + dc->desc = "VFIO-based platform device assignment";
> + set_bit(DEVICE_CATEGORY_MISC, dc->categories);
> +}
> +
> +static const TypeInfo vfio_platform_dev_info = {
> + .name = TYPE_VFIO_PLATFORM,
> + .parent = TYPE_SYS_BUS_DEVICE,
> + .instance_size = sizeof(VFIODevice),
> + .class_init = vfio_platform_dev_class_init,
> +};
> +
> +static void register_vfio_platform_dev_type(void)
> +{
> + type_register_static(&vfio_platform_dev_info);
> +}
> +
> +type_init(register_vfio_platform_dev_type)
> diff --git a/hw/vfio/vfio-common.h b/hw/vfio/vfio-common.h
> new file mode 100644
> index 0000000..21148ef
> --- /dev/null
> +++ b/hw/vfio/vfio-common.h
> @@ -0,0 +1,55 @@
> +/*
> + * common header for vfio based device assignment support
> + *
> + * Copyright Red Hat, Inc. 2012
> + *
> + * Authors:
> + * Alex Williamson <address@hidden>
> + *
> + * This work is licensed under the terms of the GNU GPL, version 2. See
> + * the COPYING file in the top-level directory.
> + *
> + * Based on qemu-kvm device-assignment:
> + * Adapted for KVM by Qumranet.
> + * Copyright (c) 2007, Neocleus, Alex Novik (address@hidden)
> + * Copyright (c) 2007, Neocleus, Guy Zana (address@hidden)
> + * Copyright (C) 2008, Qumranet, Amit Shah (address@hidden)
> + * Copyright (C) 2008, Red Hat, Amit Shah (address@hidden)
> + * Copyright (C) 2008, IBM, Muli Ben-Yehuda (address@hidden)
> + */
> +
> +struct VFIODevice;
> +
> +struct VFIOGroup;
> +
> +typedef struct VFIOType1 {
> + MemoryListener listener;
> + int error;
> + bool initialized;
> +} VFIOType1;
> +
> +typedef struct VFIOContainer {
> + int fd; /* /dev/vfio/vfio, empowered by the attached groups */
> + struct {
> + /* enable abstraction to support various iommu backends */
> + union {
> + VFIOType1 type1;
> + };
> + void (*release)(struct VFIOContainer *);
> + } iommu_data;
> + QLIST_HEAD(, VFIOGroup) group_list;
> + QLIST_ENTRY(VFIOContainer) next;
> +} VFIOContainer;
> +
> +typedef struct VFIOGroup {
> + int fd;
> + int groupid;
> + VFIOContainer *container;
> + QLIST_HEAD(, VFIODevice) device_list;
> + QLIST_ENTRY(VFIOGroup) next;
> + QLIST_ENTRY(VFIOGroup) container_next;
> +} VFIOGroup;
> +
> +
> +VFIOGroup *vfio_get_group(int groupid, QEMUResetHandler *reset_handler);
> +void vfio_put_group(VFIOGroup *group, QEMUResetHandler *reset_handler);