qemu-devel
[Top][All Lists]
Advanced

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

[Qemu-devel] [RFC PATCH 5/5] VFIO based device assignment


From: Alex Williamson
Subject: [Qemu-devel] [RFC PATCH 5/5] VFIO based device assignment
Date: Sun, 11 Jul 2010 12:09:42 -0600
User-agent: StGIT/0.14.3

This patch adds qemu device assignment support using the proposed
VFIO/UIOMMU kernel interfaces.  The existing KVM-only device assignment
code makes use of various pci sysfs files for config space, MMIO BAR
mapping, and misc other config items.  It then jumps over to KVM-specific
ioctls for enabling interrupts and assigning devices to IOMMU domains.
Finally, IO-port support uses in/out directly.  This is a messy model
to support and causes numerous issues when we try to allow unprivileged
users to access PCI devices.

VFIO/UIOMMU reduces this to two interfaces, /dev/vfioX and /dev/uiommu.
The VFIO device file provides all the necessary support for accessing
PCI config space, read/write/mmap BARs (including IO-port space),
configuring INTx/MSI/MSI-X interupts and setting up DMA mapping.  The
UIOMMU interface allows iommu domains to be created, and via vfio,
devices can be bound to a domain.  This provides an easier model to
support (IMHO) and removes the bindings that make current device
assignment only useable for KVM enabled guests.

Usage is similar to KVM device assignment.  Rather than binding the
device to the pci-stub driver, vfio devices need to be bound to the
vfio driver.  From there, it's a simple matter of specifying the
device as:

-device vfio,host=01:00.0

This example requires either root privileges or proper permissions on
/dev/uiommu and /dev/vfioX.  To support unprivileged operation, the
options vfiofd= and uiommufd= are available.  Depending on the usage
of uiommufd, each guest device can be assigned to the same iommu
domain, or to independent iommu domains.  In the example above, each
device is assigned to a separate iommu domain.

As VFIO has no KVM dependencies, this patch works with or without
-enable-kvm.  I have successfully used a couple assigned devices in a
guest without KVM support, however Michael Tsirkin warns that tcg
may not provide atomic operations to memory visible to the passthrough
device, which could result in failures for devices depending on such
for synchronization.

This patch is functional, but hasn't seen a lot of testing.  I've
tested 82576 PFs and VFs, an Intel HDA audio device, and UHCI and EHCI
USB devices (this actually includes INTx/MSI/MSI-X, 4k aligned MMIO
BARs, non-4k aligned MMIO BARs, and IO-Port BARs).

Signed-off-by: Alex Williamson <address@hidden>
---

 Makefile.target |    1 
 hw/linux-vfio.h |  200 ++++++++
 hw/vfio.c       | 1295 +++++++++++++++++++++++++++++++++++++++++++++++++++++++
 hw/vfio.h       |   90 ++++
 4 files changed, 1586 insertions(+), 0 deletions(-)
 create mode 100644 hw/linux-vfio.h
 create mode 100644 hw/vfio.c
 create mode 100644 hw/vfio.h

diff --git a/Makefile.target b/Makefile.target
index 0c1b916..4936d96 100644
--- a/Makefile.target
+++ b/Makefile.target
@@ -197,6 +197,7 @@ obj-i386-y += vmmouse.o vmport.o hpet.o
 obj-i386-y += device-hotplug.o pci-hotplug.o smbios.o wdt_ib700.o
 obj-i386-y += debugcon.o multiboot.o
 obj-i386-y += pc_piix.o
+obj-i386-y += vfio.o
 
 # shared objects
 obj-ppc-y = ppc.o
diff --git a/hw/linux-vfio.h b/hw/linux-vfio.h
new file mode 100644
index 0000000..06bd3f3
--- /dev/null
+++ b/hw/linux-vfio.h
@@ -0,0 +1,200 @@
+/*
+ * Copyright 2010 Cisco Systems, Inc.  All rights reserved.
+ * Author: Tom Lyon, address@hidden
+ *
+ * This program is free software; you may redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; version 2 of the License.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ * Portions derived from drivers/uio/uio.c:
+ * Copyright(C) 2005, Benedikt Spranger <address@hidden>
+ * Copyright(C) 2005, Thomas Gleixner <address@hidden>
+ * Copyright(C) 2006, Hans J. Koch <address@hidden>
+ * Copyright(C) 2006, Greg Kroah-Hartman <address@hidden>
+ *
+ * Portions derived from drivers/uio/uio_pci_generic.c:
+ * Copyright (C) 2009 Red Hat, Inc.
+ * Author: Michael S. Tsirkin <address@hidden>
+ */
+
+/*
+ * VFIO driver - allow mapping and use of certain PCI devices
+ * in unprivileged user processes. (If IOMMU is present)
+ * Especially useful for Virtual Function parts of SR-IOV devices
+ */
+
+#ifdef __KERNEL__
+
+struct vfio_dev {
+       struct device   *dev;
+       struct pci_dev  *pdev;
+       u8              *pci_config_map;
+       int             pci_config_size;
+       char            name[8];
+       int             devnum;
+       int             pmaster;
+       void __iomem    *bar[PCI_ROM_RESOURCE+1];
+       spinlock_t      irqlock;        /* guards command register accesses */
+       int             listeners;
+       u32             locked_pages;
+       struct mutex    lgate;          /* listener gate */
+       struct mutex    dgate;          /* dma op gate */
+       struct mutex    igate;          /* intr op gate */
+       struct msix_entry       *msix;
+       int                     nvec;
+       struct uiommu_domain    *udomain;
+       int                     cachec;
+       struct eventfd_ctx      *ev_irq;
+       struct eventfd_ctx      *ev_msi;
+       struct eventfd_ctx      **ev_msix;
+       struct {
+               u8      intr;
+               u8      bardirty;
+               u8      rombar[4];
+               u8      bar[6*4];
+               u8      msi[24];
+       } vinfo;
+};
+
+struct vfio_listener {
+       struct vfio_dev *vdev;
+       struct list_head        dm_list;
+       struct mm_struct        *mm;
+       struct mmu_notifier     mmu_notifier;
+};
+
+/*
+ * Structure for keeping track of memory nailed down by the
+ * user for DMA
+ */
+struct dma_map_page {
+       struct list_head list;
+       struct page     **pages;
+       dma_addr_t      daddr;
+       unsigned long   vaddr;
+       int             npage;
+       int             rdwr;
+};
+
+/* VFIO class infrastructure */
+struct vfio_class {
+       struct kref kref;
+       struct class *class;
+};
+extern struct vfio_class *vfio_class;
+
+ssize_t vfio_io_readwrite(int, struct vfio_dev *,
+                       char __user *, size_t, loff_t *);
+ssize_t vfio_mem_readwrite(int, struct vfio_dev *,
+                       char __user *, size_t, loff_t *);
+ssize_t vfio_config_readwrite(int, struct vfio_dev *,
+                       char __user *, size_t, loff_t *);
+
+void vfio_disable_msi(struct vfio_dev *);
+void vfio_disable_msix(struct vfio_dev *);
+int vfio_enable_msi(struct vfio_dev *, int);
+int vfio_enable_msix(struct vfio_dev *, int, void __user *);
+
+#ifndef PCI_MSIX_ENTRY_SIZE
+#define        PCI_MSIX_ENTRY_SIZE     16
+#endif
+#ifndef PCI_STATUS_INTERRUPT
+#define        PCI_STATUS_INTERRUPT    0x08
+#endif
+
+struct vfio_dma_map;
+void vfio_dma_unmapall(struct vfio_listener *);
+int vfio_dma_unmap_dm(struct vfio_listener *, struct vfio_dma_map *);
+int vfio_dma_map_common(struct vfio_listener *, unsigned int,
+                       struct vfio_dma_map *);
+int vfio_domain_set(struct vfio_dev *, int);
+void vfio_domain_unset(struct vfio_dev *);
+
+int vfio_class_init(void);
+void vfio_class_destroy(void);
+int vfio_dev_add_attributes(struct vfio_dev *);
+extern struct idr vfio_idr;
+extern struct mutex vfio_minor_lock;
+int vfio_build_config_map(struct vfio_dev *);
+
+irqreturn_t vfio_interrupt(int, void *);
+
+#endif /* __KERNEL__ */
+
+/* Kernel & User level defines for ioctls */
+
+/*
+ * Structure for DMA mapping of user buffers
+ * vaddr, dmaaddr, and size must all be page aligned
+ * buffer may only be larger than 1 page if (a) there is
+ * an iommu in the system, or (b) buffer is part of a huge page
+ */
+struct vfio_dma_map {
+       __u64   vaddr;          /* process virtual addr */
+       __u64   dmaaddr;        /* desired and/or returned dma address */
+       __u64   size;           /* size in bytes */
+       __u64   flags;          /* bool: 0 for r/o; 1 for r/w */
+#define        VFIO_FLAG_WRITE         0x1     /* req writeable DMA mem */
+};
+
+/* map user pages at specific dma address */
+/* requires previous VFIO_DOMAIN_SET */
+#define        VFIO_DMA_MAP_IOVA       _IOWR(';', 101, struct vfio_dma_map)
+
+/* unmap user pages */
+#define        VFIO_DMA_UNMAP          _IOW(';', 102, struct vfio_dma_map)
+
+/* set device DMA mask & master status */
+#define        VFIO_DMA_MASK           _IOW(';', 103, __u64)
+
+/* request IRQ interrupts; use given eventfd */
+#define        VFIO_EVENTFD_IRQ        _IOW(';', 104, int)
+
+/* request MSI interrupts; use given eventfd */
+#define        VFIO_EVENTFD_MSI        _IOW(';', 105, int)
+
+/* Request MSI-X interrupts: arg[0] is #, arg[1-n] are eventfds */
+#define        VFIO_EVENTFDS_MSIX      _IOW(';', 106, int)
+
+/* Get length of a BAR */
+#define        VFIO_BAR_LEN            _IOWR(';', 107, __u32)
+
+/* Set the IOMMU domain - arg is fd from uiommu driver */
+#define        VFIO_DOMAIN_SET         _IOW(';', 108, int)
+
+/* Unset the IOMMU domain */
+#define        VFIO_DOMAIN_UNSET       _IO(';', 109)
+
+/*
+ * Reads, writes, and mmaps determine which PCI BAR (or config space)
+ * from the high level bits of the file offset
+ */
+#define        VFIO_PCI_BAR0_RESOURCE          0x0
+#define        VFIO_PCI_BAR1_RESOURCE          0x1
+#define        VFIO_PCI_BAR2_RESOURCE          0x2
+#define        VFIO_PCI_BAR3_RESOURCE          0x3
+#define        VFIO_PCI_BAR4_RESOURCE          0x4
+#define        VFIO_PCI_BAR5_RESOURCE          0x5
+#define        VFIO_PCI_ROM_RESOURCE           0x6
+#define        VFIO_PCI_CONFIG_RESOURCE        0xF
+#define        VFIO_PCI_SPACE_SHIFT    32
+#define VFIO_PCI_CONFIG_OFF vfio_pci_space_to_offset(VFIO_PCI_CONFIG_RESOURCE)
+
+static inline int vfio_offset_to_pci_space(__u64 off)
+{
+       return (off >> VFIO_PCI_SPACE_SHIFT) & 0xF;
+}
+
+static inline __u64 vfio_pci_space_to_offset(int sp)
+{
+       return (__u64)(sp) << VFIO_PCI_SPACE_SHIFT;
+}
diff --git a/hw/vfio.c b/hw/vfio.c
new file mode 100644
index 0000000..d9ff3d8
--- /dev/null
+++ b/hw/vfio.c
@@ -0,0 +1,1295 @@
+/*
+ * vfio based device assignment support
+ *
+ * Copyright Red Hat, Inc. 2010
+ *
+ * Authors:
+ *  Alex Williamson <address@hidden>
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2.  See
+ * the COPYING file in the top-level directory.
+ *
+ * Based on qemu-kvm device-assignment:
+ *  Adapted for KVM by Qumranet.
+ *  Copyright (c) 2007, Neocleus, Alex Novik (address@hidden)
+ *  Copyright (c) 2007, Neocleus, Guy Zana (address@hidden)
+ *  Copyright (C) 2008, Qumranet, Amit Shah (address@hidden)
+ *  Copyright (C) 2008, Red Hat, Amit Shah (address@hidden)
+ *  Copyright (C) 2008, IBM, Muli Ben-Yehuda (address@hidden)
+ */
+
+#include <dirent.h>
+#include <stdio.h>
+#include <unistd.h>
+#include <sys/io.h>
+#include <sys/ioctl.h>
+#include <sys/mman.h>
+#include <sys/types.h>
+#include <sys/stat.h>
+#include "event_notifier.h"
+#include "hw.h"
+#include "memory.h"
+#include "monitor.h"
+#include "pc.h"
+#include "qemu-error.h"
+#include "vfio.h"
+#include <pci/header.h>
+#include <pci/types.h>
+#include <linux/types.h>
+#include "linux-vfio.h"
+
+//#define DEBUG_VFIO
+#ifdef DEBUG_VFIO
+#define DPRINTF(fmt, ...) \
+    do { printf("vfio: " fmt, ## __VA_ARGS__); } while (0)
+#else
+#define DPRINTF(fmt, ...) \
+    do { } while (0)
+#endif
+
+static uint32_t vfio_pci_read_config(PCIDevice *pdev, uint32_t addr, int len);
+static void vfio_pci_write_config(PCIDevice *pdev, uint32_t addr,
+                                  uint32_t val, int len);
+/*
+ * Generic
+ */
+static uint8_t pci_find_cap_offset(PCIDevice *pdev, uint8_t cap)
+{
+    int id;
+    int max_cap = 48;
+    int pos = PCI_CAPABILITY_LIST;
+    int status;
+
+    status = pdev->config[PCI_STATUS];
+    if ((status & PCI_STATUS_CAP_LIST) == 0) {
+        return 0;
+    }
+
+    while (max_cap--) {
+        pos = pdev->config[pos];
+        if (pos < 0x40) {
+            break;
+        }
+
+        pos &= ~3;
+        id = pdev->config[pos + PCI_CAP_LIST_ID];
+
+        if (id == 0xff) {
+            break;
+        }
+        if (id == cap) {
+            return pos;
+        }
+
+        pos += PCI_CAP_LIST_NEXT;
+    }
+    return 0;
+}
+
+static int parse_hostaddr(DeviceState *qdev, Property *prop, const char *str)
+{
+    PCIHostDevice *ptr = qdev_get_prop_ptr(qdev, prop);
+    const char *p = str;
+    int n, seg, bus, dev, func;
+    char field[5];
+
+    if (sscanf(p, "%4[^:]%n", field, &n) != 1 || p[n] != ':') {
+        return -1;
+    }
+
+    seg = strtol(field, NULL, 16);
+    p += n + 1;
+
+    if (sscanf(p, "%4[^:]%n", field, &n) != 1) {
+        return -1;
+    }
+
+    if (p[n] == ':') {
+        bus = strtol(field, NULL, 16);
+        p += n + 1;
+    } else {
+        bus = seg;
+        seg = 0;
+    }
+
+    if (sscanf(p, "%4[^.]%n", field, &n) != 1 || p[n] != '.') {
+        return -1;
+    }
+
+    dev = strtol(field, NULL, 16);
+    p += n + 1;
+
+    if (!qemu_isdigit(*p)) {
+        return -1;
+    }
+
+    func = *p - '0';
+
+    ptr->seg = seg;
+    ptr->bus = bus;
+    ptr->dev = dev;
+    ptr->func = func;
+    return 0;
+}
+
+static int print_hostaddr(DeviceState *qdev, Property *prop,
+                          char *dest, size_t len)
+{
+    PCIHostDevice *ptr = qdev_get_prop_ptr(qdev, prop);
+
+    return snprintf(dest, len, "%04x:%02x:%02x.%x",
+                    ptr->seg, ptr->bus, ptr->dev, ptr->func);
+}
+
+/*
+ * MSI-X
+ */
+static uint32_t msix_mmio_read(VFIODevice *vdev,
+                               target_phys_addr_t addr, int len)
+{
+    unsigned int offset = addr & 0xfff;
+    uint32_t val = 0;
+
+    memcpy(&val, (void *)&((uint8_t *)vdev->msix.table)[offset], len);
+    DPRINTF("%s(%04x:%02x:%02x.%x, 0x%lx, 0x%x) = 0x%x\n",
+            __FUNCTION__, vdev->host.seg, vdev->host.bus, vdev->host.dev,
+            vdev->host.func, addr, len, val);
+    return val;
+}
+
+static uint32_t msix_mmio_readl(void *opaque, target_phys_addr_t addr)
+{
+    return msix_mmio_read(opaque, addr, 4);
+}
+
+static uint32_t msix_mmio_readw(void *opaque, target_phys_addr_t addr)
+{
+    return msix_mmio_read(opaque, addr, 2);
+}
+
+static uint32_t msix_mmio_readb(void *opaque, target_phys_addr_t addr)
+{
+    return msix_mmio_read(opaque, addr, 1);
+}
+
+static CPUReadMemoryFunc *msix_mmio_reads[] = {
+    msix_mmio_readb,    msix_mmio_readw,        msix_mmio_readl
+};
+
+static void msix_mmio_write(VFIODevice *vdev, target_phys_addr_t addr,
+                            uint32_t val, int len)
+{
+    unsigned int offset = addr & 0xfff;
+
+    memcpy((void *)&((uint8_t *)vdev->msix.table)[offset], &val, len);
+
+    DPRINTF("%s(%04x:%02x:%02x.%x, 0x%lx, 0x%x, 0x%x)\n",
+            __FUNCTION__, vdev->host.seg, vdev->host.bus, vdev->host.dev,
+            vdev->host.func, addr, val, len);
+
+    if ((offset & 0xf) == 0xc && vdev->msix.enabled) {
+        uint64_t off = vdev->msix.bar_offset + offset +
+                       vfio_pci_space_to_offset(VFIO_PCI_BAR0_RESOURCE +
+                                                vdev->msix.bar);
+        if (pwrite(vdev->vfiofd, &val, len, off) != len) {
+            fprintf(stderr, "vfio: Error: Failed to update MSIX table ctrl\n");
+        }
+    }
+}
+
+static void msix_mmio_writel(void *opaque,
+                             target_phys_addr_t addr, uint32_t val)
+{
+    msix_mmio_write(opaque, addr, val, 4);
+}
+
+static void msix_mmio_writew(void *opaque,
+                             target_phys_addr_t addr, uint32_t val)
+{
+    msix_mmio_write(opaque, addr, val, 2);
+}
+
+static void msix_mmio_writeb(void *opaque,
+                             target_phys_addr_t addr, uint32_t val)
+{
+    msix_mmio_write(opaque, addr, val, 1);
+}
+
+static CPUWriteMemoryFunc *msix_mmio_writes[] = {
+    msix_mmio_writeb,   msix_mmio_writew,       msix_mmio_writel
+};
+
+static void vfio_msix_interrupt(void *opaque)
+{
+    MSIXEvent *event = opaque;
+    uint64_t addr;
+    uint32_t data;
+
+    if (!event_notifier_test_and_clear(&event->notifier)) {
+        return;
+    }
+
+    addr = le32_to_cpu(event->entry->upper_addr);
+    addr = (addr << 32) | le32_to_cpu(event->entry->addr);
+    data = le32_to_cpu(event->entry->data);
+    DPRINTF("%s: 0x%x -> 0x%lx\n", __FUNCTION__, data, addr);
+    stl_phys(addr, data);
+}
+
+static void vfio_enable_msix(VFIODevice *vdev)
+{
+    int i, vectors, *fds;
+    uint64_t off = vdev->msix.bar_offset +
+                   vfio_pci_space_to_offset(VFIO_PCI_BAR0_RESOURCE +
+                                            vdev->msix.bar);
+
+    /* Hmm, it's probably possible for a driver to setup less then
+     * the full table of vectors... right?
+     */
+    for (i = 0; i < vdev->msix.table_len; i++) {
+        if (!vdev->msix.table[i].addr) {
+            break;
+        }
+    }
+
+    vectors = i;
+    if (!vectors) {
+        fprintf(stderr, "vfio: Error: no MSIX vectors enabled\n");
+        return;
+    }
+
+    vdev->msix.events = qemu_mallocz(vectors * sizeof(MSIXEvent));
+    vdev->msix.num_events = vectors;
+    fds = qemu_malloc((vectors + 1) * sizeof(int));
+    fds[0] = vectors;
+
+    for (i = 0; i < vectors; i++) {
+        vdev->msix.events[i].entry = &vdev->msix.table[i];
+        if (event_notifier_init(&vdev->msix.events[i].notifier, 0))
+            fprintf(stderr, "vfio: Error: event_notifier_init failed\n");
+
+        fds[i + 1] = event_notifier_get_fd(&vdev->msix.events[i].notifier);
+        qemu_set_fd_handler(fds[i + 1], vfio_msix_interrupt, NULL,
+                            &vdev->msix.events[i]);
+    }
+
+    if (ioctl(vdev->vfiofd, VFIO_EVENTFDS_MSIX, fds)) {
+        fprintf(stderr, "vfio: Error: Failed to setup MSIX fds %s\n",
+                strerror(errno));
+        qemu_free(fds);
+        return;
+    }
+
+    qemu_free(fds);
+
+    for (i = 0; i < vectors; i++) {
+        MSIXTableEntry *te = &vdev->msix.table[i];
+        if (pwrite(vdev->vfiofd, &te->ctrl, sizeof(te->ctrl),
+                   off + (i * sizeof(MSIXTableEntry)) +
+                   offsetof(MSIXTableEntry, ctrl)) != sizeof(te->ctrl)) {
+            fprintf(stderr, "vfio: Error: Failed to update MSIX table ctrl\n");
+        }
+    }
+    vdev->msix.enabled = 1;
+    DPRINTF("%s(%04x:%02x:%02x.%x)\n",
+            __FUNCTION__, vdev->host.seg, vdev->host.bus, vdev->host.dev,
+            vdev->host.func);
+}
+
+static void vfio_disable_msix(VFIODevice *vdev)
+{
+    uint32_t vectors = 0;
+    int i;
+
+    if (!vdev->msix.enabled) {
+        return;
+    }
+
+    for (i = 0; i < vdev->msix.num_events; i++) {
+        int fd = event_notifier_get_fd(&vdev->msix.events[i].notifier);
+        qemu_set_fd_handler(fd, NULL, NULL, NULL);
+        event_notifier_cleanup(&vdev->msix.events[i].notifier);
+    }
+
+    ioctl(vdev->vfiofd, VFIO_EVENTFDS_MSIX, &vectors);
+    qemu_free(vdev->msix.events);
+    vdev->msix.events = NULL;
+    vdev->msix.num_events = 0;
+    vdev->msix.enabled = 0;
+    DPRINTF("%s(%04x:%02x:%02x.%x)\n",
+            __FUNCTION__, vdev->host.seg, vdev->host.bus, vdev->host.dev,
+            vdev->host.func);
+}
+
+/*
+ * MSI
+ */
+static void vfio_msi_interrupt(void *opaque)
+{
+    MSIEvent *event = opaque;
+    uint64_t addr;
+    uint32_t data;
+
+    if (!event_notifier_test_and_clear(&event->notifier)) {
+        return;
+    }
+
+    if (event->upper_addr) {
+        addr = pci_get_long(event->upper_addr);
+        addr = (addr << 32) | pci_get_long(event->addr);
+    } else {
+        addr = pci_get_long(event->addr);
+    }
+    data = pci_get_word(event->data);
+    DPRINTF("%s: 0x%x -> 0x%lx\n", __FUNCTION__, data, addr);
+    stl_phys(addr, data);
+}
+
+static void vfio_enable_msi(VFIODevice *vdev)
+{
+    int i, vectors, *fds;
+    uint16_t ctrl = vfio_pci_read_config(&vdev->pdev,
+                                         vdev->msi.pos + PCI_MSI_FLAGS,
+                                         sizeof(ctrl));
+    ctrl = le32_to_cpu(ctrl);
+    vectors = 1 << ((ctrl & PCI_MSI_FLAGS_QSIZE) >> 4);
+
+    if (vectors > 32) {
+        fprintf(stderr, "vfio: Error: Invalid configured MSI vectors %d\n",
+                vectors);
+        return;
+    }
+
+    vdev->msi.events = qemu_mallocz(vectors * sizeof(MSIEvent));
+    vdev->msi.num_events = vectors;
+    fds = qemu_malloc((vectors + 1) * sizeof(int));
+    fds[0] = vectors;
+
+    for (i = 0; i < vectors; i++) {
+        vdev->msi.events[i].addr = vdev->pdev.config +
+                                   vdev->msi.pos + PCI_MSI_ADDRESS_LO;
+        if (ctrl & PCI_MSI_FLAGS_64BIT) {
+            vdev->msi.events[i].upper_addr = vdev->pdev.config +
+                                             vdev->msi.pos +
+                                             PCI_MSI_ADDRESS_HI;
+            vdev->msi.events[i].data = vdev->pdev.config +
+                                             vdev->msi.pos + PCI_MSI_DATA_64;
+        } else {
+            vdev->msi.events[i].upper_addr = NULL;
+            vdev->msi.events[i].data = vdev->pdev.config +
+                                             vdev->msi.pos + PCI_MSI_DATA_32;
+        }
+
+        if (event_notifier_init(&vdev->msi.events[i].notifier, 0)) {
+            fprintf(stderr, "vfio: Error: event_notifier_init failed\n");
+        }
+        fds[i + 1] = event_notifier_get_fd(&vdev->msi.events[i].notifier);
+        qemu_set_fd_handler(fds[i + 1], vfio_msi_interrupt, NULL,
+                            &vdev->msi.events[i]);
+    }
+    
+    /* FIXME: current vfio only supports 1 MSI */
+    if (vectors > 1) {
+        fprintf(stderr, "vfio: Error: only support 1 MSI vector, want %d\n",
+                vectors);
+        abort();
+    }
+    if (ioctl(vdev->vfiofd, VFIO_EVENTFD_MSI, &fds[1])) {
+        fprintf(stderr, "vfio: Error: Failed to setup MSI fds %s\n",
+                strerror(errno));
+        qemu_free(fds);
+        return;
+    }
+
+    qemu_free(fds);
+    vdev->msi.enabled = 1;
+    DPRINTF("%s(%04x:%02x:%02x.%x)\n",
+            __FUNCTION__, vdev->host.seg, vdev->host.bus, vdev->host.dev,
+            vdev->host.func);
+}
+
+static void vfio_disable_msi(VFIODevice *vdev)
+{
+    uint32_t vectors = -1;
+    int i;
+
+    if (!vdev->msi.enabled) {
+        return;
+    }
+
+    for (i = 0; i < vdev->msi.num_events; i++) {
+        int fd = event_notifier_get_fd(&vdev->msi.events[i].notifier);
+        qemu_set_fd_handler(fd, NULL, NULL, NULL);
+        event_notifier_cleanup(&vdev->msi.events[i].notifier);
+    }
+
+    ioctl(vdev->vfiofd, VFIO_EVENTFD_MSI, &vectors);
+    qemu_free(vdev->msi.events);
+    vdev->msi.events = NULL;
+    vdev->msi.num_events = 0;
+    vdev->msi.enabled = 0;
+    DPRINTF("%s(%04x:%02x:%02x.%x)\n",
+            __FUNCTION__, vdev->host.seg, vdev->host.bus, vdev->host.dev,
+            vdev->host.func);
+}
+
+/*
+ * INTx
+ */
+static void vfio_unmask_intx(VFIODevice *vdev)
+{
+    uint16_t cmd;
+
+    cmd = vfio_pci_read_config(&vdev->pdev, PCI_COMMAND, sizeof(cmd));
+    cmd = le16_to_cpu(cmd);
+    cmd &= ~PCI_COMMAND_INTX_DISABLE;
+    cmd = cpu_to_le16(cmd);
+    vfio_pci_write_config(&vdev->pdev, PCI_COMMAND, cmd, sizeof(cmd));
+}
+
+static void vfio_intx_interrupt(void *opaque)
+{
+    VFIODevice *vdev = opaque;
+
+    if (!event_notifier_test_and_clear(&vdev->intx.notifier)) {
+        return;
+    }
+
+    DPRINTF(stderr, "%s(%04x:%02x:%02x.%x) Pin %c\n",
+            __FUNCTION__, vdev->host.seg, vdev->host.bus, vdev->host.dev,
+            vdev->host.func, 'A' + vdev->intx.pin);
+
+    vdev->intx.pending = 1;
+    qemu_set_irq(vdev->pdev.irq[vdev->intx.pin], 1);
+}
+
+static void vfio_eoi(ioapic_eoi_client *client)
+{
+    VFIODevice *vdev = container_of(client, VFIODevice, intx.eoi_client);
+
+    if (!vdev->intx.pending) {
+        return;
+    }
+
+    DPRINTF("%s(%04x:%02x:%02x.%x) EOI\n",
+            __FUNCTION__, vdev->host.seg, vdev->host.bus, vdev->host.dev,
+            vdev->host.func);
+
+    vdev->intx.pending = 0;
+    qemu_set_irq(vdev->pdev.irq[vdev->intx.pin], 0);
+    vfio_unmask_intx(vdev);
+}
+
+static int vfio_enable_intx(VFIODevice *vdev)
+{
+    int fd;
+
+    if (!(vdev->intx.pin = vfio_pci_read_config(&vdev->pdev,
+                                                PCI_INTERRUPT_PIN, 1))) {
+        return 0;
+    }
+
+    vdev->intx.pin--; /* Pin A (1) -> irq[0] */
+    vdev->intx.eoi_client.eoi = vfio_eoi;
+    vdev->intx.eoi_client.irq = pci_get_byte(vdev->pdev.config +
+                                             PCI_INTERRUPT_LINE);
+    ioapic_register_eoi_client(&vdev->intx.eoi_client);
+
+    if (event_notifier_init(&vdev->intx.notifier, 0)) {
+        fprintf(stderr, "vfio: Error: event_notifier_init failed\n");
+        return -1;
+    }
+
+    fd = event_notifier_get_fd(&vdev->intx.notifier);
+    qemu_set_fd_handler(fd, vfio_intx_interrupt, NULL, vdev);
+    if (ioctl(vdev->vfiofd, VFIO_EVENTFD_IRQ, &fd)) {
+        fprintf(stderr, "vfio: Error: Failed to setup INTx fd %s\n",
+                strerror(errno));
+        return -1;
+    }
+    vfio_unmask_intx(vdev);
+    vdev->intx.enabled = 1;
+    DPRINTF("%s(%04x:%02x:%02x.%x)\n",
+            __FUNCTION__, vdev->host.seg, vdev->host.bus, vdev->host.dev,
+            vdev->host.func);
+
+    return 0;
+}
+
+static void vfio_disable_intx(VFIODevice *vdev)
+{
+    int fd;
+
+    if (!vdev->intx.enabled) {
+        return;
+    }
+
+    ioapic_unregister_eoi_client(&vdev->intx.eoi_client);
+    fd = event_notifier_get_fd(&vdev->intx.notifier);
+    qemu_set_fd_handler(fd, NULL, NULL, NULL);
+    event_notifier_cleanup(&vdev->intx.notifier);
+    fd = -1;
+    ioctl(vdev->vfiofd, VFIO_EVENTFD_IRQ, &fd);
+    vdev->intx.enabled = 0;
+    DPRINTF("%s(%04x:%02x:%02x.%x)\n",
+            __FUNCTION__, vdev->host.seg, vdev->host.bus, vdev->host.dev,
+            vdev->host.func);
+}
+
+/*
+ * IO Port/MMIO
+ */
+static void vfio_resource_write(PCIResource *res, uint32_t addr,
+                                uint32_t val, int len)
+{
+    size_t offset = vfio_pci_space_to_offset(VFIO_PCI_BAR0_RESOURCE + 
res->bar);
+
+    if (pwrite(res->vfiofd, &val, len, offset + addr) != len) {
+        fprintf(stderr, "%s(,0x%x, 0x%x, %d) failed: %s\n",
+                __FUNCTION__, addr, val, len, strerror(errno));
+    }
+    DPRINTF("%s(BAR%d+0x%x, 0x%x, %d)\n",
+            __FUNCTION__, res->bar, addr, val, len);
+}
+
+static void vfio_resource_writeb(void *opaque, target_phys_addr_t addr,
+                                 uint32_t val)
+{
+    vfio_resource_write(opaque, addr, val, 1);
+}
+
+static void vfio_resource_writew(void *opaque, target_phys_addr_t addr,
+                                 uint32_t val)
+{
+    vfio_resource_write(opaque, addr, val, 2);
+}
+
+static void vfio_resource_writel(void *opaque, target_phys_addr_t addr,
+                                 uint32_t val)
+{
+    vfio_resource_write(opaque, addr, val, 4);
+}
+
+static CPUWriteMemoryFunc * const vfio_resource_writes[] = {
+    &vfio_resource_writeb,
+    &vfio_resource_writew,
+    &vfio_resource_writel
+};
+
+static void vfio_ioport_writeb(void *opaque, uint32_t addr, uint32_t val)
+{
+    PCIResource *res = opaque;
+    vfio_resource_write(res, addr - res->e_phys, val, 1);
+}
+
+static void vfio_ioport_writew(void *opaque, uint32_t addr, uint32_t val)
+{
+    PCIResource *res = opaque;
+    vfio_resource_write(res, addr - res->e_phys, val, 2);
+}
+
+static void vfio_ioport_writel(void *opaque, uint32_t addr, uint32_t val)
+{
+    PCIResource *res = opaque;
+    vfio_resource_write(res, addr - res->e_phys, val, 4);
+}
+
+static uint32_t vfio_resource_read(PCIResource *res, uint32_t addr, int len)
+{
+    size_t offset = vfio_pci_space_to_offset(VFIO_PCI_BAR0_RESOURCE + 
res->bar);
+    uint32_t val;
+
+    if (pread(res->vfiofd, &val, len, offset + addr) != len) {
+        fprintf(stderr, "%s(,0x%x, %d) failed: %s\n",
+                __FUNCTION__, addr, len, strerror(errno));
+        return 0xffffffffU;
+    }
+    DPRINTF("%s(BAR%d+0x%x, %d) = 0x%x\n",
+            __FUNCTION__, res->bar, addr, len, val);
+    return val;
+}
+
+static uint32_t vfio_resource_readb(void *opaque, target_phys_addr_t addr)
+{
+    return vfio_resource_read(opaque, addr, 1) & 0xff;
+}
+
+static uint32_t vfio_resource_readw(void *opaque, target_phys_addr_t addr)
+{
+    return vfio_resource_read(opaque, addr, 2) & 0xffff;
+}
+
+static uint32_t vfio_resource_readl(void *opaque, target_phys_addr_t addr)
+{
+    return vfio_resource_read(opaque, addr, 4);
+}
+
+static CPUReadMemoryFunc * const vfio_resource_reads[] = {
+    &vfio_resource_readb,
+    &vfio_resource_readw,
+    &vfio_resource_readl
+};
+
+static uint32_t vfio_ioport_readb(void *opaque, uint32_t addr)
+{
+    PCIResource *res = opaque;
+    return vfio_resource_read(res, addr - res->e_phys, 1) & 0xff;
+}
+
+static uint32_t vfio_ioport_readw(void *opaque, uint32_t addr)
+{
+    PCIResource *res = opaque;
+    return vfio_resource_read(res, addr - res->e_phys, 2) & 0xffff;
+}
+
+static uint32_t vfio_ioport_readl(void *opaque, uint32_t addr)
+{
+    PCIResource *res = opaque;
+    return vfio_resource_read(res, addr - res->e_phys, 4);
+}
+
+static void vfio_ioport_map(PCIDevice *pdev, int bar,
+                           pcibus_t e_phys, pcibus_t e_size, int type)
+{
+    VFIODevice *vdev = DO_UPCAST(VFIODevice, pdev, pdev);
+    PCIResource *res = &vdev->resources[bar];
+
+    DPRINTF("%s(%04x:%02x:%02x.%x, %d, 0x%lx, 0x%lx, %d)\n",
+            __FUNCTION__, vdev->host.seg, vdev->host.bus,
+            vdev->host.dev, vdev->host.func, bar, e_phys, e_size, type);
+
+    res->e_phys = e_phys;
+    res->e_size = e_size;
+
+    register_ioport_write(e_phys, e_size, 1, vfio_ioport_writeb, res);
+    register_ioport_write(e_phys, e_size, 2, vfio_ioport_writew, res);
+    register_ioport_write(e_phys, e_size, 4, vfio_ioport_writel, res);
+    register_ioport_read(e_phys, e_size, 1, vfio_ioport_readb, res);
+    register_ioport_read(e_phys, e_size, 2, vfio_ioport_readw, res);
+    register_ioport_read(e_phys, e_size, 4, vfio_ioport_readl, res);
+}
+
+static void vfio_iomem_map(PCIDevice *pdev, int bar,
+                           pcibus_t e_phys, pcibus_t e_size, int type)
+{
+    VFIODevice *vdev = DO_UPCAST(VFIODevice, pdev, pdev);
+    PCIResource *res = &vdev->resources[bar];
+
+    DPRINTF("%s(%04x:%02x:%02x.%x, %d, 0x%lx, 0x%lx, %d)\n",
+            __FUNCTION__, vdev->host.seg, vdev->host.bus,
+            vdev->host.dev, vdev->host.func, bar, e_phys, e_size, type);
+
+    res->e_phys = e_phys;
+    res->e_size = e_size;
+
+    if (e_size == 0) {
+        return;
+    }
+
+    if (e_size != res->size) {
+        fprintf(stderr, "vfio: Error: partial BAR map?\n");
+        abort();
+    }
+
+    if (res->msix) {
+        if (res->msix_offset > 0) {
+            cpu_register_physical_memory(e_phys, res->msix_offset,
+                                         res->memory_index[0]);
+        }
+
+        DPRINTF("Overlaying MSI-X table page\n");
+        cpu_register_physical_memory(e_phys + res->msix_offset,
+                                     TARGET_PAGE_SIZE, vdev->msix.index);
+
+        if (res->size > res->msix_offset + 0x1000) {
+            cpu_register_physical_memory(e_phys + res->msix_offset + 0x1000,
+                                         res->size - res->msix_offset - 0x1000,
+                                         res->memory_index[1]);
+        }
+    } else {
+        if (!res->slow) {
+            cpu_register_physical_memory(e_phys, e_size, res->memory_index[0]);
+        } else {
+            cpu_register_physical_memory(e_phys, e_size, res->io_mem);
+        }
+    }
+}
+
+/*
+ * PCI config space
+ */
+static uint32_t vfio_pci_read_config(PCIDevice *pdev, uint32_t addr, int len)
+{
+    VFIODevice *vdev = DO_UPCAST(VFIODevice, pdev, pdev);
+    uint32_t val = 0;
+
+    if (pread(vdev->vfiofd, &val, len, VFIO_PCI_CONFIG_OFF + addr) != len) {
+        fprintf(stderr, "%s(%04x:%02x:%02x.%x, 0x%x, 0x%x) failed: %s\n",
+                __FUNCTION__, vdev->host.seg, vdev->host.bus,
+                vdev->host.dev, vdev->host.func, addr, len, strerror(errno));
+        return -1;
+    }
+    DPRINTF("%s(%04x:%02x:%02x.%x, 0x%x, 0x%x) %x\n",
+            __FUNCTION__, vdev->host.seg, vdev->host.bus,
+            vdev->host.dev, vdev->host.func, addr, len, val);
+    return pci_default_read_config(pdev, addr, len);
+}
+
+static void vfio_pci_write_config(PCIDevice *pdev, uint32_t addr,
+                                  uint32_t val, int len)
+{
+    VFIODevice *vdev = DO_UPCAST(VFIODevice, pdev, pdev);
+
+    if (pwrite(vdev->vfiofd, &val, len, VFIO_PCI_CONFIG_OFF + addr) != len) {
+        fprintf(stderr, "%s(%04x:%02x:%02x.%x, 0x%x, 0x%x, 0x%x) failed: %s\n",
+                __FUNCTION__, vdev->host.seg, vdev->host.bus, vdev->host.dev,
+                vdev->host.func, addr, val, len, strerror(errno));
+    }
+
+    DPRINTF("%s(%04x:%02x:%02x.%x, 0x%x, 0x%x, 0x%x)\n",
+            __FUNCTION__, vdev->host.seg, vdev->host.bus, vdev->host.dev,
+            vdev->host.func, addr, val, len);
+
+    if (vdev->msix.pos && (addr == vdev->msix.pos + PCI_MSIX_FLAGS)) {
+        if (vdev->msix.enabled) {
+            if (!(val & PCI_MSIX_FLAGS_ENABLE)) {
+                vfio_disable_msix(vdev);
+            }
+        } else {
+            if (val & PCI_MSIX_FLAGS_ENABLE) {
+                vfio_enable_msix(vdev);
+            }
+        }
+    }
+
+    if (vdev->msi.pos && (addr == vdev->msi.pos + PCI_MSI_FLAGS)) {
+        if (vdev->msi.enabled) {
+            if (!(val & PCI_MSI_FLAGS_ENABLE)) {
+                vfio_disable_msi(vdev);
+            }
+        } else {
+            if (val & PCI_MSI_FLAGS_ENABLE) {
+                vfio_enable_msi(vdev);
+            }
+        }
+    }
+
+    if (addr == PCI_INTERRUPT_LINE) {
+        if (len != 1) {
+            fprintf(stderr, "vfio: fixme: INTERRUPT_LINE written as %d 
bytes\n",
+                    len);
+        }
+        vdev->intx.eoi_client.irq = val;
+    }
+
+    pci_default_write_config(pdev, addr, val, len);
+}
+
+/*
+ * DMA
+ */
+static int vfio_do_map_iommu(VFIODevice *vdev, int map)
+{
+    QemuRamSlot *slot;
+
+    QLIST_FOREACH(slot, &ram_slots.slots, next) {
+        struct vfio_dma_map dma_map;
+
+        dma_map.vaddr = (uint64_t)qemu_get_ram_ptr(slot->offset);
+        dma_map.dmaaddr = slot->start_addr;
+        dma_map.size = slot->size;
+        dma_map.flags = VFIO_FLAG_WRITE;
+
+        if (map) {
+            if (ioctl(vdev->vfiofd, VFIO_DMA_MAP_IOVA, &dma_map))
+                return -1;
+        } else {
+            ioctl(vdev->vfiofd, VFIO_DMA_UNMAP, &dma_map);
+        }
+    }
+    return 0;
+}
+
+static int vfio_map_iommu(VFIODevice *vdev)
+{
+    return vfio_do_map_iommu(vdev, 1);
+}
+
+static void vfio_unmap_iommu(VFIODevice *vdev)
+{
+    vfio_do_map_iommu(vdev, 0);
+}
+
+/*
+ * Interrupt setup
+ */
+static int vfio_setup_msi(VFIODevice *vdev)
+{
+    int pos;
+
+    if ((pos = pci_find_cap_offset(&vdev->pdev, PCI_CAP_ID_MSI))) {
+        DPRINTF("%04x:%02x:%02x.%x PCI MSI CAP @%d\n", vdev->host.seg,
+                vdev->host.bus, vdev->host.dev, vdev->host.func, pos);
+        vdev->msi.pos = pos;
+    }
+
+    if ((pos = pci_find_cap_offset(&vdev->pdev, PCI_CAP_ID_MSIX))) {
+        uint16_t ctrl;
+        uint32_t table, pba, len;
+
+        if (pread(vdev->vfiofd, &ctrl, sizeof(ctrl),
+                  VFIO_PCI_CONFIG_OFF + pos + PCI_CAP_FLAGS) != sizeof(ctrl)) {
+            return -1;
+        }
+
+        if (pread(vdev->vfiofd, &table, sizeof(table), VFIO_PCI_CONFIG_OFF +
+                  pos + PCI_MSIX_TABLE) != sizeof(table)) {
+            return -1;
+        }
+
+        if (pread(vdev->vfiofd, &pba, sizeof(pba),
+                  VFIO_PCI_CONFIG_OFF + pos + PCI_MSIX_PBA) != sizeof(pba)) {
+            return -1;
+        }
+
+        ctrl = le16_to_cpu(ctrl);
+        table = le32_to_cpu(table);
+        pba = le32_to_cpu(pba);
+
+        vdev->msix.pos = pos;
+        vdev->msix.table_len = (ctrl & PCI_MSIX_TABSIZE) + 1;
+        vdev->msix.bar = table & PCI_MSIX_BIR;
+        vdev->msix.bar_offset = table & ~PCI_MSIX_BIR;
+        vdev->resources[vdev->msix.bar].msix = 1;
+        vdev->resources[vdev->msix.bar].msix_offset = vdev->msix.bar_offset;
+
+        DPRINTF("%04x:%02x:%02x.%x PCI MSI-X CAP @%d, BAR %d, offset 0x%x\n",
+                vdev->host.seg, vdev->host.bus, vdev->host.dev,
+                vdev->host.func, pos, vdev->msix.bar, vdev->msix.bar_offset);
+
+        if ((pba & PCI_MSIX_BIR) == vdev->msix.bar &&
+            ((pba & ~0xfff) == vdev->msix.bar_offset)) {
+            fprintf(stderr, "vfio: Error: MSIX Table & PBA reside in the same "
+                    "page, not yet supported\n");
+            return -1;
+        }
+
+        /*
+         * Check if the BAR containing the MSIX table is 4k aligned, if
+         * so we can avoid slow mapping messiness.  This shouldn't fail
+         * for devices that follow the spec recommendations for sizing
+         * and placement. */
+        len = vdev->msix.bar;
+        if (ioctl(vdev->vfiofd, VFIO_BAR_LEN, &len)) {
+            fprintf(stderr, "vfio: VFIO_BAR_LEN failed for MSIX BAR\n");
+            return -1;
+        }
+        if (!len || len & 0xfff) {
+            fprintf(stderr, "vfio: MSIX BAR not 4k aligned\n");
+            return -1;
+        }
+
+        vdev->msix.table = mmap(NULL, 0x1000, PROT_READ|PROT_WRITE,
+                                MAP_ANONYMOUS|MAP_PRIVATE, 0, 0);
+        if (vdev->msix.table == MAP_FAILED) {
+            fprintf(stderr, "vfio: Failed to allocate MSIX table page\n");
+            return -1;
+        }
+
+        memset(vdev->msix.table, 0, 0x1000);
+        vdev->msix.index = cpu_register_io_memory(msix_mmio_reads,
+                                                  msix_mmio_writes, vdev);
+    }
+    return 0;
+}
+
+static void vfio_teardown_msi(VFIODevice *vdev)
+{
+    if (vdev->msix.table) {
+        munmap(vdev->msix.table, 0x1000);
+    }
+    if (vdev->msix.index) {
+        cpu_unregister_io_memory(vdev->msix.index);
+    }
+}
+
+/*
+ * Resource setup
+ */
+static int vfio_setup_resources(VFIODevice *vdev)
+{
+    int i;
+
+    for (i = 0; i < PCI_NUM_REGIONS; i++) {
+        uint32_t len, bar;
+        PCIResource *res;
+        uint8_t offset;
+        int ret, space;
+
+        res = &vdev->resources[i];
+        res->vfiofd = vdev->vfiofd;
+        res->bar = len = i;
+
+        if (ioctl(vdev->vfiofd, VFIO_BAR_LEN, &len)) {
+            fprintf(stderr, "vfio: VFIO_BAR_LEN failed for BAR %d\n", i);
+            return -1;
+        }
+        if (!len) {
+            continue;
+        }
+
+        offset = PCI_BASE_ADDRESS_0 + (4 * i);
+        ret = pread(vdev->vfiofd, &bar, sizeof(bar),
+                    VFIO_PCI_CONFIG_OFF + offset);
+        if (ret != sizeof(bar)) {
+            fprintf(stderr, "vfio: Failed to read BAR %d\n", i);
+            return -1;
+        }
+        bar = le32_to_cpu(bar);
+        space = bar & PCI_BASE_ADDRESS_SPACE;
+
+        if (space == PCI_BASE_ADDRESS_SPACE_MEMORY && !(len & 0xfff)) {
+            int off = VFIO_PCI_BAR0_RESOURCE + i;
+            int flags = PROT_READ;
+            char name[32];
+
+            res->mem = 1;
+            res->size = len;
+
+            if (i != PCI_ROM_SLOT) {
+                flags |= PROT_WRITE;
+            }
+           
+            if (vdev->pdev.qdev.info->vmsd) {
+                snprintf(name, sizeof(name), "%s.bar%d",
+                         vdev->pdev.qdev.info->vmsd->name, i);
+            } else {
+                snprintf(name, sizeof(name), "%s.bar%d",
+                         vdev->pdev.qdev.info->name, i);
+            }
+
+            if (res->msix) {
+                if (res->msix_offset) {
+                    char *c = &name[strlen(name)];
+                    
+                    res->r_virtbase[0] = mmap(NULL, res->msix_offset, flags,
+                                              MAP_SHARED, vdev->vfiofd,
+                                              vfio_pci_space_to_offset(off));
+
+                    if (res->r_virtbase[0] == MAP_FAILED) {
+                        fprintf(stderr, "vfio: Failed to mmap BAR %d\n", i);
+                        return -1;
+                    }
+                    strncat(name, ".0", sizeof(name));
+                    res->memory_index[0] = qemu_ram_map(&vdev->pdev.qdev,
+                                                        name, res->msix_offset,
+                                                        res->r_virtbase[0]);
+                    *c = 0;
+                }
+                if (len > res->msix_offset + 0x1000) {
+                    char *c = &name[strlen(name)];
+
+                    res->r_virtbase[1] = mmap(NULL,
+                                              len - res->msix_offset - 0x1000,
+                                              flags, MAP_SHARED, vdev->vfiofd,
+                                              vfio_pci_space_to_offset(off) +
+                                              res->msix_offset + 0x1000);
+
+                    if (res->r_virtbase[1] == MAP_FAILED) {
+                        fprintf(stderr, "vfio: Failed to mmap BAR %d\n", i);
+                        return -1;
+                    }
+                    strncat(name, ".1", sizeof(name));
+                    res->memory_index[1] = qemu_ram_map(&vdev->pdev.qdev, name,
+                                                        len - 0x1000 -
+                                                        res->msix_offset,
+                                                        res->r_virtbase[1]);
+                    *c = 0;
+                }
+            } else {
+                res->r_virtbase[0] = mmap(NULL, len, flags, MAP_SHARED,
+                                          vdev->vfiofd,
+                                          vfio_pci_space_to_offset(off));
+
+                if (res->r_virtbase[0] == MAP_FAILED) {
+                    fprintf(stderr, "vfio: Failed to mmap BAR %d\n", i);
+                    return -1;
+                }
+                res->memory_index[0] = qemu_ram_map(&vdev->pdev.qdev, name,
+                                                    len, res->r_virtbase[0]);
+                if (i == PCI_ROM_SLOT) {
+                    res->memory_index[0] |= IO_MEM_ROM;
+                }
+            }
+
+            pci_register_bar(&vdev->pdev, i, res->size,
+                             bar & PCI_BASE_ADDRESS_MEM_PREFETCH ?
+                             PCI_BASE_ADDRESS_MEM_PREFETCH :
+                             PCI_BASE_ADDRESS_SPACE_MEMORY,
+                             vfio_iomem_map);
+                  
+            if (bar & PCI_BASE_ADDRESS_MEM_TYPE_64) {
+                i++;
+            }
+        } else if (space == PCI_BASE_ADDRESS_SPACE_MEMORY) {
+            res->mem = 1;
+            res->size = len;
+            res->slow = 1;
+
+            DPRINTF("%s(%04x:%02x:%02x.%x) Using slow mapping for BAR %d\n",
+                    __FUNCTION__, vdev->host.seg, vdev->host.bus,
+            vdev->host.dev, vdev->host.func, i);
+
+            if (i == PCI_ROM_SLOT) {
+                res->io_mem = cpu_register_io_memory(vfio_resource_reads,
+                                                     NULL, res);
+            } else {
+                res->io_mem = cpu_register_io_memory(vfio_resource_reads,
+                                                     vfio_resource_writes, 
res);
+            }
+
+            pci_register_bar(&vdev->pdev, i, res->size,
+                             bar & PCI_BASE_ADDRESS_MEM_PREFETCH ?
+                             PCI_BASE_ADDRESS_MEM_PREFETCH :
+                             PCI_BASE_ADDRESS_SPACE_MEMORY,
+                             vfio_iomem_map);
+
+        } else if (space == PCI_BASE_ADDRESS_SPACE_IO) {
+            res->size = len;
+            pci_register_bar(&vdev->pdev, i, res->size,
+                             PCI_BASE_ADDRESS_SPACE_IO, vfio_ioport_map);
+        }
+        res->valid = 1;
+    }
+    return 0;
+}
+
+static void vfio_unmap_resources(VFIODevice *vdev)
+{
+    int i;
+    PCIResource *res = vdev->resources;
+
+    for (i = 0; i < PCI_NUM_REGIONS; i++, res++) {
+        if (res->valid && res->mem) {
+            if (res->msix) {
+                if (res->msix_offset) {
+                    cpu_register_physical_memory(res->e_phys, res->msix_offset,
+                                                 IO_MEM_UNASSIGNED);
+                    qemu_ram_unmap(res->memory_index[0]);
+                    munmap(res->r_virtbase[0], res->msix_offset);
+                }
+                if (res->size > res->msix_offset + 0x1000) {
+                    cpu_register_physical_memory(res->e_phys + 0x1000 +
+                                                 res->msix_offset,
+                                                 res->e_size - 0x1000 -
+                                                 res->msix_offset,
+                                                 IO_MEM_UNASSIGNED);
+                    qemu_ram_unmap(res->memory_index[1]);
+                    munmap(res->r_virtbase[1],
+                           res->size - 0x1000 - res->msix_offset);
+                }
+            } else {
+                if (!res->slow) {
+                    cpu_register_physical_memory(res->e_phys, res->e_size,
+                                                 IO_MEM_UNASSIGNED);
+                    qemu_ram_unmap(res->memory_index[0]);
+                    munmap(res->r_virtbase[0], res->size);
+                } else {
+                    cpu_unregister_io_memory(res->io_mem);
+                }
+            }
+        }
+    }
+}
+
+/*
+ * General setup
+ */
+static int get_vfio_fd(VFIODevice *vdev)
+{
+    if (vdev->vfiofd_name && strlen(vdev->vfiofd_name) > 0) {
+        if (qemu_isdigit(vdev->vfiofd_name[0])) {
+            vdev->vfiofd = strtol(vdev->vfiofd_name, NULL, 0);
+            return 0;
+        } else {
+            vdev->vfiofd = monitor_get_fd(cur_mon, vdev->vfiofd_name);
+            if (vdev->vfiofd < 0) {
+                fprintf(stderr, "%s: (%s) unkown\n", __func__,
+                        vdev->vfiofd_name);
+                return -1;
+            }
+            return 0;
+        }
+    } else {
+        char vfio_dir[64], vfio_dev[16];
+        DIR *dir;
+        struct dirent *de;
+
+        sprintf(vfio_dir, "/sys/bus/pci/devices/%04x:%02x:%02x.%01x/vfio/",
+                vdev->host.seg, vdev->host.bus,
+                vdev->host.dev, vdev->host.func);
+        dir = opendir(vfio_dir);
+        if (!dir) {
+            error_report("vfio: error: Driver not attached\n");
+            return -1;
+        }
+
+        while ((de = readdir(dir))) {
+            if (de->d_name[0] == '.')
+                continue;
+            if (!strncmp(de->d_name, "vfio", 4))
+                break;
+        }
+
+        if (!de) {
+            error_report("vfio: error: Cannot find vfio* in %s\n", vfio_dir);
+            return -1;
+        }
+
+        sprintf(vfio_dev, "/dev/%s", de->d_name);
+        vdev->vfiofd = open(vfio_dev, O_RDWR);
+        if (vdev->vfiofd < 0) {
+            error_report("pci-assign: vfio: Failed to open %s: %s\n",
+                         vfio_dev, strerror(errno));
+            return -1;
+        }
+        return 0;
+    }
+}
+
+static int get_uiommu_fd(VFIODevice *vdev)
+{
+    if (vdev->uiommufd_name && strlen(vdev->uiommufd_name) > 0) {
+        if (qemu_isdigit(vdev->uiommufd_name[0])) {
+            vdev->uiommufd = strtol(vdev->uiommufd_name, NULL, 0);
+            return 0;
+        } else {
+            vdev->uiommufd = monitor_get_fd(cur_mon, vdev->uiommufd_name);
+            if (vdev->uiommufd < 0) {
+                fprintf(stderr, "%s: (%s) unkown\n", __func__,
+                        vdev->uiommufd_name);
+                return -1;
+            }
+            return 0;
+        }
+    } else {
+        vdev->uiommufd = open("/dev/uiommu", O_RDONLY);
+        if (vdev->uiommufd < 0) {
+            return -1;
+        }
+        vdev->uiommufd_name = NULL; /* easier test later */
+        return 0;
+    }
+}
+
+static int vfio_initfn(struct PCIDevice *pdev)
+{
+    VFIODevice *vdev = DO_UPCAST(VFIODevice, pdev, pdev);
+    char sys[64];
+    struct stat st;
+    int ret;
+
+    /* Check that the host device exists */
+    sprintf(sys, "/sys/bus/pci/devices/%04x:%02x:%02x.%01x/",
+            vdev->host.seg, vdev->host.bus, vdev->host.dev, vdev->host.func);
+    if (stat(sys, &st) < 0) {
+        error_report("vfio: error: no such host device "
+                     "%04x:%02x:%02x.%01x", vdev->host.seg, vdev->host.bus,
+                     vdev->host.dev, vdev->host.func);
+        return -1;
+    }
+
+    if (get_uiommu_fd(vdev))
+        return -1;
+
+    if (get_vfio_fd(vdev))
+        goto out_close_uiommu;
+
+    if (ioctl(vdev->vfiofd, VFIO_DOMAIN_SET, &vdev->uiommufd))
+        goto out_close_vfiofd;
+
+    /* Get a copy of config space */
+    ret = pread(vdev->vfiofd, vdev->pdev.config,
+                pci_config_size(&vdev->pdev), VFIO_PCI_CONFIG_OFF);
+    if (ret < pci_config_size(&vdev->pdev)) {
+        fprintf(stderr, "vfio: Failed to read device config space\n");
+        goto out_unset_domain;
+    }
+
+    if (vfio_setup_msi(vdev))
+        goto out_unset_domain;
+
+    if (vfio_setup_resources(vdev))
+        goto out_disable_msix;
+
+    if (vfio_map_iommu(vdev))
+        goto out_unmap_resources;
+
+    if (vfio_enable_intx(vdev))
+        goto out_unmap_iommu;
+
+    return 0;
+
+out_unmap_iommu:
+    vfio_unmap_iommu(vdev);
+out_unmap_resources:
+    vfio_unmap_resources(vdev);
+out_disable_msix:
+    vfio_teardown_msi(vdev);
+out_unset_domain:
+    ioctl(vdev->vfiofd, VFIO_DOMAIN_UNSET);
+out_close_vfiofd:
+    close(vdev->vfiofd);
+out_close_uiommu:
+    if (!vdev->uiommufd_name)
+        close(vdev->uiommufd);
+    return -1;
+}
+
+static int vfio_exitfn(struct PCIDevice *pdev)
+{
+    VFIODevice *vdev = DO_UPCAST(VFIODevice, pdev, pdev);
+    
+    vfio_disable_intx(vdev);
+    vfio_disable_msi(vdev);
+    vfio_disable_msix(vdev);
+    vfio_unmap_iommu(vdev);
+    vfio_unmap_resources(vdev);
+    ioctl(vdev->vfiofd, VFIO_DOMAIN_UNSET);
+    close(vdev->vfiofd);
+    if (!vdev->uiommufd_name)
+        close(vdev->uiommufd);
+    return 0;
+}
+
+static PropertyInfo qdev_prop_hostaddr = {
+    .name  = "pci-hostaddr",
+    .type  = -1,
+    .size  = sizeof(PCIHostDevice),
+    .parse = parse_hostaddr,
+    .print = print_hostaddr,
+};
+
+static PCIDeviceInfo vfio_info = {
+    .qdev.name    = "vfio",
+    .qdev.desc    = "pass through host pci devices to the guest via vfio",
+    .qdev.size    = sizeof(VFIODevice),
+    .init         = vfio_initfn,
+    .exit         = vfio_exitfn,
+    .config_read  = vfio_pci_read_config,
+    .config_write = vfio_pci_write_config,
+    .qdev.props   = (Property[]) {
+        DEFINE_PROP("host", VFIODevice, host,
+                    qdev_prop_hostaddr, PCIHostDevice),
+        DEFINE_PROP_STRING("vfiofd", VFIODevice, vfiofd_name),
+        DEFINE_PROP_STRING("uiommufd", VFIODevice, uiommufd_name),
+        DEFINE_PROP_END_OF_LIST(),
+    },
+};
+
+static void vfio_register_devices(void)
+{
+    pci_qdev_register(&vfio_info);
+}
+
+device_init(vfio_register_devices)
diff --git a/hw/vfio.h b/hw/vfio.h
new file mode 100644
index 0000000..9d05ae1
--- /dev/null
+++ b/hw/vfio.h
@@ -0,0 +1,90 @@
+#ifndef __VFIO_H__
+#define __VFIO_H__
+
+#include "qemu-common.h"
+#include "qemu-queue.h"
+#include "pci.h"
+
+typedef struct PCIHostDevice {
+    uint16_t seg;
+    uint8_t bus;
+    uint8_t dev:5;
+    uint8_t func:3;
+} PCIHostDevice;
+
+typedef struct PCIResource {
+    uint8_t valid:1;
+    uint8_t mem:1;
+    uint8_t msix:1;
+    uint8_t bar:3;               /* see vfio_resource_read/write */
+    uint8_t slow:1;              /* use read/write rather than mmap */
+    uint64_t size;
+    ram_addr_t memory_index[2];  /* cpu_register_physical_memory() index */
+    void *r_virtbase[2];         /* mmapped address */
+    int io_mem;                  /* cpu_register_io_memory index */
+    pcibus_t e_phys;             /* emulated base address */
+    pcibus_t e_size;             /* emulated size of region in bytes */
+    uint32_t msix_offset;
+    int vfiofd;                  /* see vfio_resource_read/write */
+} PCIResource;
+
+typedef struct INTx {
+    uint8_t enabled:1;
+    uint8_t pending:1;
+    uint8_t pin:3;
+    EventNotifier notifier;
+    ioapic_eoi_client eoi_client;
+} INTx;
+
+typedef struct MSIEvent {
+    EventNotifier notifier;
+    uint8_t *addr;
+    uint8_t *upper_addr;
+    uint8_t *data;
+} MSIEvent;
+
+typedef struct MSI {
+    uint8_t enabled:1;
+    uint8_t pos;
+    int num_events;
+    MSIEvent *events;
+} MSI;
+
+typedef struct __attribute__((packed)) MSIXTableEntry {
+    uint32_t addr;
+    uint32_t upper_addr;
+    uint32_t data;
+    uint32_t ctrl;
+} MSIXTableEntry;
+
+typedef struct MSIXEvent {
+    EventNotifier notifier;
+    MSIXTableEntry *entry;
+} MSIXEvent;
+
+typedef struct MSIX {
+    uint8_t enabled:1;
+    uint8_t bar:3;
+    uint8_t pos;
+    uint16_t table_len;
+    uint32_t bar_offset;
+    MSIXTableEntry *table;
+    int num_events;
+    MSIXEvent *events;
+    int index;
+} MSIX;
+
+typedef struct VFIODevice {
+    PCIDevice pdev;
+    PCIHostDevice host;
+    PCIResource resources[PCI_NUM_REGIONS];
+    INTx intx;
+    MSI msi;
+    MSIX msix;
+    int vfiofd;
+    int uiommufd;
+    char *vfiofd_name;
+    char *uiommufd_name;
+} VFIODevice;
+
+#endif /* __VFIO_H__ */




reply via email to

[Prev in Thread] Current Thread [Next in Thread]