>From 380156ade7053664bdb318af0659708357f40050 Mon Sep 17 00:00:00 2001 From: Neo Jia Date: Sun, 24 Jan 2016 11:24:13 -0800 Subject: [PATCH] Add VGPU VFIO driver class support in QEMU This is just a quick POV change to allow us experiment the VGPU VFIO support, the next step is to merge this into the current vfio/pci.c which currently has a physical backing devices. Within current POC implementation, we have copy & paste lots function directly from the vfio/pci.c code, we should merge them together later. - Basic MMIO and PCI config apccess are supported - MMAP'ed GPU bar is supported - INTx and MSI using eventfd is supported, don't think we should support interrupt when vector->kvm_interrupt is not enabled. Change-Id: I99c34ac44524cd4d7d2abbcc4d43634297b96e80 Signed-off-by: Neo Jia Signed-off-by: Kirti Wankhede --- hw/vfio/Makefile.objs | 1 + hw/vfio/vgpu.c | 991 ++++++++++++++++++++++++++++++++++++++++++++++++++ include/hw/pci/pci.h | 3 + 3 files changed, 995 insertions(+) create mode 100644 hw/vfio/vgpu.c diff --git a/hw/vfio/Makefile.objs b/hw/vfio/Makefile.objs index d324863..17f2ef1 100644 --- a/hw/vfio/Makefile.objs +++ b/hw/vfio/Makefile.objs @@ -1,6 +1,7 @@ ifeq ($(CONFIG_LINUX), y) obj-$(CONFIG_SOFTMMU) += common.o obj-$(CONFIG_PCI) += pci.o pci-quirks.o +obj-$(CONFIG_PCI) += vgpu.o obj-$(CONFIG_SOFTMMU) += platform.o obj-$(CONFIG_SOFTMMU) += calxeda-xgmac.o endif diff --git a/hw/vfio/vgpu.c b/hw/vfio/vgpu.c new file mode 100644 index 0000000..56ebce0 --- /dev/null +++ b/hw/vfio/vgpu.c @@ -0,0 +1,991 @@ +/* + * vGPU VFIO device + * + * This work is licensed under the terms of the GNU GPL, version 2. See + * the COPYING file in the top-level directory. + * + */ + +#include +#include +#include +#include +#include +#include +#include + +#include "config.h" +#include "exec/address-spaces.h" +#include "exec/memory.h" +#include "hw/pci/msi.h" +#include "hw/pci/msix.h" +#include "hw/pci/pci.h" +#include "qemu-common.h" +#include "qemu/error-report.h" +#include "qemu/event_notifier.h" +#include "qemu/queue.h" +#include "qemu/range.h" +#include "sysemu/kvm.h" +#include "sysemu/sysemu.h" +#include "trace.h" +#include "hw/vfio/vfio.h" +#include "hw/vfio/pci.h" +#include "hw/vfio/vfio-common.h" +#include "qmp-commands.h" + +#define TYPE_VFIO_VGPU "vfio-vgpu" + +typedef struct VFIOvGPUDevice { + PCIDevice pdev; + VFIODevice vbasedev; + VFIOINTx intx; + VFIOBAR bars[PCI_NUM_REGIONS - 1]; /* No ROM */ + uint8_t *emulated_config_bits; /* QEMU emulated bits, little-endian */ + unsigned int config_size; + char *vgpu_type; + char *vm_uuid; + off_t config_offset; /* Offset of config space region within device fd */ + int msi_cap_size; + EventNotifier req_notifier; + int nr_vectors; /* Number of MSI/MSIX vectors currently in use */ + int interrupt; /* Current interrupt type */ + VFIOMSIVector *msi_vectors; +} VFIOvGPUDevice; + +/* + * Local functions + */ + +// function prototypes +static void vfio_vgpu_disable_interrupts(VFIOvGPUDevice *vdev); +static uint32_t vfio_vgpu_read_config(PCIDevice *pdev, uint32_t addr, int len); + + +// INTx functions + +static void vfio_vgpu_intx_interrupt(void *opaque) +{ + VFIOvGPUDevice *vdev = opaque; + + if (!event_notifier_test_and_clear(&vdev->intx.interrupt)) { + return; + } + + vdev->intx.pending = true; + pci_irq_assert(&vdev->pdev); +// vfio_mmap_set_enabled(vdev, false); + +} + +static void vfio_vgpu_intx_eoi(VFIODevice *vbasedev) +{ + VFIOvGPUDevice *vdev = container_of(vbasedev, VFIOvGPUDevice, vbasedev); + + if (!vdev->intx.pending) { + return; + } + + trace_vfio_intx_eoi(vbasedev->name); + + vdev->intx.pending = false; + pci_irq_deassert(&vdev->pdev); + vfio_unmask_single_irqindex(vbasedev, VFIO_PCI_INTX_IRQ_INDEX); +} + +static void vfio_vgpu_intx_enable_kvm(VFIOvGPUDevice *vdev) +{ +#ifdef CONFIG_KVM + struct kvm_irqfd irqfd = { + .fd = event_notifier_get_fd(&vdev->intx.interrupt), + .gsi = vdev->intx.route.irq, + .flags = KVM_IRQFD_FLAG_RESAMPLE, + }; + struct vfio_irq_set *irq_set; + int ret, argsz; + int32_t *pfd; + + if (!kvm_irqfds_enabled() || + vdev->intx.route.mode != PCI_INTX_ENABLED || + !kvm_resamplefds_enabled()) { + return; + } + + /* Get to a known interrupt state */ + qemu_set_fd_handler(irqfd.fd, NULL, NULL, vdev); + vfio_mask_single_irqindex(&vdev->vbasedev, VFIO_PCI_INTX_IRQ_INDEX); + vdev->intx.pending = false; + pci_irq_deassert(&vdev->pdev); + + /* Get an eventfd for resample/unmask */ + if (event_notifier_init(&vdev->intx.unmask, 0)) { + error_report("vfio: Error: event_notifier_init failed eoi"); + goto fail; + } + + /* KVM triggers it, VFIO listens for it */ + irqfd.resamplefd = event_notifier_get_fd(&vdev->intx.unmask); + + if (kvm_vm_ioctl(kvm_state, KVM_IRQFD, &irqfd)) { + error_report("vfio: Error: Failed to setup resample irqfd: %m"); + goto fail_irqfd; + } + + argsz = sizeof(*irq_set) + sizeof(*pfd); + + irq_set = g_malloc0(argsz); + irq_set->argsz = argsz; + irq_set->flags = VFIO_IRQ_SET_DATA_EVENTFD | VFIO_IRQ_SET_ACTION_UNMASK; + irq_set->index = VFIO_PCI_INTX_IRQ_INDEX; + irq_set->start = 0; + irq_set->count = 1; + pfd = (int32_t *)&irq_set->data; + + *pfd = irqfd.resamplefd; + + ret = ioctl(vdev->vbasedev.fd, VFIO_DEVICE_SET_IRQS, irq_set); + g_free(irq_set); + if (ret) { + error_report("vfio: Error: Failed to setup INTx unmask fd: %m"); + goto fail_vfio; + } + + /* Let'em rip */ + vfio_unmask_single_irqindex(&vdev->vbasedev, VFIO_PCI_INTX_IRQ_INDEX); + + vdev->intx.kvm_accel = true; + + trace_vfio_intx_enable_kvm(vdev->vbasedev.name); + + return; + +fail_vfio: + irqfd.flags = KVM_IRQFD_FLAG_DEASSIGN; + kvm_vm_ioctl(kvm_state, KVM_IRQFD, &irqfd); +fail_irqfd: + event_notifier_cleanup(&vdev->intx.unmask); +fail: + qemu_set_fd_handler(irqfd.fd, vfio_vgpu_intx_interrupt, NULL, vdev); + vfio_unmask_single_irqindex(&vdev->vbasedev, VFIO_PCI_INTX_IRQ_INDEX); +#endif +} + +static void vfio_vgpu_intx_disable_kvm(VFIOvGPUDevice *vdev) +{ +#ifdef CONFIG_KVM + struct kvm_irqfd irqfd = { + .fd = event_notifier_get_fd(&vdev->intx.interrupt), + .gsi = vdev->intx.route.irq, + .flags = KVM_IRQFD_FLAG_DEASSIGN, + }; + + if (!vdev->intx.kvm_accel) { + return; + } + + /* + * Get to a known state, hardware masked, QEMU ready to accept new + * interrupts, QEMU IRQ de-asserted. + */ + vfio_mask_single_irqindex(&vdev->vbasedev, VFIO_PCI_INTX_IRQ_INDEX); + vdev->intx.pending = false; + pci_irq_deassert(&vdev->pdev); + + /* Tell KVM to stop listening for an INTx irqfd */ + if (kvm_vm_ioctl(kvm_state, KVM_IRQFD, &irqfd)) { + error_report("vfio: Error: Failed to disable INTx irqfd: %m"); + } + + /* We only need to close the eventfd for VFIO to cleanup the kernel side */ + event_notifier_cleanup(&vdev->intx.unmask); + + /* QEMU starts listening for interrupt events. */ + qemu_set_fd_handler(irqfd.fd, vfio_vgpu_intx_interrupt, NULL, vdev); + + vdev->intx.kvm_accel = false; + + /* If we've missed an event, let it re-fire through QEMU */ + vfio_unmask_single_irqindex(&vdev->vbasedev, VFIO_PCI_INTX_IRQ_INDEX); + + trace_vfio_intx_disable_kvm(vdev->vbasedev.name); +#endif +} + +static void vfio_vgpu_intx_update(PCIDevice *pdev) +{ + VFIOvGPUDevice *vdev = DO_UPCAST(VFIOvGPUDevice, pdev, pdev); + PCIINTxRoute route; + + if (vdev->interrupt != VFIO_INT_INTx) { + return; + } + + route = pci_device_route_intx_to_irq(&vdev->pdev, vdev->intx.pin); + + if (!pci_intx_route_changed(&vdev->intx.route, &route)) { + return; /* Nothing changed */ + } + + trace_vfio_intx_update(vdev->vbasedev.name, + vdev->intx.route.irq, route.irq); + + vfio_vgpu_intx_disable_kvm(vdev); + + vdev->intx.route = route; + + if (route.mode != PCI_INTX_ENABLED) { + return; + } + + vfio_vgpu_intx_enable_kvm(vdev); + + /* Re-enable the interrupt in cased we missed an EOI */ + vfio_vgpu_intx_eoi(&vdev->vbasedev); +} + +static int vfio_vgpu_intx_enable(VFIOvGPUDevice *vdev) +{ + uint8_t pin = vfio_vgpu_read_config(&vdev->pdev, PCI_INTERRUPT_PIN, 1); + int ret, argsz; + struct vfio_irq_set *irq_set; + int32_t *pfd; + + if (!pin) { + return 0; + } + + vfio_vgpu_disable_interrupts(vdev); + + vdev->intx.pin = pin - 1; /* Pin A (1) -> irq[0] */ + pci_config_set_interrupt_pin(vdev->pdev.config, pin); + +#ifdef CONFIG_KVM + /* + * Only conditional to avoid generating error messages on platforms + * where we won't actually use the result anyway. + */ + if (kvm_irqfds_enabled() && kvm_resamplefds_enabled()) { + vdev->intx.route = pci_device_route_intx_to_irq(&vdev->pdev, + vdev->intx.pin); + } +#endif + + ret = event_notifier_init(&vdev->intx.interrupt, 0); + if (ret) { + error_report("vfio: Error: event_notifier_init failed"); + return ret; + } + + argsz = sizeof(*irq_set) + sizeof(*pfd); + + irq_set = g_malloc0(argsz); + irq_set->argsz = argsz; + irq_set->flags = VFIO_IRQ_SET_DATA_EVENTFD | VFIO_IRQ_SET_ACTION_TRIGGER; + irq_set->index = VFIO_PCI_INTX_IRQ_INDEX; + irq_set->start = 0; + irq_set->count = 1; + pfd = (int32_t *)&irq_set->data; + + *pfd = event_notifier_get_fd(&vdev->intx.interrupt); + qemu_set_fd_handler(*pfd, vfio_vgpu_intx_interrupt, NULL, vdev); + + ret = ioctl(vdev->vbasedev.fd, VFIO_DEVICE_SET_IRQS, irq_set); + g_free(irq_set); + if (ret) { + error_report("vfio: Error: Failed to setup INTx fd: %m"); + qemu_set_fd_handler(*pfd, NULL, NULL, vdev); + event_notifier_cleanup(&vdev->intx.interrupt); + return -errno; + } + + vfio_vgpu_intx_enable_kvm(vdev); + + vdev->interrupt = VFIO_INT_INTx; + + trace_vfio_intx_enable(vdev->vbasedev.name); + + return 0; +} + +static void vfio_vgpu_intx_disable(VFIOvGPUDevice *vdev) +{ + int fd; + + vfio_vgpu_intx_disable_kvm(vdev); + vfio_disable_irqindex(&vdev->vbasedev, VFIO_PCI_INTX_IRQ_INDEX); + vdev->intx.pending = false; + pci_irq_deassert(&vdev->pdev); +// vfio_mmap_set_enabled(vdev, true); + + fd = event_notifier_get_fd(&vdev->intx.interrupt); + qemu_set_fd_handler(fd, NULL, NULL, vdev); + event_notifier_cleanup(&vdev->intx.interrupt); + + vdev->interrupt = VFIO_INT_NONE; + + trace_vfio_intx_disable(vdev->vbasedev.name); +} + +//MSI functions +static void vfio_vgpu_remove_kvm_msi_virq(VFIOMSIVector *vector) +{ + kvm_irqchip_remove_irqfd_notifier_gsi(kvm_state, &vector->kvm_interrupt, + vector->virq); + kvm_irqchip_release_virq(kvm_state, vector->virq); + vector->virq = -1; + event_notifier_cleanup(&vector->kvm_interrupt); +} + +static void vfio_vgpu_msi_disable_common(VFIOvGPUDevice *vdev) +{ + int i; + + for (i = 0; i < vdev->nr_vectors; i++) { + VFIOMSIVector *vector = &vdev->msi_vectors[i]; + if (vdev->msi_vectors[i].use) { + if (vector->virq >= 0) { + vfio_vgpu_remove_kvm_msi_virq(vector); + } + qemu_set_fd_handler(event_notifier_get_fd(&vector->interrupt), + NULL, NULL, NULL); + event_notifier_cleanup(&vector->interrupt); + } + } + + g_free(vdev->msi_vectors); + vdev->msi_vectors = NULL; + vdev->nr_vectors = 0; + vdev->interrupt = VFIO_INT_NONE; + + vfio_vgpu_intx_enable(vdev); +} + +static void vfio_vgpu_msi_disable(VFIOvGPUDevice *vdev) +{ + vfio_disable_irqindex(&vdev->vbasedev, VFIO_PCI_MSI_IRQ_INDEX); + vfio_vgpu_msi_disable_common(vdev); +} + +static void vfio_vgpu_disable_interrupts(VFIOvGPUDevice *vdev) +{ + + if (vdev->interrupt == VFIO_INT_MSI) { + vfio_vgpu_msi_disable(vdev); + } + + if (vdev->interrupt == VFIO_INT_INTx) { + vfio_vgpu_intx_disable(vdev); + } +} + + +static void vfio_vgpu_msi_interrupt(void *opaque) +{ + VFIOMSIVector *vector = opaque; + VFIOvGPUDevice *vdev = (VFIOvGPUDevice *)vector->vdev; + MSIMessage (*get_msg)(PCIDevice *dev, unsigned vector); + void (*notify)(PCIDevice *dev, unsigned vector); + MSIMessage msg; + int nr = vector - vdev->msi_vectors; + + if (!event_notifier_test_and_clear(&vector->interrupt)) { + return; + } + + if (vdev->interrupt == VFIO_INT_MSIX) { + get_msg = msix_get_message; + notify = msix_notify; + } else if (vdev->interrupt == VFIO_INT_MSI) { + get_msg = msi_get_message; + notify = msi_notify; + } else { + abort(); + } + + msg = get_msg(&vdev->pdev, nr); + trace_vfio_msi_interrupt(vdev->vbasedev.name, nr, msg.address, msg.data); + notify(&vdev->pdev, nr); +} + +static int vfio_vgpu_enable_vectors(VFIOvGPUDevice *vdev, bool msix) +{ + struct vfio_irq_set *irq_set; + int ret = 0, i, argsz; + int32_t *fds; + + argsz = sizeof(*irq_set) + (vdev->nr_vectors * sizeof(*fds)); + + irq_set = g_malloc0(argsz); + irq_set->argsz = argsz; + irq_set->flags = VFIO_IRQ_SET_DATA_EVENTFD | VFIO_IRQ_SET_ACTION_TRIGGER; + irq_set->index = msix ? VFIO_PCI_MSIX_IRQ_INDEX : VFIO_PCI_MSI_IRQ_INDEX; + irq_set->start = 0; + irq_set->count = vdev->nr_vectors; + fds = (int32_t *)&irq_set->data; + + for (i = 0; i < vdev->nr_vectors; i++) { + int fd = -1; + + /* + * MSI vs MSI-X - The guest has direct access to MSI mask and pending + * bits, therefore we always use the KVM signaling path when setup. + * MSI-X mask and pending bits are emulated, so we want to use the + * KVM signaling path only when configured and unmasked. + */ + if (vdev->msi_vectors[i].use) { + if (vdev->msi_vectors[i].virq < 0 || + (msix && msix_is_masked(&vdev->pdev, i))) { + fd = event_notifier_get_fd(&vdev->msi_vectors[i].interrupt); + } else { + fd = event_notifier_get_fd(&vdev->msi_vectors[i].kvm_interrupt); + } + } + + fds[i] = fd; + } + + ret = ioctl(vdev->vbasedev.fd, VFIO_DEVICE_SET_IRQS, irq_set); + + g_free(irq_set); + + return ret; +} + +static void vfio_vgpu_add_kvm_msi_virq(VFIOvGPUDevice *vdev, VFIOMSIVector *vector, + MSIMessage *msg, bool msix) +{ + int virq; + + if (!msg) { + return; + } + + if (event_notifier_init(&vector->kvm_interrupt, 0)) { + return; + } + + virq = kvm_irqchip_add_msi_route(kvm_state, *msg, &vdev->pdev); + if (virq < 0) { + event_notifier_cleanup(&vector->kvm_interrupt); + return; + } + + if (kvm_irqchip_add_irqfd_notifier_gsi(kvm_state, &vector->kvm_interrupt, + NULL, virq) < 0) { + kvm_irqchip_release_virq(kvm_state, virq); + event_notifier_cleanup(&vector->kvm_interrupt); + return; + } + + vector->virq = virq; +} + +static void vfio_vgpu_update_kvm_msi_virq(VFIOMSIVector *vector, MSIMessage msg, + PCIDevice *pdev) +{ + kvm_irqchip_update_msi_route(kvm_state, vector->virq, msg, pdev); +} + + +static void vfio_vgpu_msi_enable(VFIOvGPUDevice *vdev) +{ + int ret, i; + + vfio_vgpu_disable_interrupts(vdev); + + vdev->nr_vectors = msi_nr_vectors_allocated(&vdev->pdev); +retry: + vdev->msi_vectors = g_new0(VFIOMSIVector, vdev->nr_vectors); + + for (i = 0; i < vdev->nr_vectors; i++) { + VFIOMSIVector *vector = &vdev->msi_vectors[i]; + MSIMessage msg = msi_get_message(&vdev->pdev, i); + + vector->vdev = (VFIOPCIDevice *)vdev; + vector->virq = -1; + vector->use = true; + + if (event_notifier_init(&vector->interrupt, 0)) { + error_report("vfio: Error: event_notifier_init failed"); + } + qemu_set_fd_handler(event_notifier_get_fd(&vector->interrupt), + vfio_vgpu_msi_interrupt, NULL, vector); + + /* + * Attempt to enable route through KVM irqchip, + * default to userspace handling if unavailable. + */ + vfio_vgpu_add_kvm_msi_virq(vdev, vector, &msg, false); + } + + /* Set interrupt type prior to possible interrupts */ + vdev->interrupt = VFIO_INT_MSI; + + ret = vfio_vgpu_enable_vectors(vdev, false); + if (ret) { + if (ret < 0) { + error_report("vfio: Error: Failed to setup MSI fds: %m"); + } else if (ret != vdev->nr_vectors) { + error_report("vfio: Error: Failed to enable %d " + "MSI vectors, retry with %d", vdev->nr_vectors, ret); + } + + for (i = 0; i < vdev->nr_vectors; i++) { + VFIOMSIVector *vector = &vdev->msi_vectors[i]; + if (vector->virq >= 0) { + vfio_vgpu_remove_kvm_msi_virq(vector); + } + qemu_set_fd_handler(event_notifier_get_fd(&vector->interrupt), + NULL, NULL, NULL); + event_notifier_cleanup(&vector->interrupt); + } + + g_free(vdev->msi_vectors); + + if (ret > 0 && ret != vdev->nr_vectors) { + vdev->nr_vectors = ret; + goto retry; + } + vdev->nr_vectors = 0; + + /* + * Failing to setup MSI doesn't really fall within any specification. + * Let's try leaving interrupts disabled and hope the guest figures + * out to fall back to INTx for this device. + */ + error_report("vfio: Error: Failed to enable MSI"); + vdev->interrupt = VFIO_INT_NONE; + + return; + } +} + +static void vfio_vgpu_update_msi(VFIOvGPUDevice *vdev) +{ + int i; + + for (i = 0; i < vdev->nr_vectors; i++) { + VFIOMSIVector *vector = &vdev->msi_vectors[i]; + MSIMessage msg; + + if (!vector->use || vector->virq < 0) { + continue; + } + + msg = msi_get_message(&vdev->pdev, i); + vfio_vgpu_update_kvm_msi_virq(vector, msg, &vdev->pdev); + } +} + +static int vfio_vgpu_msi_setup(VFIOvGPUDevice *vdev, int pos) +{ + uint16_t ctrl; + bool msi_64bit, msi_maskbit; + int ret, entries; + + if (pread(vdev->vbasedev.fd, &ctrl, sizeof(ctrl), + vdev->config_offset + pos + PCI_CAP_FLAGS) != sizeof(ctrl)) { + return -errno; + } + ctrl = le16_to_cpu(ctrl); + + msi_64bit = !!(ctrl & PCI_MSI_FLAGS_64BIT); + msi_maskbit = !!(ctrl & PCI_MSI_FLAGS_MASKBIT); + entries = 1 << ((ctrl & PCI_MSI_FLAGS_QMASK) >> 1); + + ret = msi_init(&vdev->pdev, pos, entries, msi_64bit, msi_maskbit); + if (ret < 0) { + if (ret == -ENOTSUP) { + return 0; + } + error_report("vfio: msi_init failed"); + return ret; + } + vdev->msi_cap_size = 0xa + (msi_maskbit ? 0xa : 0) + (msi_64bit ? 0x4 : 0); + + return 0; +} + + +static int vfio_vgpu_msi_init(VFIOvGPUDevice *vdev) +{ + uint8_t pos; + int ret; + + pos = pci_find_capability(&vdev->pdev, PCI_CAP_ID_MSI); + if (!pos) { + return 0; + } + + ret = vfio_vgpu_msi_setup(vdev, pos); + if (ret < 0) { + error_report("vgpu: Error setting address@hidden: %d", pos, ret); + return ret; + } + + return 0; +} + +/* + * VGPU device class functions + */ + +static void vfio_vgpu_reset(DeviceState *dev) +{ + + +} + +static void vfio_vgpu_eoi(VFIODevice *vbasedev) +{ + return; +} + +static int vfio_vgpu_hot_reset_multi(VFIODevice *vbasedev) +{ + // Nothing to be reset + return 0; +} + +static void vfio_vgpu_compute_needs_reset(VFIODevice *vbasedev) +{ + vbasedev->needs_reset = false; +} + +static VFIODeviceOps vfio_vgpu_ops = { + .vfio_compute_needs_reset = vfio_vgpu_compute_needs_reset, + .vfio_hot_reset_multi = vfio_vgpu_hot_reset_multi, + .vfio_eoi = vfio_vgpu_eoi, +}; + +static int vfio_vgpu_populate_device(VFIOvGPUDevice *vdev) +{ + VFIODevice *vbasedev = &vdev->vbasedev; + struct vfio_region_info reg_info = { .argsz = sizeof(reg_info) }; + int i, ret = -1; + + for (i = VFIO_PCI_BAR0_REGION_INDEX; i < VFIO_PCI_ROM_REGION_INDEX; i++) { + reg_info.index = i; + + ret = ioctl(vbasedev->fd, VFIO_DEVICE_GET_REGION_INFO, ®_info); + if (ret) { + error_report("vfio: Error getting region %d info: %m", i); + return ret; + } + + trace_vfio_populate_device_region(vbasedev->name, i, + (unsigned long)reg_info.size, + (unsigned long)reg_info.offset, + (unsigned long)reg_info.flags); + + vdev->bars[i].region.vbasedev = vbasedev; + vdev->bars[i].region.flags = reg_info.flags; + vdev->bars[i].region.size = reg_info.size; + vdev->bars[i].region.fd_offset = reg_info.offset; + vdev->bars[i].region.nr = i; + QLIST_INIT(&vdev->bars[i].quirks); + } + + reg_info.index = VFIO_PCI_CONFIG_REGION_INDEX; + + ret = ioctl(vdev->vbasedev.fd, VFIO_DEVICE_GET_REGION_INFO, ®_info); + if (ret) { + error_report("vfio: Error getting config info: %m"); + return ret; + } + + vdev->config_size = reg_info.size; + if (vdev->config_size == PCI_CONFIG_SPACE_SIZE) { + vdev->pdev.cap_present &= ~QEMU_PCI_CAP_EXPRESS; + } + vdev->config_offset = reg_info.offset; + + return 0; +} + +static void vfio_vgpu_create_virtual_bar(VFIOvGPUDevice *vdev, int nr) +{ + VFIOBAR *bar = &vdev->bars[nr]; + uint64_t size = bar->region.size; + char name[64]; + uint32_t pci_bar; + uint8_t type; + int ret; + + /* Skip both unimplemented BARs and the upper half of 64bit BARS. */ + if (!size) + return; + + /* Determine what type of BAR this is for registration */ + ret = pread(vdev->vbasedev.fd, &pci_bar, sizeof(pci_bar), + vdev->config_offset + PCI_BASE_ADDRESS_0 + (4 * nr)); + if (ret != sizeof(pci_bar)) { + error_report("vfio: Failed to read BAR %d (%m)", nr); + return; + } + + pci_bar = le32_to_cpu(pci_bar); + bar->ioport = (pci_bar & PCI_BASE_ADDRESS_SPACE_IO); + bar->mem64 = bar->ioport ? 0 : (pci_bar & PCI_BASE_ADDRESS_MEM_TYPE_64); + type = pci_bar & (bar->ioport ? ~PCI_BASE_ADDRESS_IO_MASK : + ~PCI_BASE_ADDRESS_MEM_MASK); + + /* A "slow" read/write mapping underlies all BARs */ + memory_region_init_io(&bar->region.mem, OBJECT(vdev), &vfio_region_ops, + bar, name, size); + pci_register_bar(&vdev->pdev, nr, type, &bar->region.mem); + + // Create an invalid BAR1 mapping + if (bar->region.flags & VFIO_REGION_INFO_FLAG_MMAP) { + strncat(name, " mmap", sizeof(name) - strlen(name) - 1); + vfio_mmap_region(OBJECT(vdev), &bar->region, &bar->region.mem, + &bar->region.mmap_mem, &bar->region.mmap, + size, 0, name); + } +} + +static void vfio_vgpu_create_virtual_bars(VFIOvGPUDevice *vdev) +{ + + int i = 0; + + for (i = 0; i < PCI_ROM_SLOT; i++) { + vfio_vgpu_create_virtual_bar(vdev, i); + } +} + +static int vfio_vgpu_initfn(PCIDevice *pdev) +{ + VFIOvGPUDevice *vdev = DO_UPCAST(VFIOvGPUDevice, pdev, pdev); + VFIOGroup *group; + ssize_t len; + int groupid; + struct stat st; + char path[PATH_MAX], iommu_group_path[PATH_MAX], *group_name; + int ret; + UuidInfo *uuid_info; + + uuid_info = qmp_query_uuid(NULL); + if (strcmp(uuid_info->UUID, UUID_NONE) == 0) { + return -EINVAL; + } else { + vdev->vm_uuid = uuid_info->UUID; + } + + + snprintf(path, sizeof(path), + "/sys/devices/virtual/vgpu/%s-0/", vdev->vm_uuid); + + if (stat(path, &st) < 0) { + error_report("vfio-vgpu: error: no such vgpu device: %s", path); + return -errno; + } + + vdev->vbasedev.ops = &vfio_vgpu_ops; + + vdev->vbasedev.type = VFIO_DEVICE_TYPE_PCI; + vdev->vbasedev.name = g_strdup_printf("%s-0", vdev->vm_uuid); + + strncat(path, "iommu_group", sizeof(path) - strlen(path) - 1); + + len = readlink(path, iommu_group_path, sizeof(path)); + if (len <= 0 || len >= sizeof(path)) { + error_report("vfio-vgpu: error no iommu_group for device"); + return len < 0 ? -errno : -ENAMETOOLONG; + } + + iommu_group_path[len] = 0; + group_name = basename(iommu_group_path); + + if (sscanf(group_name, "%d", &groupid) != 1) { + error_report("vfio-vgpu: error reading %s: %m", path); + return -errno; + } + + // TODO: This will only work if we *only* have VFIO_VGPU_IOMMU enabled + + group = vfio_get_group(groupid, pci_device_iommu_address_space(pdev)); + if (!group) { + error_report("vfio: failed to get group %d", groupid); + return -ENOENT; + } + + snprintf(path, sizeof(path), "%s-0", vdev->vm_uuid); + + ret = vfio_get_device(group, path, &vdev->vbasedev); + if (ret) { + error_report("vfio-vgpu; failed to get device %s", vdev->vgpu_type); + vfio_put_group(group); + return ret; + } + + ret = vfio_vgpu_populate_device(vdev); + if (ret) { + return ret; + } + + /* Get a copy of config space */ + ret = pread(vdev->vbasedev.fd, vdev->pdev.config, + MIN(pci_config_size(&vdev->pdev), vdev->config_size), + vdev->config_offset); + if (ret < (int)MIN(pci_config_size(&vdev->pdev), vdev->config_size)) { + ret = ret < 0 ? -errno : -EFAULT; + error_report("vfio: Failed to read device config space"); + return ret; + } + + vfio_vgpu_create_virtual_bars(vdev); + + ret = vfio_vgpu_msi_init(vdev); + if (ret < 0) { + error_report("%s: Error setting MSI %d", __FUNCTION__, ret); + return ret; + } + + if (vfio_vgpu_read_config(&vdev->pdev, PCI_INTERRUPT_PIN, 1)) { + pci_device_set_intx_routing_notifier(&vdev->pdev, vfio_vgpu_intx_update); + ret = vfio_vgpu_intx_enable(vdev); + if (ret) { + return ret; + } + } + + return 0; +} + + +static void vfio_vgpu_exitfn(PCIDevice *pdev) +{ + + +} + +static uint32_t vfio_vgpu_read_config(PCIDevice *pdev, uint32_t addr, int len) +{ + VFIOvGPUDevice *vdev = DO_UPCAST(VFIOvGPUDevice, pdev, pdev); + ssize_t ret; + uint32_t val = 0; + + ret = pread(vdev->vbasedev.fd, &val, len, vdev->config_offset + addr); + + if (ret != len) { + error_report("%s: failed at offset:0x%0x %m", __func__, addr); + return 0xFFFFFFFF; + } + + // memcpy(&vdev->emulated_config_bits + addr, &val, len); + return val; +} + +static void vfio_vgpu_write_config(PCIDevice *pdev, uint32_t addr, + uint32_t val, int len) +{ + VFIOvGPUDevice *vdev = DO_UPCAST(VFIOvGPUDevice, pdev, pdev); + ssize_t ret; + + ret = pwrite(vdev->vbasedev.fd, &val, len, vdev->config_offset + addr); + + if (ret != len) { + error_report("%s: failed at offset:0x%0x, val:0x%0x %m", + __func__, addr, val); + return; + } + + if (pdev->cap_present & QEMU_PCI_CAP_MSI && + ranges_overlap(addr, len, pdev->msi_cap, vdev->msi_cap_size)) { + int is_enabled, was_enabled = msi_enabled(pdev); + + pci_default_write_config(pdev, addr, val, len); + + is_enabled = msi_enabled(pdev); + + if (!was_enabled) { + if (is_enabled) { + vfio_vgpu_msi_enable(vdev); + } + } else { + if (!is_enabled) { + vfio_vgpu_msi_disable(vdev); + } else { + vfio_vgpu_update_msi(vdev); + } + } + } + else { + /* Write everything to QEMU to keep emulated bits correct */ + pci_default_write_config(pdev, addr, val, len); + } + + pci_default_write_config(pdev, addr, val, len); + + return; +} + +static const VMStateDescription vfio_vgpu_vmstate = { + .name = TYPE_VFIO_VGPU, + .unmigratable = 1, +}; + +// +// We don't actually need the vfio_vgpu_properties +// as we can just simply rely on VM UUID to find +// the IOMMU group for this VM +// + + +static Property vfio_vgpu_properties[] = { + + DEFINE_PROP_STRING("vgpu", VFIOvGPUDevice, vgpu_type), + DEFINE_PROP_END_OF_LIST() +}; + +#if 0 + +static void vfio_vgpu_instance_init(Object *obj) +{ + +} + +static void vfio_vgpu_instance_finalize(Object *obj) +{ + + +} + +#endif + +static void vfio_vgpu_class_init(ObjectClass *klass, void *data) +{ + DeviceClass *dc = DEVICE_CLASS(klass); + PCIDeviceClass *pdc = PCI_DEVICE_CLASS(klass); + // vgpudc->parent_realize = dc->realize; + // dc->realize = calxeda_xgmac_realize; + dc->desc = "VFIO-based vGPU"; + dc->vmsd = &vfio_vgpu_vmstate; + dc->reset = vfio_vgpu_reset; + // dc->cannot_instantiate_with_device_add_yet = true; + dc->props = vfio_vgpu_properties; + set_bit(DEVICE_CATEGORY_DISPLAY, dc->categories); + pdc->init = vfio_vgpu_initfn; + pdc->exit = vfio_vgpu_exitfn; + pdc->config_read = vfio_vgpu_read_config; + pdc->config_write = vfio_vgpu_write_config; + pdc->is_express = 0; /* For now, we are not */ + + pdc->vendor_id = PCI_DEVICE_ID_NVIDIA; + // pdc->device_id = 0x11B0; + pdc->class_id = PCI_CLASS_DISPLAY_VGA; +} + +static const TypeInfo vfio_vgpu_dev_info = { + .name = TYPE_VFIO_VGPU, + .parent = TYPE_PCI_DEVICE, + .instance_size = sizeof(VFIOvGPUDevice), + .class_init = vfio_vgpu_class_init, +}; + +static void register_vgpu_dev_type(void) +{ + type_register_static(&vfio_vgpu_dev_info); +} + +type_init(register_vgpu_dev_type) diff --git a/include/hw/pci/pci.h b/include/hw/pci/pci.h index 379b6e1..9af5e17 100644 --- a/include/hw/pci/pci.h +++ b/include/hw/pci/pci.h @@ -64,6 +64,9 @@ #define PCI_DEVICE_ID_VMWARE_IDE 0x1729 #define PCI_DEVICE_ID_VMWARE_VMXNET3 0x07B0 +/* NVIDIA (0x10de) */ +#define PCI_DEVICE_ID_NVIDIA 0x10de + /* Intel (0x8086) */ #define PCI_DEVICE_ID_INTEL_82551IT 0x1209 #define PCI_DEVICE_ID_INTEL_82557 0x1229 -- 1.8.3.1