[Top][All Lists]
[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]
Re: [Qemu-devel] [PATCH 2/3] vfio: vfio-pci device assignment driver
From: |
Alex Williamson |
Subject: |
Re: [Qemu-devel] [PATCH 2/3] vfio: vfio-pci device assignment driver |
Date: |
Mon, 13 Aug 2012 23:25:29 -0600 |
On Mon, 2012-08-13 at 17:18 -0500, Anthony Liguori wrote:
> Alex Williamson <address@hidden> writes:
> > +static int vfio_load_rom(VFIODevice *vdev)
> > +{
> > + uint64_t size = vdev->rom_size;
> > + const VMStateDescription *vmsd;
> > + char name[32];
> > + off_t off = 0, voff = vdev->rom_offset;
> > + ssize_t bytes;
> > + void *ptr;
> > +
> > + /* If loading ROM from file, pci handles it */
> > + if (vdev->pdev.romfile || !vdev->pdev.rom_bar || !size) {
> > + return 0;
> > + }
> > +
> > + DPRINTF("%s(%04x:%02x:%02x.%x)\n", __func__, vdev->host.domain,
> > + vdev->host.bus, vdev->host.slot, vdev->host.function);
> > +
> > + vmsd = qdev_get_vmsd(DEVICE(&vdev->pdev));
> > +
> > + if (vmsd) {
> > + snprintf(name, sizeof(name), "%s.rom", vmsd->name);
> > + } else {
> > + snprintf(name, sizeof(name), "%s.rom",
> > + object_get_typename(OBJECT(&vdev->pdev)));
> > + }
>
> Not sure where this came from. You can just hard code this to vfio.rom
> or better yet, use the pci-host address and do vfio[%s].rom.
Ok, assume you mean vfio[%04x:%02x:%02x.%x].rom or should I be calling
object_property_print() to get a %s?
> > + memory_region_init_ram(&vdev->pdev.rom, name, size);
> > + ptr = memory_region_get_ram_ptr(&vdev->pdev.rom);
> > + memset(ptr, 0xff, size);
> > +
> > + while (size) {
> > + bytes = pread(vdev->fd, ptr + off, size, voff + off);
> > + if (bytes == 0) {
> > + break; /* expect that we could get back less than the ROM BAR
> > */
> > + } else if (bytes > 0) {
> > + off += bytes;
> > + size -= bytes;
> > + } else {
> > + if (errno == EINTR || errno == EAGAIN) {
> > + continue;
> > + }
> > + error_report("vfio: Error reading device ROM: %s\n",
> > + strerror(errno));
> > + memory_region_destroy(&vdev->pdev.rom);
> > + return -1;
> > + }
> > + }
> > +
> > + pci_register_bar(&vdev->pdev, PCI_ROM_SLOT, 0, &vdev->pdev.rom);
> > + vdev->pdev.has_rom = true;
> > + return 0;
> > +}
> > +
> > +static int vfio_connect_container(VFIOGroup *group)
> > +{
> > + VFIOContainer *container;
> > + int ret, fd;
> > +
> > + if (group->container) {
> > + return 0;
> > + }
> > +
> > + QLIST_FOREACH(container, &container_list, next) {
> > + if (!ioctl(group->fd, VFIO_GROUP_SET_CONTAINER, &container->fd)) {
> > + group->container = container;
> > + QLIST_INSERT_HEAD(&container->group_list, group,
> > container_next);
> > + return 0;
> > + }
> > + }
> > +
> > + fd = qemu_open("/dev/vfio/vfio", O_RDWR);
> > + if (fd < 0) {
> > + error_report("vfio: failed to open /dev/vfio/vfio: %s\n",
> > + strerror(errno));
> > + return -1;
> > + }
> > +
> > + ret = ioctl(fd, VFIO_GET_API_VERSION);
> > + if (ret != VFIO_API_VERSION) {
> > + error_report("vfio: supported vfio version: %d, "
> > + "reported version: %d\n", VFIO_API_VERSION, ret);
> > + close(fd);
> > + return -1;
> > + }
> > +
> > + container = g_malloc0(sizeof(*container));
> > + container->fd = fd;
> > +
> > + if (ioctl(fd, VFIO_CHECK_EXTENSION, VFIO_TYPE1_IOMMU)) {
> > + ret = ioctl(group->fd, VFIO_GROUP_SET_CONTAINER, &fd);
> > + if (ret) {
> > + error_report("vfio: failed to set group container: %s\n",
> > + strerror(errno));
> > + g_free(container);
> > + close(fd);
> > + return -1;
> > + }
> > +
> > + ret = ioctl(fd, VFIO_SET_IOMMU, VFIO_TYPE1_IOMMU);
> > + if (ret) {
> > + error_report("vfio: failed to set iommu for container: %s\n",
> > + strerror(errno));
> > + g_free(container);
> > + close(fd);
> > + return -1;
> > + }
> > +
> > + container->iommu_data.listener = (MemoryListener) {
> > + .begin = vfio_listener_dummy1,
> > + .commit = vfio_listener_dummy1,
> > + .region_add = vfio_listener_region_add,
> > + .region_del = vfio_listener_region_del,
> > + .region_nop = vfio_listener_dummy2,
> > + .log_start = vfio_listener_dummy2,
> > + .log_stop = vfio_listener_dummy2,
> > + .log_sync = vfio_listener_dummy2,
> > + .log_global_start = vfio_listener_dummy1,
> > + .log_global_stop = vfio_listener_dummy1,
> > + .eventfd_add = vfio_listener_dummy3,
> > + .eventfd_del = vfio_listener_dummy3,
> > + };
>
> It would be nicer to move this out to a static structure.
Ok
> > + container->iommu_data.release = vfio_listener_release;
> > +
> > + memory_listener_register(&container->iommu_data.listener,
> > + get_system_memory());
> > + } else {
> > + error_report("vfio: No available IOMMU models\n");
> > + g_free(container);
> > + close(fd);
> > + return -1;
> > + }
> > +
> > + QLIST_INIT(&container->group_list);
> > + QLIST_INSERT_HEAD(&container_list, container, next);
> > +
> > + group->container = container;
> > + QLIST_INSERT_HEAD(&container->group_list, group, container_next);
> > +
> > + return 0;
> > +}
> > +
> > +static void vfio_disconnect_container(VFIOGroup *group)
> > +{
> > + VFIOContainer *container = group->container;
> > +
> > + if (ioctl(group->fd, VFIO_GROUP_UNSET_CONTAINER, &container->fd)) {
> > + error_report("vfio: error disconnecting group %d from container\n",
> > + group->groupid);
> > + }
>
> error_report isn't terminal.
Not meant to be here, the kernel side will prevent the group from being
re-used. I suppose there's the question of whether we want to continue
to get further out of sync with the kernel by continuing the below
disconnects though. Either way seems to be a funky state and I'm not
sure I have a strong preference. It seems unnecessary to kill the guest
though.
> > +
> > + QLIST_REMOVE(group, container_next);
> > + group->container = NULL;
> > +
> > + if (QLIST_EMPTY(&container->group_list)) {
> > + if (container->iommu_data.release) {
> > + container->iommu_data.release(container);
> > + }
> > + QLIST_REMOVE(container, next);
> > + DPRINTF("vfio_disconnect_container: close container->fd\n");
> > + close(container->fd);
> > + g_free(container);
> > + }
> > +}
> > +
> > +static VFIOGroup *vfio_get_group(int groupid)
> > +{
> > + VFIOGroup *group;
> > + char path[32];
> > + struct vfio_group_status status = { .argsz = sizeof(status) };
> > +
> > + QLIST_FOREACH(group, &group_list, next) {
> > + if (group->groupid == groupid) {
> > + return group;
> > + }
> > + }
> > +
> > + group = g_malloc0(sizeof(*group));
> > +
> > + snprintf(path, sizeof(path), "/dev/vfio/%d", groupid);
> > + group->fd = qemu_open(path, O_RDWR);
> > + if (group->fd < 0) {
> > + error_report("vfio: error opening %s: %s", path, strerror(errno));
> > + g_free(group);
> > + return NULL;
> > + }
> > +
> > + if (ioctl(group->fd, VFIO_GROUP_GET_STATUS, &status)) {
> > + error_report("vfio: error getting group status: %s\n",
> > + strerror(errno));
> > + close(group->fd);
> > + g_free(group);
> > + return NULL;
> > + }
> > +
> > + if (!(status.flags & VFIO_GROUP_FLAGS_VIABLE)) {
> > + error_report("vfio: error, group %d is not viable, please ensure "
> > + "all devices within the iommu_group are bound to
> > their "
> > + "vfio bus driver.\n", groupid);
> > + close(group->fd);
> > + g_free(group);
> > + return NULL;
> > + }
> > +
> > + group->groupid = groupid;
> > + QLIST_INIT(&group->device_list);
> > +
> > + if (vfio_connect_container(group)) {
> > + error_report("vfio: failed to setup container for group %d\n",
> > groupid);
> > + close(group->fd);
> > + g_free(group);
> > + return NULL;
> > + }
> > +
> > + QLIST_INSERT_HEAD(&group_list, group, next);
> > +
> > + return group;
> > +}
> > +
> > +static void vfio_put_group(VFIOGroup *group)
> > +{
> > + if (!QLIST_EMPTY(&group->device_list)) {
> > + return;
> > + }
> > +
> > + vfio_disconnect_container(group);
> > + QLIST_REMOVE(group, next);
> > + DPRINTF("vfio_put_group: close group->fd\n");
> > + close(group->fd);
> > + g_free(group);
> > +}
> > +
> > +static int vfio_get_device(VFIOGroup *group, const char *name, VFIODevice
> > *vdev)
> > +{
> > + struct vfio_device_info dev_info = { .argsz = sizeof(dev_info) };
> > + struct vfio_region_info reg_info = { .argsz = sizeof(reg_info) };
> > + int ret, i;
> > +
> > + ret = ioctl(group->fd, VFIO_GROUP_GET_DEVICE_FD, name);
> > + if (ret < 0) {
> > + error_report("vfio: error getting device %s from group %d: %s",
> > + name, group->groupid, strerror(errno));
> > + error_report("Verify all devices in group %d "
> > + "are bound to vfio-pci or pci-stub and not already in
> > use",
> > + group->groupid);
> > + return ret;
> > + }
> > +
> > + vdev->fd = ret;
> > + vdev->group = group;
> > + QLIST_INSERT_HEAD(&group->device_list, vdev, next);
> > +
> > + /* Sanity check device */
> > + ret = ioctl(vdev->fd, VFIO_DEVICE_GET_INFO, &dev_info);
> > + if (ret) {
> > + error_report("vfio: error getting device info: %s",
> > strerror(errno));
> > + goto error;
> > + }
> > +
> > + DPRINTF("Device %s flags: %u, regions: %u, irgs: %u\n", name,
> > + dev_info.flags, dev_info.num_regions, dev_info.num_irqs);
> > +
> > + if (!(dev_info.flags & VFIO_DEVICE_FLAGS_PCI)) {
> > + error_report("vfio: Um, this isn't a PCI device");
> > + goto error;
> > + }
> > +
> > + vdev->reset_works = !!(dev_info.flags & VFIO_DEVICE_FLAGS_RESET);
> > + if (!vdev->reset_works) {
> > + error_report("Warning, device %s does not support reset\n", name);
> > + }
> > +
> > + if (dev_info.num_regions != VFIO_PCI_NUM_REGIONS) {
> > + error_report("vfio: unexpected number of io regions %u",
> > + dev_info.num_regions);
> > + goto error;
> > + }
> > +
> > + if (dev_info.num_irqs != VFIO_PCI_NUM_IRQS) {
> > + error_report("vfio: unexpected number of irqs %u",
> > dev_info.num_irqs);
> > + goto error;
> > + }
> > +
> > + for (i = VFIO_PCI_BAR0_REGION_INDEX; i < VFIO_PCI_ROM_REGION_INDEX;
> > i++) {
> > + reg_info.index = i;
> > +
> > + ret = ioctl(vdev->fd, VFIO_DEVICE_GET_REGION_INFO, ®_info);
> > + if (ret) {
> > + error_report("vfio: Error getting region %d info: %s", i,
> > + strerror(errno));
> > + goto error;
> > + }
> > +
> > + DPRINTF("Device %s region %d:\n", name, i);
> > + DPRINTF(" size: 0x%lx, offset: 0x%lx, flags: 0x%lx\n",
> > + (unsigned long)reg_info.size, (unsigned
> > long)reg_info.offset,
> > + (unsigned long)reg_info.flags);
> > +
> > + vdev->bars[i].flags = reg_info.flags;
> > + vdev->bars[i].size = reg_info.size;
> > + vdev->bars[i].fd_offset = reg_info.offset;
> > + vdev->bars[i].fd = vdev->fd;
> > + vdev->bars[i].nr = i;
> > + }
> > +
> > + reg_info.index = VFIO_PCI_ROM_REGION_INDEX;
> > +
> > + ret = ioctl(vdev->fd, VFIO_DEVICE_GET_REGION_INFO, ®_info);
> > + if (ret) {
> > + error_report("vfio: Error getting ROM info: %s", strerror(errno));
> > + goto error;
> > + }
> > +
> > + DPRINTF("Device %s ROM:\n", name);
> > + DPRINTF(" size: 0x%lx, offset: 0x%lx, flags: 0x%lx\n",
> > + (unsigned long)reg_info.size, (unsigned long)reg_info.offset,
> > + (unsigned long)reg_info.flags);
> > +
> > + vdev->rom_size = reg_info.size;
> > + vdev->rom_offset = reg_info.offset;
> > +
> > + reg_info.index = VFIO_PCI_CONFIG_REGION_INDEX;
> > +
> > + ret = ioctl(vdev->fd, VFIO_DEVICE_GET_REGION_INFO, ®_info);
> > + if (ret) {
> > + error_report("vfio: Error getting config info: %s",
> > strerror(errno));
> > + goto error;
> > + }
> > +
> > + DPRINTF("Device %s config:\n", name);
> > + DPRINTF(" size: 0x%lx, offset: 0x%lx, flags: 0x%lx\n",
> > + (unsigned long)reg_info.size, (unsigned long)reg_info.offset,
> > + (unsigned long)reg_info.flags);
> > +
> > + vdev->config_size = reg_info.size;
> > + vdev->config_offset = reg_info.offset;
> > +
> > +error:
> > + if (ret) {
> > + QLIST_REMOVE(vdev, next);
> > + vdev->group = NULL;
> > + close(vdev->fd);
> > + }
> > + return ret;
> > +}
> > +
> > +static void vfio_put_device(VFIODevice *vdev)
> > +{
> > + QLIST_REMOVE(vdev, next);
> > + vdev->group = NULL;
> > + DPRINTF("vfio_put_device: close vdev->fd\n");
> > + close(vdev->fd);
> > + if (vdev->msix) {
> > + g_free(vdev->msix);
> > + vdev->msix = NULL;
> > + }
> > +}
> > +
> > +static int vfio_initfn(struct PCIDevice *pdev)
>
> Why struct PCIDevice?
Must have pasted it from somewhere, fixed.
> > +{
> > + VFIODevice *pvdev, *vdev = DO_UPCAST(VFIODevice, pdev, pdev);
>
> PCI_DEVICE()
I'm not sure what you're after here, are you perhaps mistaking pdev for
qdev?
> > + VFIOGroup *group;
> > + char path[PATH_MAX], iommu_group_path[PATH_MAX], *group_name;
> > + ssize_t len;
> > + struct stat st;
> > + int groupid;
> > + int ret;
> > +
> > + /* Check that the host device exists */
> > + snprintf(path, sizeof(path),
> > + "/sys/bus/pci/devices/%04x:%02x:%02x.%01x/",
> > + vdev->host.domain, vdev->host.bus, vdev->host.slot,
> > + vdev->host.function);
> > + if (stat(path, &st) < 0) {
> > + error_report("vfio: error: no such host device: %s", path);
> > + return -1;
> > + }
> > +
> > + strncat(path, "iommu_group", sizeof(path) - strlen(path) - 1);
> > +
> > + len = readlink(path, iommu_group_path, PATH_MAX);
> > + if (len <= 0) {
> > + error_report("vfio: error no iommu_group for device\n");
> > + return -1;
> > + }
> > +
> > + iommu_group_path[len] = 0;
> > + group_name = basename(iommu_group_path);
> > +
> > + if (sscanf(group_name, "%d", &groupid) != 1) {
> > + error_report("vfio: error reading %s: %s", path, strerror(errno));
> > + return -1;
> > + }
> > +
> > + DPRINTF("%s(%04x:%02x:%02x.%x) group %d\n", __func__,
> > vdev->host.domain,
> > + vdev->host.bus, vdev->host.slot, vdev->host.function, groupid);
> > +
> > + group = vfio_get_group(groupid);
> > + if (!group) {
> > + error_report("vfio: failed to get group %d", groupid);
> > + return -1;
> > + }
> > +
> > + snprintf(path, sizeof(path), "%04x:%02x:%02x.%01x",
> > + vdev->host.domain, vdev->host.bus, vdev->host.slot,
> > + vdev->host.function);
> > +
> > + QLIST_FOREACH(pvdev, &group->device_list, next) {
> > + if (pvdev->host.domain == vdev->host.domain &&
> > + pvdev->host.bus == vdev->host.bus &&
> > + pvdev->host.slot == vdev->host.slot &&
> > + pvdev->host.function == vdev->host.function) {
> > +
> > + error_report("vfio: error: device %s is already attached\n",
> > path);
> > + vfio_put_group(group);
> > + return -1;
> > + }
> > + }
> > +
> > + ret = vfio_get_device(group, path, vdev);
> > + if (ret) {
> > + error_report("vfio: failed to get device %s", path);
> > + vfio_put_group(group);
> > + return -1;
> > + }
> > +
> > + /* Get a copy of config space */
> > + assert(pci_config_size(&vdev->pdev) <= vdev->config_size);
> > + ret = pread(vdev->fd, vdev->pdev.config,
> > + pci_config_size(&vdev->pdev), vdev->config_offset);
> > + if (ret < (int)pci_config_size(&vdev->pdev)) {
> > + error_report("vfio: Failed to read device config space\n");
> > + goto out_put;
> > + }
> > +
> > + /*
> > + * Clear host resource mapping info. If we choose not to register a
> > + * BAR, such as might be the case with the option ROM, we can get
> > + * confusing, unwritable, residual addresses from the host here.
> > + */
> > + memset(&vdev->pdev.config[PCI_BASE_ADDRESS_0], 0, 24);
> > + memset(&vdev->pdev.config[PCI_ROM_ADDRESS], 0, 4);
> > +
> > + vfio_load_rom(vdev);
> > +
> > + if (vfio_early_setup_msix(vdev)) {
> > + goto out_put;
> > + }
> > +
> > + vfio_map_bars(vdev);
> > +
> > + if (vfio_add_capabilities(vdev)) {
> > + goto out_teardown;
> > + }
> > +
> > + if (vfio_pci_read_config(&vdev->pdev, PCI_INTERRUPT_PIN, 1)) {
> > + pci_device_set_intx_routing_notifier(&vdev->pdev, vfio_update_irq);
> > + }
> > +
> > + if (vfio_enable_intx(vdev)) {
> > + goto out_teardown;
> > + }
> > +
> > + return 0;
> > +
> > +out_teardown:
> > + pci_device_set_intx_routing_notifier(&vdev->pdev, NULL);
> > + vfio_teardown_msi(vdev);
> > + vfio_unmap_bars(vdev);
> > +out_put:
> > + vfio_put_device(vdev);
> > + vfio_put_group(group);
> > + return -1;
> > +}
> > +
> > +static void vfio_exitfn(struct PCIDevice *pdev)
> > +{
> > + VFIODevice *vdev = DO_UPCAST(VFIODevice, pdev, pdev);
> > + VFIOGroup *group = vdev->group;
> > +
> > + pci_device_set_intx_routing_notifier(&vdev->pdev, NULL);
> > + vfio_disable_interrupts(vdev);
> > + vfio_teardown_msi(vdev);
> > + vfio_unmap_bars(vdev);
> > + vfio_put_device(vdev);
> > + vfio_put_group(group);
>
> You should move this all to the destructor (instance_finalize).
Hmm, it looks like if I do that then pci will free my interrupt and
config space out from under me before I get any notice we're killing the
device. Being a pci device, I think I'm tied to PCIDeviceClass.exit
function, right?
> > +}
> > +
> > +static void vfio_reset(DeviceState *dev)
> > +{
> > + PCIDevice *pdev = DO_UPCAST(PCIDevice, qdev, dev);
> > + VFIODevice *vdev = DO_UPCAST(VFIODevice, pdev, pdev);
> > +
> > + if (!vdev->reset_works) {
> > + return;
> > + }
> > +
> > + if (ioctl(vdev->fd, VFIO_DEVICE_RESET)) {
> > + error_report("vfio: Error unable to reset physical device "
> > + "(%04x:%02x:%02x.%x): %s\n", vdev->host.domain,
> > + vdev->host.bus, vdev->host.slot, vdev->host.function,
> > + strerror(errno));
>
> %m is thread safe, strerror isn't.
Neat. Fixed throughout.
> > + }
> > +}
> > +
> > +static Property vfio_pci_dev_properties[] = {
> > + DEFINE_PROP_PCI_HOST_DEVADDR("host", VFIODevice, host),
> > + /*
> > + * TODO - support passed fds... is this necessary?
> > + * DEFINE_PROP_STRING("vfiofd", VFIODevice, vfiofd_name),
> > + * DEFINE_PROP_STRING("vfiogroupfd, VFIODevice, vfiogroupfd_name),
> > + */
> > + DEFINE_PROP_END_OF_LIST(),
> > +};
> > +
> > +
> > +static void vfio_pci_dev_class_init(ObjectClass *klass, void *data)
> > +{
> > + PCIDeviceClass *dc = PCI_DEVICE_CLASS(klass);
> > +
> > + dc->parent_class.reset = vfio_reset;
>
> This is definitely not right. You want to do:
>
> PCIDeviceClass *pdc = PCI_DEVICE_CLASS(klass);
> DeviceClass *dc = DEVICE_CLASS(klass);
>
> dc->reset = vfio_reset;
>
> > + dc->init = vfio_initfn;
> > + dc->exit = vfio_exitfn;
> > + dc->config_read = vfio_pci_read_config;
> > + dc->config_write = vfio_pci_write_config;
> > + dc->parent_class.props = vfio_pci_dev_properties;
>
> dc->props = vfio_pci_dev_properties;
Ok, fixed.
> > +}
> > +
> > +static TypeInfo vfio_pci_dev_info = {
> > + .name = "vfio-pci",
> > + .parent = TYPE_PCI_DEVICE,
> > + .instance_size = sizeof(VFIODevice),
> > + .class_init = vfio_pci_dev_class_init,
> > +};
> > +
> > +static void register_vfio_pci_dev_type(void)
> > +{
> > + type_register_static(&vfio_pci_dev_info);
> > +}
> > +
> > +type_init(register_vfio_pci_dev_type)
> > diff --git a/hw/vfio_pci.h b/hw/vfio_pci.h
> > new file mode 100644
> > index 0000000..0a71bce
> > --- /dev/null
> > +++ b/hw/vfio_pci.h
> > @@ -0,0 +1,101 @@
>
> copyright/license.
>
> > +#ifndef HW_VFIO_PCI_H
> > +#define HW_VFIO_PCI_H
> > +
> > +#include "qemu-common.h"
> > +#include "qemu-queue.h"
> > +#include "pci.h"
> > +#include "event_notifier.h"
>
> This is all private to vfio.c, right? Perhaps call it vfio_pci_int.h
Yes, it is private. Ok, renamed. Thanks for the review,
Alex
- [Qemu-devel] [PATCH 3/3] vfio: Enable vfio-pci and mark supported, (continued)
[Qemu-devel] [PATCH 2/3] vfio: vfio-pci device assignment driver, Alex Williamson, 2012/08/01
Re: [Qemu-devel] [PATCH 0/3] VFIO-based PCI device assignment for QEMU 1.2, Anthony Liguori, 2012/08/13