[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]
[Qemu-devel] [PATCH 2/4] pcie: Add support for Single Root I/O Virtualiz
From: |
Knut Omang |
Subject: |
[Qemu-devel] [PATCH 2/4] pcie: Add support for Single Root I/O Virtualization (SR/IOV) |
Date: |
Fri, 29 Aug 2014 09:17:07 +0200 |
This patch provides the building blocks for creating an SR/IOV
PCIe Extended Capability header and creating and removing
SR/IOV Virtual Functions.
Signed-off-by: Knut Omang <address@hidden>
---
hw/pci/pci.c | 107 +++++++++++++++++++-------
hw/pci/pcie.c | 205 +++++++++++++++++++++++++++++++++++++++++++++++++-
include/hw/pci/pci.h | 6 +-
include/hw/pci/pcie.h | 26 +++++++
4 files changed, 311 insertions(+), 33 deletions(-)
diff --git a/hw/pci/pci.c b/hw/pci/pci.c
index daeaeac..071ab81 100644
--- a/hw/pci/pci.c
+++ b/hw/pci/pci.c
@@ -35,7 +35,6 @@
#include "hw/pci/msi.h"
#include "hw/pci/msix.h"
#include "exec/address-spaces.h"
-#include "hw/hotplug.h"
//#define DEBUG_PCI
#ifdef DEBUG_PCI
@@ -126,6 +125,9 @@ static int pci_bar(PCIDevice *d, int reg)
{
uint8_t type;
+ /* PCIe virtual functions do not have their own BARs */
+ assert(!d->exp.is_vf);
+
if (reg != PCI_ROM_SLOT)
return PCI_BASE_ADDRESS_0 + reg * 4;
@@ -184,22 +186,13 @@ void pci_device_deassert_intx(PCIDevice *dev)
}
}
-static void pci_do_device_reset(PCIDevice *dev)
+static void pci_reset_regions(PCIDevice *dev)
{
int r;
+ if (dev->exp.is_vf) {
+ return;
+ }
- pci_device_deassert_intx(dev);
- assert(dev->irq_state == 0);
-
- /* Clear all writable bits */
- pci_word_test_and_clear_mask(dev->config + PCI_COMMAND,
- pci_get_word(dev->wmask + PCI_COMMAND) |
- pci_get_word(dev->w1cmask + PCI_COMMAND));
- pci_word_test_and_clear_mask(dev->config + PCI_STATUS,
- pci_get_word(dev->wmask + PCI_STATUS) |
- pci_get_word(dev->w1cmask + PCI_STATUS));
- dev->config[PCI_CACHE_LINE_SIZE] = 0x0;
- dev->config[PCI_INTERRUPT_LINE] = 0x0;
for (r = 0; r < PCI_NUM_REGIONS; ++r) {
PCIIORegion *region = &dev->io_regions[r];
if (!region->size) {
@@ -213,6 +206,27 @@ static void pci_do_device_reset(PCIDevice *dev)
pci_set_long(dev->config + pci_bar(dev, r), region->type);
}
}
+}
+
+static void pci_do_device_reset(PCIDevice *dev)
+{
+ qdev_reset_all(&dev->qdev);
+
+ dev->irq_state = 0;
+ pci_update_irq_status(dev);
+ pci_device_deassert_intx(dev);
+ assert(dev->irq_state == 0);
+
+ /* Clear all writable bits */
+ pci_word_test_and_clear_mask(dev->config + PCI_COMMAND,
+ pci_get_word(dev->wmask + PCI_COMMAND) |
+ pci_get_word(dev->w1cmask + PCI_COMMAND));
+ pci_word_test_and_clear_mask(dev->config + PCI_STATUS,
+ pci_get_word(dev->wmask + PCI_STATUS) |
+ pci_get_word(dev->w1cmask + PCI_STATUS));
+ dev->config[PCI_CACHE_LINE_SIZE] = 0x0;
+ dev->config[PCI_INTERRUPT_LINE] = 0x0;
+ pci_reset_regions(dev);
pci_update_mappings(dev);
msi_reset(dev);
@@ -734,6 +748,14 @@ static int pci_init_multifunction(PCIBus *bus, PCIDevice
*dev)
dev->config[PCI_HEADER_TYPE] |= PCI_HEADER_TYPE_MULTI_FUNCTION;
}
+ /* With SR/IOV and ARI, subsequent function 0's are only
+ * another VF which the physical function is placed in the initial
+ * function (0.0)
+ */
+ if (dev->exp.pf && dev->exp.pf->cap_present & QEMU_PCI_CAP_MULTIFUNCTION) {
+ return 0;
+ }
+
/*
* multifunction bit is interpreted in two ways as follows.
* - all functions must set the bit to 1.
@@ -920,6 +942,7 @@ void pci_register_bar(PCIDevice *pci_dev, int region_num,
uint64_t wmask;
pcibus_t size = memory_region_size(memory);
+ assert(!pci_dev->exp.is_vf); /* VFs must use pcie_register_vf_bar */
assert(region_num >= 0);
assert(region_num < PCI_NUM_REGIONS);
if (size & (size-1)) {
@@ -1018,18 +1041,51 @@ pcibus_t pci_get_bar_addr(PCIDevice *pci_dev, int
region_num)
return pci_dev->io_regions[region_num].addr;
}
-static pcibus_t pci_bar_address(PCIDevice *d,
- int reg, uint8_t type, pcibus_t size)
+
+static pcibus_t pci_config_get_bar_addr(PCIDevice *d, int reg,
+ uint8_t type, pcibus_t size)
+{
+ pcibus_t new_addr;
+ if (!d->exp.is_vf) {
+ int bar = pci_bar(d, reg);
+ if (type & PCI_BASE_ADDRESS_MEM_TYPE_64) {
+ new_addr = pci_get_quad(d->config + bar);
+ } else {
+ new_addr = pci_get_long(d->config + bar);
+ }
+ } else {
+ int bar = d->exp.pf->exp.sriov_cap + PCI_SRIOV_BAR + reg * 4;
+ uint32_t vf_num = d->devfn - (d->exp.pf->devfn + 1);
+ if (type & PCI_BASE_ADDRESS_MEM_TYPE_64) {
+ new_addr = pci_get_quad(d->exp.pf->config + bar);
+ } else {
+ new_addr = pci_get_long(d->exp.pf->config + bar);
+ }
+ new_addr += vf_num * size;
+
+ PCI_DPRINTF("%02d:%02d.%d: (vf %d) found config reg %d, "
+ "size 0x%lx addr 0x%lx\n",
+ pci_bus_num(d->bus), d->devfn >> 3, d->devfn & 7,
+ vf_num, reg, size, new_addr);
+ }
+ if (reg != PCI_ROM_SLOT) {
+ /* Preserve the rom enable bit */
+ new_addr &= ~(size - 1);
+ }
+ return new_addr;
+}
+
+pcibus_t pci_bar_address(PCIDevice *d,
+ int reg, uint8_t type, pcibus_t size)
{
pcibus_t new_addr, last_addr;
- int bar = pci_bar(d, reg);
uint16_t cmd = pci_get_word(d->config + PCI_COMMAND);
if (type & PCI_BASE_ADDRESS_SPACE_IO) {
if (!(cmd & PCI_COMMAND_IO)) {
return PCI_BAR_UNMAPPED;
}
- new_addr = pci_get_long(d->config + bar) & ~(size - 1);
+ new_addr = pci_config_get_bar_addr(d, reg, type, size);
last_addr = new_addr + size - 1;
/* Check if 32 bit BAR wraps around explicitly.
* TODO: make priorities correct and remove this work around.
@@ -1043,11 +1099,7 @@ static pcibus_t pci_bar_address(PCIDevice *d,
if (!(cmd & PCI_COMMAND_MEMORY)) {
return PCI_BAR_UNMAPPED;
}
- if (type & PCI_BASE_ADDRESS_MEM_TYPE_64) {
- new_addr = pci_get_quad(d->config + bar);
- } else {
- new_addr = pci_get_long(d->config + bar);
- }
+ new_addr = pci_config_get_bar_addr(d, reg, type, size);
/* the ROM slot has a specific enable bit */
if (reg == PCI_ROM_SLOT && !(new_addr & PCI_ROM_ADDRESS_ENABLE)) {
return PCI_BAR_UNMAPPED;
@@ -1146,9 +1198,10 @@ uint32_t pci_default_read_config(PCIDevice *d,
return le32_to_cpu(val);
}
-void pci_default_write_config(PCIDevice *d, uint32_t addr, uint32_t val, int l)
+void pci_default_write_config(PCIDevice *d, uint32_t addr, uint32_t val_in,
int l)
{
int i, was_irq_disabled = pci_irq_disabled(d);
+ uint32_t val = val_in;
for (i = 0; i < l; val >>= 8, ++i) {
uint8_t wmask = d->wmask[addr + i];
@@ -1170,8 +1223,9 @@ void pci_default_write_config(PCIDevice *d, uint32_t
addr, uint32_t val, int l)
& PCI_COMMAND_MASTER);
}
- msi_write_config(d, addr, val, l);
- msix_write_config(d, addr, val, l);
+ msi_write_config(d, addr, val_in, l);
+ msix_write_config(d, addr, val_in, l);
+ pcie_sriov_config_write(d, addr, val_in, l);
}
/***********************************************************/
@@ -1776,7 +1830,6 @@ static int pci_qdev_init(DeviceState *qdev)
is_default_rom = true;
}
pci_add_option_rom(pci_dev, is_default_rom);
-
return 0;
}
diff --git a/hw/pci/pcie.c b/hw/pci/pcie.c
index 6cb6e0c..b7af693 100644
--- a/hw/pci/pcie.c
+++ b/hw/pci/pcie.c
@@ -254,7 +254,7 @@ void pcie_cap_slot_hotplug_cb(HotplugHandler *hotplug_dev,
DeviceState *dev,
* Right now, only a device of function = 0 is allowed to be
* hot plugged/unplugged.
*/
- assert(PCI_FUNC(pci_dev->devfn) == 0);
+ assert(PCI_FUNC(pci_dev->devfn) == 0 || pci_dev->exp.is_vf);
pci_word_test_and_set_mask(exp_cap + PCI_EXP_SLTSTA,
PCI_EXP_SLTSTA_PDS);
@@ -266,10 +266,11 @@ void pcie_cap_slot_hot_unplug_cb(HotplugHandler
*hotplug_dev, DeviceState *dev,
Error **errp)
{
uint8_t *exp_cap;
+ PCIDevice *pdev = PCI_DEVICE(hotplug_dev);
- pcie_cap_slot_hotplug_common(PCI_DEVICE(hotplug_dev), dev, &exp_cap, errp);
+ pcie_cap_slot_hotplug_common(pdev, dev, &exp_cap, errp);
- pcie_cap_slot_push_attention_button(PCI_DEVICE(hotplug_dev));
+ pcie_cap_slot_push_attention_button(pdev);
}
/* pci express slot for pci express root/downstream port
@@ -409,7 +410,7 @@ void pcie_cap_slot_write_config(PCIDevice *dev,
}
/*
- * If the slot is polulated, power indicator is off and power
+ * If the slot is populated, power indicator is off and power
* controller is off, it is safe to detach the devices.
*/
if ((sltsta & PCI_EXP_SLTSTA_PDS) && (val & PCI_EXP_SLTCTL_PCC) &&
@@ -633,3 +634,199 @@ void pcie_ari_init(PCIDevice *dev, uint16_t offset,
uint16_t nextfn)
offset, PCI_ARI_SIZEOF);
pci_set_long(dev->config + offset + PCI_ARI_CAP, (nextfn & 0xff) << 8);
}
+
+
+
+/* SR/IOV */
+void pcie_sriov_init(PCIDevice *dev, uint16_t offset,
+ const char *vfname, uint16_t vf_dev_id,
+ uint16_t init_vfs, uint16_t total_vfs)
+{
+ uint8_t *cfg = dev->config + offset;
+ uint8_t *wmask;
+ pcie_add_capability(dev, PCI_EXT_CAP_ID_SRIOV, 1,
+ offset, PCI_EXT_CAP_SRIOV_SIZEOF);
+ dev->exp.sriov_cap = offset;
+ dev->exp.num_vfs = 0;
+ dev->exp.vfname = g_strdup(vfname);
+ dev->exp.vf = NULL;
+
+ /* set some sensible defaults - devices can override later */
+ pci_set_word(cfg + PCI_SRIOV_VF_OFFSET, 0x1);
+ pci_set_word(cfg + PCI_SRIOV_VF_STRIDE, 0x1);
+ pci_set_word(cfg + PCI_SRIOV_SUP_PGSIZE, 0x553);
+ pci_set_word(cfg + PCI_SRIOV_SYS_PGSIZE, 0x1);
+
+ /* Set up device ID and initial/total number of VFs available */
+ pci_set_word(cfg + PCI_SRIOV_VF_DID, vf_dev_id);
+ pci_set_word(cfg + PCI_SRIOV_INITIAL_VF, init_vfs);
+ pci_set_word(cfg + PCI_SRIOV_TOTAL_VF, total_vfs);
+
+ /* Write enable control bits */
+ wmask = dev->wmask + offset;
+ pci_set_word(wmask + PCI_SRIOV_CTRL,
+ PCI_SRIOV_CTRL_VFE | PCI_SRIOV_CTRL_MSE | PCI_SRIOV_CTRL_ARI);
+ pci_set_word(wmask + PCI_SRIOV_NUM_VF, 0xffff);
+
+ qdev_prop_set_bit(&dev->qdev, "multifunction", true);
+}
+
+
+void pcie_sriov_exit(PCIDevice *dev)
+{
+ PCIE_DPRINTF("\n");
+ pcie_sriov_reset_vfs(dev);
+}
+
+void pcie_sriov_init_bar(PCIDevice *dev, int region_num,
+ uint8_t type, dma_addr_t size)
+{
+ uint32_t addr;
+ uint64_t wmask;
+ uint16_t sriov_cap = dev->exp.sriov_cap;
+
+ assert(sriov_cap > 0);
+ assert(region_num >= 0);
+ assert(region_num < PCI_NUM_REGIONS);
+ assert(region_num != PCI_ROM_SLOT);
+
+ wmask = ~(size - 1);
+ addr = sriov_cap + PCI_SRIOV_BAR + region_num * 4;
+
+ pci_set_long(dev->config + addr, type);
+ if (!(type & PCI_BASE_ADDRESS_SPACE_IO) &&
+ type & PCI_BASE_ADDRESS_MEM_TYPE_64) {
+ pci_set_quad(dev->wmask + addr, wmask);
+ pci_set_quad(dev->cmask + addr, ~0ULL);
+ } else {
+ pci_set_long(dev->wmask + addr, wmask & 0xffffffff);
+ pci_set_long(dev->cmask + addr, 0xffffffff);
+ }
+ dev->exp.vf_bar_type[region_num] = type;
+}
+
+void pcie_register_vf_bar(PCIDevice *dev, int region_num,
+ MemoryRegion *memory)
+{
+ PCIIORegion *r;
+ uint8_t type;
+ pcibus_t size = memory_region_size(memory);
+
+ assert(dev->exp.is_vf); /* PFs must use pci_register_bar */
+ assert(region_num >= 0);
+ assert(region_num < PCI_NUM_REGIONS);
+ type = dev->exp.pf->exp.vf_bar_type[region_num];
+
+ if (size & (size-1)) {
+ fprintf(stderr, "ERROR: PCI region size must be pow2 "
+ "type=0x%x, size=0x%"FMT_PCIBUS"\n", type, size);
+ exit(1);
+ }
+
+ r = &dev->io_regions[region_num];
+ r->memory = memory;
+ r->address_space =
+ type & PCI_BASE_ADDRESS_SPACE_IO
+ ? dev->bus->address_space_io
+ : dev->bus->address_space_mem;
+ r->size = size;
+ r->type = type;
+
+ r->addr = pci_bar_address(dev, region_num, r->type, r->size);
+ if (r->addr != PCI_BAR_UNMAPPED) {
+ memory_region_add_subregion_overlap(r->address_space,
+ r->addr, r->memory, 1);
+ }
+}
+
+
+static PCIDevice *pcie_create_vf(PCIDevice *pf, int devfn, const char *name)
+{
+ int ret;
+ PCIDevice *dev = pci_create(pf->bus, devfn, name);
+ dev->exp.is_vf = true;
+ dev->exp.pf = pf;
+
+ ret = qdev_init(&dev->qdev);
+ if (ret)
+ return NULL;
+
+ /* set vid/did according to sr/iov spec - they are not used */
+ pci_config_set_vendor_id(dev->config, 0xffff);
+ pci_config_set_device_id(dev->config, 0xffff);
+ return dev;
+}
+
+
+void pcie_sriov_create_vfs(PCIDevice *dev)
+{
+ uint16_t num_vfs;
+ uint16_t i;
+ int32_t devfn = dev->devfn + 1;
+ uint16_t sriov_cap = dev->exp.sriov_cap;
+
+ assert(sriov_cap > 0);
+ num_vfs = pci_get_word(dev->config + sriov_cap + PCI_SRIOV_NUM_VF);
+
+ dev->exp.vf = g_malloc(sizeof(PCIDevice*) * num_vfs);
+ assert(dev->exp.vf);
+
+ PCIE_DEV_PRINTF(dev, "creating %d vf devs\n", num_vfs);
+ for (i = 0; i < num_vfs; i++) {
+ dev->exp.vf[i] = pcie_create_vf(dev, devfn++, dev->exp.vfname);
+ if (!dev->exp.vf[i]) {
+ PCIE_DEV_PRINTF(dev, "Failed to create VF %d\n", i);
+ num_vfs = i;
+ break;
+ }
+ }
+ dev->exp.num_vfs = num_vfs;
+}
+
+
+void pcie_sriov_reset_vfs(PCIDevice *dev)
+{
+ Error *local_err = NULL;
+ uint16_t num_vfs = dev->exp.num_vfs;
+ uint16_t i;
+ PCIE_DEV_PRINTF(dev, "Resetting %d vf devs\n", num_vfs);
+ for (i = 0; i < num_vfs; i++) {
+ qdev_unplug(&dev->exp.vf[i]->qdev, &local_err);
+ if (local_err) {
+ fprintf(stderr, "Failed to unplug: %s\n",
+ error_get_pretty(local_err));
+ error_free(local_err);
+ }
+ }
+ dev->exp.num_vfs = 0;
+}
+
+
+void pcie_sriov_config_write(PCIDevice *dev, uint32_t address, uint32_t val,
int len)
+{
+ uint32_t off;
+ uint16_t sriov_cap = dev->exp.sriov_cap;
+
+ if (!sriov_cap || address < sriov_cap) {
+ return;
+ }
+ off = address - sriov_cap;
+ if (off >= PCI_EXT_CAP_SRIOV_SIZEOF) {
+ return;
+ }
+
+ PCIE_DEV_PRINTF(dev, "cap at 0x%x sriov offset 0x%x val 0x%x len %d\n",
+ sriov_cap, off, val, len);
+
+ if (range_covers_byte(off, len, PCI_SRIOV_CTRL)) {
+ if (dev->exp.num_vfs) {
+ if (!(val & PCI_SRIOV_CTRL_VFE)) {
+ pcie_sriov_reset_vfs(dev);
+ }
+ } else {
+ if (val & PCI_SRIOV_CTRL_VFE) {
+ pcie_sriov_create_vfs(dev);
+ }
+ }
+ }
+}
diff --git a/include/hw/pci/pci.h b/include/hw/pci/pci.h
index c352c7b..d36c68d 100644
--- a/include/hw/pci/pci.h
+++ b/include/hw/pci/pci.h
@@ -11,8 +11,6 @@
/* PCI includes legacy ISA access. */
#include "hw/isa/isa.h"
-#include "hw/pci/pcie.h"
-
/* PCI bus */
#define PCI_DEVFN(slot, func) ((((slot) & 0x1f) << 3) | ((func) & 0x07))
@@ -127,6 +125,7 @@ enum {
#define QEMU_PCI_VGA_IO_HI_SIZE 0x20
#include "hw/pci/pci_regs.h"
+#include "hw/pci/pcie.h"
/* PCI HEADER_TYPE */
#define PCI_HEADER_TYPE_MULTI_FUNCTION 0x80
@@ -413,6 +412,9 @@ typedef AddressSpace *(*PCIIOMMUFunc)(PCIBus *, void *,
int);
AddressSpace *pci_device_iommu_address_space(PCIDevice *dev);
void pci_setup_iommu(PCIBus *bus, PCIIOMMUFunc fn, void *opaque);
+pcibus_t pci_bar_address(PCIDevice *d,
+ int reg, uint8_t type, pcibus_t size);
+
static inline void
pci_set_byte(uint8_t *config, uint8_t val)
{
diff --git a/include/hw/pci/pcie.h b/include/hw/pci/pcie.h
index d139d58..b34d831 100644
--- a/include/hw/pci/pcie.h
+++ b/include/hw/pci/pcie.h
@@ -74,6 +74,15 @@ struct PCIExpressDevice {
/* AER */
uint16_t aer_cap;
PCIEAERLog aer_log;
+
+ /* SR/IOV */
+ uint16_t sriov_cap;
+ uint16_t num_vfs; /* Number of virtual functions created */
+ bool is_vf; /* Set if this device is a virtual function */
+ const char *vfname; /* Reference to the device type used for the
VFs */
+ PCIDevice **vf; /* Pointer to an array of num_vfs VF devices */
+ PCIDevice *pf; /* Pointer back to owner physical function */
+ uint8_t vf_bar_type[PCI_NUM_REGIONS]; /* Store type for each VF bar */
};
#define COMPAT_PROP_PCP "power_controller_present"
@@ -115,6 +124,23 @@ void pcie_add_capability(PCIDevice *dev,
uint16_t offset, uint16_t size);
void pcie_ari_init(PCIDevice *dev, uint16_t offset, uint16_t nextfn);
+void pcie_sriov_init(PCIDevice *dev, uint16_t offset,
+ const char *vfname, uint16_t vf_dev_id,
+ uint16_t init_vfs, uint16_t total_vfs);
+void pcie_sriov_exit(PCIDevice *dev);
+
+/* Set up a VF bar in the SR/IOV bar area */
+void pcie_sriov_init_bar(PCIDevice *s, int region_num,
+ uint8_t type, dma_addr_t size);
+
+/* Instantiate a bar for a VF */
+void pcie_register_vf_bar(PCIDevice *pci_dev, int region_num,
+ MemoryRegion *memory);
+
+void pcie_sriov_create_vfs(PCIDevice *dev);
+void pcie_sriov_reset_vfs(PCIDevice *dev);
+void pcie_sriov_config_write(PCIDevice *dev, uint32_t address,
+ uint32_t val, int len);
extern const VMStateDescription vmstate_pcie_device;
--
1.9.0