qemu-devel
[Top][All Lists]
Advanced

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

Re: [Qemu-devel] [PATCH v5 15/16] spapr_pci: enable basic hotplug operat


From: David Gibson
Subject: Re: [Qemu-devel] [PATCH v5 15/16] spapr_pci: enable basic hotplug operations
Date: Wed, 25 Feb 2015 14:11:29 +1100
User-agent: Mutt/1.5.23 (2014-03-12)

On Mon, Feb 16, 2015 at 08:27:51AM -0600, Michael Roth wrote:
> This enables hotplug for PHB bridges. Upon hotplug we generate the

"PCI Host Bridge bridges" :-p

> OF-nodes required by PAPR specification and IEEE 1275-1994
> "PCI Bus Binding to Open Firmware" for the device.
> 
> We associate the corresponding FDT for these nodes with the DrcEntry
> corresponding to the slot, which will be fetched via
> ibm,configure-connector RTAS calls by the guest as described by PAPR
> specification. The FDT is cleaned up in the case of unplug.
> 
> Signed-off-by: Michael Roth <address@hidden>
> ---
>  hw/ppc/spapr_pci.c | 391 
> ++++++++++++++++++++++++++++++++++++++++++++++++++---
>  1 file changed, 371 insertions(+), 20 deletions(-)
> 
> diff --git a/hw/ppc/spapr_pci.c b/hw/ppc/spapr_pci.c
> index 6a3d917..b9af1cd 100644
> --- a/hw/ppc/spapr_pci.c
> +++ b/hw/ppc/spapr_pci.c
> @@ -33,8 +33,11 @@
>  #include <libfdt.h>
>  #include "trace.h"
>  #include "qemu/error-report.h"
> +#include "qapi/qmp/qerror.h"
>  
>  #include "hw/pci/pci_bus.h"
> +#include "hw/ppc/spapr_drc.h"
> +#include "sysemu/device_tree.h"
>  
>  /* Copied from the kernel arch/powerpc/platforms/pseries/msi.c */
>  #define RTAS_QUERY_FN           0
> @@ -47,7 +50,13 @@
>  #define RTAS_TYPE_MSI           1
>  #define RTAS_TYPE_MSIX          2
>  
> -#include "hw/ppc/spapr_drc.h"
> +#define _FDT(exp) \
> +    do { \
> +        int ret = (exp);                                           \
> +        if (ret < 0) {                                             \
> +            return ret;                                            \
> +        }                                                          \
> +    } while (0)
>  
>  static sPAPRPHBState *find_phb(sPAPREnvironment *spapr, uint64_t buid)
>  {
> @@ -481,6 +490,359 @@ static AddressSpace *spapr_pci_dma_iommu(PCIBus *bus, 
> void *opaque, int devfn)
>      return &phb->iommu_as;
>  }
>  
> +/* Macros to operate with address in OF binding to PCI */
> +#define b_x(x, p, l)    (((x) & ((1<<(l))-1)) << (p))
> +#define b_n(x)          b_x((x), 31, 1) /* 0 if relocatable */
> +#define b_p(x)          b_x((x), 30, 1) /* 1 if prefetchable */
> +#define b_t(x)          b_x((x), 29, 1) /* 1 if the address is aliased */
> +#define b_ss(x)         b_x((x), 24, 2) /* the space code */
> +#define b_bbbbbbbb(x)   b_x((x), 16, 8) /* bus number */
> +#define b_ddddd(x)      b_x((x), 11, 5) /* device number */
> +#define b_fff(x)        b_x((x), 8, 3)  /* function number */
> +#define b_rrrrrrrr(x)   b_x((x), 0, 8)  /* register number */
> +/* for 'reg'/'assigned-addresses' OF properties */
> +#define RESOURCE_CELLS_SIZE 2
> +#define RESOURCE_CELLS_ADDRESS 3
> +
> +typedef struct ResourceFields {
> +    uint32_t phys_hi;
> +    uint32_t phys_mid;
> +    uint32_t phys_lo;
> +    uint32_t size_hi;
> +    uint32_t size_lo;
> +} ResourceFields;

Hrm, 5*32-bit ints, that probably needs a ((packed)) on at least some
platforms to be safe.

> +typedef struct ResourceProps {
> +    union {
> +        ResourceFields fields[8];
> +        uint8_t data[sizeof(ResourceFields) * 8];
> +    } reg;
> +    union {
> +        ResourceFields fields[7];
> +        uint8_t data[sizeof(ResourceFields) * 7];
> +    } assigned;
> +    uint32_t reg_len;
> +    uint32_t assigned_len;
> +} ResourceProps;

I'm a bit dubious about this union, rather than just casting when you
need the bytestring version.

> +
> +/* fill in the 'reg'/'assigned-resources' OF properties for
> + * a PCI device. 'reg' describes resource requirements for a
> + * device's IO/MEM regions, 'assigned-addresses' describes the
> + * actual resource assignments.
> + *
> + * the properties are arrays of ('phys-addr', 'size') pairs describing
> + * the addressable regions of the PCI device, where 'phys-addr' is a
> + * RESOURCE_CELLS_ADDRESS-tuple of 32-bit integers corresponding to
> + * (phys.hi, phys.mid, phys.lo), and 'size' is a
> + * RESOURCE_CELLS_SIZE-tuple corresponding to (size.hi, size.lo).
> + * phys.hi = 0xYYXXXXZZ, where:
> + *   0xYY = npt000ss
> + *          |||   |
> + *          |||   +-- space code: 1 if IO region, 2 if MEM region
> + *          ||+------ for non-relocatable IO: 1 if aliased
> + *          ||        for relocatable IO: 1 if below 64KB
> + *          ||        for MEM: 1 if below 1MB
> + *          |+------- 1 if region is prefetchable
> + *          +-------- 1 if region is non-relocatable
> + *   0xXXXX = bbbbbbbb dddddfff, encoding bus, slot, and function
> + *            bits respectively
> + *   0xZZ = rrrrrrrr, the register number of the BAR corresponding
> + *          to the region
> + *
> + * phys.mid and phys.lo correspond respectively to the hi/lo portions
> + * of the actual address of the region.
> + *
> + * how the phys-addr/size values are used differ slightly between
> + * 'reg' and 'assigned-addresses' properties. namely, 'reg' has
> + * an additional description for the config space region of the
> + * device, and in the case of QEMU has n=0 and phys.mid=phys.lo=0
> + * to describe the region as relocatable, with an address-mapping
> + * that corresponds directly to the PHB's address space for the
> + * resource. 'assigned-addresses' always has n=1 set with an absolute
> + * address assigned for the resource. in general, 'assigned-addresses'
> + * won't be populated, since addresses for PCI devices are generally
> + * unmapped initially and left to the guest to assign.
> + *
> + * in accordance with PCI Bus Binding to Open Firmware,
> + * IEEE Std 1275-1994, section 4.1.1, as implemented by PAPR+ v2.7,
> + * Appendix C.
> + */
> +static void populate_resource_props(PCIDevice *d, ResourceProps *rp)
> +{
> +    int bus_num = pci_bus_num(PCI_BUS(qdev_get_parent_bus(DEVICE(d))));
> +    uint32_t dev_id = (b_bbbbbbbb(bus_num) |
> +                       b_ddddd(PCI_SLOT(d->devfn)) |
> +                       b_fff(PCI_FUNC(d->devfn)));
> +    ResourceFields *reg, *assigned;
> +    int i, reg_idx = 0, assigned_idx = 0;
> +
> +    /* config space region */
> +    reg = &rp->reg.fields[reg_idx++];
> +    reg->phys_hi = cpu_to_be32(dev_id);
> +    reg->phys_mid = 0;
> +    reg->phys_lo = 0;
> +    reg->size_hi = 0;
> +    reg->size_lo = 0;
> +
> +    for (i = 0; i < PCI_NUM_REGIONS; i++) {
> +        if (!d->io_regions[i].size) {
> +            continue;
> +        }
> +
> +        reg = &rp->reg.fields[reg_idx++];
> +
> +        reg->phys_hi = cpu_to_be32(dev_id | b_rrrrrrrr(pci_bar(d, i)));
> +        if (d->io_regions[i].type & PCI_BASE_ADDRESS_SPACE_IO) {
> +            reg->phys_hi |= cpu_to_be32(b_ss(1));
> +        } else {
> +            reg->phys_hi |= cpu_to_be32(b_ss(2));
> +        }
> +        reg->phys_mid = 0;
> +        reg->phys_lo = 0;
> +        reg->size_hi = cpu_to_be32(d->io_regions[i].size >> 32);
> +        reg->size_lo = cpu_to_be32(d->io_regions[i].size);
> +
> +        if (d->io_regions[i].addr != PCI_BAR_UNMAPPED) {
> +            continue;

This has inverted sense, doesn't it?  If the BAR *is* unmapped you
want to continue, skipping the assigned-addresses stuff.

> +        }
> +
> +        assigned = &rp->assigned.fields[assigned_idx++];
> +        assigned->phys_hi = cpu_to_be32(reg->phys_hi | b_n(1));
> +        assigned->phys_mid = cpu_to_be32(d->io_regions[i].addr >> 32);
> +        assigned->phys_lo = cpu_to_be32(d->io_regions[i].addr);

Not sure if I understood the comment above properly; should these
addresses actually be translated through an mappings above the PHB?
Not that there are any such translations on sPAPR, but...

> +        assigned->size_hi = reg->size_hi;
> +        assigned->size_lo = reg->size_lo;
> +    }
> +
> +    rp->reg_len = reg_idx * sizeof(ResourceFields);
> +    rp->assigned_len = assigned_idx * sizeof(ResourceFields);
> +}
> +
> +static int spapr_populate_pci_child_dt(PCIDevice *dev, void *fdt, int offset,
> +                                       int phb_index, int drc_index,
> +                                       const char *drc_name)
> +{
> +    ResourceProps rp;
> +    bool is_bridge = false;
> +    int pci_status;
> +
> +    if (pci_default_read_config(dev, PCI_HEADER_TYPE, 1) ==
> +        PCI_HEADER_TYPE_BRIDGE) {
> +        is_bridge = true;
> +    }
> +
> +    /* in accordance with PAPR+ v2.7 13.6.3, Table 181 */
> +    _FDT(fdt_setprop_cell(fdt, offset, "vendor-id",
> +                          pci_default_read_config(dev, PCI_VENDOR_ID, 2)));
> +    _FDT(fdt_setprop_cell(fdt, offset, "device-id",
> +                          pci_default_read_config(dev, PCI_DEVICE_ID, 2)));
> +    _FDT(fdt_setprop_cell(fdt, offset, "revision-id",
> +                          pci_default_read_config(dev, PCI_REVISION_ID, 1)));
> +    _FDT(fdt_setprop_cell(fdt, offset, "class-code",
> +                          pci_default_read_config(dev, PCI_CLASS_DEVICE, 2)
> +                            << 8));
> +    if (pci_default_read_config(dev, PCI_INTERRUPT_PIN, 1)) {
> +        _FDT(fdt_setprop_cell(fdt, offset, "interrupts",
> +                 pci_default_read_config(dev, PCI_INTERRUPT_PIN, 1)));
> +    }
> +
> +    if (!is_bridge) {
> +        _FDT(fdt_setprop_cell(fdt, offset, "min-grant",
> +            pci_default_read_config(dev, PCI_MIN_GNT, 1)));
> +        _FDT(fdt_setprop_cell(fdt, offset, "max-latency",
> +            pci_default_read_config(dev, PCI_MAX_LAT, 1)));
> +    }
> +
> +    if (pci_default_read_config(dev, PCI_SUBSYSTEM_ID, 2)) {
> +        _FDT(fdt_setprop_cell(fdt, offset, "subsystem-id",
> +                 pci_default_read_config(dev, PCI_SUBSYSTEM_ID, 2)));
> +    }
> +
> +    if (pci_default_read_config(dev, PCI_SUBSYSTEM_VENDOR_ID, 2)) {
> +        _FDT(fdt_setprop_cell(fdt, offset, "subsystem-vendor-id",
> +                 pci_default_read_config(dev, PCI_SUBSYSTEM_VENDOR_ID, 2)));
> +    }
> +
> +    _FDT(fdt_setprop_cell(fdt, offset, "cache-line-size",
> +        pci_default_read_config(dev, PCI_CACHE_LINE_SIZE, 1)));
> +
> +    /* the following fdt cells are masked off the pci status register */
> +    pci_status = pci_default_read_config(dev, PCI_STATUS, 2);
> +    _FDT(fdt_setprop_cell(fdt, offset, "devsel-speed",
> +                          PCI_STATUS_DEVSEL_MASK & pci_status));
> +
> +    if (pci_status & PCI_STATUS_FAST_BACK) {
> +        _FDT(fdt_setprop(fdt, offset, "fast-back-to-back", NULL, 0));
> +    }
> +    if (pci_status & PCI_STATUS_66MHZ) {
> +        _FDT(fdt_setprop(fdt, offset, "66mhz-capable", NULL, 0));
> +    }
> +    if (pci_status & PCI_STATUS_UDF) {
> +        _FDT(fdt_setprop(fdt, offset, "udf-supported", NULL, 0));
> +    }
> +
> +    _FDT(fdt_setprop_string(fdt, offset, "name", "pci"));

Ah.. this is a bit weird.  Explicit "name" properties are usually
omitted from fdts.  But I guess you need it, because this will get
streamed into the guest via the RTAS interfaces, rather than into the
guest's fdt parsing code (which knowws to derive name properties from
the unit name).

That probably deserves a comment.

> +    _FDT(fdt_setprop(fdt, offset, "ibm,loc-code", drc_name, 
> strlen(drc_name)));
> +    _FDT(fdt_setprop_cell(fdt, offset, "ibm,my-drc-index", drc_index));
> +
> +    _FDT(fdt_setprop_cell(fdt, offset, "#address-cells",
> +                          RESOURCE_CELLS_ADDRESS));
> +    _FDT(fdt_setprop_cell(fdt, offset, "#size-cells",
> +                          RESOURCE_CELLS_SIZE));
> +    _FDT(fdt_setprop_cell(fdt, offset, "ibm,req#msi-x",
> +                          RESOURCE_CELLS_SIZE));
> +
> +    populate_resource_props(dev, &rp);
> +    _FDT(fdt_setprop(fdt, offset, "reg", rp.reg.data, rp.reg_len));
> +    _FDT(fdt_setprop(fdt, offset, "assigned-addresses",
> +                     rp.assigned.data, rp.assigned_len));
> +
> +    return 0;
> +}
> +
> +/* create OF node for pci device and required OF DT properties */
> +static void *spapr_create_pci_child_dt(sPAPRPHBState *phb, PCIDevice *dev,
> +                                       int drc_index, const char *drc_name,
> +                                       int *dt_offset)
> +{
> +    void *fdt;
> +    int offset, ret, fdt_size;
> +    int slot = PCI_SLOT(dev->devfn);
> +    char nodename[512];
> +
> +    fdt = create_device_tree(&fdt_size);
> +    sprintf(nodename, "address@hidden", slot);
> +    offset = fdt_add_subnode(fdt, 0, nodename);
> +    ret = spapr_populate_pci_child_dt(dev, fdt, offset, phb->index, 
> drc_index,
> +                                      drc_name);
> +    g_assert(!ret);
> +
> +    *dt_offset = offset;
> +    return fdt;
> +}
> +
> +static void spapr_phb_add_pci_device(sPAPRDRConnector *drc,
> +                                     sPAPRPHBState *phb,
> +                                     PCIDevice *pdev,
> +                                     Error **errp)
> +{
> +    sPAPRDRConnectorClass *drck = SPAPR_DR_CONNECTOR_GET_CLASS(drc);
> +    DeviceState *dev = DEVICE(pdev);
> +    int drc_index = drck->get_index(drc);
> +    const char *drc_name = drck->get_name(drc);
> +    void *fdt = NULL;
> +    int fdt_start_offset = 0;
> +
> +    /* boot-time devices get their device tree node created by SLOF, but for
> +     * hotplugged devices we need QEMU to generate it so the guest can fetch
> +     * it via RTAS
> +     */
> +    if (dev->hotplugged) {
> +        fdt = spapr_create_pci_child_dt(phb, pdev, drc_index, drc_name,
> +                                        &fdt_start_offset);
> +    }
> +
> +    drck->attach(drc, DEVICE(pdev),
> +                 fdt, fdt_start_offset, !dev->hotplugged, errp);
> +    if (errp) {
> +        g_free(fdt);
> +    }
> +}
> +
> +static void spapr_phb_remove_pci_device_cb(DeviceState *dev, void *opaque)
> +{
> +    /* some version guests do not wait for completion of a device
> +     * cleanup (generally done asynchronously by the kernel) before
> +     * signaling to QEMU that the device is safe, but instead sleep
> +     * for some 'safe' period of time. unfortunately on a busy host
> +     * this sleep isn't guaranteed to be long enough, resulting in
> +     * bad things like IRQ lines being left asserted during final
> +     * device removal. to deal with this we call reset just prior
> +     * to finalizing the device, which will put the device back into
> +     * an 'idle' state, as the device cleanup code expects.
> +     */
> +    pci_device_reset(PCI_DEVICE(dev));
> +    object_unparent(OBJECT(dev));
> +}
> +
> +static void spapr_phb_remove_pci_device(sPAPRDRConnector *drc,
> +                                        sPAPRPHBState *phb,
> +                                        PCIDevice *pdev,
> +                                        Error **errp)
> +{
> +    sPAPRDRConnectorClass *drck = SPAPR_DR_CONNECTOR_GET_CLASS(drc);
> +
> +    drck->detach(drc, DEVICE(pdev), spapr_phb_remove_pci_device_cb, phb, 
> errp);
> +}
> +
> +static void spapr_phb_hot_plug_child(HotplugHandler *plug_handler,
> +                                     DeviceState *plugged_dev, Error **errp)
> +{
> +    sPAPRPHBState *phb = SPAPR_PCI_HOST_BRIDGE(DEVICE(plug_handler));
> +    PCIDevice *pdev = PCI_DEVICE(plugged_dev);
> +    sPAPRDRConnector *drc = spapr_dr_connector_by_id(
> +        SPAPR_DR_CONNECTOR_TYPE_PCI,
> +        (phb->index << 16) |
> +        (pci_bus_num(PCI_BUS(qdev_get_parent_bus(plugged_dev))) << 8) |
> +        pdev->devfn);

Helper function for that calculation might be clearer.

> +    Error *local_err = NULL;
> +
> +    /* if DR is disabled we don't need to do anything in the case of
> +     * hotplug or coldplug callbacks
> +     */
> +    if (!phb->dr_enabled) {
> +        /* if this is a hotplug operation initiated by the user
> +         * we need to let them know it's not enabled
> +         */
> +        if (plugged_dev->hotplugged) {
> +            error_set(errp, QERR_BUS_NO_HOTPLUG,
> +                      object_get_typename(OBJECT(phb)));
> +        }
> +        return;
> +    }
> +
> +    g_assert(drc);
> +
> +    spapr_phb_add_pci_device(drc, phb, pdev, &local_err);
> +    if (local_err) {
> +        error_propagate(errp, local_err);
> +        return;
> +    }
> +}
> +
> +static void spapr_phb_hot_unplug_child(HotplugHandler *plug_handler,
> +                                       DeviceState *plugged_dev, Error 
> **errp)
> +{
> +    sPAPRPHBState *phb = SPAPR_PCI_HOST_BRIDGE(DEVICE(plug_handler));
> +    PCIDevice *pdev = PCI_DEVICE(plugged_dev);
> +    sPAPRDRConnectorClass *drck;
> +    sPAPRDRConnector *drc = spapr_dr_connector_by_id(
> +        SPAPR_DR_CONNECTOR_TYPE_PCI,
> +        (phb->index << 16) |
> +        (pci_bus_num(PCI_BUS(qdev_get_parent_bus(plugged_dev))) << 8) |
> +        pdev->devfn);
> +    Error *local_err = NULL;
> +
> +    if (!phb->dr_enabled) {
> +        error_set(errp, QERR_BUS_NO_HOTPLUG,
> +                  object_get_typename(OBJECT(phb)));
> +        return;
> +    }
> +
> +    g_assert(drc);
> +
> +    drck = SPAPR_DR_CONNECTOR_GET_CLASS(drc);
> +    if (!drck->release_pending(drc)) {
> +        spapr_phb_remove_pci_device(drc, phb, pdev, &local_err);
> +        if (local_err) {
> +            error_propagate(errp, local_err);
> +            return;
> +        }
> +    }
> +}
> +
>  static void spapr_phb_realize(DeviceState *dev, Error **errp)
>  {
>      SysBusDevice *s = SYS_BUS_DEVICE(dev);
> @@ -574,6 +936,7 @@ static void spapr_phb_realize(DeviceState *dev, Error 
> **errp)
>                             &sphb->memspace, &sphb->iospace,
>                             PCI_DEVFN(0, 0), PCI_NUM_PINS, TYPE_PCI_BUS);
>      phb->bus = bus;
> +    qbus_set_hotplug_handler(BUS(phb->bus), DEVICE(sphb), NULL);
>  
>      /*
>       * Initialize PHB address space.
> @@ -810,6 +1173,7 @@ static void spapr_phb_class_init(ObjectClass *klass, 
> void *data)
>      PCIHostBridgeClass *hc = PCI_HOST_BRIDGE_CLASS(klass);
>      DeviceClass *dc = DEVICE_CLASS(klass);
>      sPAPRPHBClass *spc = SPAPR_PCI_HOST_BRIDGE_CLASS(klass);
> +    HotplugHandlerClass *hp = HOTPLUG_HANDLER_CLASS(klass);
>  
>      hc->root_bus_path = spapr_phb_root_bus_path;
>      dc->realize = spapr_phb_realize;
> @@ -819,6 +1183,8 @@ static void spapr_phb_class_init(ObjectClass *klass, 
> void *data)
>      set_bit(DEVICE_CATEGORY_BRIDGE, dc->categories);
>      dc->cannot_instantiate_with_device_add_yet = false;
>      spc->finish_realize = spapr_phb_finish_realize;
> +    hp->plug = spapr_phb_hot_plug_child;
> +    hp->unplug = spapr_phb_hot_unplug_child;
>  }
>  
>  static const TypeInfo spapr_phb_info = {
> @@ -827,6 +1193,10 @@ static const TypeInfo spapr_phb_info = {
>      .instance_size = sizeof(sPAPRPHBState),
>      .class_init    = spapr_phb_class_init,
>      .class_size    = sizeof(sPAPRPHBClass),
> +    .interfaces    = (InterfaceInfo[]) {
> +        { TYPE_HOTPLUG_HANDLER },
> +        { }
> +    }
>  };
>  
>  PCIHostState *spapr_create_phb(sPAPREnvironment *spapr, int index)
> @@ -840,17 +1210,6 @@ PCIHostState *spapr_create_phb(sPAPREnvironment *spapr, 
> int index)
>      return PCI_HOST_BRIDGE(dev);
>  }
>  
> -/* Macros to operate with address in OF binding to PCI */
> -#define b_x(x, p, l)    (((x) & ((1<<(l))-1)) << (p))
> -#define b_n(x)          b_x((x), 31, 1) /* 0 if relocatable */
> -#define b_p(x)          b_x((x), 30, 1) /* 1 if prefetchable */
> -#define b_t(x)          b_x((x), 29, 1) /* 1 if the address is aliased */
> -#define b_ss(x)         b_x((x), 24, 2) /* the space code */
> -#define b_bbbbbbbb(x)   b_x((x), 16, 8) /* bus number */
> -#define b_ddddd(x)      b_x((x), 11, 5) /* device number */
> -#define b_fff(x)        b_x((x), 8, 3)  /* function number */
> -#define b_rrrrrrrr(x)   b_x((x), 0, 8)  /* register number */
> -
>  typedef struct sPAPRTCEDT {
>      void *fdt;
>      int node_off;
> @@ -920,14 +1279,6 @@ int spapr_populate_pci_dt(sPAPRPHBState *phb,
>          return bus_off;
>      }
>  
> -#define _FDT(exp) \
> -    do { \
> -        int ret = (exp);                                           \
> -        if (ret < 0) {                                             \
> -            return ret;                                            \
> -        }                                                          \
> -    } while (0)
> -
>      /* Write PHB properties */
>      _FDT(fdt_setprop_string(fdt, bus_off, "device_type", "pci"));
>      _FDT(fdt_setprop_string(fdt, bus_off, "compatible", "IBM,Logical_PHB"));

-- 
David Gibson                    | I'll have my music baroque, and my code
david AT gibson.dropbear.id.au  | minimalist, thank you.  NOT _the_ _other_
                                | _way_ _around_!
http://www.ozlabs.org/~dgibson

Attachment: pgpfYgWAK4NyT.pgp
Description: PGP signature


reply via email to

[Prev in Thread] Current Thread [Next in Thread]