qemu-devel
[Top][All Lists]
Advanced

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

Re: [Qemu-devel] [PATCH v3 6/9] rocker: add new rocker switch device


From: Jason Wang
Subject: Re: [Qemu-devel] [PATCH v3 6/9] rocker: add new rocker switch device
Date: Fri, 16 Jan 2015 17:15:52 +0800
User-agent: Mozilla/5.0 (X11; Linux x86_64; rv:31.0) Gecko/20100101 Thunderbird/31.3.0

On 01/11/2015 11:57 AM, address@hidden wrote:
> From: Scott Feldman <address@hidden>
>
> Rocker is a simulated ethernet switch device.  The device supports up to 62
> front-panel ports and supports L2 switching and L3 routing functions, as well
> as L2/L3/L4 ACLs.  The device presents a single PCI device for each switch,
> with a memory-mapped register space for device driver access.
>
> Rocker device is invoked with -device, for example a 4-port switch:
>
>   -device rocker,name=sw1,len-ports=4,ports[0]=dev0,ports[1]=dev1, \
>          ports[2]=dev2,ports[3]=dev3
>
> Each port is a netdev and can be paired with using -netdev id=<port name>.
>
> Signed-off-by: Scott Feldman <address@hidden>
> Signed-off-by: Jiri Pirko <address@hidden>
> ---

Looks like the devices does not support state saving. How about tagging
it with unmigratable first?
>  default-configs/pci.mak       |    1 +
>  hw/net/Makefile.objs          |    3 +
>  hw/net/rocker/rocker.c        | 1395 +++++++++++++++++++++++++
>  hw/net/rocker/rocker.h        |   76 ++
>  hw/net/rocker/rocker_desc.c   |  379 +++++++
>  hw/net/rocker/rocker_desc.h   |   57 +
>  hw/net/rocker/rocker_fp.c     |  232 +++++
>  hw/net/rocker/rocker_fp.h     |   53 +
>  hw/net/rocker/rocker_hw.h     |  475 +++++++++
>  hw/net/rocker/rocker_of_dpa.c | 2314 
> +++++++++++++++++++++++++++++++++++++++++
>  hw/net/rocker/rocker_of_dpa.h |   25 +
>  hw/net/rocker/rocker_tlv.h    |  244 +++++
>  hw/net/rocker/rocker_world.c  |  108 ++
>  hw/net/rocker/rocker_world.h  |   63 ++
>  14 files changed, 5425 insertions(+)
>  create mode 100644 hw/net/rocker/rocker.c
>  create mode 100644 hw/net/rocker/rocker.h
>  create mode 100644 hw/net/rocker/rocker_desc.c
>  create mode 100644 hw/net/rocker/rocker_desc.h
>  create mode 100644 hw/net/rocker/rocker_fp.c
>  create mode 100644 hw/net/rocker/rocker_fp.h
>  create mode 100644 hw/net/rocker/rocker_hw.h
>  create mode 100644 hw/net/rocker/rocker_of_dpa.c
>  create mode 100644 hw/net/rocker/rocker_of_dpa.h
>  create mode 100644 hw/net/rocker/rocker_tlv.h
>  create mode 100644 hw/net/rocker/rocker_world.c
>  create mode 100644 hw/net/rocker/rocker_world.h
[...]
> +
> +typedef struct rocker {
> +    /* private */
> +    PCIDevice parent_obj;
> +    /* public */
> +
> +    MemoryRegion mmio;
> +    MemoryRegion msix_bar;
> +
> +    /* switch configuration */
> +    char *name;                  /* switch name */
> +    uint32_t fp_ports;           /* front-panel port count */
> +    NICPeers *fp_ports_peers;
> +    MACAddr fp_start_macaddr;    /* front-panel port 0 mac addr */
> +    uint64_t switch_id;          /* switch id */

Looks like the function of name ad switch_id are duplicated?
> +
> +    /* front-panel ports */
> +    FpPort *fp_port[ROCKER_FP_PORTS_MAX];
> +
> +    /* register backings */
> +    uint32_t test_reg;
> +    uint64_t test_reg64;
> +    dma_addr_t test_dma_addr;
> +    uint32_t test_dma_size;
> +
> +    /* desc rings */
> +    DescRing **rings;
> +
> +    /* switch worlds */
> +    World *worlds[ROCKER_WORLD_TYPE_MAX];
> +    World *world_dflt;
> +
> +    QLIST_ENTRY(rocker) next;
> +} Rocker;
> +
[...]
> +
> +static int tx_consume(Rocker *r, DescInfo *info)
> +{
> +    PCIDevice *dev = PCI_DEVICE(r);
> +    char *buf = desc_get_buf(info, true);
> +    RockerTlv *tlv_frag;
> +    RockerTlv *tlvs[ROCKER_TLV_TX_MAX + 1];
> +    struct iovec iov[ROCKER_TX_FRAGS_MAX] = { { 0, }, };
> +    uint32_t pport;
> +    uint32_t port;
> +    uint16_t tx_offload = ROCKER_TX_OFFLOAD_NONE;
> +    uint16_t tx_l3_csum_off = 0;
> +    uint16_t tx_tso_mss = 0;
> +    uint16_t tx_tso_hdr_len = 0;
> +    int iovcnt = 0;
> +    int err = 0;
> +    int rem;
> +    int i;
> +
> +    if (!buf) {
> +        return -ENXIO;
> +    }
> +
> +    rocker_tlv_parse(tlvs, ROCKER_TLV_TX_MAX, buf, desc_tlv_size(info));
> +
> +    if (!tlvs[ROCKER_TLV_TX_FRAGS]) {
> +        return -EINVAL;
> +    }
> +
> +    pport = rocker_get_pport_by_tx_ring(r, desc_get_ring(info));
> +    if (!fp_port_from_pport(pport, &port)) {
> +        return -EINVAL;
> +    }
> +
> +    if (tlvs[ROCKER_TLV_TX_OFFLOAD]) {
> +        tx_offload = rocker_tlv_get_u8(tlvs[ROCKER_TLV_TX_OFFLOAD]);
> +    }
> +
> +    switch (tx_offload) {
> +    case ROCKER_TX_OFFLOAD_L3_CSUM:
> +        if (!tlvs[ROCKER_TLV_TX_L3_CSUM_OFF]) {
> +            return -EINVAL;
> +        }
> +    case ROCKER_TX_OFFLOAD_TSO:
> +        if (!tlvs[ROCKER_TLV_TX_TSO_MSS] ||
> +            !tlvs[ROCKER_TLV_TX_TSO_HDR_LEN]) {
> +            return -EINVAL;
> +        }
> +    }
> +
> +    if (tlvs[ROCKER_TLV_TX_L3_CSUM_OFF]) {
> +        tx_l3_csum_off = 
> rocker_tlv_get_le16(tlvs[ROCKER_TLV_TX_L3_CSUM_OFF]);
> +    }
> +
> +    if (tlvs[ROCKER_TLV_TX_TSO_MSS]) {
> +        tx_tso_mss = rocker_tlv_get_le16(tlvs[ROCKER_TLV_TX_TSO_MSS]);
> +    }
> +
> +    if (tlvs[ROCKER_TLV_TX_TSO_HDR_LEN]) {
> +        tx_tso_hdr_len = 
> rocker_tlv_get_le16(tlvs[ROCKER_TLV_TX_TSO_HDR_LEN]);
> +    }
> +
> +    rocker_tlv_for_each_nested(tlv_frag, tlvs[ROCKER_TLV_TX_FRAGS], rem) {
> +        hwaddr frag_addr;
> +        uint16_t frag_len;
> +
> +        if (rocker_tlv_type(tlv_frag) != ROCKER_TLV_TX_FRAG) {
> +            return -EINVAL;
> +        }
> +
> +        rocker_tlv_parse_nested(tlvs, ROCKER_TLV_TX_FRAG_ATTR_MAX, tlv_frag);
> +
> +        if (!tlvs[ROCKER_TLV_TX_FRAG_ATTR_ADDR] ||
> +            !tlvs[ROCKER_TLV_TX_FRAG_ATTR_LEN]) {
> +            return -EINVAL;
> +        }
> +
> +        frag_addr = rocker_tlv_get_le64(tlvs[ROCKER_TLV_TX_FRAG_ATTR_ADDR]);
> +        frag_len = rocker_tlv_get_le16(tlvs[ROCKER_TLV_TX_FRAG_ATTR_LEN]);
> +
> +        iov[iovcnt].iov_len = frag_len;
> +        iov[iovcnt].iov_base = g_malloc(frag_len);
> +        if (!iov[iovcnt].iov_base) {
> +            err = -ENOMEM;
> +            goto err_no_mem;
> +        }
> +
> +        if (pci_dma_read(dev, frag_addr, iov[iovcnt].iov_base,
> +                     iov[iovcnt].iov_len)) {
> +            err = -ENXIO;
> +            goto err_bad_io;
> +        }
> +
> +        if (++iovcnt > ROCKER_TX_FRAGS_MAX) {
> +            goto err_too_many_frags;
> +        }
> +    }
> +
> +    if (iovcnt) {
> +        /* XXX perform Tx offloads */
> +        /* XXX   silence compiler for now */

Need add TSO here (e1000 is a good reference) or disable TSO in driver.
> +        tx_l3_csum_off += tx_tso_mss = tx_tso_hdr_len = 0;
> +    }
> +
> +    err = fp_port_eg(r->fp_port[port], iov, iovcnt);
> +
> +err_no_mem:
> +err_bad_io:
> +err_too_many_frags:
> +    for (i = 0; i < iovcnt; i++) {
> +        if (iov[i].iov_base) {
> +            g_free(iov[i].iov_base);
> +        }
> +    }
> +
> +    return err;
> +}
> +

[...]
> +
> +int rocker_event_link_changed(Rocker *r, uint32_t pport, bool link_up)
> +{
> +    DescRing *ring = r->rings[ROCKER_RING_EVENT];
> +    DescInfo *info = desc_ring_fetch_desc(ring);
> +    RockerTlv *nest;
> +    char *buf;
> +    size_t tlv_size;
> +    int pos;
> +    int err;
> +
> +    if (!info) {
> +        return -ENOBUFS;
> +    }
> +
> +    tlv_size = rocker_tlv_total_size(sizeof(uint16_t)) +  /* event type */
> +               rocker_tlv_total_size(0) +                 /* nest */
> +               rocker_tlv_total_size(sizeof(uint32_t)) +  /*   pport */
> +               rocker_tlv_total_size(sizeof(uint8_t));    /*   link up */
> +
> +    if (tlv_size > desc_buf_size(info)) {
> +        err = -EMSGSIZE;
> +        goto err_too_big;
> +    }
> +
> +    buf = desc_get_buf(info, false);
> +    if (!buf) {
> +        err = -ENOMEM;
> +        goto err_no_mem;
> +    }
> +
> +    pos = 0;
> +    rocker_tlv_put_le32(buf, &pos, ROCKER_TLV_EVENT_TYPE,
> +                        ROCKER_TLV_EVENT_TYPE_LINK_CHANGED);
> +    nest = rocker_tlv_nest_start(buf, &pos, ROCKER_TLV_EVENT_INFO);
> +    rocker_tlv_put_le32(buf, &pos, ROCKER_TLV_EVENT_LINK_CHANGED_PPORT, 
> pport);
> +    rocker_tlv_put_u8(buf, &pos, ROCKER_TLV_EVENT_LINK_CHANGED_LINKUP,
> +                      link_up ? 1 : 0);

Looks like those types are not documented.
> +    rocker_tlv_nest_end(buf, &pos, nest);
> +
> +    err = desc_set_buf(info, tlv_size);
> +
> +err_too_big:
> +err_no_mem:
> +    if (desc_ring_post_desc(ring, err)) {
> +        rocker_msix_irq(r, ROCKER_MSIX_VEC_EVENT);
> +    }
> +
> +    return err;
> +}
> +
> +int rocker_event_mac_vlan_seen(Rocker *r, uint32_t pport, uint8_t *addr,
> +                               uint16_t vlan_id)
> +{
> +    DescRing *ring = r->rings[ROCKER_RING_EVENT];
> +    DescInfo *info;
> +    FpPort *fp_port;
> +    uint32_t port;
> +    RockerTlv *nest;
> +    char *buf;
> +    size_t tlv_size;
> +    int pos;
> +    int err;
> +
> +    if (!fp_port_from_pport(pport, &port)) {
> +        return -EINVAL;
> +    }
> +    fp_port = r->fp_port[port];
> +    if (!fp_port_get_learning(fp_port)) {
> +        return 0;
> +    }
> +
> +    info = desc_ring_fetch_desc(ring);
> +    if (!info) {
> +        return -ENOBUFS;
> +    }
> +
> +    tlv_size = rocker_tlv_total_size(sizeof(uint16_t)) +  /* event type */
> +               rocker_tlv_total_size(0) +                 /* nest */
> +               rocker_tlv_total_size(sizeof(uint32_t)) +  /*   pport */
> +               rocker_tlv_total_size(ETH_ALEN) +          /*   mac addr */
> +               rocker_tlv_total_size(sizeof(uint16_t));   /*   vlan_id */
> +
> +    if (tlv_size > desc_buf_size(info)) {
> +        err = -EMSGSIZE;
> +        goto err_too_big;
> +    }
> +
> +    buf = desc_get_buf(info, false);
> +    if (!buf) {
> +        err = -ENOMEM;
> +        goto err_no_mem;
> +    }
> +
> +    pos = 0;
> +    rocker_tlv_put_le32(buf, &pos, ROCKER_TLV_EVENT_TYPE,
> +                        ROCKER_TLV_EVENT_TYPE_MAC_VLAN_SEEN);
> +    nest = rocker_tlv_nest_start(buf, &pos, ROCKER_TLV_EVENT_INFO);
> +    rocker_tlv_put_le32(buf, &pos, ROCKER_TLV_EVENT_MAC_VLAN_PPORT, pport);
> +    rocker_tlv_put(buf, &pos, ROCKER_TLV_EVENT_MAC_VLAN_MAC, ETH_ALEN, addr);
> +    rocker_tlv_put_u16(buf, &pos, ROCKER_TLV_EVENT_MAC_VLAN_VLAN_ID, 
> vlan_id);

Undocumented types.
> +    rocker_tlv_nest_end(buf, &pos, nest);
> +
> +    err = desc_set_buf(info, tlv_size);
> +
> +err_too_big:
> +err_no_mem:
> +    if (desc_ring_post_desc(ring, err)) {
> +        rocker_msix_irq(r, ROCKER_MSIX_VEC_EVENT);
> +    }
> +
> +    return err;
> +}
> +
> +static DescRing *rocker_get_rx_ring_by_pport(Rocker *r,
> +                                                     uint32_t pport)
> +{
> +    return r->rings[(pport - 1) * 2 + 3];
> +}
> +
> +int rx_produce(World *world, uint32_t pport,
> +               const struct iovec *iov, int iovcnt)
> +{
> +    Rocker *r = world_rocker(world);
> +    PCIDevice *dev = (PCIDevice *)r;
> +    DescRing *ring = rocker_get_rx_ring_by_pport(r, pport);
> +    DescInfo *info = desc_ring_fetch_desc(ring);
> +    char *data;
> +    size_t data_size = iov_size(iov, iovcnt);
> +    char *buf;
> +    uint16_t rx_flags = 0;
> +    uint16_t rx_csum = 0;
> +    size_t tlv_size;
> +    RockerTlv *tlvs[ROCKER_TLV_RX_MAX + 1];
> +    hwaddr frag_addr;
> +    uint16_t frag_max_len;
> +    int pos;
> +    int err;
> +
> +    if (!info) {
> +        return -ENOBUFS;
> +    }
> +
> +    buf = desc_get_buf(info, false);
> +    if (!buf) {
> +        err = -ENXIO;
> +        goto out;
> +    }
> +    rocker_tlv_parse(tlvs, ROCKER_TLV_RX_MAX, buf, desc_tlv_size(info));
> +
> +    if (!tlvs[ROCKER_TLV_RX_FRAG_ADDR] ||
> +        !tlvs[ROCKER_TLV_RX_FRAG_MAX_LEN]) {

This conflicts with the spec which only describes RX_PACKET.
> +        err = -EINVAL;
> +        goto out;
> +    }
> +
> +    frag_addr = rocker_tlv_get_le64(tlvs[ROCKER_TLV_RX_FRAG_ADDR]);
> +    frag_max_len = rocker_tlv_get_le16(tlvs[ROCKER_TLV_RX_FRAG_MAX_LEN]);
> +
> +    if (data_size > frag_max_len) {
> +        err = -EMSGSIZE;
> +        goto out;
> +    }
> +
> +    /* XXX calc rx flags/csum */
> +
> +    tlv_size = rocker_tlv_total_size(sizeof(uint16_t)) + /* flags */
> +               rocker_tlv_total_size(sizeof(uint16_t)) + /* scum */
> +               rocker_tlv_total_size(sizeof(uint64_t)) + /* frag addr */
> +               rocker_tlv_total_size(sizeof(uint16_t)) + /* frag max len */
> +               rocker_tlv_total_size(sizeof(uint16_t));  /* frag len */
> +
> +    if (tlv_size > desc_buf_size(info)) {
> +        err = -EMSGSIZE;
> +        goto out;
> +    }
> +
> +    /* TODO:
> +     * iov dma write can be optimized in similar way e1000 does it in
> +     * e1000_receive_iov. But maybe if would make sense to introduce
> +     * generic helper iov_dma_write.
> +     */
> +
> +    data = g_malloc(data_size);
> +    if (!data) {
> +        err = -ENOMEM;
> +        goto out;
> +    }
> +    iov_to_buf(iov, iovcnt, 0, data, data_size);
> +    pci_dma_write(dev, frag_addr, data, data_size);
> +    g_free(data);
> +
> +    pos = 0;
> +    rocker_tlv_put_le16(buf, &pos, ROCKER_TLV_RX_FLAGS, rx_flags);
> +    rocker_tlv_put_le16(buf, &pos, ROCKER_TLV_RX_CSUM, rx_csum);

Note: Some backend (e.g tap) can do offloading, may consider to add the
support in the future.
> +    rocker_tlv_put_le64(buf, &pos, ROCKER_TLV_RX_FRAG_ADDR, frag_addr);
> +    rocker_tlv_put_le16(buf, &pos, ROCKER_TLV_RX_FRAG_MAX_LEN, frag_max_len);

The above 2 types were not documented. Please make sure all types were
documented.
> +    rocker_tlv_put_le16(buf, &pos, ROCKER_TLV_RX_FRAG_LEN, data_size);
> +
> +    err = desc_set_buf(info, tlv_size);
> +
> +out:
> +    if (desc_ring_post_desc(ring, err)) {
> +        rocker_msix_irq(r, ROCKER_MSIX_VEC_RX(pport - 1));
> +    }
> +
> +    return err;
> +}
> +
[...]
> +
> +bool fp_port_from_pport(uint32_t pport, uint32_t *port)
> +{
> +    if (pport < 1 || pport > ROCKER_FP_PORTS_MAX) {
> +        return false;
> +    }
> +    *port = pport - 1;
> +    return true;
> +}
> +
> +int fp_port_eg(FpPort *port, const struct iovec *iov, int iovcnt)
> +{
> +    NetClientState *nc = qemu_get_queue(port->nic);

Using only 1 queues is ok for multiqueue backend. In the future may
consider to use all.
> +
> +    if (port->enabled) {
> +        qemu_sendv_packet(nc, iov, iovcnt);
> +    }
> +
> +    return 0;
> +}
> +
[...]

Thanks



reply via email to

[Prev in Thread] Current Thread [Next in Thread]