[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]
Re: [Qemu-devel] [PATCH v3 6/9] rocker: add new rocker switch device
From: |
Jason Wang |
Subject: |
Re: [Qemu-devel] [PATCH v3 6/9] rocker: add new rocker switch device |
Date: |
Fri, 16 Jan 2015 17:15:52 +0800 |
User-agent: |
Mozilla/5.0 (X11; Linux x86_64; rv:31.0) Gecko/20100101 Thunderbird/31.3.0 |
On 01/11/2015 11:57 AM, address@hidden wrote:
> From: Scott Feldman <address@hidden>
>
> Rocker is a simulated ethernet switch device. The device supports up to 62
> front-panel ports and supports L2 switching and L3 routing functions, as well
> as L2/L3/L4 ACLs. The device presents a single PCI device for each switch,
> with a memory-mapped register space for device driver access.
>
> Rocker device is invoked with -device, for example a 4-port switch:
>
> -device rocker,name=sw1,len-ports=4,ports[0]=dev0,ports[1]=dev1, \
> ports[2]=dev2,ports[3]=dev3
>
> Each port is a netdev and can be paired with using -netdev id=<port name>.
>
> Signed-off-by: Scott Feldman <address@hidden>
> Signed-off-by: Jiri Pirko <address@hidden>
> ---
Looks like the devices does not support state saving. How about tagging
it with unmigratable first?
> default-configs/pci.mak | 1 +
> hw/net/Makefile.objs | 3 +
> hw/net/rocker/rocker.c | 1395 +++++++++++++++++++++++++
> hw/net/rocker/rocker.h | 76 ++
> hw/net/rocker/rocker_desc.c | 379 +++++++
> hw/net/rocker/rocker_desc.h | 57 +
> hw/net/rocker/rocker_fp.c | 232 +++++
> hw/net/rocker/rocker_fp.h | 53 +
> hw/net/rocker/rocker_hw.h | 475 +++++++++
> hw/net/rocker/rocker_of_dpa.c | 2314
> +++++++++++++++++++++++++++++++++++++++++
> hw/net/rocker/rocker_of_dpa.h | 25 +
> hw/net/rocker/rocker_tlv.h | 244 +++++
> hw/net/rocker/rocker_world.c | 108 ++
> hw/net/rocker/rocker_world.h | 63 ++
> 14 files changed, 5425 insertions(+)
> create mode 100644 hw/net/rocker/rocker.c
> create mode 100644 hw/net/rocker/rocker.h
> create mode 100644 hw/net/rocker/rocker_desc.c
> create mode 100644 hw/net/rocker/rocker_desc.h
> create mode 100644 hw/net/rocker/rocker_fp.c
> create mode 100644 hw/net/rocker/rocker_fp.h
> create mode 100644 hw/net/rocker/rocker_hw.h
> create mode 100644 hw/net/rocker/rocker_of_dpa.c
> create mode 100644 hw/net/rocker/rocker_of_dpa.h
> create mode 100644 hw/net/rocker/rocker_tlv.h
> create mode 100644 hw/net/rocker/rocker_world.c
> create mode 100644 hw/net/rocker/rocker_world.h
[...]
> +
> +typedef struct rocker {
> + /* private */
> + PCIDevice parent_obj;
> + /* public */
> +
> + MemoryRegion mmio;
> + MemoryRegion msix_bar;
> +
> + /* switch configuration */
> + char *name; /* switch name */
> + uint32_t fp_ports; /* front-panel port count */
> + NICPeers *fp_ports_peers;
> + MACAddr fp_start_macaddr; /* front-panel port 0 mac addr */
> + uint64_t switch_id; /* switch id */
Looks like the function of name ad switch_id are duplicated?
> +
> + /* front-panel ports */
> + FpPort *fp_port[ROCKER_FP_PORTS_MAX];
> +
> + /* register backings */
> + uint32_t test_reg;
> + uint64_t test_reg64;
> + dma_addr_t test_dma_addr;
> + uint32_t test_dma_size;
> +
> + /* desc rings */
> + DescRing **rings;
> +
> + /* switch worlds */
> + World *worlds[ROCKER_WORLD_TYPE_MAX];
> + World *world_dflt;
> +
> + QLIST_ENTRY(rocker) next;
> +} Rocker;
> +
[...]
> +
> +static int tx_consume(Rocker *r, DescInfo *info)
> +{
> + PCIDevice *dev = PCI_DEVICE(r);
> + char *buf = desc_get_buf(info, true);
> + RockerTlv *tlv_frag;
> + RockerTlv *tlvs[ROCKER_TLV_TX_MAX + 1];
> + struct iovec iov[ROCKER_TX_FRAGS_MAX] = { { 0, }, };
> + uint32_t pport;
> + uint32_t port;
> + uint16_t tx_offload = ROCKER_TX_OFFLOAD_NONE;
> + uint16_t tx_l3_csum_off = 0;
> + uint16_t tx_tso_mss = 0;
> + uint16_t tx_tso_hdr_len = 0;
> + int iovcnt = 0;
> + int err = 0;
> + int rem;
> + int i;
> +
> + if (!buf) {
> + return -ENXIO;
> + }
> +
> + rocker_tlv_parse(tlvs, ROCKER_TLV_TX_MAX, buf, desc_tlv_size(info));
> +
> + if (!tlvs[ROCKER_TLV_TX_FRAGS]) {
> + return -EINVAL;
> + }
> +
> + pport = rocker_get_pport_by_tx_ring(r, desc_get_ring(info));
> + if (!fp_port_from_pport(pport, &port)) {
> + return -EINVAL;
> + }
> +
> + if (tlvs[ROCKER_TLV_TX_OFFLOAD]) {
> + tx_offload = rocker_tlv_get_u8(tlvs[ROCKER_TLV_TX_OFFLOAD]);
> + }
> +
> + switch (tx_offload) {
> + case ROCKER_TX_OFFLOAD_L3_CSUM:
> + if (!tlvs[ROCKER_TLV_TX_L3_CSUM_OFF]) {
> + return -EINVAL;
> + }
> + case ROCKER_TX_OFFLOAD_TSO:
> + if (!tlvs[ROCKER_TLV_TX_TSO_MSS] ||
> + !tlvs[ROCKER_TLV_TX_TSO_HDR_LEN]) {
> + return -EINVAL;
> + }
> + }
> +
> + if (tlvs[ROCKER_TLV_TX_L3_CSUM_OFF]) {
> + tx_l3_csum_off =
> rocker_tlv_get_le16(tlvs[ROCKER_TLV_TX_L3_CSUM_OFF]);
> + }
> +
> + if (tlvs[ROCKER_TLV_TX_TSO_MSS]) {
> + tx_tso_mss = rocker_tlv_get_le16(tlvs[ROCKER_TLV_TX_TSO_MSS]);
> + }
> +
> + if (tlvs[ROCKER_TLV_TX_TSO_HDR_LEN]) {
> + tx_tso_hdr_len =
> rocker_tlv_get_le16(tlvs[ROCKER_TLV_TX_TSO_HDR_LEN]);
> + }
> +
> + rocker_tlv_for_each_nested(tlv_frag, tlvs[ROCKER_TLV_TX_FRAGS], rem) {
> + hwaddr frag_addr;
> + uint16_t frag_len;
> +
> + if (rocker_tlv_type(tlv_frag) != ROCKER_TLV_TX_FRAG) {
> + return -EINVAL;
> + }
> +
> + rocker_tlv_parse_nested(tlvs, ROCKER_TLV_TX_FRAG_ATTR_MAX, tlv_frag);
> +
> + if (!tlvs[ROCKER_TLV_TX_FRAG_ATTR_ADDR] ||
> + !tlvs[ROCKER_TLV_TX_FRAG_ATTR_LEN]) {
> + return -EINVAL;
> + }
> +
> + frag_addr = rocker_tlv_get_le64(tlvs[ROCKER_TLV_TX_FRAG_ATTR_ADDR]);
> + frag_len = rocker_tlv_get_le16(tlvs[ROCKER_TLV_TX_FRAG_ATTR_LEN]);
> +
> + iov[iovcnt].iov_len = frag_len;
> + iov[iovcnt].iov_base = g_malloc(frag_len);
> + if (!iov[iovcnt].iov_base) {
> + err = -ENOMEM;
> + goto err_no_mem;
> + }
> +
> + if (pci_dma_read(dev, frag_addr, iov[iovcnt].iov_base,
> + iov[iovcnt].iov_len)) {
> + err = -ENXIO;
> + goto err_bad_io;
> + }
> +
> + if (++iovcnt > ROCKER_TX_FRAGS_MAX) {
> + goto err_too_many_frags;
> + }
> + }
> +
> + if (iovcnt) {
> + /* XXX perform Tx offloads */
> + /* XXX silence compiler for now */
Need add TSO here (e1000 is a good reference) or disable TSO in driver.
> + tx_l3_csum_off += tx_tso_mss = tx_tso_hdr_len = 0;
> + }
> +
> + err = fp_port_eg(r->fp_port[port], iov, iovcnt);
> +
> +err_no_mem:
> +err_bad_io:
> +err_too_many_frags:
> + for (i = 0; i < iovcnt; i++) {
> + if (iov[i].iov_base) {
> + g_free(iov[i].iov_base);
> + }
> + }
> +
> + return err;
> +}
> +
[...]
> +
> +int rocker_event_link_changed(Rocker *r, uint32_t pport, bool link_up)
> +{
> + DescRing *ring = r->rings[ROCKER_RING_EVENT];
> + DescInfo *info = desc_ring_fetch_desc(ring);
> + RockerTlv *nest;
> + char *buf;
> + size_t tlv_size;
> + int pos;
> + int err;
> +
> + if (!info) {
> + return -ENOBUFS;
> + }
> +
> + tlv_size = rocker_tlv_total_size(sizeof(uint16_t)) + /* event type */
> + rocker_tlv_total_size(0) + /* nest */
> + rocker_tlv_total_size(sizeof(uint32_t)) + /* pport */
> + rocker_tlv_total_size(sizeof(uint8_t)); /* link up */
> +
> + if (tlv_size > desc_buf_size(info)) {
> + err = -EMSGSIZE;
> + goto err_too_big;
> + }
> +
> + buf = desc_get_buf(info, false);
> + if (!buf) {
> + err = -ENOMEM;
> + goto err_no_mem;
> + }
> +
> + pos = 0;
> + rocker_tlv_put_le32(buf, &pos, ROCKER_TLV_EVENT_TYPE,
> + ROCKER_TLV_EVENT_TYPE_LINK_CHANGED);
> + nest = rocker_tlv_nest_start(buf, &pos, ROCKER_TLV_EVENT_INFO);
> + rocker_tlv_put_le32(buf, &pos, ROCKER_TLV_EVENT_LINK_CHANGED_PPORT,
> pport);
> + rocker_tlv_put_u8(buf, &pos, ROCKER_TLV_EVENT_LINK_CHANGED_LINKUP,
> + link_up ? 1 : 0);
Looks like those types are not documented.
> + rocker_tlv_nest_end(buf, &pos, nest);
> +
> + err = desc_set_buf(info, tlv_size);
> +
> +err_too_big:
> +err_no_mem:
> + if (desc_ring_post_desc(ring, err)) {
> + rocker_msix_irq(r, ROCKER_MSIX_VEC_EVENT);
> + }
> +
> + return err;
> +}
> +
> +int rocker_event_mac_vlan_seen(Rocker *r, uint32_t pport, uint8_t *addr,
> + uint16_t vlan_id)
> +{
> + DescRing *ring = r->rings[ROCKER_RING_EVENT];
> + DescInfo *info;
> + FpPort *fp_port;
> + uint32_t port;
> + RockerTlv *nest;
> + char *buf;
> + size_t tlv_size;
> + int pos;
> + int err;
> +
> + if (!fp_port_from_pport(pport, &port)) {
> + return -EINVAL;
> + }
> + fp_port = r->fp_port[port];
> + if (!fp_port_get_learning(fp_port)) {
> + return 0;
> + }
> +
> + info = desc_ring_fetch_desc(ring);
> + if (!info) {
> + return -ENOBUFS;
> + }
> +
> + tlv_size = rocker_tlv_total_size(sizeof(uint16_t)) + /* event type */
> + rocker_tlv_total_size(0) + /* nest */
> + rocker_tlv_total_size(sizeof(uint32_t)) + /* pport */
> + rocker_tlv_total_size(ETH_ALEN) + /* mac addr */
> + rocker_tlv_total_size(sizeof(uint16_t)); /* vlan_id */
> +
> + if (tlv_size > desc_buf_size(info)) {
> + err = -EMSGSIZE;
> + goto err_too_big;
> + }
> +
> + buf = desc_get_buf(info, false);
> + if (!buf) {
> + err = -ENOMEM;
> + goto err_no_mem;
> + }
> +
> + pos = 0;
> + rocker_tlv_put_le32(buf, &pos, ROCKER_TLV_EVENT_TYPE,
> + ROCKER_TLV_EVENT_TYPE_MAC_VLAN_SEEN);
> + nest = rocker_tlv_nest_start(buf, &pos, ROCKER_TLV_EVENT_INFO);
> + rocker_tlv_put_le32(buf, &pos, ROCKER_TLV_EVENT_MAC_VLAN_PPORT, pport);
> + rocker_tlv_put(buf, &pos, ROCKER_TLV_EVENT_MAC_VLAN_MAC, ETH_ALEN, addr);
> + rocker_tlv_put_u16(buf, &pos, ROCKER_TLV_EVENT_MAC_VLAN_VLAN_ID,
> vlan_id);
Undocumented types.
> + rocker_tlv_nest_end(buf, &pos, nest);
> +
> + err = desc_set_buf(info, tlv_size);
> +
> +err_too_big:
> +err_no_mem:
> + if (desc_ring_post_desc(ring, err)) {
> + rocker_msix_irq(r, ROCKER_MSIX_VEC_EVENT);
> + }
> +
> + return err;
> +}
> +
> +static DescRing *rocker_get_rx_ring_by_pport(Rocker *r,
> + uint32_t pport)
> +{
> + return r->rings[(pport - 1) * 2 + 3];
> +}
> +
> +int rx_produce(World *world, uint32_t pport,
> + const struct iovec *iov, int iovcnt)
> +{
> + Rocker *r = world_rocker(world);
> + PCIDevice *dev = (PCIDevice *)r;
> + DescRing *ring = rocker_get_rx_ring_by_pport(r, pport);
> + DescInfo *info = desc_ring_fetch_desc(ring);
> + char *data;
> + size_t data_size = iov_size(iov, iovcnt);
> + char *buf;
> + uint16_t rx_flags = 0;
> + uint16_t rx_csum = 0;
> + size_t tlv_size;
> + RockerTlv *tlvs[ROCKER_TLV_RX_MAX + 1];
> + hwaddr frag_addr;
> + uint16_t frag_max_len;
> + int pos;
> + int err;
> +
> + if (!info) {
> + return -ENOBUFS;
> + }
> +
> + buf = desc_get_buf(info, false);
> + if (!buf) {
> + err = -ENXIO;
> + goto out;
> + }
> + rocker_tlv_parse(tlvs, ROCKER_TLV_RX_MAX, buf, desc_tlv_size(info));
> +
> + if (!tlvs[ROCKER_TLV_RX_FRAG_ADDR] ||
> + !tlvs[ROCKER_TLV_RX_FRAG_MAX_LEN]) {
This conflicts with the spec which only describes RX_PACKET.
> + err = -EINVAL;
> + goto out;
> + }
> +
> + frag_addr = rocker_tlv_get_le64(tlvs[ROCKER_TLV_RX_FRAG_ADDR]);
> + frag_max_len = rocker_tlv_get_le16(tlvs[ROCKER_TLV_RX_FRAG_MAX_LEN]);
> +
> + if (data_size > frag_max_len) {
> + err = -EMSGSIZE;
> + goto out;
> + }
> +
> + /* XXX calc rx flags/csum */
> +
> + tlv_size = rocker_tlv_total_size(sizeof(uint16_t)) + /* flags */
> + rocker_tlv_total_size(sizeof(uint16_t)) + /* scum */
> + rocker_tlv_total_size(sizeof(uint64_t)) + /* frag addr */
> + rocker_tlv_total_size(sizeof(uint16_t)) + /* frag max len */
> + rocker_tlv_total_size(sizeof(uint16_t)); /* frag len */
> +
> + if (tlv_size > desc_buf_size(info)) {
> + err = -EMSGSIZE;
> + goto out;
> + }
> +
> + /* TODO:
> + * iov dma write can be optimized in similar way e1000 does it in
> + * e1000_receive_iov. But maybe if would make sense to introduce
> + * generic helper iov_dma_write.
> + */
> +
> + data = g_malloc(data_size);
> + if (!data) {
> + err = -ENOMEM;
> + goto out;
> + }
> + iov_to_buf(iov, iovcnt, 0, data, data_size);
> + pci_dma_write(dev, frag_addr, data, data_size);
> + g_free(data);
> +
> + pos = 0;
> + rocker_tlv_put_le16(buf, &pos, ROCKER_TLV_RX_FLAGS, rx_flags);
> + rocker_tlv_put_le16(buf, &pos, ROCKER_TLV_RX_CSUM, rx_csum);
Note: Some backend (e.g tap) can do offloading, may consider to add the
support in the future.
> + rocker_tlv_put_le64(buf, &pos, ROCKER_TLV_RX_FRAG_ADDR, frag_addr);
> + rocker_tlv_put_le16(buf, &pos, ROCKER_TLV_RX_FRAG_MAX_LEN, frag_max_len);
The above 2 types were not documented. Please make sure all types were
documented.
> + rocker_tlv_put_le16(buf, &pos, ROCKER_TLV_RX_FRAG_LEN, data_size);
> +
> + err = desc_set_buf(info, tlv_size);
> +
> +out:
> + if (desc_ring_post_desc(ring, err)) {
> + rocker_msix_irq(r, ROCKER_MSIX_VEC_RX(pport - 1));
> + }
> +
> + return err;
> +}
> +
[...]
> +
> +bool fp_port_from_pport(uint32_t pport, uint32_t *port)
> +{
> + if (pport < 1 || pport > ROCKER_FP_PORTS_MAX) {
> + return false;
> + }
> + *port = pport - 1;
> + return true;
> +}
> +
> +int fp_port_eg(FpPort *port, const struct iovec *iov, int iovcnt)
> +{
> + NetClientState *nc = qemu_get_queue(port->nic);
Using only 1 queues is ok for multiqueue backend. In the future may
consider to use all.
> +
> + if (port->enabled) {
> + qemu_sendv_packet(nc, iov, iovcnt);
> + }
> +
> + return 0;
> +}
> +
[...]
Thanks
- [Qemu-devel] [PATCH v3 4/9] pci: add rocker device ID, (continued)
- [Qemu-devel] [PATCH v3 4/9] pci: add rocker device ID, sfeldma, 2015/01/10
- [Qemu-devel] [PATCH v3 8/9] rocker: add tests, sfeldma, 2015/01/10
- [Qemu-devel] [PATCH v3 9/9] MAINTAINERS: add rocker, sfeldma, 2015/01/10
- [Qemu-devel] [PATCH v3 7/9] qmp: add rocker device support, sfeldma, 2015/01/10
- [Qemu-devel] [PATCH v3 6/9] rocker: add new rocker switch device, sfeldma, 2015/01/10
- Re: [Qemu-devel] [PATCH v3 6/9] rocker: add new rocker switch device,
Jason Wang <=