[Top][All Lists]
[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]
[Qemu-devel] Re: [PATCH qemu-kvm] Add raw(af_packet) network backend to
From: |
Sridhar Samudrala |
Subject: |
[Qemu-devel] Re: [PATCH qemu-kvm] Add raw(af_packet) network backend to qemu |
Date: |
Fri, 29 Jan 2010 12:52:56 -0800 |
On Wed, 2010-01-27 at 14:56 -0800, Sridhar Samudrala wrote:
> On Wed, 2010-01-27 at 22:39 +0100, Arnd Bergmann wrote:
> > On Wednesday 27 January 2010, Anthony Liguori wrote:
> > > >> I think -net socket,fd should just be (trivially) extended to work
> > > >> with raw
> > > >> sockets out of the box, with no support for opening it. Then you can
> > > >> have
> > > >> libvirt or some wrapper open a raw socket and a private namespace and
> > > >> just pass it
> > > >> down.
> > > >>
> > > > That'd work. Anthony?
> > >
> > > The fundamental problem that I have with all of this is that we should
> > > not be introducing new network backends that are based around something
> > > only a developer is going to understand. If I'm a user and I want to
> > > use an external switch in VEPA mode, how in the world am I going to know
> > > that I'm supposed to use the -net raw backend or the -net socket
> > > backend? It might as well be the -net butterflies backend as far as a
> > > user is concerned.
> >
> > My point is that we already have -net socket,fd and any user that passes
> > an fd into that already knows what he wants to do with it. Making it
> > work with raw sockets is just a natural extension to this, which works
> > on all kernels and (with separate namespaces) is reasonably secure.
>
> Didn't realize that -net socket is already there and supports TCP and
> UDP sockets. I will look into extending -net socket to support AF_PACKET
> SOCK_RAW type sockets.
OK. Here is a patch that adds AF_PACKET-SOCK_RAW support to -netdev socket
backend. It allows specifying a already opened raw fd or a ifname to which a
raw socket can be bind.
-netdev socket,fd=X,id=str
-netdev socket,ifname=<ethX/macvlanX>,id=str
However, i found that struct NetSocketState doesn't include all the State info
that
is required to support AF_PACKET Raw sockets. So i had to add NetSocketRawState
and also couldn't re-use much of the code.
I think -net socket backend is more geared towards AF_INET sockets. Adding
support
for a new family of socket doesn't fit nicely with the existing code.
But if this approach is more acceptable than a new -net raw,fd backend, i am
fine
with it.
Thanks
Sridhar
diff --git a/hw/virtio-net.c b/hw/virtio-net.c
index eba578a..7d62dd9 100644
--- a/hw/virtio-net.c
+++ b/hw/virtio-net.c
@@ -15,6 +15,7 @@
#include "net.h"
#include "net/checksum.h"
#include "net/tap.h"
+#include "net/socket.h"
#include "qemu-timer.h"
#include "virtio-net.h"
@@ -133,6 +134,9 @@ static int peer_has_vnet_hdr(VirtIONet *n)
case NET_CLIENT_TYPE_TAP:
n->has_vnet_hdr = tap_has_vnet_hdr(n->nic->nc.peer);
break;
+ case NET_CLIENT_TYPE_SOCKET_RAW:
+ n->has_vnet_hdr = sock_raw_has_vnet_hdr(n->nic->nc.peer);
+ break;
default:
return 0;
}
@@ -149,6 +153,9 @@ static int peer_has_ufo(VirtIONet *n)
case NET_CLIENT_TYPE_TAP:
n->has_ufo = tap_has_ufo(n->nic->nc.peer);
break;
+ case NET_CLIENT_TYPE_SOCKET_RAW:
+ n->has_ufo = sock_raw_has_ufo(n->nic->nc.peer);
+ break;
default:
return 0;
}
@@ -165,6 +172,9 @@ static void peer_using_vnet_hdr(VirtIONet *n, int
using_vnet_hdr)
case NET_CLIENT_TYPE_TAP:
tap_using_vnet_hdr(n->nic->nc.peer, using_vnet_hdr);
break;
+ case NET_CLIENT_TYPE_SOCKET_RAW:
+ sock_raw_using_vnet_hdr(n->nic->nc.peer, using_vnet_hdr);
+ break;
default:
break;
}
@@ -180,6 +190,9 @@ static void peer_set_offload(VirtIONet *n, int csum, int
tso4, int tso6,
case NET_CLIENT_TYPE_TAP:
tap_set_offload(n->nic->nc.peer, csum, tso4, tso6, ecn, ufo);
break;
+ case NET_CLIENT_TYPE_SOCKET_RAW:
+ sock_raw_set_offload(n->nic->nc.peer, csum, tso4, tso6, ecn, ufo);
+ break;
default:
break;
}
diff --git a/net.c b/net.c
index 6ef93e6..3d25d64 100644
--- a/net.c
+++ b/net.c
@@ -1002,6 +1002,11 @@ static struct {
.type = QEMU_OPT_STRING,
.help = "UDP multicast address and port number",
},
+ {
+ .name = "ifname",
+ .type = QEMU_OPT_STRING,
+ .help = "interface name",
+ },
{ /* end of list */ }
},
#ifdef CONFIG_VDE
diff --git a/net.h b/net.h
index 116bb80..74b3e69 100644
--- a/net.h
+++ b/net.h
@@ -34,7 +34,8 @@ typedef enum {
NET_CLIENT_TYPE_TAP,
NET_CLIENT_TYPE_SOCKET,
NET_CLIENT_TYPE_VDE,
- NET_CLIENT_TYPE_DUMP
+ NET_CLIENT_TYPE_DUMP,
+ NET_CLIENT_TYPE_SOCKET_RAW,
} net_client_type;
typedef void (NetPoll)(VLANClientState *, bool enable);
diff --git a/net/socket.c b/net/socket.c
index 5533737..56f5bad 100644
--- a/net/socket.c
+++ b/net/socket.c
@@ -32,6 +32,327 @@
#include "qemu_socket.h"
#include "sysemu.h"
+#include <netpacket/packet.h>
+#include <net/ethernet.h>
+#include <net/if.h>
+#include <sys/ioctl.h>
+
+/* Maximum GSO packet size (64k) plus plenty of room for
+ * the ethernet and virtio_net headers
+ */
+#define RAW_BUFSIZE (4096 + 65536)
+
+typedef struct NetSocketRawState {
+ VLANClientState nc;
+ int fd;
+ uint8_t buf[RAW_BUFSIZE];
+ int promisc;
+ unsigned int read_poll:1;
+ unsigned int write_poll:1;
+ unsigned int has_vnet_hdr:1;
+ unsigned int using_vnet_hdr:1;
+ unsigned int has_ufo:1;
+} NetSocketRawState;
+
+struct virtio_net_hdr
+{
+ uint8_t flags;
+ uint8_t gso_type;
+ uint16_t hdr_len;
+ uint16_t gso_size;
+ uint16_t csum_start;
+ uint16_t csum_offset;
+};
+
+static int sock_raw_can_send(void *opaque);
+static void sock_raw_send(void *opaque);
+static void sock_raw_writable(void *opaque);
+
+static void sock_raw_update_fd_handler(NetSocketRawState *s)
+{
+ qemu_set_fd_handler2(s->fd,
+ s->read_poll ? sock_raw_can_send : NULL,
+ s->read_poll ? sock_raw_send : NULL,
+ s->write_poll ? sock_raw_writable : NULL,
+ s);
+}
+
+static void sock_raw_read_poll(NetSocketRawState *s, int enable)
+{
+ s->read_poll = !!enable;
+ sock_raw_update_fd_handler(s);
+}
+
+static void sock_raw_write_poll(NetSocketRawState *s, int enable)
+{
+ s->write_poll = !!enable;
+ sock_raw_update_fd_handler(s);
+}
+
+static void sock_raw_writable(void *opaque)
+{
+ NetSocketRawState *s = opaque;
+
+ sock_raw_write_poll(s, 0);
+ qemu_flush_queued_packets(&s->nc);
+}
+
+static ssize_t sock_raw_write_packet(NetSocketRawState *s,
+ const struct iovec *iov,
+ int iovcnt)
+{
+ ssize_t len;
+
+ do {
+ len = writev(s->fd, iov, iovcnt);
+ } while (len == -1 && errno == EINTR);
+
+ if (len == -1 && errno == EAGAIN) {
+ sock_raw_write_poll(s, 1);
+ return 0;
+ }
+
+ if (len == -1)
+ printf("raw_write_packet: errno:%d\n", errno);
+
+ return len;
+}
+
+static ssize_t sock_raw_receive_iov(VLANClientState *nc,
+ const struct iovec *iov,
+ int iovcnt)
+{
+ NetSocketRawState *s = DO_UPCAST(NetSocketRawState, nc, nc);
+ const struct iovec *iovp = iov;
+ struct iovec iov_copy[iovcnt + 1];
+ struct virtio_net_hdr hdr = { 0, };
+
+ if (s->has_vnet_hdr && !s->using_vnet_hdr) {
+ iov_copy[0].iov_base = &hdr;
+ iov_copy[0].iov_len = sizeof(hdr);
+ memcpy(&iov_copy[1], iov, iovcnt * sizeof(*iov));
+ iovp = iov_copy;
+ iovcnt++;
+ }
+
+ return sock_raw_write_packet(s, iovp, iovcnt);
+}
+
+static ssize_t sock_raw_receive_raw(VLANClientState *nc, const uint8_t *buf,
+ size_t size)
+{
+ NetSocketRawState *s = DO_UPCAST(NetSocketRawState, nc, nc);
+ struct iovec iov[2];
+ int iovcnt = 0;
+ struct virtio_net_hdr hdr = { 0, };
+
+ if (s->has_vnet_hdr) {
+ iov[iovcnt].iov_base = &hdr;
+ iov[iovcnt].iov_len = sizeof(hdr);
+ iovcnt++;
+ }
+
+ iov[iovcnt].iov_base = (char *)buf;
+ iov[iovcnt].iov_len = size;
+ iovcnt++;
+
+ return sock_raw_write_packet(s, iov, iovcnt);
+}
+
+static ssize_t sock_raw_receive(VLANClientState *nc, const uint8_t *buf,
+ size_t size)
+{
+ NetSocketRawState *s = DO_UPCAST(NetSocketRawState, nc, nc);
+ struct iovec iov[1];
+
+ if (s->has_vnet_hdr && !s->using_vnet_hdr)
+ return sock_raw_receive_raw(nc, buf, size);
+
+ iov[0].iov_base = (char *)buf;
+ iov[0].iov_len = size;
+
+ return sock_raw_write_packet(s, iov, 1);
+}
+
+static int sock_raw_can_send(void *opaque)
+{
+ NetSocketRawState *s = opaque;
+
+ return qemu_can_send_packet(&s->nc);
+}
+
+ssize_t sock_raw_read_packet(int fd, uint8_t *buf, int maxlen, int flags)
+{
+ int ret;
+
+ ret = recv(fd, buf, maxlen, flags);
+ return ret;
+}
+
+static void sock_raw_send_completed(VLANClientState *nc, ssize_t len)
+{
+ NetSocketRawState *s = DO_UPCAST(NetSocketRawState, nc, nc);
+
+ sock_raw_read_poll(s, 1);
+}
+
+static void sock_raw_send(void *opaque)
+{
+ NetSocketRawState *s = opaque;
+ int size;
+
+ do {
+ uint8_t *buf = s->buf;
+
+ size = sock_raw_read_packet(s->fd, s->buf, sizeof(s->buf), MSG_TRUNC);
+ if (size <= 0)
+ break;
+
+ if (s->has_vnet_hdr && !s->using_vnet_hdr) {
+ buf += sizeof(struct virtio_net_hdr);
+ size -= sizeof(struct virtio_net_hdr);
+ }
+
+ size = qemu_send_packet_async(&s->nc, buf, size,
+ sock_raw_send_completed);
+ if (size == 0)
+ sock_raw_read_poll(s, 0);
+
+ } while (size > 0 && qemu_can_send_packet(&s->nc));
+}
+
+int sock_raw_has_ufo(VLANClientState *nc)
+{
+ NetSocketRawState *s = DO_UPCAST(NetSocketRawState, nc, nc);
+
+ assert(nc->info->type == NET_CLIENT_TYPE_SOCKET_RAW);
+
+ return s->has_ufo;
+}
+
+int sock_raw_has_vnet_hdr(VLANClientState *nc)
+{
+ NetSocketRawState *s = DO_UPCAST(NetSocketRawState, nc, nc);
+
+ assert(nc->info->type == NET_CLIENT_TYPE_SOCKET_RAW);
+
+ return s->has_vnet_hdr;
+}
+
+void sock_raw_using_vnet_hdr(VLANClientState *nc, int using_vnet_hdr)
+{
+ NetSocketRawState *s = DO_UPCAST(NetSocketRawState, nc, nc);
+
+ using_vnet_hdr = using_vnet_hdr != 0;
+
+ assert(nc->info->type == NET_CLIENT_TYPE_SOCKET_RAW);
+ assert(s->has_vnet_hdr == using_vnet_hdr);
+
+ s->using_vnet_hdr = using_vnet_hdr;
+}
+
+void sock_raw_set_offload(VLANClientState *nc, int csum, int tso4,
+ int tso6, int ecn, int ufo)
+{
+ return;
+}
+
+static void sock_raw_cleanup(VLANClientState *nc)
+{
+ NetSocketRawState *s = DO_UPCAST(NetSocketRawState, nc, nc);
+
+ qemu_purge_queued_packets(nc);
+
+ sock_raw_read_poll(s, 0);
+ sock_raw_write_poll(s, 0);
+ close(s->fd);
+}
+
+int sock_raw_probe_vnet_hdr(int fd)
+{
+ int val, len;
+
+ len = sizeof(val);
+ if (getsockopt(fd, SOL_PACKET, PACKET_VNET_HDR, &val, (socklen_t *)&len)
+ == 0)
+ return 1;
+
+ return 0;
+}
+
+static NetClientInfo net_raw_info = {
+ .type = NET_CLIENT_TYPE_SOCKET_RAW,
+ .size = sizeof(NetSocketRawState),
+ .receive = sock_raw_receive,
+ .receive_raw = NULL,
+ .receive_iov = sock_raw_receive_iov,
+ .cleanup = sock_raw_cleanup,
+};
+
+
+static NetSocketRawState *net_socket_fd_init_raw(VLANState *vlan,
+ const char *model,
+ const char *name, int fd)
+{
+ VLANClientState *nc;
+ NetSocketRawState *s;
+
+ nc = qemu_new_net_client(&net_raw_info, vlan, NULL, model, name);
+
+ s = DO_UPCAST(NetSocketRawState, nc, nc);
+
+ s->fd = fd;
+ s->has_vnet_hdr = sock_raw_probe_vnet_hdr(fd);
+ s->using_vnet_hdr = 0;
+ s->has_ufo = 1;
+ sock_raw_read_poll(s, 1);
+
+ return s;
+}
+
+static int net_socket_raw_ifname_init(VLANState *vlan, const char *model,
+ const char *name, const char *ifname)
+{
+ struct ifreq req;
+ int fd, ret;
+ struct sockaddr_ll lladdr;
+ int val;
+
+ fd = qemu_socket(PF_PACKET, SOCK_RAW, htons(ETH_P_ALL));
+ if (fd < 0)
+ fprintf(stderr, "packet socket failed\n");
+
+ memset(&req, 0, sizeof(req));
+ strncpy(req.ifr_name, ifname, IFNAMSIZ-1);
+ ret = ioctl(fd, SIOCGIFINDEX, &req);
+ if (ret < 0)
+ fprintf(stderr, "SIOCGIFINDEX failed\n");
+
+ memset(&lladdr, 0, sizeof(lladdr));
+ lladdr.sll_family = AF_PACKET;
+ lladdr.sll_protocol = htons(ETH_P_ALL);
+ lladdr.sll_ifindex = req.ifr_ifindex;
+ ret = bind(fd, (const struct sockaddr *)&lladdr, sizeof(lladdr));
+ if (ret < 0)
+ fprintf(stderr, "bind failed\n");
+
+ val = 1;
+ ret=setsockopt(fd, SOL_PACKET, PACKET_VNET_HDR, (const char *)&val,
+ sizeof(val));
+ if (ret < 0) {
+ fprintf(stderr, "setsockopt(SOL_PACKET, PACKET_VNET_HDR) failed\n");
+ }
+
+ ret = fcntl(fd, F_SETFL, fcntl(fd, F_GETFL) | O_NONBLOCK);
+ if (ret < 0)
+ fprintf(stderr, "fcntl(O_NONBLOCK) set failed\n");
+
+ net_socket_fd_init_raw(vlan, model, name, fd);
+
+ return 0;
+}
+
+
typedef struct NetSocketState {
VLANClientState nc;
int fd;
@@ -337,6 +658,8 @@ static NetSocketState *net_socket_fd_init(VLANState *vlan,
return net_socket_fd_init_dgram(vlan, model, name, fd, is_connected);
case SOCK_STREAM:
return net_socket_fd_init_stream(vlan, model, name, fd, is_connected);
+ case SOCK_RAW:
+ return (struct NetSocketState *)net_socket_fd_init_raw(vlan, model,
name, fd);
default:
/* who knows ... this could be a eg. a pty, do warn and continue as
stream */
fprintf(stderr, "qemu: warning: socket type=%d for fd=%d is not
SOCK_DGRAM or SOCK_STREAM\n", so_type, fd);
@@ -519,6 +842,22 @@ int net_init_socket(QemuOpts *opts,
close(fd);
return -1;
}
+ } else if (qemu_opt_get(opts, "ifname")) {
+ const char *ifname;
+
+ if (qemu_opt_get(opts, "fd") ||
+ qemu_opt_get(opts, "connect") ||
+ qemu_opt_get(opts, "listen") ||
+ qemu_opt_get(opts, "mcast")) {
+ qemu_error("fd=, connect= and mcast= and listen= is invalid with
ifname=\n");
+ return -1;
+ }
+
+ ifname = qemu_opt_get(opts, "ifname");
+
+ if (net_socket_raw_ifname_init(vlan, "socket", name, ifname) == -1) {
+ return -1;
+ }
} else if (qemu_opt_get(opts, "listen")) {
const char *listen;
diff --git a/net/socket.h b/net/socket.h
index ea46f02..cc09866 100644
--- a/net/socket.h
+++ b/net/socket.h
@@ -30,4 +30,13 @@
int net_init_socket(QemuOpts *opts, Monitor *mon,
const char *name, VLANState *vlan);
+#define PACKET_VNET_HDR 15
+
+ssize_t sock_raw_read_packet(int fd, uint8_t *buf, int maxlen, int flags);
+int sock_raw_has_ufo(VLANClientState *vc);
+int sock_raw_has_vnet_hdr(VLANClientState *vc);
+void sock_raw_using_vnet_hdr(VLANClientState *vc, int using_vnet_hdr);
+int sock_raw_probe_vnet_hdr(int fd);
+void sock_raw_set_offload(VLANClientState *vc, int csum, int tso4, int tso6,
int ecn, int ufo);
+
#endif /* QEMU_NET_SOCKET_H */