qemu-block
[Top][All Lists]
Advanced

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

[Qemu-block] [PATCH v3 2/2] aio: Introduce aio-epoll.c


From: Fam Zheng
Subject: [Qemu-block] [PATCH v3 2/2] aio: Introduce aio-epoll.c
Date: Mon, 26 Oct 2015 14:39:42 +0800

To minimize code duplication, epoll is hooked into aio-posix's
aio_poll() instead of rolling its own. This approach also has both
compile-time and run-time switchability.

1) When QEMU starts with a small number of fds in the event loop, ppoll
is used.

2) When QEMU starts with a big number of fds, or when more devices are
hot plugged, epoll kicks in when the number of fds hits the threshold.

3) Some fds may not support epoll, such as tty based stdio. In this
case, it falls back to ppoll.

A rough benchmark with scsi-disk on virtio-scsi dataplane (epoll gets
enabled from 64 onward). Numbers are in MB/s.

===============================================
             |     master     |     epoll
             |                |
scsi disks # | read    randrw | read    randrw
-------------|----------------|----------------
1            | 74      36     | 75      37
8            | 74      36     | 73      37
32           | 65      32     | 63      32
64           | 52      25     | 66      32
128          | 42      21     | 54      27
256          | 26      15     | 38      19
===============================================

Signed-off-by: Fam Zheng <address@hidden>
---
 aio-posix.c         | 170 +++++++++++++++++++++++++++++++++++++++++++++++++++-
 include/block/aio.h |   6 ++
 2 files changed, 174 insertions(+), 2 deletions(-)

diff --git a/aio-posix.c b/aio-posix.c
index 597cb35..f259ef9 100644
--- a/aio-posix.c
+++ b/aio-posix.c
@@ -17,6 +17,9 @@
 #include "block/block.h"
 #include "qemu/queue.h"
 #include "qemu/sockets.h"
+#ifdef CONFIG_EPOLL
+#include <sys/epoll.h>
+#endif
 
 struct AioHandler
 {
@@ -28,6 +31,147 @@ struct AioHandler
     QLIST_ENTRY(AioHandler) node;
 };
 
+#ifdef CONFIG_EPOLL
+
+/* The fd number threashold to switch to epoll */
+#define EPOLL_ENABLE_THRESHOLD 64
+
+static void aio_epoll_disable(AioContext *ctx)
+{
+    ctx->epoll_available = false;
+    if (!ctx->epoll_enabled) {
+        return;
+    }
+    ctx->epoll_enabled = false;
+    close(ctx->epollfd);
+}
+
+static inline int epoll_events_from_pfd(int pfd_events)
+{
+    return (pfd_events & G_IO_IN ? EPOLLIN : 0) |
+           (pfd_events & G_IO_OUT ? EPOLLOUT : 0) |
+           (pfd_events & G_IO_HUP ? EPOLLHUP : 0) |
+           (pfd_events & G_IO_ERR ? EPOLLERR : 0);
+}
+
+static bool aio_epoll_try_enable(AioContext *ctx)
+{
+    AioHandler *node;
+    struct epoll_event event;
+
+    QLIST_FOREACH(node, &ctx->aio_handlers, node) {
+        int r;
+        if (node->deleted || !node->pfd.events) {
+            continue;
+        }
+        event.events = epoll_events_from_pfd(node->pfd.events);
+        event.data.ptr = node;
+        r = epoll_ctl(ctx->epollfd, EPOLL_CTL_ADD, node->pfd.fd, &event);
+        if (r) {
+            return false;
+        }
+    }
+    ctx->epoll_enabled = true;
+    return true;
+}
+
+static void aio_epoll_update(AioContext *ctx, AioHandler *node, bool is_new)
+{
+    struct epoll_event event;
+    int r;
+
+    if (!ctx->epoll_enabled) {
+        return;
+    }
+    if (!node->pfd.events) {
+        r = epoll_ctl(ctx->epollfd, EPOLL_CTL_DEL, node->pfd.fd, &event);
+        assert(!r);
+    } else {
+        event.data.ptr = node;
+        event.events = epoll_events_from_pfd(node->pfd.events);
+        if (is_new) {
+            r = epoll_ctl(ctx->epollfd, EPOLL_CTL_ADD, node->pfd.fd, &event);
+            if (r) {
+                aio_epoll_disable(ctx);
+            }
+        } else {
+            r = epoll_ctl(ctx->epollfd, EPOLL_CTL_MOD, node->pfd.fd, &event);
+            assert(!r);
+        }
+    }
+}
+
+static int aio_epoll(AioContext *ctx, GPollFD *pfds,
+                     unsigned npfd, int64_t timeout)
+{
+    AioHandler *node;
+    int i, ret = 0;
+    struct epoll_event events[128];
+
+    assert(npfd == 1);
+    assert(pfds[0].fd == ctx->epollfd);
+    if (timeout > 0) {
+        ret = qemu_poll_ns(pfds, npfd, timeout);
+    }
+    if (timeout <= 0 || ret > 0) {
+        ret = epoll_wait(ctx->epollfd, events,
+                         sizeof(events) / sizeof(events[0]),
+                         timeout);
+        if (ret <= 0) {
+            goto out;
+        }
+        for (i = 0; i < ret; i++) {
+            int ev = events[i].events;
+            node = events[i].data.ptr;
+            node->pfd.revents = (ev & EPOLLIN ? G_IO_IN : 0) |
+                (ev & EPOLLOUT ? G_IO_OUT : 0) |
+                (ev & EPOLLHUP ? G_IO_HUP : 0) |
+                (ev & EPOLLERR ? G_IO_ERR : 0);
+        }
+    }
+out:
+    return ret;
+}
+
+static bool aio_epoll_check_poll(AioContext *ctx, GPollFD *pfds,
+                                 unsigned npfd, int64_t timeout)
+{
+    if (!ctx->epoll_available) {
+        return false;
+    }
+    if (ctx->epoll_enabled) {
+        return true;
+    }
+    if (npfd >= EPOLL_ENABLE_THRESHOLD) {
+        if (aio_epoll_try_enable(ctx)) {
+            return true;
+        } else {
+            aio_epoll_disable(ctx);
+        }
+    }
+    return false;
+}
+
+#else
+
+static void aio_epoll_update(AioContext *ctx, AioHandler *node, bool is_new)
+{
+}
+
+static int aio_epoll(AioContext *ctx, GPollFD *pfds,
+                     unsigned npfd, int64_t timeout)
+{
+    assert(false);
+}
+
+static bool aio_epoll_check_poll(AioContext *ctx, GPollFD *pfds,
+                          unsigned npfd, int64_t timeout)
+{
+    return false;
+}
+
+#endif
+
 static AioHandler *find_aio_handler(AioContext *ctx, int fd)
 {
     AioHandler *node;
@@ -48,6 +192,7 @@ void aio_set_fd_handler(AioContext *ctx,
                         void *opaque)
 {
     AioHandler *node;
+    bool is_new = false;
 
     node = find_aio_handler(ctx, fd);
 
@@ -77,6 +222,7 @@ void aio_set_fd_handler(AioContext *ctx,
             QLIST_INSERT_HEAD(&ctx->aio_handlers, node, node);
 
             g_source_add_poll(&ctx->source, &node->pfd);
+            is_new = true;
         }
         /* Update handler with latest information */
         node->io_read = io_read;
@@ -87,6 +233,7 @@ void aio_set_fd_handler(AioContext *ctx,
         node->pfd.events |= (io_write ? G_IO_OUT | G_IO_ERR : 0);
     }
 
+    aio_epoll_update(ctx, node, is_new);
     aio_notify(ctx);
 }
 
@@ -257,7 +404,7 @@ bool aio_poll(AioContext *ctx, bool blocking)
 
     /* fill pollfds */
     QLIST_FOREACH(node, &ctx->aio_handlers, node) {
-        if (!node->deleted && node->pfd.events) {
+        if (!node->deleted && node->pfd.events && !ctx->epoll_enabled) {
             add_pollfd(node);
         }
     }
@@ -268,7 +415,17 @@ bool aio_poll(AioContext *ctx, bool blocking)
     if (timeout) {
         aio_context_release(ctx);
     }
-    ret = qemu_poll_ns((GPollFD *)pollfds, npfd, timeout);
+    if (aio_epoll_check_poll(ctx, pollfds, npfd, timeout)) {
+        AioHandler epoll_handler;
+
+        epoll_handler.pfd.fd = ctx->epollfd;
+        epoll_handler.pfd.events = G_IO_IN | G_IO_OUT | G_IO_HUP | G_IO_ERR;
+        npfd = 0;
+        add_pollfd(&epoll_handler);
+        ret = aio_epoll(ctx, pollfds, npfd, timeout);
+    } else  {
+        ret = qemu_poll_ns(pollfds, npfd, timeout);
+    }
     if (blocking) {
         atomic_sub(&ctx->notify_me, 2);
     }
@@ -300,4 +457,13 @@ bool aio_poll(AioContext *ctx, bool blocking)
 
 void aio_context_setup(AioContext *ctx, Error **errp)
 {
+#ifdef CONFIG_EPOLL
+    assert(!ctx->epollfd);
+    ctx->epollfd = epoll_create1(EPOLL_CLOEXEC);
+    if (ctx->epollfd == -1) {
+        ctx->epoll_available = false;
+    } else {
+        ctx->epoll_available = true;
+    }
+#endif
 }
diff --git a/include/block/aio.h b/include/block/aio.h
index a6fd5ad..45137e7 100644
--- a/include/block/aio.h
+++ b/include/block/aio.h
@@ -122,6 +122,12 @@ struct AioContext {
 
     /* TimerLists for calling timers - one per clock type */
     QEMUTimerListGroup tlg;
+
+#ifdef CONFIG_EPOLL
+    int epollfd;
+    bool epoll_enabled;
+    bool epoll_available;
+#endif
 };
 
 /**
-- 
2.4.3




reply via email to

[Prev in Thread] Current Thread [Next in Thread]