qemu-devel
[Top][All Lists]
Advanced

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

[Qemu-devel] [PATCH RFC 4/4] aio-posix: Use epoll in aio_poll


From: Fam Zheng
Subject: [Qemu-devel] [PATCH RFC 4/4] aio-posix: Use epoll in aio_poll
Date: Tue, 30 Jun 2015 21:19:45 +0800

This patch let aio_poll use epoll_wait(2) syscall instead of
qemu_poll_ns, if possible. It improves scalability of
iothread (for example, virtio-scsi-dataplane.)

The epollfd is managed together with the GSource and ctx->aio_handlers,
by creating epoll_event instances for each watched aio fd and adding to
the epollfd with epoll_ctl.

The following table is a fio benchmark comparison on a single guest
block device, with different number of disks attached to the same scsi
bus (in MB/s):

=====================================================================
  # of scsi-disks  |        master           |       epoll
                   |   rd     wr    randrw   |   rd    wr    randrw
---------------------------------------------------------------------
        1          |   103    96     49      |   105   99     49
        4          |   92     96     48      |   103   98     49
        8          |   96     94     46      |   101   97     50
        16         |   91     91     45      |   101   95     48
        32         |   84     83     40      |   95    95     48
        64         |   75     73     35      |   91    90     44
        128        |   54     53     26      |   79    80     39
        256        |   41     39     19      |   63    62     30
=====================================================================

Signed-off-by: Fam Zheng <address@hidden>
---
 aio-posix.c         | 118 ++++++++++++++++++++++++++++++++++++++++++++++++++--
 include/block/aio.h |   3 ++
 2 files changed, 117 insertions(+), 4 deletions(-)

diff --git a/aio-posix.c b/aio-posix.c
index 22406ce..111d7fb 100644
--- a/aio-posix.c
+++ b/aio-posix.c
@@ -17,6 +17,9 @@
 #include "block/block.h"
 #include "qemu/queue.h"
 #include "qemu/sockets.h"
+#ifdef CONFIG_EPOLL
+#include <sys/epoll.h>
+#endif
 
 struct AioHandler
 {
@@ -44,6 +47,12 @@ static AioHandler *find_aio_handler(AioContext *ctx, int fd)
 
 void aio_context_setup(AioContext *ctx, Error **errp)
 {
+#ifdef CONFIG_EPOLL
+    ctx->epollfd = epoll_create1(EPOLL_CLOEXEC);
+    if (ctx->epollfd < 0) {
+        error_setg(errp, "Failed to create epoll fd: %s", strerror(errno));
+    }
+#endif
 }
 
 void aio_set_fd_handler_pri(AioContext *ctx,
@@ -54,6 +63,11 @@ void aio_set_fd_handler_pri(AioContext *ctx,
                             void *opaque)
 {
     AioHandler *node;
+#ifdef CONFIG_EPOLL
+    struct epoll_event event;
+    int r;
+    bool add = false;
+#endif
 
     node = find_aio_handler(ctx, fd);
 
@@ -61,6 +75,10 @@ void aio_set_fd_handler_pri(AioContext *ctx,
     if (!io_read && !io_write && !io_read_pri) {
         if (node) {
             g_source_remove_poll(&ctx->source, &node->pfd);
+#ifdef CONFIG_EPOLL
+            r = epoll_ctl(ctx->epollfd, EPOLL_CTL_DEL, fd, &event);
+            assert(!r);
+#endif
 
             /* If the lock is held, just mark the node as deleted */
             if (ctx->walking_handlers) {
@@ -83,6 +101,9 @@ void aio_set_fd_handler_pri(AioContext *ctx,
             QLIST_INSERT_HEAD(&ctx->aio_handlers, node, node);
 
             g_source_add_poll(&ctx->source, &node->pfd);
+#ifdef CONFIG_EPOLL
+            add = true;
+#endif
         }
         /* Update handler with latest information */
         node->io_read = io_read;
@@ -93,6 +114,13 @@ void aio_set_fd_handler_pri(AioContext *ctx,
         node->pfd.events = (io_read ? G_IO_IN | G_IO_HUP | G_IO_ERR : 0);
         node->pfd.events |= (io_write ? G_IO_OUT | G_IO_ERR : 0);
         node->pfd.events |= (io_read_pri ? G_IO_PRI | G_IO_HUP | G_IO_ERR : 0);
+#ifdef CONFIG_EPOLL
+        event.data.ptr = node;
+        event.events = node->pfd.events;
+        r = epoll_ctl(ctx->epollfd, add ? EPOLL_CTL_ADD : EPOLL_CTL_MOD,
+                      fd, &event);
+        assert(!r);
+#endif
     }
 
     aio_notify(ctx);
@@ -198,7 +226,80 @@ bool aio_dispatch(AioContext *ctx)
     return progress;
 }
 
-/* These thread-local variables are used only in a small part of aio_poll
+#ifdef CONFIG_EPOLL
+QEMU_BUILD_BUG_ON((int)G_IO_IN != EPOLLIN);
+QEMU_BUILD_BUG_ON((int)G_IO_OUT != EPOLLOUT);
+QEMU_BUILD_BUG_ON((int)G_IO_PRI != EPOLLPRI);
+QEMU_BUILD_BUG_ON((int)G_IO_ERR != EPOLLERR);
+QEMU_BUILD_BUG_ON((int)G_IO_HUP != EPOLLHUP);
+
+#define EPOLL_BATCH 128
+static bool aio_poll_epoll(AioContext *ctx, bool blocking)
+{
+    AioHandler *node;
+    bool was_dispatching;
+    int i, ret;
+    bool progress;
+    int64_t timeout;
+    struct epoll_event events[EPOLL_BATCH];
+
+    aio_context_acquire(ctx);
+    was_dispatching = ctx->dispatching;
+    progress = false;
+
+    /* aio_notify can avoid the expensive event_notifier_set if
+     * everything (file descriptors, bottom halves, timers) will
+     * be re-evaluated before the next blocking poll().  This is
+     * already true when aio_poll is called with blocking == false;
+     * if blocking == true, it is only true after poll() returns.
+     *
+     * If we're in a nested event loop, ctx->dispatching might be true.
+     * In that case we can restore it just before returning, but we
+     * have to clear it now.
+     */
+    aio_set_dispatching(ctx, !blocking);
+
+    ctx->walking_handlers++;
+
+    timeout = blocking ? aio_compute_timeout(ctx) : 0;
+
+    if (timeout > 0) {
+        timeout = DIV_ROUND_UP(timeout, 1000000);
+    }
+
+    /* wait until next event */
+    if (timeout) {
+        aio_context_release(ctx);
+    }
+    ret = epoll_wait(ctx->epollfd, events, EPOLL_BATCH, timeout);
+    if (timeout) {
+        aio_context_acquire(ctx);
+    }
+
+    /* if we have any readable fds, dispatch event */
+    if (ret > 0) {
+        for (i = 0; i < ret; i++) {
+            node = events[i].data.ptr;
+            node->pfd.revents = events[i].events;
+        }
+    }
+
+    ctx->walking_handlers--;
+
+    /* Run dispatch even if there were no readable fds to run timers */
+    aio_set_dispatching(ctx, true);
+    if (aio_dispatch(ctx)) {
+        progress = true;
+    }
+
+    aio_set_dispatching(ctx, was_dispatching);
+    aio_context_release(ctx);
+
+    return progress;
+}
+#else
+
+/* These thread-local variables are used only in a small part of aio_poll_posix
  * around the call to the poll() system call.  In particular they are not
  * used while aio_poll is performing callbacks, which makes it much easier
  * to think about reentrancy!
@@ -212,7 +313,6 @@ bool aio_dispatch(AioContext *ctx)
 static __thread GPollFD *pollfds;
 static __thread AioHandler **nodes;
 static __thread unsigned npfd, nalloc;
-static __thread Notifier pollfds_cleanup_notifier;
 
 static void pollfds_cleanup(Notifier *n, void *unused)
 {
@@ -221,7 +321,7 @@ static void pollfds_cleanup(Notifier *n, void *unused)
     g_free(nodes);
     nalloc = 0;
 }
-
+static __thread Notifier pollfds_cleanup_notifier;
 static void add_pollfd(AioHandler *node)
 {
     if (npfd == nalloc) {
@@ -244,7 +344,7 @@ static void add_pollfd(AioHandler *node)
     npfd++;
 }
 
-bool aio_poll(AioContext *ctx, bool blocking)
+bool aio_poll_posix(AioContext *ctx, bool blocking)
 {
     AioHandler *node;
     bool was_dispatching;
@@ -311,3 +411,13 @@ bool aio_poll(AioContext *ctx, bool blocking)
 
     return progress;
 }
+#endif
+
+bool aio_poll(AioContext *ctx, bool blocking)
+{
+#ifdef CONFIG_EPOLL
+    return aio_poll_epoll(ctx, blocking);
+#else
+    return aio_poll_posix(ctx, blocking);
+#endif
+}
diff --git a/include/block/aio.h b/include/block/aio.h
index 5120583..9178ff2 100644
--- a/include/block/aio.h
+++ b/include/block/aio.h
@@ -87,6 +87,9 @@ struct AioContext {
 
     /* TimerLists for calling timers - one per clock type */
     QEMUTimerListGroup tlg;
+
+    /* epoll fd */
+    int epollfd;
 };
 
 /* Used internally to synchronize aio_poll against qemu_bh_schedule.  */
-- 
2.4.3




reply via email to

[Prev in Thread] Current Thread [Next in Thread]