qemu-devel
[Top][All Lists]
Advanced

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

[Qemu-devel] [RFC PATCH V2 4/4] virtio-blk: introduce multiread


From: Peter Lieven
Subject: [Qemu-devel] [RFC PATCH V2 4/4] virtio-blk: introduce multiread
Date: Fri, 5 Dec 2014 12:50:29 +0100

this patch finally introduce multiread support to virtio-blk while
multiwrite support was there for a long time read support was missing.

To achieve this the patch does serveral things which might need futher
explaination:

 - the whole merge and multireq logic is moved from block.c into
   virtio-blk. This is move is a preparation for directly creating a
   coroutine out of virtio-blk.

 - requests are only merged if they are strictly sequential and no
   longer sorted. This simplification decreases overhead and reduces
   latency. It will also merge some requests which were unmergable before.

   The old algorithm took up to 32 requests sorted them and tried to merge
   them. The outcome was anything between 1 and 32 requests. In case of
   32 requests there were 31 requests unnecessarily delayed.

   On the other hand lets imagine e.g. 16 unmergeable requests followed
   by 32 mergable requests. The latter 32 requests would have been split
   into two 16 byte requests.

   Last the simplified logic allows for a fast path if we have only a
   single request in the multirequest. In this case the request is sent as
   ordinary request without mulltireq callbacks.

As a first benchmark I installed Ubuntu 14.04.1 on a ramdisk. The number of
merged requests is in the same order while the latency is slightly decreased.
One should not stick too much to the numbers because the number of wr_requests
are highly fluctuant. I hope the numbers show that this patch is at least
not causing too big harm:

Cmdline:
qemu-system-x86_64 -m 1024 -smp 2 -enable-kvm -cdrom 
ubuntu-14.04.1-server-amd64.iso \
 -drive if=virtio,file=/tmp/ubuntu.raw -monitor stdio

Before:
virtio0: rd_bytes=150979072 wr_bytes=2591138816 rd_operations=18475 
wr_operations=69216
 flush_operations=15343 wr_total_time_ns=26969283701 rd_total_time_ns=1018449432
 flush_total_time_ns=562596819 rd_merged=0 wr_merged=10453

After:
virtio0: rd_bytes=148112896 wr_bytes=2845834240 rd_operations=17535 
wr_operations=72197
 flush_operations=15760 wr_total_time_ns=26104971623 rd_total_time_ns=1012926283
 flush_total_time_ns=564414752 rd_merged=517 wr_merged=9859

Some first numbers of improved read performance while booting:

The Ubuntu 14.04.1 vServer from above:
virtio0: rd_bytes=86158336 wr_bytes=688128 rd_operations=5851 wr_operations=70
 flush_operations=4 wr_total_time_ns=10886705 rd_total_time_ns=416688595
 flush_total_time_ns=288776 rd_merged=1297 wr_merged=2

Windows 2012R2:
virtio0: rd_bytes=176559104 wr_bytes=61859840 rd_operations=7200 
wr_operations=360
 flush_operations=68 wr_total_time_ns=34344992718 rd_total_time_ns=134386844669
 flush_total_time_ns=18115517 rd_merged=641 wr_merged=216

Signed-off-by: Peter Lieven <address@hidden>
---
 hw/block/dataplane/virtio-blk.c |   10 +-
 hw/block/virtio-blk.c           |  219 ++++++++++++++++++++++-----------------
 include/hw/virtio/virtio-blk.h  |   25 +++--
 trace-events                    |    1 +
 4 files changed, 146 insertions(+), 109 deletions(-)

diff --git a/hw/block/dataplane/virtio-blk.c b/hw/block/dataplane/virtio-blk.c
index 1222a37..aa4ad91 100644
--- a/hw/block/dataplane/virtio-blk.c
+++ b/hw/block/dataplane/virtio-blk.c
@@ -96,9 +96,8 @@ static void handle_notify(EventNotifier *e)
     event_notifier_test_and_clear(&s->host_notifier);
     blk_io_plug(s->conf->conf.blk);
     for (;;) {
-        MultiReqBuffer mrb = {
-            .num_writes = 0,
-        };
+        MultiReqBuffer mrb_rd = {};
+        MultiReqBuffer mrb_wr = {.is_write = 1};
         int ret;
 
         /* Disable guest->host notifies to avoid unnecessary vmexits */
@@ -117,10 +116,11 @@ static void handle_notify(EventNotifier *e)
                                                         req->elem.in_num,
                                                         req->elem.index);
 
-            virtio_blk_handle_request(req, &mrb);
+            virtio_blk_handle_request(req, &mrb_wr, &mrb_rd);
         }
 
-        virtio_submit_multiwrite(s->conf->conf.blk, &mrb);
+        virtio_submit_multireq(s->conf->conf.blk, &mrb_wr);
+        virtio_submit_multireq(s->conf->conf.blk, &mrb_rd);
 
         if (likely(ret == -EAGAIN)) { /* vring emptied */
             /* Re-enable guest->host notifies and stop processing the vring.
diff --git a/hw/block/virtio-blk.c b/hw/block/virtio-blk.c
index 490f961..7d8a835 100644
--- a/hw/block/virtio-blk.c
+++ b/hw/block/virtio-blk.c
@@ -34,6 +34,8 @@ VirtIOBlockReq *virtio_blk_alloc_request(VirtIOBlock *s)
     req->dev = s;
     req->qiov.size = 0;
     req->next = NULL;
+    req->mr_next = NULL;
+    req->mr_qiov.nalloc = 0;
     return req;
 }
 
@@ -84,20 +86,29 @@ static int virtio_blk_handle_rw_error(VirtIOBlockReq *req, 
int error,
 
 static void virtio_blk_rw_complete(void *opaque, int ret)
 {
-    VirtIOBlockReq *req = opaque;
+    VirtIOBlockReq *next = opaque;
 
-    trace_virtio_blk_rw_complete(req, ret);
+    while (next) {
+        VirtIOBlockReq *req = next;
+        next = req->mr_next;
+        trace_virtio_blk_rw_complete(req, ret);
 
-    if (ret) {
-        int p = virtio_ldl_p(VIRTIO_DEVICE(req->dev), &req->out.type);
-        bool is_read = !(p & VIRTIO_BLK_T_OUT);
-        if (virtio_blk_handle_rw_error(req, -ret, is_read))
-            return;
-    }
+        if (req->mr_qiov.nalloc) {
+            qemu_iovec_destroy(&req->mr_qiov);
+        }
 
-    virtio_blk_req_complete(req, VIRTIO_BLK_S_OK);
-    block_acct_done(blk_get_stats(req->dev->blk), &req->acct);
-    virtio_blk_free_request(req);
+        if (ret) {
+            int p = virtio_ldl_p(VIRTIO_DEVICE(req->dev), &req->out.type);
+            bool is_read = !(p & VIRTIO_BLK_T_OUT);
+            if (virtio_blk_handle_rw_error(req, -ret, is_read)) {
+                continue;
+            }
+        }
+
+        virtio_blk_req_complete(req, VIRTIO_BLK_S_OK);
+        block_acct_done(blk_get_stats(req->dev->blk), &req->acct);
+        virtio_blk_free_request(req);
+    }
 }
 
 static void virtio_blk_flush_complete(void *opaque, int ret)
@@ -257,24 +268,46 @@ static void virtio_blk_handle_scsi(VirtIOBlockReq *req)
     virtio_blk_free_request(req);
 }
 
-void virtio_submit_multiwrite(BlockBackend *blk, MultiReqBuffer *mrb)
+void virtio_submit_multireq(BlockBackend *blk, MultiReqBuffer *mrb)
 {
-    int i, ret;
+    QEMUIOVector *qiov = &mrb->reqs[0]->qiov;
 
-    if (!mrb->num_writes) {
+    if (!mrb->num_reqs) {
         return;
     }
 
-    ret = blk_aio_multiwrite(blk, mrb->blkreq, mrb->num_writes);
-    if (ret != 0) {
-        for (i = 0; i < mrb->num_writes; i++) {
-            if (mrb->blkreq[i].error) {
-                virtio_blk_rw_complete(mrb->blkreq[i].opaque, -EIO);
+    if (mrb->num_reqs > 1) {
+        int i;
+        
+        trace_virtio_blk_submit_multireq(mrb, mrb->num_reqs, mrb->sector_num, 
mrb->nb_sectors, mrb->is_write);
+        
+        qiov = &mrb->reqs[0]->mr_qiov;
+        qemu_iovec_init(qiov, mrb->niov);
+        
+        for (i = 0; i < mrb->num_reqs; i++) {
+            qemu_iovec_concat(qiov, &mrb->reqs[i]->qiov, 0, 
mrb->reqs[i]->qiov.size);
+            if (i) {
+                 mrb->reqs[i - 1]->mr_next = mrb->reqs[i];
             }
         }
+        assert(mrb->nb_sectors == qiov->size / BDRV_SECTOR_SIZE);
+
+        block_acct_merge_done(blk_get_stats(blk),
+                              mrb->is_write ? BLOCK_ACCT_WRITE : 
BLOCK_ACCT_READ,
+                              mrb->num_reqs - 1);
     }
 
-    mrb->num_writes = 0;
+    if (mrb->is_write) {
+        blk_aio_writev(blk, mrb->sector_num, qiov, mrb->nb_sectors,
+                       virtio_blk_rw_complete, mrb->reqs[0]);
+    } else {
+        blk_aio_readv(blk, mrb->sector_num, qiov, mrb->nb_sectors,
+                      virtio_blk_rw_complete, mrb->reqs[0]);
+    }
+
+    mrb->num_reqs = 0;
+    mrb->nb_sectors = 0;
+    mrb->niov = 0;
 }
 
 static void virtio_blk_handle_flush(VirtIOBlockReq *req, MultiReqBuffer *mrb)
@@ -283,9 +316,9 @@ static void virtio_blk_handle_flush(VirtIOBlockReq *req, 
MultiReqBuffer *mrb)
                      BLOCK_ACCT_FLUSH);
 
     /*
-     * Make sure all outstanding writes are posted to the backing device.
+     * Make sure all outstanding requests are posted to the backing device.
      */
-    virtio_submit_multiwrite(req->dev->blk, mrb);
+    virtio_submit_multireq(req->dev->blk, mrb);
     blk_aio_flush(req->dev->blk, virtio_blk_flush_complete, req);
 }
 
@@ -308,61 +341,8 @@ static bool virtio_blk_sect_range_ok(VirtIOBlock *dev,
     return true;
 }
 
-static void virtio_blk_handle_write(VirtIOBlockReq *req, MultiReqBuffer *mrb)
-{
-    BlockRequest *blkreq;
-    uint64_t sector;
-
-    sector = virtio_ldq_p(VIRTIO_DEVICE(req->dev), &req->out.sector);
-
-    trace_virtio_blk_handle_write(req, sector, req->qiov.size / 512);
-
-    if (!virtio_blk_sect_range_ok(req->dev, sector, req->qiov.size)) {
-        virtio_blk_req_complete(req, VIRTIO_BLK_S_IOERR);
-        virtio_blk_free_request(req);
-        return;
-    }
-
-    block_acct_start(blk_get_stats(req->dev->blk), &req->acct, req->qiov.size,
-                     BLOCK_ACCT_WRITE);
-
-    if (mrb->num_writes == VIRTIO_BLK_MAX_MERGE_REQS) {
-        virtio_submit_multiwrite(req->dev->blk, mrb);
-    }
-
-    blkreq = &mrb->blkreq[mrb->num_writes];
-    blkreq->sector = sector;
-    blkreq->nb_sectors = req->qiov.size / BDRV_SECTOR_SIZE;
-    blkreq->qiov = &req->qiov;
-    blkreq->cb = virtio_blk_rw_complete;
-    blkreq->opaque = req;
-    blkreq->error = 0;
-
-    mrb->num_writes++;
-}
-
-static void virtio_blk_handle_read(VirtIOBlockReq *req)
-{
-    uint64_t sector;
-
-    sector = virtio_ldq_p(VIRTIO_DEVICE(req->dev), &req->out.sector);
-
-    trace_virtio_blk_handle_read(req, sector, req->qiov.size / 512);
-
-    if (!virtio_blk_sect_range_ok(req->dev, sector, req->qiov.size)) {
-        virtio_blk_req_complete(req, VIRTIO_BLK_S_IOERR);
-        virtio_blk_free_request(req);
-        return;
-    }
-
-    block_acct_start(blk_get_stats(req->dev->blk), &req->acct, req->qiov.size,
-                     BLOCK_ACCT_READ);
-    blk_aio_readv(req->dev->blk, sector, &req->qiov,
-                  req->qiov.size / BDRV_SECTOR_SIZE,
-                  virtio_blk_rw_complete, req);
-}
-
-void virtio_blk_handle_request(VirtIOBlockReq *req, MultiReqBuffer *mrb)
+void virtio_blk_handle_request(VirtIOBlockReq *req, MultiReqBuffer *mrb_wr,
+                               MultiReqBuffer *mrb_rd)
 {
     uint32_t type;
     struct iovec *in_iov = req->elem.in_sg;
@@ -397,7 +377,7 @@ void virtio_blk_handle_request(VirtIOBlockReq *req, 
MultiReqBuffer *mrb)
     type = virtio_ldl_p(VIRTIO_DEVICE(req->dev), &req->out.type);
 
     if (type & VIRTIO_BLK_T_FLUSH) {
-        virtio_blk_handle_flush(req, mrb);
+        virtio_blk_handle_flush(req, mrb_wr);
     } else if (type & VIRTIO_BLK_T_SCSI_CMD) {
         virtio_blk_handle_scsi(req);
     } else if (type & VIRTIO_BLK_T_GET_ID) {
@@ -414,13 +394,62 @@ void virtio_blk_handle_request(VirtIOBlockReq *req, 
MultiReqBuffer *mrb)
         iov_from_buf(in_iov, in_num, 0, serial, size);
         virtio_blk_req_complete(req, VIRTIO_BLK_S_OK);
         virtio_blk_free_request(req);
-    } else if (type & VIRTIO_BLK_T_OUT) {
-        qemu_iovec_init_external(&req->qiov, iov, out_num);
-        virtio_blk_handle_write(req, mrb);
-    } else if (type == VIRTIO_BLK_T_IN || type == VIRTIO_BLK_T_BARRIER) {
-        /* VIRTIO_BLK_T_IN is 0, so we can't just & it. */
-        qemu_iovec_init_external(&req->qiov, in_iov, in_num);
-        virtio_blk_handle_read(req);
+    } else if (type & VIRTIO_BLK_T_OUT || type == VIRTIO_BLK_T_IN || type == 
VIRTIO_BLK_T_BARRIER) {
+        bool is_write = type & VIRTIO_BLK_T_OUT;
+        MultiReqBuffer *mrb = is_write ? mrb_wr : mrb_rd;
+        int64_t sector_num = virtio_ldq_p(VIRTIO_DEVICE(req->dev), 
&req->out.sector);
+        int max_transfer_length = blk_get_max_transfer_length(req->dev->blk);
+        int nb_sectors = 0;
+        bool merge = true;
+
+        if (!virtio_blk_sect_range_ok(req->dev, sector_num, req->qiov.size)) {
+            virtio_blk_req_complete(req, VIRTIO_BLK_S_IOERR);
+            virtio_blk_free_request(req);
+            return;
+        }
+
+        if (is_write) {
+            qemu_iovec_init_external(&req->qiov, iov, out_num);
+            trace_virtio_blk_handle_write(req, sector_num, req->qiov.size / 
BDRV_SECTOR_SIZE);
+        } else {
+            qemu_iovec_init_external(&req->qiov, in_iov, in_num);
+            trace_virtio_blk_handle_read(req, sector_num, req->qiov.size / 
BDRV_SECTOR_SIZE);
+        }
+
+        nb_sectors = req->qiov.size / BDRV_SECTOR_SIZE;
+
+        block_acct_start(blk_get_stats(req->dev->blk), &req->acct, 
req->qiov.size,
+                         is_write ? BLOCK_ACCT_WRITE : BLOCK_ACCT_READ);
+
+        /* merge would exceed maximum number of requests or IOVs */
+        if (mrb->num_reqs == MAX_MERGE_REQS ||
+            mrb->niov + req->qiov.niov + 1 > IOV_MAX) {
+            merge = false;
+        }
+
+        /* merge would exceed maximum transfer length of backend device */
+        if (max_transfer_length &&
+            mrb->nb_sectors + nb_sectors > max_transfer_length) {
+            merge = false;
+        }
+
+        /* requests are not sequential */
+        if (mrb->num_reqs && mrb->sector_num + mrb->nb_sectors != sector_num) {
+            merge = false;
+        }
+
+        if (!merge) {
+            virtio_submit_multireq(req->dev->blk, mrb);
+        }
+
+        if (mrb->num_reqs == 0) {
+            mrb->sector_num = sector_num;
+        }
+
+        mrb->nb_sectors += req->qiov.size / BDRV_SECTOR_SIZE;
+        mrb->reqs[mrb->num_reqs] = req;
+        mrb->niov += req->qiov.niov;
+        mrb->num_reqs++;
     } else {
         virtio_blk_req_complete(req, VIRTIO_BLK_S_UNSUPP);
         virtio_blk_free_request(req);
@@ -431,9 +460,8 @@ static void virtio_blk_handle_output(VirtIODevice *vdev, 
VirtQueue *vq)
 {
     VirtIOBlock *s = VIRTIO_BLK(vdev);
     VirtIOBlockReq *req;
-    MultiReqBuffer mrb = {
-        .num_writes = 0,
-    };
+    MultiReqBuffer mrb_rd = {};
+    MultiReqBuffer mrb_wr = {.is_write = 1};
 
     /* Some guests kick before setting VIRTIO_CONFIG_S_DRIVER_OK so start
      * dataplane here instead of waiting for .set_status().
@@ -444,10 +472,11 @@ static void virtio_blk_handle_output(VirtIODevice *vdev, 
VirtQueue *vq)
     }
 
     while ((req = virtio_blk_get_request(s))) {
-        virtio_blk_handle_request(req, &mrb);
+        virtio_blk_handle_request(req, &mrb_wr, &mrb_rd);
     }
 
-    virtio_submit_multiwrite(s->blk, &mrb);
+    virtio_submit_multireq(s->blk, &mrb_wr);
+    virtio_submit_multireq(s->blk, &mrb_rd);
 
     /*
      * FIXME: Want to check for completions before returning to guest mode,
@@ -460,9 +489,8 @@ static void virtio_blk_dma_restart_bh(void *opaque)
 {
     VirtIOBlock *s = opaque;
     VirtIOBlockReq *req = s->rq;
-    MultiReqBuffer mrb = {
-        .num_writes = 0,
-    };
+    MultiReqBuffer mrb_rd = {};
+    MultiReqBuffer mrb_wr = {.is_write = 1};
 
     qemu_bh_delete(s->bh);
     s->bh = NULL;
@@ -471,11 +499,12 @@ static void virtio_blk_dma_restart_bh(void *opaque)
 
     while (req) {
         VirtIOBlockReq *next = req->next;
-        virtio_blk_handle_request(req, &mrb);
+        virtio_blk_handle_request(req, &mrb_wr, &mrb_rd);
         req = next;
     }
 
-    virtio_submit_multiwrite(s->blk, &mrb);
+    virtio_submit_multireq(s->blk, &mrb_wr);
+    virtio_submit_multireq(s->blk, &mrb_rd);
 }
 
 static void virtio_blk_dma_restart_cb(void *opaque, int running,
diff --git a/include/hw/virtio/virtio-blk.h b/include/hw/virtio/virtio-blk.h
index 3f2652f..98d3d83 100644
--- a/include/hw/virtio/virtio-blk.h
+++ b/include/hw/virtio/virtio-blk.h
@@ -134,13 +134,6 @@ typedef struct VirtIOBlock {
     struct VirtIOBlockDataPlane *dataplane;
 } VirtIOBlock;
 
-#define VIRTIO_BLK_MAX_MERGE_REQS 32
-
-typedef struct MultiReqBuffer {
-    BlockRequest        blkreq[VIRTIO_BLK_MAX_MERGE_REQS];
-    unsigned int        num_writes;
-} MultiReqBuffer;
-
 typedef struct VirtIOBlockReq {
     VirtIOBlock *dev;
     VirtQueueElement elem;
@@ -149,8 +142,21 @@ typedef struct VirtIOBlockReq {
     QEMUIOVector qiov;
     struct VirtIOBlockReq *next;
     BlockAcctCookie acct;
+    QEMUIOVector mr_qiov;
+    struct VirtIOBlockReq *mr_next;
 } VirtIOBlockReq;
 
+#define MAX_MERGE_REQS 32
+
+typedef struct MultiReqBuffer {
+    VirtIOBlockReq *reqs[MAX_MERGE_REQS];
+    unsigned int num_reqs;
+    bool is_write;
+    int niov;
+    int64_t sector_num;
+    int nb_sectors;
+} MultiReqBuffer;
+
 VirtIOBlockReq *virtio_blk_alloc_request(VirtIOBlock *s);
 
 void virtio_blk_free_request(VirtIOBlockReq *req);
@@ -158,8 +164,9 @@ void virtio_blk_free_request(VirtIOBlockReq *req);
 int virtio_blk_handle_scsi_req(VirtIOBlock *blk,
                                VirtQueueElement *elem);
 
-void virtio_blk_handle_request(VirtIOBlockReq *req, MultiReqBuffer *mrb);
+void virtio_blk_handle_request(VirtIOBlockReq *req, MultiReqBuffer *mrb_wr,
+                               MultiReqBuffer *mrb_rd);
 
-void virtio_submit_multiwrite(BlockBackend *blk, MultiReqBuffer *mrb);
+void virtio_submit_multireq(BlockBackend *blk, MultiReqBuffer *mrb);
 
 #endif
diff --git a/trace-events b/trace-events
index b5722ea..1b2fbf3 100644
--- a/trace-events
+++ b/trace-events
@@ -116,6 +116,7 @@ virtio_blk_req_complete(void *req, int status) "req %p 
status %d"
 virtio_blk_rw_complete(void *req, int ret) "req %p ret %d"
 virtio_blk_handle_write(void *req, uint64_t sector, size_t nsectors) "req %p 
sector %"PRIu64" nsectors %zu"
 virtio_blk_handle_read(void *req, uint64_t sector, size_t nsectors) "req %p 
sector %"PRIu64" nsectors %zu"
+virtio_blk_submit_multireq(void *mrb, int num_reqs, uint64_t sector, size_t 
nsectors, bool is_write) "mrb %p num_reqs %d sector %"PRIu64" nsectors %zu 
is_write %d"
 
 # hw/block/dataplane/virtio-blk.c
 virtio_blk_data_plane_start(void *s) "dataplane %p"
-- 
1.7.9.5




reply via email to

[Prev in Thread] Current Thread [Next in Thread]