qemu-devel
[Top][All Lists]
Advanced

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

[Qemu-devel] [PATCH] bdrv_aio_flush


From: Andrea Arcangeli
Subject: [Qemu-devel] [PATCH] bdrv_aio_flush
Date: Fri, 29 Aug 2008 15:37:37 +0200

Hello,

while reading the aio/ide code I noticed the bdrv_flush operation is
unsafe. When a write command is submitted with bdrv_aio_write and
later bdrv_flush is called, fsync will do nothing. fsync only sees the
kernel writeback cache. But the write command is still queued in the
aio kernel thread and is still invisible to the kernel. bdrv_aio_flush
will instead see both the regular bdrv_write (that submits data to the
kernel synchronously) as well as the bdrv_aio_write as the fsync will
be queued at the end of the aio queue and it'll be issued by the aio
pthread thread itself.

IDE works by luck because it can only submit one command at once (no
tagged queueing) so supposedly the guest kernel driver will wait the
IDE emulated device to return ready before issuing a journaling
barrier with WIN_FLUSH_CACHE* but with scsi and tagged command
queueing this bug in the aio common code will become visible and it'll
break the journaling guarantees of the guest if there's a power loss
in the host. So it's not urgent for IDE I think, but it clearly should
be fixed in the qemu block model eventually.

This should fix it. Unfortunately I didn't implement all the backends
and I only tested it with KVM so far. Also I don't know for sure if
the BUSY line should get set during the flush, I assumed yes. It was
irrelevant before because the flush was happening atomically before
returning from the ioport write from guest point of view.

Signed-off-by: Andrea Arcangeli <address@hidden>

Index: block_int.h
===================================================================
--- block_int.h (revision 5089)
+++ block_int.h (working copy)
@@ -54,6 +54,8 @@
     BlockDriverAIOCB *(*bdrv_aio_write)(BlockDriverState *bs,
         int64_t sector_num, const uint8_t *buf, int nb_sectors,
         BlockDriverCompletionFunc *cb, void *opaque);
+    BlockDriverAIOCB *(*bdrv_aio_flush)(BlockDriverState *bs,
+        BlockDriverCompletionFunc *cb, void *opaque);
     void (*bdrv_aio_cancel)(BlockDriverAIOCB *acb);
     int aiocb_size;
 
Index: block-raw-posix.c
===================================================================
--- block-raw-posix.c   (revision 5089)
+++ block-raw-posix.c   (working copy)
@@ -638,6 +638,21 @@
     return &acb->common;
 }
 
+static BlockDriverAIOCB *raw_aio_flush(BlockDriverState *bs,
+        BlockDriverCompletionFunc *cb, void *opaque)
+{
+    RawAIOCB *acb;
+
+    acb = raw_aio_setup(bs, 0, NULL, 0, cb, opaque);
+    if (!acb)
+        return NULL;
+    if (aio_fsync(O_DSYNC, &acb->aiocb) < 0) {
+        qemu_aio_release(acb);
+        return NULL;
+    }
+    return &acb->common;
+}
+
 static BlockDriverAIOCB *raw_aio_write(BlockDriverState *bs,
         int64_t sector_num, const uint8_t *buf, int nb_sectors,
         BlockDriverCompletionFunc *cb, void *opaque)
@@ -857,6 +872,7 @@
 #ifdef CONFIG_AIO
     .bdrv_aio_read = raw_aio_read,
     .bdrv_aio_write = raw_aio_write,
+    .bdrv_aio_flush = raw_aio_flush,
     .bdrv_aio_cancel = raw_aio_cancel,
     .aiocb_size = sizeof(RawAIOCB),
 #endif
@@ -1211,6 +1227,7 @@
 #ifdef CONFIG_AIO
     .bdrv_aio_read = raw_aio_read,
     .bdrv_aio_write = raw_aio_write,
+    .bdrv_aio_flush = raw_aio_flush,
     .bdrv_aio_cancel = raw_aio_cancel,
     .aiocb_size = sizeof(RawAIOCB),
 #endif
Index: block-qcow2.c
===================================================================
--- block-qcow2.c       (revision 5089)
+++ block-qcow2.c       (working copy)
@@ -1369,6 +1369,42 @@
     return &acb->common;
 }
 
+static void qcow_aio_flush_cb(void *opaque, int ret)
+{
+    QCowAIOCB *acb = opaque;
+    BlockDriverState *bs = acb->common.bs;
+    BDRVQcowState *s = bs->opaque;
+
+    acb->hd_aiocb = NULL;
+    if (ret < 0) {
+        acb->common.cb(acb->common.opaque, ret);
+        qemu_aio_release(acb);
+        return;
+    }
+
+    /* request completed */
+    acb->common.cb(acb->common.opaque, 0);
+    qemu_aio_release(acb);
+}
+
+static BlockDriverAIOCB *qcow_aio_flush(BlockDriverState *bs,
+        BlockDriverCompletionFunc *cb, void *opaque)
+{
+    BDRVQcowState *s = bs->opaque;
+    QCowAIOCB *acb;
+
+    acb = qcow_aio_setup(bs, 0, NULL, 0, cb, opaque);
+    if (!acb)
+        return NULL;
+
+    acb->hd_aiocb = bdrv_aio_flush(s->hd,
+                                  qcow_aio_flush_cb, acb);
+    if (acb->hd_aiocb == NULL)
+           qcow_aio_flush_cb(acb, -ENOMEM);
+
+    return &acb->common;
+}
+
 static void qcow_aio_cancel(BlockDriverAIOCB *blockacb)
 {
     QCowAIOCB *acb = (QCowAIOCB *)blockacb;
@@ -2612,6 +2648,7 @@
 
     .bdrv_aio_read = qcow_aio_read,
     .bdrv_aio_write = qcow_aio_write,
+    .bdrv_aio_flush = qcow_aio_flush,
     .bdrv_aio_cancel = qcow_aio_cancel,
     .aiocb_size = sizeof(QCowAIOCB),
     .bdrv_write_compressed = qcow_write_compressed,
Index: block.c
===================================================================
--- block.c     (revision 5089)
+++ block.c     (working copy)
@@ -1181,6 +1181,20 @@
     return ret;
 }
 
+BlockDriverAIOCB *bdrv_aio_flush(BlockDriverState *bs,
+                                BlockDriverCompletionFunc *cb, void *opaque)
+{
+    BlockDriver *drv = bs->drv;
+    BlockDriverAIOCB *ret;
+
+    if (!drv)
+        return NULL;
+
+    ret = drv->bdrv_aio_flush(bs, cb, opaque);
+
+    return ret;
+}
+
 void bdrv_aio_cancel(BlockDriverAIOCB *acb)
 {
     BlockDriver *drv = acb->bs->drv;
Index: block.h
===================================================================
--- block.h     (revision 5089)
+++ block.h     (working copy)
@@ -87,6 +87,8 @@
 BlockDriverAIOCB *bdrv_aio_write(BlockDriverState *bs, int64_t sector_num,
                                  const uint8_t *buf, int nb_sectors,
                                  BlockDriverCompletionFunc *cb, void *opaque);
+BlockDriverAIOCB *bdrv_aio_flush(BlockDriverState *bs,
+                                 BlockDriverCompletionFunc *cb, void *opaque);
 void bdrv_aio_cancel(BlockDriverAIOCB *acb);
 
 void qemu_aio_init(void);
Index: hw/ide.c
===================================================================
--- hw/ide.c    (revision 5089)
+++ hw/ide.c    (working copy)
@@ -1969,6 +1969,13 @@
     ide_if[1].select &= ~(1 << 7);
 }
 
+static void ide_flush_dma_cb(void *opaque, int ret)
+{
+    IDEState *s = opaque;
+    s->status = READY_STAT | SEEK_STAT;
+    ide_set_irq(s);
+}
+
 static void ide_ioport_write(void *opaque, uint32_t addr, uint32_t val)
 {
     IDEState *ide_if = opaque;
@@ -2237,8 +2244,11 @@
             break;
         case WIN_FLUSH_CACHE:
         case WIN_FLUSH_CACHE_EXT:
-            if (s->bs)
-                bdrv_flush(s->bs);
+           if (s->bs) {
+               s->status = READY_STAT | SEEK_STAT | BUSY_STAT;
+               bdrv_aio_flush(s->bs, ide_flush_dma_cb, s);
+               break;
+           }
            s->status = READY_STAT | SEEK_STAT;
             ide_set_irq(s);
             break;




reply via email to

[Prev in Thread] Current Thread [Next in Thread]