qemu-devel
[Top][All Lists]
Advanced

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

[Qemu-devel] [PATCH] Perform emulated IDE flushes asynchronously.


From: Ian Jackson
Subject: [Qemu-devel] [PATCH] Perform emulated IDE flushes asynchronously.
Date: Fri, 28 Mar 2008 17:08:50 +0000

Presently, a guest may be blocked while bdrv_flush completes if it
uses the IDE FLUSH_CACHE command.  With this patch, it may continue
executing while the cache completes.

This avoids a problem detected in a Xen installation where Windows
does a flush and blocks for long enough that it loses a substantial
amount of wall clock time - enough for it to fail.

This patch introduces a new bdrv_aio_flush method and implementations
in the relevant drivers, and uses it for the IDE flush cache
operation.

Ian.

Signed-off-by: Ian Jackson <address@hidden>
Modified-by: Ian Jackson <address@hidden>
Signed-off-by: Kouya Shimura <address@hidden>

>From 59b4a4d1c2ab84c560576861e2a817bdfdd686fd Mon Sep 17 00:00:00 2001
From: Ian Jackson <address@hidden>
Date: Fri, 28 Mar 2008 16:54:09 +0000
Subject: [PATCH] Perform emulated IDE flushes asynchronously.

We arrange for the WIN_FLUSH_CACHE and WIN_FLUSH_CACHE_EXT
commands to use a new bdrv_aio_flush facility.

If there is an error, the ATA-7 spec says that we are supposed to know
which is the first block whose flush failed and leave that in the
block offset registers.  However since we are using f(data)sync that's
not possible for us.  There is sadly no way for us to report the error
which won't encourage the guest to try to understand what went wrong
and then do the flush again expecting the remaining blocks to be
written (as specified by ATA-7).

So if the asynchronous flush fails, we kill the disk by detaching
->bs.  This makes it vanish: we don't generate any more interrupts,
leave status set to busy, and ignore future commands (and discard any
in-flight IO).  Alan Cox reports that this will probably induce the
best available behaviour in guests (retry for a while and then give
up).  Fine-grained error reporting is available if the guest turns off
the write cache.

Signed-off-by: Ian Jackson <address@hidden>
Modified-by: Ian Jackson <address@hidden>
Signed-off-by: Kouya Shimura <address@hidden>
---
 block-qcow.c      |    8 +++++++
 block-qcow2.c     |    8 +++++++
 block-raw-posix.c |   16 +++++++++++++++
 block.c           |   39 ++++++++++++++++++++++++++++++++++++++
 block.h           |    2 +
 block_int.h       |    2 +
 hw/ide.c          |   54 +++++++++++++++++++++++++++++++++++++++++++++-------
 7 files changed, 121 insertions(+), 8 deletions(-)

diff --git a/block-qcow.c b/block-qcow.c
index 59f1c66..4798ade 100644
--- a/block-qcow.c
+++ b/block-qcow.c
@@ -725,6 +725,13 @@ static void qcow_aio_cancel(BlockDriverAIOCB *blockacb)
     qemu_aio_release(acb);
 }
 
+static BlockDriverAIOCB *qcow_aio_flush(BlockDriverState *bs,
+        BlockDriverCompletionFunc *cb, void *opaque)
+{
+    BDRVQcowState *s = bs->opaque;
+    return bdrv_aio_flush(s->hd, cb, opaque);
+}
+
 static void qcow_close(BlockDriverState *bs)
 {
     BDRVQcowState *s = bs->opaque;
@@ -913,6 +920,7 @@ BlockDriver bdrv_qcow = {
     .bdrv_aio_read = qcow_aio_read,
     .bdrv_aio_write = qcow_aio_write,
     .bdrv_aio_cancel = qcow_aio_cancel,
+    .bdrv_aio_flush = qcow_aio_flush,
     .aiocb_size = sizeof(QCowAIOCB),
     .bdrv_write_compressed = qcow_write_compressed,
     .bdrv_get_info = qcow_get_info,
diff --git a/block-qcow2.c b/block-qcow2.c
index 9bad090..9c63a63 100644
--- a/block-qcow2.c
+++ b/block-qcow2.c
@@ -1007,6 +1007,13 @@ static void qcow_aio_cancel(BlockDriverAIOCB *blockacb)
     qemu_aio_release(acb);
 }
 
+static BlockDriverAIOCB *qcow_aio_flush(BlockDriverState *bs,
+        BlockDriverCompletionFunc *cb, void *opaque)
+{
+    BDRVQcowState *s = bs->opaque;
+    return bdrv_aio_flush(s->hd, cb, opaque);
+}
+
 static void qcow_close(BlockDriverState *bs)
 {
     BDRVQcowState *s = bs->opaque;
@@ -2241,6 +2248,7 @@ BlockDriver bdrv_qcow2 = {
     .bdrv_aio_read = qcow_aio_read,
     .bdrv_aio_write = qcow_aio_write,
     .bdrv_aio_cancel = qcow_aio_cancel,
+    .bdrv_aio_flush = qcow_aio_flush,
     .aiocb_size = sizeof(QCowAIOCB),
     .bdrv_write_compressed = qcow_write_compressed,
 
diff --git a/block-raw-posix.c b/block-raw-posix.c
index 45d0a93..768d52c 100644
--- a/block-raw-posix.c
+++ b/block-raw-posix.c
@@ -762,6 +762,21 @@ static int fd_open(BlockDriverState *bs)
 {
     return 0;
 }
+
+static BlockDriverAIOCB *raw_aio_flush(BlockDriverState *bs,
+        BlockDriverCompletionFunc *cb, void *opaque)
+{
+    RawAIOCB *acb;
+
+    acb = raw_aio_setup(bs, 0, NULL, 0, cb, opaque);
+    if (!acb)
+        return NULL;
+    if (aio_fsync(O_SYNC, &acb->aiocb) < 0) {
+        qemu_aio_release(acb);
+        return NULL;
+    }
+    return &acb->common;
+}
 #endif
 
 #if defined(__linux__)
@@ -913,6 +928,7 @@ BlockDriver bdrv_host_device = {
     .bdrv_aio_read = raw_aio_read,
     .bdrv_aio_write = raw_aio_write,
     .bdrv_aio_cancel = raw_aio_cancel,
+    .bdrv_aio_flush = raw_aio_flush,
     .aiocb_size = sizeof(RawAIOCB),
     .bdrv_pread = raw_pread,
     .bdrv_pwrite = raw_pwrite,
diff --git a/block.c b/block.c
index fc32a60..348a215 100644
--- a/block.c
+++ b/block.c
@@ -50,6 +50,8 @@ static BlockDriverAIOCB *bdrv_aio_read_em(BlockDriverState 
*bs,
 static BlockDriverAIOCB *bdrv_aio_write_em(BlockDriverState *bs,
         int64_t sector_num, const uint8_t *buf, int nb_sectors,
         BlockDriverCompletionFunc *cb, void *opaque);
+static BlockDriverAIOCB *bdrv_aio_flush_em(BlockDriverState *bs,
+        BlockDriverCompletionFunc *cb, void *opaque);
 static void bdrv_aio_cancel_em(BlockDriverAIOCB *acb);
 static int bdrv_read_em(BlockDriverState *bs, int64_t sector_num,
                         uint8_t *buf, int nb_sectors);
@@ -137,6 +139,8 @@ static void bdrv_register(BlockDriver *bdrv)
         bdrv->bdrv_read = bdrv_read_em;
         bdrv->bdrv_write = bdrv_write_em;
     }
+    if (!bdrv->bdrv_aio_flush)
+        bdrv->bdrv_aio_flush = bdrv_aio_flush_em;
     bdrv->next = first_drv;
     first_drv = bdrv;
 }
@@ -1156,6 +1160,17 @@ void bdrv_aio_cancel(BlockDriverAIOCB *acb)
     drv->bdrv_aio_cancel(acb);
 }
 
+BlockDriverAIOCB *bdrv_aio_flush(BlockDriverState *bs, 
+                                 BlockDriverCompletionFunc *cb, void *opaque)
+{
+    BlockDriver *drv = bs->drv;
+
+    if (!drv)
+        return NULL;
+
+    return drv->bdrv_aio_flush(bs, cb, opaque);
+}
+
 
 /**************************************************************/
 /* async block device emulation */
@@ -1181,6 +1196,15 @@ static BlockDriverAIOCB 
*bdrv_aio_write_em(BlockDriverState *bs,
     return NULL;
 }
 
+static BlockDriverAIOCB *bdrv_aio_flush_em(BlockDriverState *bs,
+        BlockDriverCompletionFunc *cb, void *opaque)
+{
+    int ret;
+    ret = bdrv_flush(bs);
+    cb(opaque, ret);
+    return NULL;
+}
+
 static void bdrv_aio_cancel_em(BlockDriverAIOCB *acb)
 {
 }
@@ -1224,6 +1248,21 @@ static BlockDriverAIOCB 
*bdrv_aio_write_em(BlockDriverState *bs,
     return &acb->common;
 }
 
+static BlockDriverAIOCB *bdrv_aio_flush_em(BlockDriverState *bs,
+        BlockDriverCompletionFunc *cb, void *opaque)
+{
+    BlockDriverAIOCBSync *acb;
+    int ret;
+
+    acb = qemu_aio_get(bs, cb, opaque);
+    if (!acb->bh)
+        acb->bh = qemu_bh_new(bdrv_aio_bh_cb, acb);
+    ret = bdrv_flush(bs);
+    acb->ret = ret;
+    qemu_bh_schedule(acb->bh);
+    return &acb->common;
+}
+
 static void bdrv_aio_cancel_em(BlockDriverAIOCB *blockacb)
 {
     BlockDriverAIOCBSync *acb = (BlockDriverAIOCBSync *)blockacb;
diff --git a/block.h b/block.h
index 861a65b..0ecb0f8 100644
--- a/block.h
+++ b/block.h
@@ -86,6 +86,8 @@ BlockDriverAIOCB *bdrv_aio_read(BlockDriverState *bs, int64_t 
sector_num,
 BlockDriverAIOCB *bdrv_aio_write(BlockDriverState *bs, int64_t sector_num,
                                  const uint8_t *buf, int nb_sectors,
                                  BlockDriverCompletionFunc *cb, void *opaque);
+BlockDriverAIOCB *bdrv_aio_flush(BlockDriverState *bs, 
+                                 BlockDriverCompletionFunc *cb, void *opaque);
 void bdrv_aio_cancel(BlockDriverAIOCB *acb);
 
 void qemu_aio_init(void);
diff --git a/block_int.h b/block_int.h
index 796ce29..bda0eab 100644
--- a/block_int.h
+++ b/block_int.h
@@ -55,6 +55,8 @@ struct BlockDriver {
         int64_t sector_num, const uint8_t *buf, int nb_sectors,
         BlockDriverCompletionFunc *cb, void *opaque);
     void (*bdrv_aio_cancel)(BlockDriverAIOCB *acb);
+    BlockDriverAIOCB *(*bdrv_aio_flush)(BlockDriverState *bs,
+        BlockDriverCompletionFunc *cb, void *opaque);
     int aiocb_size;
 
     const char *protocol_name;
diff --git a/hw/ide.c b/hw/ide.c
index 0fb58e7..06eacdb 100644
--- a/hw/ide.c
+++ b/hw/ide.c
@@ -724,6 +724,7 @@ static inline void ide_abort_command(IDEState *s)
 static inline void ide_set_irq(IDEState *s)
 {
     BMDMAState *bm = s->bmdma;
+    if (!s->bs) return; /* yikes */
     if (!(s->cmd & IDE_CMD_DISABLE_IRQ)) {
         if (bm) {
             bm->status |= BM_STATUS_INT;
@@ -903,6 +904,8 @@ static void ide_read_dma_cb(void *opaque, int ret)
        return;
     }
 
+    if (!s->bs) return; /* yikes */
+
     n = s->io_buffer_size >> 9;
     sector_num = ide_get_sector(s);
     if (n > 0) {
@@ -1016,6 +1019,8 @@ static void ide_write_dma_cb(void *opaque, int ret)
        return;
     }
 
+    if (!s->bs) return; /* yikes */
+
     n = s->io_buffer_size >> 9;
     sector_num = ide_get_sector(s);
     if (n > 0) {
@@ -1068,6 +1073,39 @@ static void ide_sector_write_dma(IDEState *s)
     ide_dma_start(s, ide_write_dma_cb);
 }
 
+static void ide_device_utterly_broken(IDEState *s) {
+    s->status |= BUSY_STAT;
+    s->bs = NULL;
+    /* This prevents all future commands from working.  All of the
+     * asynchronous callbacks (and ide_set_irq, as a safety measure)
+     * check to see whether this has happened and bail if so.
+     */
+}
+
+static void ide_flush_cb(void *opaque, int ret)
+{
+    IDEState *s = opaque;
+
+    if (!s->bs) return; /* yikes */
+
+    if (ret) {
+        /* We are completely doomed.  The IDE spec does not permit us
+        * to return an error from a flush except via a protocol which
+        * requires us to say where the error is and which
+        * contemplates the guest repeating the flush attempt to
+        * attempt flush the remaining data.  We can't support that
+        * because f(data)sync (which is what the block drivers use
+        * eventually) doesn't report the necessary information or
+        * give us the necessary control.  So we make the disk vanish.
+        */
+       ide_device_utterly_broken(s);
+       return;
+    }
+    else
+        s->status = READY_STAT;
+    ide_set_irq(s);
+}
+
 static void ide_atapi_cmd_ok(IDEState *s)
 {
     s->error = 0;
@@ -1294,6 +1332,8 @@ static void ide_atapi_cmd_read_dma_cb(void *opaque, int 
ret)
     IDEState *s = bm->ide_if;
     int data_offset, n;
 
+    if (!s->bs) return; /* yikes */
+
     if (ret < 0) {
         ide_atapi_io_error(s, ret);
         goto eot;
@@ -1872,6 +1912,8 @@ static void cdrom_change_cb(void *opaque)
     IDEState *s = opaque;
     uint64_t nb_sectors;
 
+    if (!s->bs) return; /* yikes */
+
     /* XXX: send interrupt too */
     bdrv_get_geometry(s->bs, &nb_sectors);
     s->nb_sectors = nb_sectors;
@@ -1975,8 +2017,8 @@ static void ide_ioport_write(void *opaque, uint32_t addr, 
uint32_t val)
         printf("ide: CMD=%02x\n", val);
 #endif
         s = ide_if->cur_drive;
-        /* ignore commands to non existant slave */
-        if (s != ide_if && !s->bs)
+       /* ignore commands to non existant device */
+        if (!s->bs)
             break;
 
         switch(val) {
@@ -2169,12 +2211,8 @@ static void ide_ioport_write(void *opaque, uint32_t 
addr, uint32_t val)
             break;
         case WIN_FLUSH_CACHE:
         case WIN_FLUSH_CACHE_EXT:
-            if (s->bs) {
-                ret = bdrv_flush(s->bs);
-               if (ret) goto abort_cmd;
-           }
-           s->status = READY_STAT;
-            ide_set_irq(s);
+           s->status = BUSY_STAT;
+           bdrv_aio_flush(s->bs, ide_flush_cb, s);
             break;
         case WIN_STANDBY:
         case WIN_STANDBY2:
-- 
1.4.4.4


reply via email to

[Prev in Thread] Current Thread [Next in Thread]