[Top][All Lists]
[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]
[Qemu-devel] [patch 3/3] block migration: do not submit multiple AIOs fo
From: |
Marcelo Tosatti |
Subject: |
[Qemu-devel] [patch 3/3] block migration: do not submit multiple AIOs for same sector |
Date: |
Mon, 08 Nov 2010 17:02:56 -0200 |
User-agent: |
quilt/0.47-1 |
Block migration can submit multiple AIO reads for the same sector/chunk, but
completion of such reads can happen out of order:
migration guest
- get_dirty(N)
- aio_read(N)
- clear_dirty(N)
write(N)
set_dirty(N)
- get_dirty(N)
- aio_read(N)
If the first aio_read completes after the second, stale data will be
migrated to the destination.
Fix by not allowing multiple AIOs inflight for the same sector.
Signed-off-by: Marcelo Tosatti <address@hidden>
Index: qemu-kvm/block-migration.c
===================================================================
--- qemu-kvm.orig/block-migration.c
+++ qemu-kvm/block-migration.c
@@ -49,12 +49,14 @@ typedef struct BlkMigDevState {
int64_t total_sectors;
int64_t dirty;
QSIMPLEQ_ENTRY(BlkMigDevState) entry;
+ unsigned long *aio_bitmap;
} BlkMigDevState;
typedef struct BlkMigBlock {
uint8_t *buf;
BlkMigDevState *bmds;
int64_t sector;
+ int nr_sectors;
struct iovec iov;
QEMUIOVector qiov;
BlockDriverAIOCB *aiocb;
@@ -140,6 +142,57 @@ static inline long double compute_read_b
return (block_mig_state.reads * BLOCK_SIZE)/ block_mig_state.total_time;
}
+static int bmds_aio_inflight(BlkMigDevState *bmds, int64_t sector)
+{
+ int64_t chunk = sector / (int64_t)BDRV_SECTORS_PER_DIRTY_CHUNK;
+
+ if (bmds->aio_bitmap &&
+ (sector << BDRV_SECTOR_BITS) < bdrv_getlength(bmds->bs)) {
+ return !!(bmds->aio_bitmap[chunk / (sizeof(unsigned long) * 8)] &
+ (1UL << (chunk % (sizeof(unsigned long) * 8))));
+ } else {
+ return 0;
+ }
+}
+
+static void bmds_set_aio_inflight(BlkMigDevState *bmds, int64_t sector_num,
+ int nb_sectors, int set)
+{
+ int64_t start, end;
+ unsigned long val, idx, bit;
+
+ start = sector_num / BDRV_SECTORS_PER_DIRTY_CHUNK;
+ end = (sector_num + nb_sectors - 1) / BDRV_SECTORS_PER_DIRTY_CHUNK;
+
+ for (; start <= end; start++) {
+ idx = start / (sizeof(unsigned long) * 8);
+ bit = start % (sizeof(unsigned long) * 8);
+ val = bmds->aio_bitmap[idx];
+ if (set) {
+ if (!(val & (1UL << bit))) {
+ val |= 1UL << bit;
+ }
+ } else {
+ if (val & (1UL << bit)) {
+ val &= ~(1UL << bit);
+ }
+ }
+ bmds->aio_bitmap[idx] = val;
+ }
+}
+
+static void alloc_aio_bitmap(BlkMigDevState *bmds)
+{
+ BlockDriverState *bs = bmds->bs;
+ int64_t bitmap_size;
+
+ bitmap_size = (bdrv_getlength(bs) >> BDRV_SECTOR_BITS) +
+ BDRV_SECTORS_PER_DIRTY_CHUNK * 8 - 1;
+ bitmap_size /= BDRV_SECTORS_PER_DIRTY_CHUNK * 8;
+
+ bmds->aio_bitmap = qemu_mallocz(bitmap_size);
+}
+
static void blk_mig_read_cb(void *opaque, int ret)
{
BlkMigBlock *blk = opaque;
@@ -151,6 +204,7 @@ static void blk_mig_read_cb(void *opaque
add_avg_read_time(blk->time);
QSIMPLEQ_INSERT_TAIL(&block_mig_state.blk_list, blk, entry);
+ bmds_set_aio_inflight(blk->bmds, blk->sector, blk->nr_sectors, 0);
block_mig_state.submitted--;
block_mig_state.read_done++;
@@ -194,6 +248,7 @@ static int mig_save_device_bulk(Monitor
blk->buf = qemu_malloc(BLOCK_SIZE);
blk->bmds = bmds;
blk->sector = cur_sector;
+ blk->nr_sectors = nr_sectors;
blk->iov.iov_base = blk->buf;
blk->iov.iov_len = nr_sectors * BDRV_SECTOR_SIZE;
@@ -248,6 +303,7 @@ static void init_blk_migration_it(void *
bmds->total_sectors = sectors;
bmds->completed_sectors = 0;
bmds->shared_base = block_mig_state.shared_base;
+ alloc_aio_bitmap(bmds);
block_mig_state.total_sector_sum += sectors;
@@ -329,6 +385,8 @@ static int mig_save_device_dirty(Monitor
int nr_sectors;
for (sector = bmds->cur_dirty; sector < bmds->total_sectors;) {
+ if (bmds_aio_inflight(bmds, sector))
+ qemu_aio_flush();
if (bdrv_get_dirty(bmds->bs, sector)) {
if (total_sectors - sector < BDRV_SECTORS_PER_DIRTY_CHUNK) {
@@ -340,6 +398,7 @@ static int mig_save_device_dirty(Monitor
blk->buf = qemu_malloc(BLOCK_SIZE);
blk->bmds = bmds;
blk->sector = sector;
+ blk->nr_sectors = nr_sectors;
if (is_async) {
blk->iov.iov_base = blk->buf;
@@ -354,6 +413,7 @@ static int mig_save_device_dirty(Monitor
goto error;
}
block_mig_state.submitted++;
+ bmds_set_aio_inflight(bmds, sector, nr_sectors, 1);
} else {
if (bdrv_read(bmds->bs, sector, blk->buf,
nr_sectors) < 0) {
@@ -474,6 +534,7 @@ static void blk_mig_cleanup(Monitor *mon
while ((bmds = QSIMPLEQ_FIRST(&block_mig_state.bmds_list)) != NULL) {
QSIMPLEQ_REMOVE_HEAD(&block_mig_state.bmds_list, entry);
+ qemu_free(bmds->aio_bitmap);
qemu_free(bmds);
}