qemu-devel
[Top][All Lists]
Advanced

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

[Qemu-devel] [RFC 1/1] pci-dma-api-v2


From: Andrea Arcangeli
Subject: [Qemu-devel] [RFC 1/1] pci-dma-api-v2
Date: Sun, 30 Nov 2008 18:41:33 +0100

On Fri, Nov 28, 2008 at 07:50:01PM +0100, Andrea Arcangeli wrote:
> better. I've also noticed some problems with windows (I didn't test
> windows before posting), those aren't related to the cache layer as I
> added a #define to disable it and replace it with malloc/free. But
> that's not the cache layer, as soon as windows runs completely
> flawlessy I post an update.

As promised here an updated. I fixed all outstanding issues, it's
running rock solid. Before there were a few spots on ide.c plus the
cache layer was very buggy with more than one dma in-flight.

One of the issues was also that the sg[].len may not be aligned. It's
the total DMA length that has to be aligned. So while linux never
submits memory regions with sg[].len that isn't 512byte aligned, other
OS submitted 512 bytes large I/O scattered over different memory
regions each one smaller than 512bytes. So the alignment check is
moved on the whole DMA operation (param->curr_len) and the max
alignment depends on the max size of the buffer (currently is
hardcoded to 1M and it can't be too big for security reasons). The way
host direct DMA works, the memory region length must be a multiple of
the alignment, so bounce buffering is a must for those few times the
buffer is smaller than blocksize. I guess the hardware behaves
similarly and those dma op with sg entries with length < 512bytes runs
slower in the hardware too.

I tested on various OS all combinations, direct dma, bounce buffering,
and bounce buffering with 512byte size (which runs so much slower as
expected, visibly even during boot, showing how much overhead there is
even with cache=off by reading in small chunks and entering/exiting
the host kenrel so frequently). I didn't split the _em out of the
patch this time as it's so small anyway and I doubt we're ready for
merging in qemu svn, but those aren't meant to be merged. However I
think the below is close to good enough, the reason I don't think
we're ready for merging is that this shall be merged at the same time
we add a real bdrv_aio_readv/writev. So once we can agree on the
below to be the way to go, next thing to discuss is how to add readv/writev ;).

Signed-off-by: Andrea Arcangeli <address@hidden>

Index: block_int.h
===================================================================
--- block_int.h (revision 5818)
+++ block_int.h (working copy)
@@ -55,6 +55,8 @@
         int64_t sector_num, const uint8_t *buf, int nb_sectors,
         BlockDriverCompletionFunc *cb, void *opaque);
     void (*bdrv_aio_cancel)(BlockDriverAIOCB *acb);
+    BlockDriverAIOIOV *bdrv_aio_readv;
+    BlockDriverAIOIOV *bdrv_aio_writev;
     int aiocb_size;
 
     const char *protocol_name;
Index: Makefile.target
===================================================================
--- Makefile.target     (revision 5818)
+++ Makefile.target     (working copy)
@@ -659,7 +659,7 @@
 
 ifeq ($(TARGET_BASE_ARCH), i386)
 # Hardware support
-OBJS+= ide.o pckbd.o ps2.o vga.o $(SOUND_HW) dma.o
+OBJS+= ide.o pckbd.o ps2.o vga.o $(SOUND_HW) dma.o pci_dma.o
 OBJS+= fdc.o mc146818rtc.o serial.o i8259.o i8254.o pcspk.o pc.o
 OBJS+= cirrus_vga.o apic.o parallel.o acpi.o piix_pci.o
 OBJS+= usb-uhci.o vmmouse.o vmport.o vmware_vga.o
@@ -668,7 +668,7 @@
 ifeq ($(TARGET_BASE_ARCH), ppc)
 CPPFLAGS += -DHAS_AUDIO -DHAS_AUDIO_CHOICE
 # shared objects
-OBJS+= ppc.o ide.o vga.o $(SOUND_HW) dma.o openpic.o
+OBJS+= ppc.o ide.o vga.o $(SOUND_HW) dma.o openpic.o pci_dma.o
 # PREP target
 OBJS+= pckbd.o ps2.o serial.o i8259.o i8254.o fdc.o m48t59.o mc146818rtc.o
 OBJS+= prep_pci.o ppc_prep.o
@@ -686,7 +686,7 @@
 OBJS+= mips_timer.o mips_int.o dma.o vga.o serial.o i8254.o i8259.o rc4030.o
 OBJS+= g364fb.o jazz_led.o
 OBJS+= ide.o gt64xxx.o pckbd.o ps2.o fdc.o mc146818rtc.o usb-uhci.o acpi.o 
ds1225y.o
-OBJS+= piix_pci.o parallel.o cirrus_vga.o pcspk.o $(SOUND_HW)
+OBJS+= piix_pci.o parallel.o cirrus_vga.o pcspk.o pci_dma.o $(SOUND_HW)
 OBJS+= mipsnet.o
 OBJS+= pflash_cfi01.o
 CPPFLAGS += -DHAS_AUDIO -DHAS_AUDIO_CHOICE
@@ -710,7 +710,7 @@
 else
 OBJS+= sun4m.o tcx.o pcnet.o iommu.o m48t59.o slavio_intctl.o
 OBJS+= slavio_timer.o slavio_serial.o slavio_misc.o fdc.o sparc32_dma.o
-OBJS+= cs4231.o ptimer.o eccmemctl.o sbi.o sun4c_intctl.o
+OBJS+= cs4231.o ptimer.o eccmemctl.o sbi.o sun4c_intctl.o pci_dma.o
 endif
 endif
 ifeq ($(TARGET_BASE_ARCH), arm)
@@ -731,7 +731,7 @@
 OBJS+= nseries.o blizzard.o onenand.o vga.o cbus.o tusb6010.o usb-musb.o
 OBJS+= tsc2005.o bt-hci-csr.o
 OBJS+= mst_fpga.o mainstone.o
-OBJS+= musicpal.o pflash_cfi02.o
+OBJS+= musicpal.o pflash_cfi02.o pci_dma.o
 CPPFLAGS += -DHAS_AUDIO
 endif
 ifeq ($(TARGET_BASE_ARCH), sh4)
Index: exec.c
===================================================================
--- exec.c      (revision 5818)
+++ exec.c      (working copy)
@@ -2807,7 +2807,7 @@
 /* physical memory access (slow version, mainly for debug) */
 #if defined(CONFIG_USER_ONLY)
 void cpu_physical_memory_rw(target_phys_addr_t addr, uint8_t *buf,
-                            int len, int is_write)
+                            size_t len, int is_write)
 {
     int l, flags;
     target_ulong page;
@@ -2848,7 +2848,7 @@
 
 #else
 void cpu_physical_memory_rw(target_phys_addr_t addr, uint8_t *buf,
-                            int len, int is_write)
+                            size_t len, int is_write)
 {
     int l, io_index;
     uint8_t *ptr;
@@ -2938,6 +2938,111 @@
     }
 }
 
+uint8_t *cpu_physical_memory_can_dma(target_phys_addr_t addr,
+                                    size_t len, int is_write,
+                                    int alignment)
+{
+    int l, first = 1;
+    uint8_t *ptr = NULL;
+    target_phys_addr_t page;
+    unsigned long pd, pd_first = 0;
+    PhysPageDesc *p;
+
+    if (len & (alignment-1))
+       goto out;
+
+    while (len > 0) {
+       page = addr & TARGET_PAGE_MASK;
+       p = phys_page_find(page >> TARGET_PAGE_BITS);
+
+       l = (page + TARGET_PAGE_SIZE) - addr;
+        if (l > len)
+            l = len;
+
+        if (!p)
+            pd = IO_MEM_UNASSIGNED;
+        else
+            pd = p->phys_offset;
+
+        if (is_write) {
+            if ((pd & ~TARGET_PAGE_MASK) != IO_MEM_RAM)
+               return NULL;
+        } else {
+            if ((pd & ~TARGET_PAGE_MASK) > IO_MEM_ROM &&
+                !(pd & IO_MEM_ROMD))
+               return NULL;
+        }
+
+       if (first) {
+           first = 0;
+           ptr = phys_ram_base + (pd & TARGET_PAGE_MASK) +
+               (addr & ~TARGET_PAGE_MASK);
+           if ((unsigned long)ptr & (alignment-1))
+               return NULL;
+           pd_first = pd;
+       }
+
+       /* nonlinear range */
+       if (pd_first != pd)
+           return NULL;
+       pd_first += TARGET_PAGE_SIZE;
+
+        len -= l;
+        addr += l;
+    }
+
+out:
+    return ptr;
+}
+
+void cpu_physical_memory_write_post_dma(target_phys_addr_t addr,
+                                       size_t len)
+{
+    int l;
+    uint8_t *ptr;
+    target_phys_addr_t page;
+    unsigned long pd;
+    PhysPageDesc *p;
+
+    while (len > 0) {
+        page = addr & TARGET_PAGE_MASK;
+
+        l = (page + TARGET_PAGE_SIZE) - addr;
+        if (l > len)
+            l = len;
+
+        p = phys_page_find(page >> TARGET_PAGE_BITS);
+        if (!p)
+            pd = IO_MEM_UNASSIGNED;
+        else
+            pd = p->phys_offset;
+
+       if ((pd & ~TARGET_PAGE_MASK) != IO_MEM_RAM) {
+           printf("ERROR cpu_physical_memory_post_dma: memory layout 
changed\n");
+           continue;
+       } else {
+           unsigned long addr1;
+           addr1 = (pd & TARGET_PAGE_MASK) + (addr & ~TARGET_PAGE_MASK);
+           /* RAM case */
+           ptr = phys_ram_base + addr1;
+           if (!cpu_physical_memory_is_dirty(addr1)) {
+               /* invalidate code */
+               tb_invalidate_phys_page_range(addr1, addr1 + l, 0);
+               /* set dirty bit */
+               phys_ram_dirty[addr1 >> TARGET_PAGE_BITS] |=
+                   (0xff & ~CODE_DIRTY_FLAG);
+           }
+           /* qemu doesn't execute guest code directly, but kvm does
+              therefore fluch instruction caches */
+           if (kvm_enabled())
+               flush_icache_range((unsigned long)ptr,
+                                  ((unsigned long)ptr)+l);
+        }
+        len -= l;
+        addr += l;
+    }
+}
+
 /* used for ROM loading : can write in RAM and ROM */
 void cpu_physical_memory_write_rom(target_phys_addr_t addr,
                                    const uint8_t *buf, int len)
Index: block.c
===================================================================
--- block.c     (revision 5818)
+++ block.c     (working copy)
@@ -53,6 +53,20 @@
                         uint8_t *buf, int nb_sectors);
 static int bdrv_write_em(BlockDriverState *bs, int64_t sector_num,
                          const uint8_t *buf, int nb_sectors);
+static BlockDriverAIOCB *bdrv_aio_readv_em(BlockDriverState *bs,
+                                          int64_t sector_num,
+                                          struct iovec *iov,
+                                          int iovnct,
+                                          size_t len,
+                                          BlockDriverCompletionFunc *cb,
+                                          void *opaque);
+static BlockDriverAIOCB *bdrv_aio_writev_em(BlockDriverState *bs,
+                                           int64_t sector_num,
+                                           struct iovec *iov,
+                                           int iovnct,
+                                           size_t len,
+                                           BlockDriverCompletionFunc *cb,
+                                           void *opaque);
 
 BlockDriverState *bdrv_first;
 
@@ -135,6 +149,9 @@
         /* add synchronous IO emulation layer */
         bdrv->bdrv_read = bdrv_read_em;
         bdrv->bdrv_write = bdrv_write_em;
+        bdrv->bdrv_aio_readv = bdrv_aio_readv_em; /* FIXME */
+        bdrv->bdrv_aio_writev = bdrv_aio_writev_em; /* FIXME */
+        bdrv->bdrv_aio_cancel = bdrv_aio_cancel_em; /* FIXME */
     }
     bdrv->next = first_drv;
     first_drv = bdrv;
@@ -1291,7 +1308,51 @@
     drv->bdrv_aio_cancel(acb);
 }
 
+BlockDriverAIOCB *bdrv_aio_readv(BlockDriverState *bs, int64_t sector_num,
+                                struct iovec *iov, int iovcnt, size_t len,
+                                BlockDriverCompletionFunc *cb, void *opaque)
+{
+    BlockDriver *drv = bs->drv;
+    BlockDriverAIOCB *ret;
+    
 
+    if (!drv)
+        return NULL;
+
+    ret = drv->bdrv_aio_readv(bs, sector_num, iov, iovcnt, len, cb, opaque);
+
+    if (ret) {
+       /* Update stats even though technically transfer has not happened. */
+       bs->rd_bytes += (unsigned) len;
+       bs->rd_ops ++;
+    }
+
+    return ret;
+}
+
+BlockDriverAIOCB *bdrv_aio_writev(BlockDriverState *bs, int64_t sector_num,
+                                 struct iovec *iov, int iovcnt, size_t len,
+                                 BlockDriverCompletionFunc *cb, void *opaque)
+{
+    BlockDriver *drv = bs->drv;
+    BlockDriverAIOCB *ret;
+    
+
+    if (!drv)
+        return NULL;
+
+    ret = drv->bdrv_aio_writev(bs, sector_num, iov, iovcnt, len, cb, opaque);
+
+    if (ret) {
+       /* Update stats even though technically transfer has not happened. */
+       bs->rd_bytes += (unsigned) len;
+       bs->rd_ops ++;
+    }
+
+    return ret;
+}
+
+
 /**************************************************************/
 /* async block device emulation */
 
@@ -1341,6 +1402,74 @@
     qemu_aio_release(acb);
 }
 
+static void bdrv_aio_iov_bh_cb(void *opaque)
+{
+    BlockDriverAIOCBSync *acb = opaque;
+    acb->common.cb(acb->common.opaque, acb->ret);
+    qemu_bh_delete(acb->bh);
+    qemu_free(acb);
+}
+
+static BlockDriverAIOCB *bdrv_aio_readv_em(BlockDriverState *bs,
+                                          int64_t sector_num,
+                                          struct iovec *iov,
+                                          int iovcnt,
+                                          size_t len,
+                                          BlockDriverCompletionFunc *cb,
+                                          void *opaque)
+{
+    BlockDriverAIOCBSync *acb;
+    int ret = -1, idx;
+
+    for (idx = 0; idx < iovcnt; idx++) {
+       size_t sectors = iov[idx].iov_len >> SECTOR_BITS;
+       ret = bdrv_read(bs, sector_num, iov[idx].iov_base, sectors);
+       if (ret)
+           break;
+       sector_num += sectors;
+    }
+    acb = qemu_mallocz(sizeof(BlockDriverAIOCBSync));
+    if (!acb)
+            return NULL;
+    acb->common.bs = bs;
+    acb->common.cb = cb;
+    acb->common.opaque = opaque;
+    acb->bh = qemu_bh_new(bdrv_aio_iov_bh_cb, acb);
+    acb->ret = ret;
+    qemu_bh_schedule(acb->bh);
+    return &acb->common;
+}
+
+static BlockDriverAIOCB *bdrv_aio_writev_em(BlockDriverState *bs,
+                                           int64_t sector_num,
+                                           struct iovec *iov,
+                                           int iovcnt,
+                                           size_t len,
+                                           BlockDriverCompletionFunc *cb,
+                                           void *opaque)
+{
+    BlockDriverAIOCBSync *acb;
+    int ret = -1, idx;
+
+    for (idx = 0; idx < iovcnt; idx++) {
+       size_t sectors = iov[idx].iov_len >> SECTOR_BITS;
+       ret = bdrv_write(bs, sector_num, iov[idx].iov_base, sectors);
+       if (ret)
+           break;
+       sector_num += sectors;
+    }
+    acb = qemu_mallocz(sizeof(BlockDriverAIOCBSync));
+    if (!acb)
+            return NULL;
+    acb->common.bs = bs;
+    acb->common.cb = cb;
+    acb->common.opaque = opaque;
+    acb->bh = qemu_bh_new(bdrv_aio_iov_bh_cb, acb);
+    acb->ret = ret;
+    qemu_bh_schedule(acb->bh);
+    return &acb->common;
+}
+
 /**************************************************************/
 /* sync block device emulation */
 
Index: block.h
===================================================================
--- block.h     (revision 5818)
+++ block.h     (working copy)
@@ -83,6 +83,13 @@
 /* async block I/O */
 typedef struct BlockDriverAIOCB BlockDriverAIOCB;
 typedef void BlockDriverCompletionFunc(void *opaque, int ret);
+typedef BlockDriverAIOCB *BlockDriverAIOIOV(BlockDriverState *bs,
+                                           int64_t sector_num,
+                                           struct iovec *iov,
+                                           int iovnct,
+                                           size_t len,
+                                           BlockDriverCompletionFunc *cb,
+                                           void *opaque);
 
 BlockDriverAIOCB *bdrv_aio_read(BlockDriverState *bs, int64_t sector_num,
                                 uint8_t *buf, int nb_sectors,
@@ -91,6 +98,12 @@
                                  const uint8_t *buf, int nb_sectors,
                                  BlockDriverCompletionFunc *cb, void *opaque);
 void bdrv_aio_cancel(BlockDriverAIOCB *acb);
+BlockDriverAIOCB *bdrv_aio_readv(BlockDriverState *bs, int64_t sector_num,
+                                struct iovec *iov, int iovnct, size_t len,
+                                BlockDriverCompletionFunc *cb, void *opaque);
+BlockDriverAIOCB *bdrv_aio_writev(BlockDriverState *bs, int64_t sector_num,
+                                 struct iovec *iov, int iovnct, size_t len,
+                                 BlockDriverCompletionFunc *cb, void *opaque);
 
 int qemu_key_check(BlockDriverState *bs, const char *name);
 
Index: hw/ide.c
===================================================================
--- hw/ide.c    (revision 5818)
+++ hw/ide.c    (working copy)
@@ -31,6 +31,7 @@
 #include "qemu-timer.h"
 #include "sysemu.h"
 #include "ppc_mac.h"
+#include "pci_dma.h"
 
 /* debug IDE devices */
 //#define DEBUG_IDE
@@ -480,12 +481,15 @@
     struct PCIIDEState *pci_dev;
     /* current transfer state */
     uint32_t cur_addr;
-    uint32_t cur_prd_last;
-    uint32_t cur_prd_addr;
-    uint32_t cur_prd_len;
+    uint32_t cur_prd_last; /* fixme delete */
+    uint32_t cur_prd_addr; /* fixme delete */
+    uint32_t cur_prd_len; /* fixme delete */
     IDEState *ide_if;
     BlockDriverCompletionFunc *dma_cb;
     BlockDriverAIOCB *aiocb;
+    QEMUPciDmaSg sg[IDE_DMA_BUF_SECTORS];
+    BlockDriverAIOIOV *bdrv_aio_iov;
+    int dma_to_memory;
 } BMDMAState;
 
 typedef struct PCIIDEState {
@@ -870,7 +874,7 @@
 }
 
 /* return 0 if buffer completed */
-static int dma_buf_rw(BMDMAState *bm, int is_write)
+static int dma_buf_rw(BMDMAState *bm, int is_write) /* fixme delete */
 {
     IDEState *s = bm->ide_if;
     struct {
@@ -917,61 +921,80 @@
     return 1;
 }
 
-static void ide_read_dma_cb(void *opaque, int ret)
+static int build_dma_sg(BMDMAState *bm)
 {
-    BMDMAState *bm = opaque;
-    IDEState *s = bm->ide_if;
-    int n;
-    int64_t sector_num;
+    struct {
+        uint32_t addr;
+        uint32_t size;
+    } prd;
+    int len;
+    int idx;
 
-    if (ret < 0) {
-       ide_dma_error(s);
-       return;
+    for (idx = 1; idx <= IDE_DMA_BUF_SECTORS; idx++) {
+       cpu_physical_memory_read(bm->cur_addr, (uint8_t *)&prd, 8);
+       bm->cur_addr += 8;
+       bm->sg[idx-1].addr = le32_to_cpu(prd.addr);
+       prd.size = le32_to_cpu(prd.size);
+       len = prd.size & 0xfffe;
+       if (len == 0)
+           len = 0x10000;
+       bm->sg[idx-1].len = len;
+       /* end of table (with a fail safe of one page) */
+       if ((prd.size & 0x80000000) ||
+           (bm->cur_addr - bm->addr) >= 4096)
+           break;
     }
+    if (idx > IDE_DMA_BUF_SECTORS)
+       printf("build_dma_sg: too many sg entries\n");
+    return idx;
+}
 
-    n = s->io_buffer_size >> 9;
-    sector_num = ide_get_sector(s);
-    if (n > 0) {
-        sector_num += n;
-        ide_set_sector(s, sector_num);
-        s->nsector -= n;
-        if (dma_buf_rw(bm, 1) == 0)
-            goto eot;
-    }
+static void ide_dma_complete(void *opaque, int ret)
+{
+    BMDMAState *bm = opaque;
+    IDEState *s = bm->ide_if;
 
+    bm->bdrv_aio_iov = NULL;
+    bm->ide_if = NULL;
+    bm->aiocb = NULL;
     /* end of transfer ? */
-    if (s->nsector == 0) {
+    if (s->nsector == 0 && !ret) {
         s->status = READY_STAT | SEEK_STAT;
         ide_set_irq(s);
-    eot:
         bm->status &= ~BM_STATUS_DMAING;
         bm->status |= BM_STATUS_INT;
-        bm->dma_cb = NULL;
-        bm->ide_if = NULL;
-        bm->aiocb = NULL;
-        return;
+    } else {
+       ide_dma_error(s);
+       printf("ide_dma_complete error: nsector %d err %d\n", s->nsector, ret);
     }
+}
 
-    /* launch next transfer */
-    n = s->nsector;
-    if (n > IDE_DMA_BUF_SECTORS)
-        n = IDE_DMA_BUF_SECTORS;
-    s->io_buffer_index = 0;
-    s->io_buffer_size = n * 512;
+static int ide_dma_submit(void *opaque, struct iovec *dma_iov,
+                         int iovcnt, size_t len,
+                         BlockDriverCompletionFunc dma_cb,
+                         void *dma_cb_param)
+{
+    BMDMAState *bm = opaque;
+    IDEState *s = bm->ide_if;
+    size_t sectors;
+    int64_t sector_num;
+
+    sectors = len >> 9;
+    if (s->nsector < sectors)
+       return -3000;
+    sector_num = ide_get_sector(s);
+    ide_set_sector(s, sector_num  + sectors);
+    s->nsector -= sectors;
+
 #ifdef DEBUG_AIO
-    printf("aio_read: sector_num=%lld n=%d\n", sector_num, n);
+    printf("ide_dma_submit_write: sector_num=%lld n=%d\n", sector_num, 
sectors);
 #endif
-    bm->aiocb = bdrv_aio_read(s->bs, sector_num, s->io_buffer, n,
-                              ide_read_dma_cb, bm);
-    ide_dma_submit_check(s, ide_read_dma_cb, bm);
-}
+    bm->aiocb = bm->bdrv_aio_iov(s->bs, sector_num, dma_iov, iovcnt, len,
+                                dma_cb, dma_cb_param);
+    if (!bm->aiocb)
+       return -3001;
 
-static void ide_sector_read_dma(IDEState *s)
-{
-    s->status = READY_STAT | SEEK_STAT | DRQ_STAT | BUSY_STAT;
-    s->io_buffer_index = 0;
-    s->io_buffer_size = 0;
-    ide_dma_start(s, ide_read_dma_cb);
+    return 0;
 }
 
 static void ide_sector_write_timer_cb(void *opaque)
@@ -1028,62 +1051,31 @@
     }
 }
 
-static void ide_write_dma_cb(void *opaque, int ret)
+static void ide_sector_dma_start(BMDMAState *bm)
 {
-    BMDMAState *bm = opaque;
-    IDEState *s = bm->ide_if;
-    int n;
-    int64_t sector_num;
-
-    if (ret < 0) {
-       ide_dma_error(s);
-       return;
-    }
-
-    n = s->io_buffer_size >> 9;
-    sector_num = ide_get_sector(s);
-    if (n > 0) {
-        sector_num += n;
-        ide_set_sector(s, sector_num);
-        s->nsector -= n;
-    }
-
-    /* end of transfer ? */
-    if (s->nsector == 0) {
-        s->status = READY_STAT | SEEK_STAT;
-        ide_set_irq(s);
-    eot:
-        bm->status &= ~BM_STATUS_DMAING;
-        bm->status |= BM_STATUS_INT;
-        bm->dma_cb = NULL;
-        bm->ide_if = NULL;
-        bm->aiocb = NULL;
-        return;
-    }
-
-    /* launch next transfer */
-    n = s->nsector;
-    if (n > IDE_DMA_BUF_SECTORS)
-        n = IDE_DMA_BUF_SECTORS;
-    s->io_buffer_index = 0;
-    s->io_buffer_size = n * 512;
-
-    if (dma_buf_rw(bm, 0) == 0)
-        goto eot;
-#ifdef DEBUG_AIO
-    printf("aio_write: sector_num=%lld n=%d\n", sector_num, n);
-#endif
-    bm->aiocb = bdrv_aio_write(s->bs, sector_num, s->io_buffer, n,
-                               ide_write_dma_cb, bm);
-    ide_dma_submit_check(s, ide_write_dma_cb, bm);
+    int iovcnt = build_dma_sg(bm);
+    pci_dma_sg((PCIDevice *)bm->pci_dev, bm->sg, iovcnt,
+              ide_dma_submit, ide_dma_complete, bm,
+              bm->dma_to_memory, 512);
 }
 
-static void ide_sector_write_dma(IDEState *s)
+static void ide_sector_dma(IDEState *s, BlockDriverAIOIOV *bdrv_aio_iov,
+                          int dma_to_memory)
 {
+    BMDMAState *bm = s->bmdma;
+    if(!bm)
+       goto err;
+
     s->status = READY_STAT | SEEK_STAT | DRQ_STAT | BUSY_STAT;
-    s->io_buffer_index = 0;
-    s->io_buffer_size = 0;
-    ide_dma_start(s, ide_write_dma_cb);
+    bm->ide_if = s;
+    bm->bdrv_aio_iov = bdrv_aio_iov;
+    bm->dma_to_memory = dma_to_memory;
+    if (bm->status & BM_STATUS_DMAING)
+       ide_sector_dma_start(bm);
+    return;
+
+err:
+    ide_dma_error(s);
 }
 
 static void ide_atapi_cmd_ok(IDEState *s)
@@ -2219,7 +2211,7 @@
             if (!s->bs)
                 goto abort_cmd;
            ide_cmd_lba48_transform(s, lba48);
-            ide_sector_read_dma(s);
+            ide_sector_dma(s, bdrv_aio_readv, 1);
             break;
        case WIN_WRITEDMA_EXT:
            lba48 = 1;
@@ -2228,7 +2220,7 @@
             if (!s->bs)
                 goto abort_cmd;
            ide_cmd_lba48_transform(s, lba48);
-            ide_sector_write_dma(s);
+            ide_sector_dma(s, bdrv_aio_writev, 0);
             s->media_changed = 1;
             break;
         case WIN_READ_NATIVE_MAX_EXT:
@@ -2852,6 +2844,7 @@
         /* cancel DMA request */
         bm->ide_if = NULL;
         bm->dma_cb = NULL;
+       bm->bdrv_aio_iov = NULL;
         if (bm->aiocb) {
 #ifdef DEBUG_AIO
             printf("aio_cancel\n");
@@ -2876,7 +2869,9 @@
         if (!(bm->status & BM_STATUS_DMAING)) {
             bm->status |= BM_STATUS_DMAING;
             /* start dma transfer if possible */
-            if (bm->dma_cb)
+           if (bm->bdrv_aio_iov)
+               ide_sector_dma_start(bm);
+            else if (bm->dma_cb)
                 bm->dma_cb(bm, 0);
         }
         bm->cmd = val & 0x09;
Index: cpu-all.h
===================================================================
--- cpu-all.h   (revision 5818)
+++ cpu-all.h   (working copy)
@@ -891,14 +891,19 @@
 CPUReadMemoryFunc **cpu_get_io_memory_read(int io_index);
 
 void cpu_physical_memory_rw(target_phys_addr_t addr, uint8_t *buf,
-                            int len, int is_write);
+                            size_t len, int is_write);
+void cpu_physical_memory_write_post_dma(target_phys_addr_t addr,
+                                       size_t len);
+uint8_t *cpu_physical_memory_can_dma(target_phys_addr_t addr,
+                                    size_t len, int is_write,
+                                    int alignment);
 static inline void cpu_physical_memory_read(target_phys_addr_t addr,
-                                            uint8_t *buf, int len)
+                                            uint8_t *buf, size_t len)
 {
     cpu_physical_memory_rw(addr, buf, len, 0);
 }
 static inline void cpu_physical_memory_write(target_phys_addr_t addr,
-                                             const uint8_t *buf, int len)
+                                             const uint8_t *buf, size_t len)
 {
     cpu_physical_memory_rw(addr, (uint8_t *)buf, len, 1);
 }
diff --git a/qemu/hw/pci_dma.c b/qemu/hw/pci_dma.c
new file mode 100644
index 0000000..48762a8
--- /dev/null
+++ b/qemu/hw/pci_dma.c
@@ -0,0 +1,366 @@
+/*
+ * QEMU PCI DMA operations
+ *
+ * Copyright (c) 2008 Red Hat, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to 
deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 
FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+
+#include <assert.h>
+#include "pci_dma.h"
+
+#define MAX_DMA_BOUNCE_BUFFER (1024*1024)
+//#define DEBUG_BOUNCE
+//#define MAX_DMA_BOUNCE_BUFFER (512)
+//#define DISABLE_IOVEC_CACHE
+
+typedef struct QEMUPciDmaSgParam {
+    QEMUPciDmaSgSubmit *pci_dma_sg_submit;
+    QEMUPciDmaSgComplete *pci_dma_sg_complete;
+    void *pci_dma_sg_opaque;
+    int dma_to_memory;
+    int alignment;
+    uint8_t *bounce;
+    QEMUPciDmaSg *sg;
+    int iovcnt;
+    int restart_iovcnt;
+    size_t restart_offset;
+    int curr_restart_iovcnt;
+    size_t curr_restart_offset;
+    size_t curr_len;
+#ifndef DISABLE_IOVEC_CACHE
+    struct QEMUPciDmaSgParam *next;
+#endif
+    struct iovec iov;
+} QEMUPciDmaSgParam;
+
+#ifndef DISABLE_IOVEC_CACHE
+/*
+ * Too many entries will run slower and waste memory, this is really
+ * only about the fast path so it must be small, slow path is fine to
+ * allocate dynamically. Max memory used is slightly more than
+ * MAX_IOVEC_ENTRIES * MAX_IOVEC_IOVCNT * sizeof(struct iovec).
+ */
+#define MAX_IOVEC_ENTRIES 10
+
+/*
+ * Don't cache exceptionally large iovcnt used for huge DMA transfers
+ * as the DMA transfer may take much longer than malloc and huge
+ * memory could be wasted if it happens only once in a while.
+ */
+#define MAX_IOVEC_IOVCNT 2048
+
+static QEMUPciDmaSgParam *sg_list;
+static long max_sg_in_flight, sg_in_flight;
+
+static QEMUPciDmaSgParam *qemu_get_pci_dma_sg(int iovcnt)
+{
+    QEMUPciDmaSgParam *entry = sg_list, *last;
+
+    while (entry) {
+       if (entry->iovcnt >= iovcnt) {
+           if (entry == sg_list)
+               sg_list = sg_list->next;
+           else
+               last->next = entry->next;
+           goto found;
+       }
+       last = entry;
+       entry = entry->next;
+    }
+
+    entry = qemu_malloc(sizeof(QEMUPciDmaSgParam) +
+                       sizeof(struct iovec) * (iovcnt-1));
+    if (!entry)
+       return NULL;
+
+    if (iovcnt <= MAX_IOVEC_IOVCNT) {
+    found:
+       sg_in_flight += 1;
+       if (sg_in_flight > max_sg_in_flight)
+           max_sg_in_flight = sg_in_flight;
+    }
+    return entry;
+}
+
+static void qemu_release_pci_dma_sg(QEMUPciDmaSgParam *this)
+{
+    QEMUPciDmaSgParam *min_entry = NULL, *entry = sg_list;
+    QEMUPciDmaSgParam *min_last = NULL, *last = NULL;
+    unsigned int min_iovcnt = -1;
+    int nr = 0, tot;
+
+    if (this->iovcnt > MAX_IOVEC_IOVCNT) {
+       qemu_free(this);
+       return;
+    }
+
+    while (entry) {
+       nr += 1;
+       if ((unsigned int)entry->iovcnt <= min_iovcnt) {
+           min_entry = entry;
+           min_last = last;
+           min_iovcnt = entry->iovcnt;
+       }
+       last = entry;
+       entry = entry->next;
+    }
+
+    assert(max_sg_in_flight > 0);
+    assert(sg_in_flight > 0);
+    tot = nr+sg_in_flight; 
+    if (tot > max_sg_in_flight || tot > MAX_IOVEC_ENTRIES) {
+       /* detail: replace even if it's equal as it's cache hot */
+       if ((unsigned int)this->iovcnt < min_iovcnt)
+           qemu_free(this);
+       else {
+           assert(nr > 0);
+           if (min_entry == sg_list) {
+               this->next = sg_list->next;
+           } else {
+               min_last->next = min_entry->next;
+               this->next = sg_list;
+           }
+           sg_list = this;
+           qemu_free(min_entry);
+       }
+    } else {
+       this->next = sg_list;
+       sg_list = this;
+    }
+    sg_in_flight -= 1;
+    assert(sg_in_flight >= 0);
+}
+#else /* DISABLE_IOVEC_CACHE */
+#define qemu_get_pci_dma_sg(iovcnt) 
qemu_malloc(sizeof(QEMUPciDmaSgParam)+(sizeof(struct iovec)*((iovcnt)-1))) 
+#define qemu_release_pci_dma_sg(param) qemu_free(param)
+#endif /* DISABLE_IOVEC_CACHE */
+
+static int pci_dma_sg_map_direct(QEMUPciDmaSg *sg,
+                                int iovcnt,
+                                int dma_to_memory,
+                                int alignment,
+                                size_t *len,
+                                struct iovec *dma_iov)
+{
+    int idx = 0;
+    size_t _len = 0;
+
+#ifdef DEBUG_BOUNCE
+    return 0;
+#endif
+
+    for (idx = 0; idx < iovcnt; idx++) {
+       void * addr;
+
+       if (_len + sg[idx].len <= _len)
+           return 0;
+       _len += sg[idx].len;
+
+       addr = cpu_physical_memory_can_dma(sg[idx].addr,
+                                          sg[idx].len,
+                                          dma_to_memory,
+                                          alignment);
+       if (!addr)
+           return 0;
+
+       dma_iov[idx].iov_base = addr;
+       dma_iov[idx].iov_len = sg[idx].len;
+    }
+
+    *len = _len;
+    return 1;
+}
+
+static int pci_dma_sg_map_bounce(QEMUPciDmaSgParam *param)
+{
+    int idx;
+    size_t len = 0;
+
+    param->curr_restart_iovcnt = param->restart_iovcnt;
+    param->curr_restart_offset = param->restart_offset;
+
+    for (idx = param->restart_iovcnt; idx < param->iovcnt; idx++) {
+       if (len + param->sg[idx].len <= len)
+           return 0;
+       len += param->sg[idx].len - param->restart_offset;
+       param->restart_offset = 0;
+       if (len > MAX_DMA_BOUNCE_BUFFER) {
+           size_t leftover = len - MAX_DMA_BOUNCE_BUFFER;
+           param->restart_offset = param->sg[idx].len - leftover;
+           len = MAX_DMA_BOUNCE_BUFFER;
+           break;
+       }
+    }
+    param->restart_iovcnt = idx;
+    param->curr_len = len;
+
+    if (len & (param->alignment-1))
+       return 0;
+
+    param->iov.iov_len = len;
+    if (!param->bounce) {
+       param->bounce = qemu_memalign(param->alignment, len);
+       if (!param->bounce)
+           return 0;
+       param->iov.iov_base = param->bounce;
+    }
+
+    if (!param->dma_to_memory) {
+       int idx;
+       size_t offset = 0;
+       for (idx = param->curr_restart_iovcnt;
+            idx < param->iovcnt && offset < len; idx++) {
+           size_t copy_len = param->sg[idx].len - param->curr_restart_offset;
+           if (offset+copy_len > len)
+               copy_len = len;
+           cpu_physical_memory_read(param->sg[idx].addr + 
+                                    param->curr_restart_offset,
+                                    param->bounce + offset,
+                                    copy_len);
+           param->curr_restart_offset = 0;
+           offset += copy_len;
+       }
+    }
+
+    return 1;
+}
+
+static void pci_dma_sg_unmap_direct(QEMUPciDmaSgParam *param, int ret)
+{
+    if (!ret && param->dma_to_memory) {
+       int idx;
+       QEMUPciDmaSg *sg = param->sg;
+       for (idx = 0; idx < param->iovcnt; idx++)
+           cpu_physical_memory_write_post_dma(sg[idx].addr,
+                                              sg[idx].len);
+    }
+}
+
+int pci_dma_sg_unmap_bounce(QEMUPciDmaSgParam *param, int ret)
+{
+    if (!ret && param->dma_to_memory) {
+       int idx;
+       size_t offset = 0;
+       for (idx = param->curr_restart_iovcnt;
+            idx < param->iovcnt && offset < param->curr_len; idx++) {
+           size_t copy_len = param->sg[idx].len - param->curr_restart_offset;
+           if (offset+copy_len > param->curr_len)
+               copy_len = param->curr_len;
+           cpu_physical_memory_write(param->sg[idx].addr +
+                                     param->curr_restart_offset,
+                                     param->bounce + offset,
+                                     copy_len);
+           param->curr_restart_offset = 0;
+           offset += copy_len;
+       }
+    }
+    if (param->restart_iovcnt == param->iovcnt || ret) {
+       qemu_free(param->bounce);
+       return 0;
+    }
+    return 1;
+}
+
+static void pci_dma_sg_cb(void *opaque, int ret)
+{
+    QEMUPciDmaSgParam *param = opaque;
+    int restart = 0;
+
+    if (!param->bounce)
+       pci_dma_sg_unmap_direct(param, ret);
+    else
+       restart = pci_dma_sg_unmap_bounce(param, ret);
+
+    if (restart) {
+       ret = -1000;
+       if (!pci_dma_sg_map_bounce(param)) {
+           qemu_free(param->bounce);
+           goto out_free;
+       }
+       ret = param->pci_dma_sg_submit(param->pci_dma_sg_opaque,
+                                      &param->iov, 1,
+                                      param->curr_len,
+                                      pci_dma_sg_cb,
+                                      param);
+    }
+    if (ret || !restart) {
+    out_free:
+       param->pci_dma_sg_complete(param->pci_dma_sg_opaque, ret);
+       qemu_release_pci_dma_sg(param);
+    }
+}
+
+/* PCIDevice is there in case we want to emulate an iommu later */
+void pci_dma_sg(PCIDevice *pci_dev,
+               QEMUPciDmaSg *sg, int iovcnt,
+               QEMUPciDmaSgSubmit pci_dma_sg_submit,
+               QEMUPciDmaSgComplete pci_dma_sg_complete,
+               void *pci_dma_sg_opaque,
+               int dma_to_memory, int alignment)
+{
+    int ret;
+    QEMUPciDmaSgParam *param;
+
+    ret = -2000;
+    if ((unsigned int) dma_to_memory > 1)
+       goto err;
+    if ((unsigned int) alignment > MAX_DMA_BOUNCE_BUFFER)
+       goto err;
+    if (iovcnt < 1)
+       goto err;
+
+    param = qemu_get_pci_dma_sg(iovcnt);
+    if (!param)
+       goto err;
+
+    param->pci_dma_sg_submit = pci_dma_sg_submit;
+    param->pci_dma_sg_complete = pci_dma_sg_complete;
+    param->pci_dma_sg_opaque = pci_dma_sg_opaque;
+    param->dma_to_memory = dma_to_memory;
+    param->alignment = alignment;
+    param->bounce = NULL;
+    param->sg = sg;
+    param->iovcnt = iovcnt;
+    param->restart_offset = param->restart_iovcnt = 0;
+
+    /* map the sg */
+    if (!pci_dma_sg_map_direct(sg, iovcnt,
+                              dma_to_memory, alignment,
+                              &param->curr_len, &param->iov)) {
+       ret = -2004;
+       if (!pci_dma_sg_map_bounce(param))
+           goto out_free;
+       iovcnt = 1;
+    }
+
+    /* run the I/O */
+    ret = pci_dma_sg_submit(pci_dma_sg_opaque,
+                           &param->iov, iovcnt, param->curr_len,
+                           pci_dma_sg_cb,
+                           param);
+    if (ret)
+    out_free:
+       pci_dma_sg_cb(param, ret);
+    return;
+
+ err:
+    pci_dma_sg_complete(pci_dma_sg_opaque, ret);
+    return;
+}
diff --git a/qemu/hw/pci_dma.h b/qemu/hw/pci_dma.h
new file mode 100644
index 0000000..5cc8413
--- /dev/null
+++ b/qemu/hw/pci_dma.h
@@ -0,0 +1,29 @@
+#ifndef QEMU_PCI_DMA_H
+#define QEMU_PCI_DMA_H
+
+#include "qemu-common.h"
+#include "block.h"
+#include <sys/uio.h> /* struct iovec */
+
+typedef int QEMUPciDmaSgSubmit(void *pci_dma_sg_opaque,
+                              struct iovec *iov, int iovcnt,
+                              size_t len,
+                              BlockDriverCompletionFunc dma_cb,
+                              void *dma_cb_param);
+
+typedef void QEMUPciDmaSgComplete(void *pci_dma_sg_opaque, int ret);
+
+typedef struct QEMUPciDmaSg {
+    target_phys_addr_t addr;
+    size_t len;
+} QEMUPciDmaSg;
+
+/* pci_dma.c */
+void pci_dma_sg(PCIDevice *pci_dev,
+               QEMUPciDmaSg *sg, int iovcnt,
+               QEMUPciDmaSgSubmit *pci_dma_sg_submit,
+               QEMUPciDmaSgComplete *pci_dma_sg_complete,
+               void *pci_dma_sg_opaque,
+               int dma_to_memory, int alignment);
+
+#endif




reply via email to

[Prev in Thread] Current Thread [Next in Thread]