qemu-devel
[Top][All Lists]
Advanced

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

[Qemu-devel] [RFC 1/2] pci-dma-api-v1


From: Andrea Arcangeli
Subject: [Qemu-devel] [RFC 1/2] pci-dma-api-v1
Date: Thu, 27 Nov 2008 13:35:38 +0100

Hello everyone,

One major limitation for KVM today is the lack of a proper way to
write drivers in a way that allows the host OS to use direct DMA to
the guest physical memory to avoid any intermediate copy. The only API
provided to drivers seems to be the cpu_physical_memory_rw and that
enforces all drivers to bounce and trash cpu caches and be memory
bound. This new DMA API instead allows drivers to use a pci_dma_sg
method for SG I/O that will translate the guest physical addresses to
host virutal addresses and it will call two operation, one is a submit
method and one is the complete method. The pci_dma_sg may have to
bounce buffer internally and to limit the max bounce size it may have
to submit I/O in pieces with multiple submit calls. The patch adapts
the ide.c HD driver to use this. Once cdrom is converted too
dma_buf_rw can be eliminated. As you can see the new ide_dma_submit
and ide_dma_complete code is much more readable than the previous
rearming callback.

This is only tested with KVM so far but qemu builds, in general
there's nothing kvm specific here (with the exception of a single
kvm_enabled), so it should all work well for both.

All we care about is the performance of the direct path, so I tried to
avoid dynamic allocations there to avoid entering glibc, the current
logic doesn't satisfy me yet but it should be at least faster than
calling malloc (but I'm still working on it to avoid memory waste to
detect when more than one iov should be cached). But in case of
instabilities I recommend first thing to set MAX_IOVEC_IOVCNT 0 to
disable that logic ;). I recommend to test with DEBUG_BOUNCE and with
a 512 max bounce buffer too. It's running stable in all modes so
far. However if ide.c end up calling aio_cancel things will likely
fall apart but this is all because of bdrv_aio_readv/writev, and the
astonishing lack of aio_readv/writev in glibc!

Once we finish fixing storage performance with a real
bdrv_aio_readv/writev (now a blocker issue), a pci_dma_single can be
added for zero copy networking (one NIC per VM, or VMDq, IOV
etc..). The DMA API should allow for that too.

Signed-off-by: Andrea Arcangeli <address@hidden>

diff --git a/qemu/hw/pci_dma.c b/qemu/hw/pci_dma.c
new file mode 100644
index 0000000..2e5f100
--- /dev/null
+++ b/qemu/hw/pci_dma.c
@@ -0,0 +1,346 @@
+/*
+ * QEMU PCI DMA operations
+ *
+ * Copyright (c) 2008 Red Hat, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to 
deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 
FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+
+#include "pci_dma.h"
+
+#define MAX_DMA_BOUNCE_BUFFER (1024*1024)
+//#define DEBUG_BOUNCE
+//#define MAX_DMA_BOUNCE_BUFFER (512)
+
+/*
+ * Too many entries will run slower and waste memory, this is really
+ * only about the fast path so it must be small, slow path is fine to
+ * allocate dynamically. Max memory used is around MAX_IOVEC_ENTRIES *
+ * MAX_IOVEC_IOVCNT * sizeof(struct iovec).
+ */
+#define MAX_IOVEC_ENTRIES 5
+
+/*
+ * Don't cache exceptionally large iovcnt used for huge DMA transfers
+ * as the DMA transfer may take much longer than malloc
+ * and huge memory could be wasted if it happens only once in a while. 
+ */
+#define MAX_IOVEC_IOVCNT 2048
+
+struct iovec_entry {
+    int iovcnt;
+    struct iovec_entry *next;
+    struct iovec iov;
+};
+
+struct iovec_entry *iovec_list;
+
+static struct iovec *qemu_get_iovec(int iovcnt)
+{
+    struct iovec_entry *entry = iovec_list;
+
+    while (entry) {
+       if (entry->iovcnt >= iovcnt)
+           return &entry->iov;
+       entry = entry->next;
+    }
+
+    entry = qemu_malloc(sizeof(struct iovec_entry) + 
+                       sizeof(struct iovec) * (iovcnt-1));
+    if (!entry)
+       return NULL;
+
+    return &entry->iov;
+}
+
+static void qemu_release_iovec(struct iovec *iov)
+{
+    struct iovec_entry *this, *min_entry = NULL, *entry = iovec_list;
+    struct iovec_entry *min_last = NULL, *last = NULL;
+    unsigned int min_iovcnt = -1, nr = 0;
+
+    if (!iov)
+       return;
+
+    this = (struct iovec_entry *)
+       (((char*)iov)-(unsigned long)(&((struct iovec_entry *)0)->iov));
+
+    if (this->iovcnt > MAX_IOVEC_IOVCNT) {
+       qemu_free(this);
+       return;
+    }
+
+    while (entry) {
+       nr += 1;
+       if ((unsigned int)entry->iovcnt < min_iovcnt) {
+           min_entry = entry;
+           min_last = last;
+           min_iovcnt = entry->iovcnt;
+       }
+       last = entry;
+       entry = entry->next;
+    }
+
+    if (nr > MAX_IOVEC_ENTRIES) {
+       /* detail: replace even if it's equal as it's cache hot */
+       if (this->iovcnt < min_iovcnt)
+           qemu_free(this);
+       else {
+           if (min_entry == iovec_list) {
+               this->next = iovec_list->next;
+               iovec_list = this;
+           } else {
+               min_last->next = min_entry->next;
+               this->next = iovec_list;
+               iovec_list = this;
+           }
+           qemu_free(min_entry);
+       }
+    } else {
+       if (!iovec_list) {
+           this->next = NULL;
+           iovec_list = this;
+       } else {
+           this->next = iovec_list;
+           iovec_list = this;
+       }
+    }
+}
+
+static struct iovec *pci_dma_sg_map_direct(QEMUPciDmaSg *sg,
+                                          int iovcnt,
+                                          int dma_to_memory,
+                                          int alignment,
+                                          size_t *len)
+{
+    int idx = 0;
+    struct iovec *dma_iov;
+    size_t _len = 0;
+
+#ifdef DEBUG_BOUNCE
+    return NULL;
+#endif
+
+    /* fixme: must not call malloc and cache them in some faster queue */
+    dma_iov = qemu_get_iovec(iovcnt);
+    if (!dma_iov)
+       goto out;
+
+    for (idx = 0; idx < iovcnt; idx++) {
+       void * addr;
+
+       if (_len + sg[idx].len <= _len)
+               goto err;
+       _len += sg[idx].len;
+
+       addr = cpu_physical_memory_can_dma(sg[idx].addr,
+                                          sg[idx].len,
+                                          dma_to_memory,
+                                          alignment);
+       if (!addr)
+           goto err;
+
+       dma_iov[idx].iov_base = addr;
+       dma_iov[idx].iov_len = sg[idx].len;
+    }
+
+    *len = _len;
+ out:
+    return dma_iov;
+err:
+    qemu_release_iovec(dma_iov);
+    dma_iov = NULL;
+    goto out;
+}
+
+static struct iovec *pci_dma_sg_map_bounce(QEMUPciDmaSgParam *param)
+{
+    int idx;
+    size_t len = 0;
+
+    param->curr_restart_iovcnt = param->restart_iovcnt;
+    param->curr_restart_offset = param->restart_offset;
+
+    for (idx = param->restart_iovcnt; idx < param->iovcnt; idx++) {
+       if (len & (param->alignment-1))
+           return NULL;
+       if (len + param->sg[idx].len <= len)
+           return NULL;
+       len += param->sg[idx].len - param->restart_offset;
+       param->restart_offset = 0;
+       if (len > MAX_DMA_BOUNCE_BUFFER) {
+           size_t leftover = len - MAX_DMA_BOUNCE_BUFFER;
+           param->restart_offset = param->sg[idx].len - leftover;
+           len = MAX_DMA_BOUNCE_BUFFER;
+           break;
+       }
+    }
+    param->restart_iovcnt = idx;
+    param->curr_len = len;
+
+    param->linearized.iov_len = len;
+    if (!param->bounce) {
+       param->bounce = qemu_memalign(param->alignment, len);
+       if (!param->bounce)
+           return NULL;
+       param->linearized.iov_base = param->bounce;
+    }
+
+    if (!param->dma_to_memory) {
+       int idx;
+       size_t offset = 0;
+       for (idx = param->curr_restart_iovcnt;
+            idx < param->iovcnt && offset < len; idx++) {
+           size_t copy_len = param->sg[idx].len - param->curr_restart_offset;
+           if (offset+copy_len > len)
+               copy_len = len;
+           cpu_physical_memory_read(param->sg[idx].addr + 
+                                    param->curr_restart_offset,
+                                    param->bounce + offset,
+                                    copy_len);
+           param->curr_restart_offset = 0;
+           offset += copy_len;
+       }
+    }
+
+    return &param->linearized;
+}
+
+static void pci_dma_sg_unmap_direct(QEMUPciDmaSgParam *param, int ret)
+{
+    if (!ret && param->dma_to_memory) {
+       int idx;
+       QEMUPciDmaSg *sg = param->sg;
+       for (idx = 0; idx < param->iovcnt; idx++)
+           cpu_physical_memory_write_post_dma(sg[idx].addr,
+                                              sg[idx].len);
+    }
+    qemu_release_iovec(param->dma_iov);
+}
+
+int pci_dma_sg_unmap_bounce(QEMUPciDmaSgParam *param, int ret)
+{
+    if (!ret && param->dma_to_memory) {
+       int idx;
+       size_t offset = 0;
+       for (idx = param->curr_restart_iovcnt;
+            idx < param->iovcnt && offset < param->curr_len; idx++) {
+           size_t copy_len = param->sg[idx].len - param->curr_restart_offset;
+           if (offset+copy_len > param->curr_len)
+               copy_len = param->curr_len;
+           cpu_physical_memory_write(param->sg[idx].addr +
+                                     param->curr_restart_offset,
+                                     param->bounce + offset,
+                                     copy_len);
+           param->curr_restart_offset = 0;
+           offset += copy_len;
+       }
+    }
+    if (param->restart_iovcnt == param->iovcnt || ret) {
+       qemu_free(param->bounce);
+       return 0;
+    }
+    return 1;
+}
+
+static void pci_dma_sg_cb(void *opaque, int ret)
+{
+    QEMUPciDmaSgParam *param = opaque;
+    int restart = 0;
+
+    if (!param->bounce)
+       pci_dma_sg_unmap_direct(param, ret);
+    else
+       restart = pci_dma_sg_unmap_bounce(param, ret);
+
+    if (restart) {
+       ret = -1;
+       param->dma_iov = pci_dma_sg_map_bounce(param);
+       if (!param->dma_iov)
+           goto out_free;
+       ret = param->pci_dma_sg_submit(param->pci_dma_sg_opaque,
+                                      param->dma_iov, 1,
+                                      param->curr_len,
+                                      pci_dma_sg_cb,
+                                      param);
+    }
+    if (ret || !restart) {
+    out_free:
+       param->pci_dma_sg_complete(param->pci_dma_sg_opaque, ret);
+       qemu_free(param);
+    }
+}
+
+/* PCIDevice is there in case we want to emulate an iommu later */
+void pci_dma_sg(PCIDevice *pci_dev,
+               QEMUPciDmaSg *sg, int iovcnt,
+               QEMUPciDmaSgSubmit pci_dma_sg_submit,
+               QEMUPciDmaSgComplete pci_dma_sg_complete,
+               void *pci_dma_sg_opaque,
+               int dma_to_memory, int alignment)
+{
+    int ret = -1;
+    QEMUPciDmaSgParam *param;
+
+    if ((unsigned int) dma_to_memory > 1)
+       goto err;
+    if (alignment < 0)
+       goto err;
+    if (iovcnt < 1)
+       goto err;
+
+    param = qemu_malloc(sizeof(QEMUPciDmaSgParam));
+    if (!param)
+       goto err;
+
+    param->pci_dma_sg_submit = pci_dma_sg_submit;
+    param->pci_dma_sg_complete = pci_dma_sg_complete;
+    param->pci_dma_sg_opaque = pci_dma_sg_opaque;
+    param->dma_to_memory = dma_to_memory;
+    param->alignment = alignment;
+    param->bounce = NULL;
+    param->sg = sg;
+    param->iovcnt = iovcnt;
+    param->restart_offset = param->restart_iovcnt = 0;
+
+    /* map the sg */
+    param->dma_iov = pci_dma_sg_map_direct(sg, iovcnt,
+                                          dma_to_memory, alignment,
+                                          &param->curr_len);
+    if (!param->dma_iov) {
+       param->dma_iov = pci_dma_sg_map_bounce(param);
+       if (!param->dma_iov)
+           goto out_free;
+       iovcnt = 1;
+    }
+
+    /* run the I/O */
+    ret = pci_dma_sg_submit(pci_dma_sg_opaque,
+                           param->dma_iov, iovcnt, param->curr_len,
+                           pci_dma_sg_cb,
+                           param);
+    if (ret)
+    out_free:
+       pci_dma_sg_cb(param, ret);
+    return;
+
+ err:
+    pci_dma_sg_complete(pci_dma_sg_opaque, ret);
+    return;
+}
diff --git a/qemu/hw/pci_dma.h b/qemu/hw/pci_dma.h
new file mode 100644
index 0000000..9ea606d
--- /dev/null
+++ b/qemu/hw/pci_dma.h
@@ -0,0 +1,47 @@
+#ifndef QEMU_PCI_DMA_H
+#define QEMU_PCI_DMA_H
+
+#include "qemu-common.h"
+#include "block.h"
+#include <sys/uio.h> /* struct iovec */
+
+typedef int QEMUPciDmaSgSubmit(void *pci_dma_sg_opaque,
+                              struct iovec *iov, int iovcnt,
+                              size_t len,
+                              BlockDriverCompletionFunc dma_cb,
+                              void *dma_cb_param);
+
+typedef void QEMUPciDmaSgComplete(void *pci_dma_sg_opaque, int ret);
+
+typedef struct QEMUPciDmaSg {
+    target_phys_addr_t addr;
+    size_t len;
+} QEMUPciDmaSg;
+
+typedef struct QEMUPciDmaSgParam {
+    QEMUPciDmaSgSubmit *pci_dma_sg_submit;
+    QEMUPciDmaSgComplete *pci_dma_sg_complete;
+    void *pci_dma_sg_opaque;
+    int dma_to_memory;
+    int alignment;
+    uint8_t *bounce;
+    QEMUPciDmaSg *sg;
+    int iovcnt;
+    int restart_iovcnt;
+    size_t restart_offset;
+    int curr_restart_iovcnt;
+    size_t curr_restart_offset;
+    size_t curr_len;
+    struct iovec *dma_iov;
+    struct iovec linearized;
+} QEMUPciDmaSgParam;
+
+/* pci_dma.c */
+void pci_dma_sg(PCIDevice *pci_dev,
+               QEMUPciDmaSg *sg, int iovcnt,
+               QEMUPciDmaSgSubmit *pci_dma_sg_submit,
+               QEMUPciDmaSgComplete *pci_dma_sg_complete,
+               void *pci_dma_sg_opaque,
+               int dma_to_memory, int alignment);
+
+#endif
Index: block_int.h
===================================================================
--- block_int.h (revision 5799)
+++ block_int.h (working copy)
@@ -55,6 +55,8 @@
         int64_t sector_num, const uint8_t *buf, int nb_sectors,
         BlockDriverCompletionFunc *cb, void *opaque);
     void (*bdrv_aio_cancel)(BlockDriverAIOCB *acb);
+    BlockDriverAIOIOV *bdrv_aio_readv;
+    BlockDriverAIOIOV *bdrv_aio_writev;
     int aiocb_size;
 
     const char *protocol_name;
Index: Makefile.target
===================================================================
--- Makefile.target     (revision 5799)
+++ Makefile.target     (working copy)
@@ -659,7 +659,7 @@
 
 ifeq ($(TARGET_BASE_ARCH), i386)
 # Hardware support
-OBJS+= ide.o pckbd.o ps2.o vga.o $(SOUND_HW) dma.o
+OBJS+= ide.o pckbd.o ps2.o vga.o $(SOUND_HW) dma.o pci_dma.o
 OBJS+= fdc.o mc146818rtc.o serial.o i8259.o i8254.o pcspk.o pc.o
 OBJS+= cirrus_vga.o apic.o parallel.o acpi.o piix_pci.o
 OBJS+= usb-uhci.o vmmouse.o vmport.o vmware_vga.o
@@ -668,7 +668,7 @@
 ifeq ($(TARGET_BASE_ARCH), ppc)
 CPPFLAGS += -DHAS_AUDIO -DHAS_AUDIO_CHOICE
 # shared objects
-OBJS+= ppc.o ide.o vga.o $(SOUND_HW) dma.o openpic.o
+OBJS+= ppc.o ide.o vga.o $(SOUND_HW) dma.o openpic.o pci_dma.o
 # PREP target
 OBJS+= pckbd.o ps2.o serial.o i8259.o i8254.o fdc.o m48t59.o mc146818rtc.o
 OBJS+= prep_pci.o ppc_prep.o
@@ -686,7 +686,7 @@
 OBJS+= mips_timer.o mips_int.o dma.o vga.o serial.o i8254.o i8259.o rc4030.o
 OBJS+= g364fb.o jazz_led.o
 OBJS+= ide.o gt64xxx.o pckbd.o ps2.o fdc.o mc146818rtc.o usb-uhci.o acpi.o 
ds1225y.o
-OBJS+= piix_pci.o parallel.o cirrus_vga.o pcspk.o $(SOUND_HW)
+OBJS+= piix_pci.o parallel.o cirrus_vga.o pcspk.o pci_dma.o $(SOUND_HW)
 OBJS+= mipsnet.o
 OBJS+= pflash_cfi01.o
 CPPFLAGS += -DHAS_AUDIO -DHAS_AUDIO_CHOICE
@@ -710,7 +710,7 @@
 else
 OBJS+= sun4m.o tcx.o pcnet.o iommu.o m48t59.o slavio_intctl.o
 OBJS+= slavio_timer.o slavio_serial.o slavio_misc.o fdc.o sparc32_dma.o
-OBJS+= cs4231.o ptimer.o eccmemctl.o sbi.o sun4c_intctl.o
+OBJS+= cs4231.o ptimer.o eccmemctl.o sbi.o sun4c_intctl.o pci_dma.o
 endif
 endif
 ifeq ($(TARGET_BASE_ARCH), arm)
@@ -731,7 +731,7 @@
 OBJS+= nseries.o blizzard.o onenand.o vga.o cbus.o tusb6010.o usb-musb.o
 OBJS+= tsc2005.o bt-hci-csr.o
 OBJS+= mst_fpga.o mainstone.o
-OBJS+= musicpal.o pflash_cfi02.o
+OBJS+= musicpal.o pflash_cfi02.o pci_dma.o
 CPPFLAGS += -DHAS_AUDIO
 endif
 ifeq ($(TARGET_BASE_ARCH), sh4)
Index: exec.c
===================================================================
--- exec.c      (revision 5799)
+++ exec.c      (working copy)
@@ -2807,7 +2807,7 @@
 /* physical memory access (slow version, mainly for debug) */
 #if defined(CONFIG_USER_ONLY)
 void cpu_physical_memory_rw(target_phys_addr_t addr, uint8_t *buf,
-                            int len, int is_write)
+                            size_t len, int is_write)
 {
     int l, flags;
     target_ulong page;
@@ -2848,7 +2848,7 @@
 
 #else
 void cpu_physical_memory_rw(target_phys_addr_t addr, uint8_t *buf,
-                            int len, int is_write)
+                            size_t len, int is_write)
 {
     int l, io_index;
     uint8_t *ptr;
@@ -2938,6 +2938,111 @@
     }
 }
 
+uint8_t *cpu_physical_memory_can_dma(target_phys_addr_t addr,
+                                    size_t len, int is_write,
+                                    int alignment)
+{
+    int l, first = 1;
+    uint8_t *ptr = NULL;
+    target_phys_addr_t page;
+    unsigned long pd, pd_first = 0;
+    PhysPageDesc *p;
+
+    if (len & (alignment-1))
+       goto out;
+
+    while (len > 0) {
+       page = addr & TARGET_PAGE_MASK;
+       p = phys_page_find(page >> TARGET_PAGE_BITS);
+
+       l = (page + TARGET_PAGE_SIZE) - addr;
+        if (l > len)
+            l = len;
+
+        if (!p)
+            pd = IO_MEM_UNASSIGNED;
+        else
+            pd = p->phys_offset;
+
+        if (is_write) {
+            if ((pd & ~TARGET_PAGE_MASK) != IO_MEM_RAM)
+               return NULL;
+        } else {
+            if ((pd & ~TARGET_PAGE_MASK) > IO_MEM_ROM &&
+                !(pd & IO_MEM_ROMD))
+               return NULL;
+        }
+
+       if (first) {
+           first = 0;
+           ptr = phys_ram_base + (pd & TARGET_PAGE_MASK) +
+               (addr & ~TARGET_PAGE_MASK);
+           if ((unsigned long)ptr & (alignment-1))
+               return NULL;
+           pd_first = pd;
+       }
+
+       /* nonlinear range */
+       if (pd_first != pd)
+           return NULL;
+       pd_first += TARGET_PAGE_SIZE;
+
+        len -= l;
+        addr += l;
+    }
+
+out:
+    return ptr;
+}
+
+void cpu_physical_memory_write_post_dma(target_phys_addr_t addr,
+                                       size_t len)
+{
+    int l;
+    uint8_t *ptr;
+    target_phys_addr_t page;
+    unsigned long pd;
+    PhysPageDesc *p;
+
+    while (len > 0) {
+        page = addr & TARGET_PAGE_MASK;
+
+        l = (page + TARGET_PAGE_SIZE) - addr;
+        if (l > len)
+            l = len;
+
+        p = phys_page_find(page >> TARGET_PAGE_BITS);
+        if (!p)
+            pd = IO_MEM_UNASSIGNED;
+        else
+            pd = p->phys_offset;
+
+       if ((pd & ~TARGET_PAGE_MASK) != IO_MEM_RAM) {
+           printf("ERROR cpu_physical_memory_post_dma: memory layout 
changed\n");
+           continue;
+       } else {
+           unsigned long addr1;
+           addr1 = (pd & TARGET_PAGE_MASK) + (addr & ~TARGET_PAGE_MASK);
+           /* RAM case */
+           ptr = phys_ram_base + addr1;
+           if (!cpu_physical_memory_is_dirty(addr1)) {
+               /* invalidate code */
+               tb_invalidate_phys_page_range(addr1, addr1 + l, 0);
+               /* set dirty bit */
+               phys_ram_dirty[addr1 >> TARGET_PAGE_BITS] |=
+                   (0xff & ~CODE_DIRTY_FLAG);
+           }
+           /* qemu doesn't execute guest code directly, but kvm does
+              therefore fluch instruction caches */
+           if (kvm_enabled())
+               flush_icache_range((unsigned long)ptr,
+                                  ((unsigned long)ptr)+l);
+        }
+        len -= l;
+        addr += l;
+    }
+}
+
 /* used for ROM loading : can write in RAM and ROM */
 void cpu_physical_memory_write_rom(target_phys_addr_t addr,
                                    const uint8_t *buf, int len)
Index: block.c
===================================================================
--- block.c     (revision 5799)
+++ block.c     (working copy)
@@ -1291,7 +1307,51 @@
     drv->bdrv_aio_cancel(acb);
 }
 
+BlockDriverAIOCB *bdrv_aio_readv(BlockDriverState *bs, int64_t sector_num,
+                                struct iovec *iov, int iovcnt, size_t len,
+                                BlockDriverCompletionFunc *cb, void *opaque)
+{
+    BlockDriver *drv = bs->drv;
+    BlockDriverAIOCB *ret;
+    
 
+    if (!drv)
+        return NULL;
+
+    ret = drv->bdrv_aio_readv(bs, sector_num, iov, iovcnt, len, cb, opaque);
+
+    if (ret) {
+       /* Update stats even though technically transfer has not happened. */
+       bs->rd_bytes += (unsigned) len;
+       bs->rd_ops ++;
+    }
+
+    return ret;
+}
+
+BlockDriverAIOCB *bdrv_aio_writev(BlockDriverState *bs, int64_t sector_num,
+                                 struct iovec *iov, int iovcnt, size_t len,
+                                 BlockDriverCompletionFunc *cb, void *opaque)
+{
+    BlockDriver *drv = bs->drv;
+    BlockDriverAIOCB *ret;
+    
+
+    if (!drv)
+        return NULL;
+
+    ret = drv->bdrv_aio_writev(bs, sector_num, iov, iovcnt, len, cb, opaque);
+
+    if (ret) {
+       /* Update stats even though technically transfer has not happened. */
+       bs->rd_bytes += (unsigned) len;
+       bs->rd_ops ++;
+    }
+
+    return ret;
+}
+
+
 /**************************************************************/
 /* async block device emulation */
 
Index: block.h
===================================================================
--- block.h     (revision 5799)
+++ block.h     (working copy)
@@ -83,6 +83,13 @@
 /* async block I/O */
 typedef struct BlockDriverAIOCB BlockDriverAIOCB;
 typedef void BlockDriverCompletionFunc(void *opaque, int ret);
+typedef BlockDriverAIOCB *BlockDriverAIOIOV(BlockDriverState *bs,
+                                           int64_t sector_num,
+                                           struct iovec *iov,
+                                           int iovnct,
+                                           size_t len,
+                                           BlockDriverCompletionFunc *cb,
+                                           void *opaque);
 
 BlockDriverAIOCB *bdrv_aio_read(BlockDriverState *bs, int64_t sector_num,
                                 uint8_t *buf, int nb_sectors,
@@ -91,6 +98,12 @@
                                  const uint8_t *buf, int nb_sectors,
                                  BlockDriverCompletionFunc *cb, void *opaque);
 void bdrv_aio_cancel(BlockDriverAIOCB *acb);
+BlockDriverAIOCB *bdrv_aio_readv(BlockDriverState *bs, int64_t sector_num,
+                                struct iovec *iov, int iovnct, size_t len,
+                                BlockDriverCompletionFunc *cb, void *opaque);
+BlockDriverAIOCB *bdrv_aio_writev(BlockDriverState *bs, int64_t sector_num,
+                                 struct iovec *iov, int iovnct, size_t len,
+                                 BlockDriverCompletionFunc *cb, void *opaque);
 
 int qemu_key_check(BlockDriverState *bs, const char *name);
 
Index: hw/ide.c
===================================================================
--- hw/ide.c    (revision 5799)
+++ hw/ide.c    (working copy)
@@ -31,6 +31,7 @@
 #include "qemu-timer.h"
 #include "sysemu.h"
 #include "ppc_mac.h"
+#include "pci_dma.h"
 
 /* debug IDE devices */
 //#define DEBUG_IDE
@@ -480,12 +481,14 @@
     struct PCIIDEState *pci_dev;
     /* current transfer state */
     uint32_t cur_addr;
-    uint32_t cur_prd_last;
-    uint32_t cur_prd_addr;
-    uint32_t cur_prd_len;
+    uint32_t cur_prd_last; /* fixme delete */
+    uint32_t cur_prd_addr; /* fixme delete */
+    uint32_t cur_prd_len; /* fixme delete */
     IDEState *ide_if;
     BlockDriverCompletionFunc *dma_cb;
     BlockDriverAIOCB *aiocb;
+    QEMUPciDmaSg sg[IDE_DMA_BUF_SECTORS];
+    BlockDriverAIOIOV *bdrv_aio_iov;
 } BMDMAState;
 
 typedef struct PCIIDEState {
@@ -870,7 +873,7 @@
 }
 
 /* return 0 if buffer completed */
-static int dma_buf_rw(BMDMAState *bm, int is_write)
+static int dma_buf_rw(BMDMAState *bm, int is_write) /* fixme delete */
 {
     IDEState *s = bm->ide_if;
     struct {
@@ -917,61 +920,80 @@
     return 1;
 }
 
-static void ide_read_dma_cb(void *opaque, int ret)
+static int build_dma_sg(BMDMAState *bm)
 {
-    BMDMAState *bm = opaque;
-    IDEState *s = bm->ide_if;
-    int n;
-    int64_t sector_num;
+    struct {
+        uint32_t addr;
+        uint32_t size;
+    } prd;
+    int len;
+    int idx;
 
-    if (ret < 0) {
-       ide_dma_error(s);
-       return;
+    for (idx = 1; idx <= IDE_DMA_BUF_SECTORS; idx++) {
+       cpu_physical_memory_read(bm->cur_addr, (uint8_t *)&prd, 8);
+       bm->cur_addr += 8;
+       bm->sg[idx-1].addr = le32_to_cpu(prd.addr);
+       prd.size = le32_to_cpu(prd.size);
+       len = prd.size & 0xfffe;
+       if (len == 0)
+           len = 0x10000;
+       bm->sg[idx-1].len = len;
+       /* end of table (with a fail safe of one page) */
+       if ((prd.size & 0x80000000) ||
+           (bm->cur_addr - bm->addr) >= 4096)
+           break;
     }
+    return idx;
+}
 
-    n = s->io_buffer_size >> 9;
-    sector_num = ide_get_sector(s);
-    if (n > 0) {
-        sector_num += n;
-        ide_set_sector(s, sector_num);
-        s->nsector -= n;
-        if (dma_buf_rw(bm, 1) == 0)
-            goto eot;
-    }
+static void ide_dma_complete(void *opaque, int ret)
+{
+    BMDMAState *bm = opaque;
+    IDEState *s = bm->ide_if;
 
+    bm->bdrv_aio_iov = NULL;
+    bm->dma_cb = NULL;
+    bm->ide_if = NULL;
+    bm->aiocb = NULL;
     /* end of transfer ? */
-    if (s->nsector == 0) {
+    if (s->nsector == 0 && !ret) {
         s->status = READY_STAT | SEEK_STAT;
         ide_set_irq(s);
-    eot:
         bm->status &= ~BM_STATUS_DMAING;
         bm->status |= BM_STATUS_INT;
-        bm->dma_cb = NULL;
-        bm->ide_if = NULL;
-        bm->aiocb = NULL;
-        return;
+    } else {
+       ide_dma_error(s);
+       printf("ide_dma_complete error: nsector %d err %d\n", s->nsector, ret);
     }
+}
 
-    /* launch next transfer */
-    n = s->nsector;
-    if (n > IDE_DMA_BUF_SECTORS)
-        n = IDE_DMA_BUF_SECTORS;
-    s->io_buffer_index = 0;
-    s->io_buffer_size = n * 512;
+static int ide_dma_submit(void *opaque, struct iovec *dma_iov,
+                         int iovcnt, size_t len,
+                         BlockDriverCompletionFunc dma_cb,
+                         void *dma_cb_param)
+{
+    BMDMAState *bm = opaque;
+    IDEState *s = bm->ide_if;
+    uint32_t sectors;
+    int64_t sector_num;
+
+    sectors = len >> 9;
+    if (s->nsector < sectors || !s->nsector)
+       return -1;
+
+    sector_num = ide_get_sector(s);
+    ide_set_sector(s, sector_num  + sectors);
+    s->nsector -= sectors;
+
 #ifdef DEBUG_AIO
-    printf("aio_read: sector_num=%lld n=%d\n", sector_num, n);
+    printf("aio_write: sector_num=%lld n=%d\n", sector_num, sectors);
 #endif
-    bm->aiocb = bdrv_aio_read(s->bs, sector_num, s->io_buffer, n,
-                              ide_read_dma_cb, bm);
-    ide_dma_submit_check(s, ide_read_dma_cb, bm);
-}
+    bm->aiocb = bm->bdrv_aio_iov(s->bs, sector_num, dma_iov, iovcnt, len,
+                                dma_cb, dma_cb_param);
+    if (!bm->aiocb)
+       return -1;
 
-static void ide_sector_read_dma(IDEState *s)
-{
-    s->status = READY_STAT | SEEK_STAT | DRQ_STAT | BUSY_STAT;
-    s->io_buffer_index = 0;
-    s->io_buffer_size = 0;
-    ide_dma_start(s, ide_read_dma_cb);
+    return 0;
 }
 
 static void ide_sector_write_timer_cb(void *opaque)
@@ -1028,64 +1050,29 @@
     }
 }
 
-static void ide_write_dma_cb(void *opaque, int ret)
+static void ide_sector_dma(IDEState *s, BlockDriverAIOIOV *bdrv_aio_iov,
+                          int dma_to_memory)
 {
-    BMDMAState *bm = opaque;
-    IDEState *s = bm->ide_if;
-    int n;
-    int64_t sector_num;
+    int iovcnt;
+    BMDMAState *bm = s->bmdma;
+    if(!bm)
+       goto err;
 
-    if (ret < 0) {
-       ide_dma_error(s);
-       return;
-    }
+    s->status = READY_STAT | SEEK_STAT | DRQ_STAT | BUSY_STAT;
+    bm->ide_if = s;
+    iovcnt = build_dma_sg(s->bmdma);
+    bm->bdrv_aio_iov = bdrv_aio_iov;
+    if (iovcnt > IDE_DMA_BUF_SECTORS)
+       goto err;
+    pci_dma_sg((PCIDevice *)bm->pci_dev, bm->sg, iovcnt,
+              ide_dma_submit, ide_dma_complete, bm,
+              dma_to_memory, 512);
+    return;
 
-    n = s->io_buffer_size >> 9;
-    sector_num = ide_get_sector(s);
-    if (n > 0) {
-        sector_num += n;
-        ide_set_sector(s, sector_num);
-        s->nsector -= n;
-    }
-
-    /* end of transfer ? */
-    if (s->nsector == 0) {
-        s->status = READY_STAT | SEEK_STAT;
-        ide_set_irq(s);
-    eot:
-        bm->status &= ~BM_STATUS_DMAING;
-        bm->status |= BM_STATUS_INT;
-        bm->dma_cb = NULL;
-        bm->ide_if = NULL;
-        bm->aiocb = NULL;
-        return;
-    }
-
-    /* launch next transfer */
-    n = s->nsector;
-    if (n > IDE_DMA_BUF_SECTORS)
-        n = IDE_DMA_BUF_SECTORS;
-    s->io_buffer_index = 0;
-    s->io_buffer_size = n * 512;
-
-    if (dma_buf_rw(bm, 0) == 0)
-        goto eot;
-#ifdef DEBUG_AIO
-    printf("aio_write: sector_num=%lld n=%d\n", sector_num, n);
-#endif
-    bm->aiocb = bdrv_aio_write(s->bs, sector_num, s->io_buffer, n,
-                               ide_write_dma_cb, bm);
-    ide_dma_submit_check(s, ide_write_dma_cb, bm);
+err:
+    ide_dma_error(s);
 }
 
-static void ide_sector_write_dma(IDEState *s)
-{
-    s->status = READY_STAT | SEEK_STAT | DRQ_STAT | BUSY_STAT;
-    s->io_buffer_index = 0;
-    s->io_buffer_size = 0;
-    ide_dma_start(s, ide_write_dma_cb);
-}
-
 static void ide_atapi_cmd_ok(IDEState *s)
 {
     s->error = 0;
@@ -2219,7 +2206,7 @@
             if (!s->bs)
                 goto abort_cmd;
            ide_cmd_lba48_transform(s, lba48);
-            ide_sector_read_dma(s);
+            ide_sector_dma(s, bdrv_aio_readv, 1);
             break;
        case WIN_WRITEDMA_EXT:
            lba48 = 1;
@@ -2228,7 +2215,7 @@
             if (!s->bs)
                 goto abort_cmd;
            ide_cmd_lba48_transform(s, lba48);
-            ide_sector_write_dma(s);
+            ide_sector_dma(s, bdrv_aio_writev, 0);
             s->media_changed = 1;
             break;
         case WIN_READ_NATIVE_MAX_EXT:
Index: cpu-all.h
===================================================================
--- cpu-all.h   (revision 5799)
+++ cpu-all.h   (working copy)
@@ -891,14 +891,19 @@
 CPUReadMemoryFunc **cpu_get_io_memory_read(int io_index);
 
 void cpu_physical_memory_rw(target_phys_addr_t addr, uint8_t *buf,
-                            int len, int is_write);
+                            size_t len, int is_write);
+void cpu_physical_memory_write_post_dma(target_phys_addr_t addr,
+                                       size_t len);
+uint8_t *cpu_physical_memory_can_dma(target_phys_addr_t addr,
+                                    size_t len, int is_write,
+                                    int alignment);
 static inline void cpu_physical_memory_read(target_phys_addr_t addr,
-                                            uint8_t *buf, int len)
+                                            uint8_t *buf, size_t len)
 {
     cpu_physical_memory_rw(addr, buf, len, 0);
 }
 static inline void cpu_physical_memory_write(target_phys_addr_t addr,
-                                             const uint8_t *buf, int len)
+                                             const uint8_t *buf, size_t len)
 {
     cpu_physical_memory_rw(addr, (uint8_t *)buf, len, 1);
 }




reply via email to

[Prev in Thread] Current Thread [Next in Thread]