qemu-devel
[Top][All Lists]
Advanced

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

[Qemu-devel] [PATCH 4 of 5] dma api


From: Andrea Arcangeli
Subject: [Qemu-devel] [PATCH 4 of 5] dma api
Date: Fri, 12 Dec 2008 19:16:46 +0100

From: Andrea Arcangeli <address@hidden>

One major limitation for KVM today is the lack of a proper way to write drivers
in a way that allows the host OS to use direct DMA to the guest physical memory
to avoid any intermediate copy. The only API provided to drivers seems to be
the cpu_physical_memory_rw and that enforces all drivers to bounce and trash
cpu caches and be memory bound. This new DMA API instead allows drivers to use
a pci_dma_sg method for SG I/O that will translate the guest physical addresses
to host virutal addresses and it will call two operation, one is a submit
method and one is the complete method. The pci_dma_sg may have to bounce buffer
internally and to limit the max bounce size it may have to submit I/O in pieces
with multiple submit calls.

All we care about is the performance of the direct path, so I tried to
avoid dynamic allocations there to avoid entering glibc.

Signed-off-by: Andrea Arcangeli <address@hidden>
---

diff --git a/Makefile.target b/Makefile.target
--- a/Makefile.target
+++ b/Makefile.target
@@ -629,7 +629,7 @@ OBJS += e1000.o
 
 ifeq ($(TARGET_BASE_ARCH), i386)
 # Hardware support
-OBJS+= ide.o pckbd.o ps2.o vga.o $(SOUND_HW) isa_dma.o
+OBJS+= ide.o pckbd.o ps2.o vga.o $(SOUND_HW) isa_dma.o dma.o
 OBJS+= fdc.o mc146818rtc.o serial.o i8259.o i8254.o pcspk.o pc.o
 OBJS+= cirrus_vga.o apic.o parallel.o acpi.o piix_pci.o
 OBJS+= usb-uhci.o vmmouse.o vmport.o vmware_vga.o
@@ -640,7 +640,7 @@ ifeq ($(TARGET_BASE_ARCH), ppc)
 ifeq ($(TARGET_BASE_ARCH), ppc)
 CPPFLAGS += -DHAS_AUDIO -DHAS_AUDIO_CHOICE
 # shared objects
-OBJS+= ppc.o ide.o vga.o $(SOUND_HW) isa_dma.o openpic.o
+OBJS+= ppc.o ide.o vga.o $(SOUND_HW) isa_dma.o openpic.o dma.o
 # PREP target
 OBJS+= pckbd.o ps2.o serial.o i8259.o i8254.o fdc.o m48t59.o mc146818rtc.o
 OBJS+= prep_pci.o ppc_prep.o
@@ -658,7 +658,7 @@ ifeq ($(TARGET_BASE_ARCH), mips)
 ifeq ($(TARGET_BASE_ARCH), mips)
 OBJS+= mips_r4k.o mips_jazz.o mips_malta.o mips_mipssim.o
 OBJS+= mips_timer.o mips_int.o isa_dma.o vga.o serial.o i8254.o i8259.o 
rc4030.o
-OBJS+= g364fb.o jazz_led.o
+OBJS+= g364fb.o jazz_led.o dma.o
 OBJS+= ide.o gt64xxx.o pckbd.o ps2.o fdc.o mc146818rtc.o usb-uhci.o acpi.o 
ds1225y.o
 OBJS+= piix_pci.o parallel.o cirrus_vga.o pcspk.o $(SOUND_HW)
 OBJS+= mipsnet.o
@@ -667,7 +667,7 @@ endif
 endif
 ifeq ($(TARGET_BASE_ARCH), cris)
 OBJS+= etraxfs.o
-OBJS+= etraxfs_dma.o
+OBJS+= etraxfs_dma.o dma.o
 OBJS+= etraxfs_pic.o
 OBJS+= etraxfs_eth.o
 OBJS+= etraxfs_timer.o
@@ -678,13 +678,13 @@ endif
 endif
 ifeq ($(TARGET_BASE_ARCH), sparc)
 ifeq ($(TARGET_ARCH), sparc64)
-OBJS+= sun4u.o ide.o pckbd.o ps2.o vga.o apb_pci.o
+OBJS+= sun4u.o ide.o pckbd.o ps2.o vga.o apb_pci.o dma.o
 OBJS+= fdc.o mc146818rtc.o serial.o m48t59.o
 OBJS+= cirrus_vga.o parallel.o ptimer.o
 else
 OBJS+= sun4m.o tcx.o pcnet.o iommu.o m48t59.o slavio_intctl.o
 OBJS+= slavio_timer.o slavio_serial.o slavio_misc.o fdc.o sparc32_dma.o
-OBJS+= cs4231.o ptimer.o eccmemctl.o sbi.o sun4c_intctl.o
+OBJS+= cs4231.o ptimer.o eccmemctl.o sbi.o sun4c_intctl.o dma.o
 endif
 endif
 ifeq ($(TARGET_BASE_ARCH), arm)
@@ -700,7 +700,7 @@ OBJS+= pflash_cfi01.o gumstix.o
 OBJS+= pflash_cfi01.o gumstix.o
 OBJS+= zaurus.o ide.o serial.o nand.o ecc.o spitz.o tosa.o tc6393xb.o
 OBJS+= omap1.o omap_lcdc.o omap_dma.o omap_clk.o omap_mmc.o omap_i2c.o
-OBJS+= omap2.o omap_dss.o soc_dma.o
+OBJS+= omap2.o omap_dss.o soc_dma.o dma.o
 OBJS+= palm.o tsc210x.o
 OBJS+= nseries.o blizzard.o onenand.o vga.o cbus.o tusb6010.o usb-musb.o
 OBJS+= tsc2005.o bt-hci-csr.o
@@ -711,11 +711,11 @@ ifeq ($(TARGET_BASE_ARCH), sh4)
 ifeq ($(TARGET_BASE_ARCH), sh4)
 OBJS+= shix.o r2d.o sh7750.o sh7750_regnames.o tc58128.o
 OBJS+= sh_timer.o ptimer.o sh_serial.o sh_intc.o sh_pci.o sm501.o serial.o
-OBJS+= ide.o
+OBJS+= ide.o dma.o
 endif
 ifeq ($(TARGET_BASE_ARCH), m68k)
 OBJS+= an5206.o mcf5206.o ptimer.o mcf_uart.o mcf_intc.o mcf5208.o mcf_fec.o
-OBJS+= m68k-semi.o dummy_m68k.o
+OBJS+= m68k-semi.o dummy_m68k.o dma.o
 endif
 ifdef CONFIG_GDBSTUB
 OBJS+=gdbstub.o gdbstub-xml.o
diff --git a/block.h b/block.h
--- a/block.h
+++ b/block.h
@@ -2,6 +2,7 @@
 #define BLOCK_H
 
 #include "qemu-aio.h"
+#include <sys/uio.h> /* struct iovec */
 
 /* block.c */
 typedef struct BlockDriver BlockDriver;
diff --git a/hw/dma.c b/hw/dma.c
new file mode 100644
--- /dev/null
+++ b/hw/dma.c
@@ -0,0 +1,366 @@
+/*
+ * QEMU PCI DMA operations
+ *
+ * Copyright (c) 2008 Red Hat, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to 
deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 
FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+
+#include <assert.h>
+#include "dma.h"
+
+#define MAX_DMA_BOUNCE_BUFFER (1024*1024)
+//#define DEBUG_BOUNCE
+//#define MAX_DMA_BOUNCE_BUFFER (512)
+//#define DISABLE_IOVEC_CACHE
+
+typedef struct QEMUPciDmaSgParam {
+    QEMUPciDmaSgSubmit *pci_dma_sg_submit;
+    QEMUPciDmaSgComplete *pci_dma_sg_complete;
+    void *pci_dma_sg_opaque;
+    int dma_to_memory;
+    int alignment;
+    uint8_t *bounce;
+    QEMUPciDmaSg *sg;
+    int iovcnt;
+    int restart_iovcnt;
+    size_t restart_offset;
+    int curr_restart_iovcnt;
+    size_t curr_restart_offset;
+    size_t curr_len;
+#ifndef DISABLE_IOVEC_CACHE
+    struct QEMUPciDmaSgParam *next;
+#endif
+    struct iovec iov;
+} QEMUPciDmaSgParam;
+
+#ifndef DISABLE_IOVEC_CACHE
+/*
+ * Too many entries will run slower and waste memory, this is really
+ * only about the fast path so it must be small, slow path is fine to
+ * allocate dynamically. Max memory used is slightly more than
+ * MAX_IOVEC_ENTRIES * MAX_IOVEC_IOVCNT * sizeof(struct iovec).
+ */
+#define MAX_IOVEC_ENTRIES 10
+
+/*
+ * Don't cache exceptionally large iovcnt used for huge DMA transfers
+ * as the DMA transfer may take much longer than malloc and huge
+ * memory could be wasted if it happens only once in a while.
+ */
+#define MAX_IOVEC_IOVCNT 2048
+
+static QEMUPciDmaSgParam *sg_list;
+static long max_sg_in_flight, sg_in_flight;
+
+static QEMUPciDmaSgParam *qemu_get_pci_dma_sg(int iovcnt)
+{
+    QEMUPciDmaSgParam *entry = sg_list, *last;
+
+    while (entry) {
+        if (entry->iovcnt >= iovcnt) {
+            if (entry == sg_list)
+                sg_list = sg_list->next;
+            else
+                last->next = entry->next;
+            goto found;
+        }
+        last = entry;
+        entry = entry->next;
+    }
+
+    entry = qemu_malloc(sizeof(QEMUPciDmaSgParam) +
+                        sizeof(struct iovec) * (iovcnt-1));
+    if (!entry)
+        return NULL;
+
+    if (iovcnt <= MAX_IOVEC_IOVCNT) {
+    found:
+        sg_in_flight += 1;
+        if (sg_in_flight > max_sg_in_flight)
+            max_sg_in_flight = sg_in_flight;
+    }
+    return entry;
+}
+
+static void qemu_release_pci_dma_sg(QEMUPciDmaSgParam *this)
+{
+    QEMUPciDmaSgParam *min_entry = NULL, *entry = sg_list;
+    QEMUPciDmaSgParam *min_last = NULL, *last = NULL;
+    unsigned int min_iovcnt = -1;
+    int nr = 0, tot;
+
+    if (this->iovcnt > MAX_IOVEC_IOVCNT) {
+        qemu_free(this);
+        return;
+    }
+
+    while (entry) {
+        nr += 1;
+        if ((unsigned int)entry->iovcnt <= min_iovcnt) {
+            min_entry = entry;
+            min_last = last;
+            min_iovcnt = entry->iovcnt;
+        }
+        last = entry;
+        entry = entry->next;
+    }
+
+    assert(max_sg_in_flight > 0);
+    assert(sg_in_flight > 0);
+    tot = nr+sg_in_flight; 
+    if (tot > max_sg_in_flight || tot > MAX_IOVEC_ENTRIES) {
+        /* detail: replace even if it's equal as it's cache hot */
+        if ((unsigned int)this->iovcnt < min_iovcnt)
+            qemu_free(this);
+        else {
+            assert(nr > 0);
+            if (min_entry == sg_list) {
+                this->next = sg_list->next;
+            } else {
+                min_last->next = min_entry->next;
+                this->next = sg_list;
+            }
+            sg_list = this;
+            qemu_free(min_entry);
+        }
+    } else {
+        this->next = sg_list;
+        sg_list = this;
+    }
+    sg_in_flight -= 1;
+    assert(sg_in_flight >= 0);
+}
+#else /* DISABLE_IOVEC_CACHE */
+#define qemu_get_pci_dma_sg(iovcnt) 
qemu_malloc(sizeof(QEMUPciDmaSgParam)+(sizeof(struct iovec)*((iovcnt)-1))) 
+#define qemu_release_pci_dma_sg(param) qemu_free(param)
+#endif /* DISABLE_IOVEC_CACHE */
+
+static int pci_dma_sg_map_direct(QEMUPciDmaSg *sg,
+                                 int iovcnt,
+                                 int dma_to_memory,
+                                 int alignment,
+                                 size_t *len,
+                                 struct iovec *dma_iov)
+{
+    int idx = 0;
+    size_t _len = 0;
+
+#ifdef DEBUG_BOUNCE
+    return 0;
+#endif
+
+    for (idx = 0; idx < iovcnt; idx++) {
+        void * addr;
+
+        if (_len + sg[idx].len <= _len)
+            return 0;
+        _len += sg[idx].len;
+
+        addr = cpu_physical_memory_can_dma(sg[idx].addr,
+                                           sg[idx].len,
+                                           dma_to_memory,
+                                           alignment);
+        if (!addr)
+            return 0;
+
+        dma_iov[idx].iov_base = addr;
+        dma_iov[idx].iov_len = sg[idx].len;
+    }
+
+    *len = _len;
+    return 1;
+}
+
+static int pci_dma_sg_map_bounce(QEMUPciDmaSgParam *param)
+{
+    int idx;
+    size_t len = 0;
+
+    param->curr_restart_iovcnt = param->restart_iovcnt;
+    param->curr_restart_offset = param->restart_offset;
+
+    for (idx = param->restart_iovcnt; idx < param->iovcnt; idx++) {
+        if (len + param->sg[idx].len <= len)
+            return 0;
+        len += param->sg[idx].len - param->restart_offset;
+        param->restart_offset = 0;
+        if (len > MAX_DMA_BOUNCE_BUFFER) {
+            size_t leftover = len - MAX_DMA_BOUNCE_BUFFER;
+            param->restart_offset = param->sg[idx].len - leftover;
+            len = MAX_DMA_BOUNCE_BUFFER;
+            break;
+        }
+    }
+    param->restart_iovcnt = idx;
+    param->curr_len = len;
+
+    if (len & (param->alignment-1))
+        return 0;
+
+    param->iov.iov_len = len;
+    if (!param->bounce) {
+        param->bounce = qemu_memalign(param->alignment, len);
+        if (!param->bounce)
+            return 0;
+        param->iov.iov_base = param->bounce;
+    }
+
+    if (!param->dma_to_memory) {
+        int idx;
+        size_t offset = 0;
+        for (idx = param->curr_restart_iovcnt;
+             idx < param->iovcnt && offset < len; idx++) {
+            size_t copy_len = param->sg[idx].len - param->curr_restart_offset;
+            if (offset+copy_len > len)
+                copy_len = len;
+            cpu_physical_memory_read(param->sg[idx].addr + 
+                                     param->curr_restart_offset,
+                                     param->bounce + offset,
+                                     copy_len);
+            param->curr_restart_offset = 0;
+            offset += copy_len;
+        }
+    }
+
+    return 1;
+}
+
+static void pci_dma_sg_unmap_direct(QEMUPciDmaSgParam *param, int ret)
+{
+    if (!ret && param->dma_to_memory) {
+        int idx;
+        QEMUPciDmaSg *sg = param->sg;
+        for (idx = 0; idx < param->iovcnt; idx++)
+            cpu_physical_memory_write_post_dma(sg[idx].addr,
+                                               sg[idx].len);
+    }
+}
+
+static int pci_dma_sg_unmap_bounce(QEMUPciDmaSgParam *param, int ret)
+{
+    if (!ret && param->dma_to_memory) {
+        int idx;
+        size_t offset = 0;
+        for (idx = param->curr_restart_iovcnt;
+             idx < param->iovcnt && offset < param->curr_len; idx++) {
+            size_t copy_len = param->sg[idx].len - param->curr_restart_offset;
+            if (offset+copy_len > param->curr_len)
+                copy_len = param->curr_len;
+            cpu_physical_memory_write(param->sg[idx].addr +
+                                      param->curr_restart_offset,
+                                      param->bounce + offset,
+                                      copy_len);
+            param->curr_restart_offset = 0;
+            offset += copy_len;
+        }
+    }
+    if (param->restart_iovcnt == param->iovcnt || ret) {
+        qemu_free(param->bounce);
+        return 0;
+    }
+    return 1;
+}
+
+static void pci_dma_sg_cb(void *opaque, int ret)
+{
+    QEMUPciDmaSgParam *param = opaque;
+    int restart = 0;
+
+    if (!param->bounce)
+        pci_dma_sg_unmap_direct(param, ret);
+    else
+        restart = pci_dma_sg_unmap_bounce(param, ret);
+
+    if (restart) {
+        ret = -1000;
+        if (!pci_dma_sg_map_bounce(param)) {
+            qemu_free(param->bounce);
+            goto out_free;
+        }
+        ret = param->pci_dma_sg_submit(param->pci_dma_sg_opaque,
+                                       &param->iov, 1,
+                                       param->curr_len,
+                                       pci_dma_sg_cb,
+                                       param);
+    }
+    if (ret || !restart) {
+    out_free:
+        param->pci_dma_sg_complete(param->pci_dma_sg_opaque, ret);
+        qemu_release_pci_dma_sg(param);
+    }
+}
+
+/* PCIDevice is there in case we want to emulate an iommu later */
+void pci_dma_sg(PCIDevice *pci_dev,
+                QEMUPciDmaSg *sg, int iovcnt,
+                QEMUPciDmaSgSubmit pci_dma_sg_submit,
+                QEMUPciDmaSgComplete pci_dma_sg_complete,
+                void *pci_dma_sg_opaque,
+                int dma_to_memory, int alignment)
+{
+    int ret;
+    QEMUPciDmaSgParam *param;
+
+    ret = -2000;
+    if ((unsigned int) dma_to_memory > 1)
+        goto err;
+    if ((unsigned int) alignment > MAX_DMA_BOUNCE_BUFFER)
+        goto err;
+    if (iovcnt < 1)
+        goto err;
+
+    param = qemu_get_pci_dma_sg(iovcnt);
+    if (!param)
+        goto err;
+
+    param->pci_dma_sg_submit = pci_dma_sg_submit;
+    param->pci_dma_sg_complete = pci_dma_sg_complete;
+    param->pci_dma_sg_opaque = pci_dma_sg_opaque;
+    param->dma_to_memory = dma_to_memory;
+    param->alignment = alignment;
+    param->bounce = NULL;
+    param->sg = sg;
+    param->iovcnt = iovcnt;
+    param->restart_offset = param->restart_iovcnt = 0;
+
+    /* map the sg */
+    if (!pci_dma_sg_map_direct(sg, iovcnt,
+                               dma_to_memory, alignment,
+                               &param->curr_len, &param->iov)) {
+        ret = -2004;
+        if (!pci_dma_sg_map_bounce(param))
+            goto out_free;
+        iovcnt = 1;
+    }
+
+    /* run the I/O */
+    ret = pci_dma_sg_submit(pci_dma_sg_opaque,
+                            &param->iov, iovcnt, param->curr_len,
+                            pci_dma_sg_cb,
+                            param);
+    if (ret)
+    out_free:
+        pci_dma_sg_cb(param, ret);
+    return;
+
+err:
+    pci_dma_sg_complete(pci_dma_sg_opaque, ret);
+    return;
+}
diff --git a/hw/dma.h b/hw/dma.h
new file mode 100644
--- /dev/null
+++ b/hw/dma.h
@@ -0,0 +1,28 @@
+#ifndef QEMU_PCI_DMA_H
+#define QEMU_PCI_DMA_H
+
+#include "qemu-common.h"
+#include "block.h"
+
+typedef int QEMUPciDmaSgSubmit(void *pci_dma_sg_opaque,
+                               struct iovec *iov, int iovcnt,
+                               size_t len,
+                               BlockDriverCompletionFunc dma_cb,
+                               void *dma_cb_param);
+
+typedef void QEMUPciDmaSgComplete(void *pci_dma_sg_opaque, int ret);
+
+typedef struct QEMUPciDmaSg {
+    target_phys_addr_t addr;
+    size_t len;
+} QEMUPciDmaSg;
+
+/* pci_dma.c */
+void pci_dma_sg(PCIDevice *pci_dev,
+                QEMUPciDmaSg *sg, int iovcnt,
+                QEMUPciDmaSgSubmit *pci_dma_sg_submit,
+                QEMUPciDmaSgComplete *pci_dma_sg_complete,
+                void *pci_dma_sg_opaque,
+                int dma_to_memory, int alignment);
+
+#endif




reply via email to

[Prev in Thread] Current Thread [Next in Thread]