[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]
Re: [Qemu-devel] [PATCH] hw/block/nvme: Add doorbell buffer config suppo
From: |
Paolo Bonzini |
Subject: |
Re: [Qemu-devel] [PATCH] hw/block/nvme: Add doorbell buffer config support |
Date: |
Mon, 30 Apr 2018 12:07:58 +0200 |
User-agent: |
Mozilla/5.0 (X11; Linux x86_64; rv:52.0) Gecko/20100101 Thunderbird/52.7.0 |
Ping... Keith, can you review this?
Thanks,
Paolo
On 05/03/2018 20:49, Huaicheng Li wrote:
> This patch adds Doorbell Buffer Config support (NVMe 1.3) to QEMU NVMe,
> based on Mihai Rusu / Lin Ming's Google vendor extension patch [1]. The
> basic idea of this optimization is to use a shared buffer between guest
> OS and QEMU to reduce # of MMIO operations (doorbell writes). This patch
> ports the original code to work under current QEMU and make it also
> work with SPDK.
>
> Unlike Linux kernel NVMe driver which builds the shadow buffer first and
> then creates SQ/CQ, SPDK first creates SQ/CQ and then issues this command
> to create shadow buffer. Thus, in this implementation, we also try to
> associate shadow buffer entry with each SQ/CQ during queue initialization.
>
> [1] http://lists.nongnu.org/archive/html/qemu-devel/2015-11/msg04127.html
>
> Peroformance results using a **ramdisk** backed virtual NVMe device in
> guest
> Linux 4.14 is as below:
>
> Note: "QEMU" represent stock QEMU and "+dbbuf" is QEMU with this patch.
> For psync, QD represents # of threads being used.
>
>
> IOPS (Linux kernel NVMe driver)
> psync libaio
> QD QEMU +dbbuf QEMU +dbbuf
> 1 47k 50k 45k 47k
> 4 86k 107k 59k 143k
> 16 95k 198k 58k 185k
> 64 97k 259k 59k 216k
>
>
> IOPS (SPDK)
> QD QEMU +dbbuf
> 1 62k 71k
> 4 61k 191k
> 16 60k 319k
> 64 62k 364k
>
> We can see that this patch can greatly increase the IOPS (and lower the
> latency, not shown) (2.7x for psync, 3.7x for libaio and 5.9x for SPDK).
>
> ==Setup==:
>
> (1) VM script:
> x86_64-softmmu/qemu-system-x86_64 \
> -name "nvme-FEMU-test" \
> -enable-kvm \
> -cpu host \
> -smp 4 \
> -m 8G \
> -drive
> file=$IMGDIR/u14s.qcow2,if=ide,aio=native,cache=none,format=qcow2,id=hd0 \
> -drive file=/mnt/tmpfs/test1.raw,if=none,aio=threads,format=raw,id=id0 \
> -device nvme,drive=id0,serial=serial0,id=nvme0 \
> -net user,hostfwd=tcp::8080-:22 \
> -net nic,model=virtio \
> -nographic \
>
> (2) FIO configuration:
>
> [global]
> ioengine=libaio
> filename=/dev/nvme0n1
> thread=1
> group_reporting=1
> direct=1
> verify=0
> time_based=1
> ramp_time=0
> runtime=30
> ;size=1G
> iodepth=16
> rw=randread
> bs=4k
>
> [test]
> numjobs=1
>
>
> Signed-off-by: Huaicheng Li <address@hidden>
> ---
> hw/block/nvme.c | 97
> +++++++++++++++++++++++++++++++++++++++++++++++++---
> hw/block/nvme.h | 7 ++++
> include/block/nvme.h | 2 ++
> 3 files changed, 102 insertions(+), 4 deletions(-)
>
> diff --git a/hw/block/nvme.c b/hw/block/nvme.c
> index 85d2406400..3882037e36 100644
> --- a/hw/block/nvme.c
> +++ b/hw/block/nvme.c
> @@ -9,7 +9,7 @@
> */
>
> /**
> - * Reference Specs: http://www.nvmexpress.org, 1.2, 1.1, 1.0e
> + * Reference Specs: http://www.nvmexpress.org, 1.3, 1.2, 1.1, 1.0e
> *
> * http://www.nvmexpress.org/resources/
> */
> @@ -33,6 +33,7 @@
> #include "qapi/error.h"
> #include "qapi/visitor.h"
> #include "sysemu/block-backend.h"
> +#include "exec/memory.h"
>
> #include "qemu/log.h"
> #include "trace.h"
> @@ -244,6 +245,14 @@ static uint16_t nvme_dma_read_prp(NvmeCtrl *n,
> uint8_t *ptr, uint32_t len,
> return status;
> }
>
> +static void nvme_update_cq_head(NvmeCQueue *cq)
> +{
> + if (cq->db_addr) {
> + pci_dma_read(&cq->ctrl->parent_obj, cq->db_addr, &cq->head,
> + sizeof(cq->head));
> + }
> +}
> +
> static void nvme_post_cqes(void *opaque)
> {
> NvmeCQueue *cq = opaque;
> @@ -254,6 +263,8 @@ static void nvme_post_cqes(void *opaque)
> NvmeSQueue *sq;
> hwaddr addr;
>
> + nvme_update_cq_head(cq);
> +
> if (nvme_cq_full(cq)) {
> break;
> }
> @@ -461,6 +472,7 @@ static uint16_t nvme_del_sq(NvmeCtrl *n, NvmeCmd *cmd)
> static void nvme_init_sq(NvmeSQueue *sq, NvmeCtrl *n, uint64_t dma_addr,
> uint16_t sqid, uint16_t cqid, uint16_t size)
> {
> + uint32_t stride = 4 << NVME_CAP_DSTRD(n->bar.cap);
> int i;
> NvmeCQueue *cq;
>
> @@ -480,6 +492,11 @@ static void nvme_init_sq(NvmeSQueue *sq, NvmeCtrl
> *n, uint64_t dma_addr,
> }
> sq->timer = timer_new_ns(QEMU_CLOCK_VIRTUAL, nvme_process_sq, sq);
>
> + if (sqid && n->dbbuf_dbs && n->dbbuf_eis) {
> + sq->db_addr = n->dbbuf_dbs + 2 * sqid * stride;
> + sq->ei_addr = n->dbbuf_eis + 2 * sqid * stride;
> + }
> +
> assert(n->cq[cqid]);
> cq = n->cq[cqid];
> QTAILQ_INSERT_TAIL(&(cq->sq_list), sq, entry);
> @@ -559,6 +576,8 @@ static uint16_t nvme_del_cq(NvmeCtrl *n, NvmeCmd *cmd)
> static void nvme_init_cq(NvmeCQueue *cq, NvmeCtrl *n, uint64_t dma_addr,
> uint16_t cqid, uint16_t vector, uint16_t size, uint16_t irq_enabled)
> {
> + uint32_t stride = 4 << NVME_CAP_DSTRD(n->bar.cap);
> +
> cq->ctrl = n;
> cq->cqid = cqid;
> cq->size = size;
> @@ -569,11 +588,51 @@ static void nvme_init_cq(NvmeCQueue *cq, NvmeCtrl
> *n, uint64_t dma_addr,
> cq->head = cq->tail = 0;
> QTAILQ_INIT(&cq->req_list);
> QTAILQ_INIT(&cq->sq_list);
> + if (cqid && n->dbbuf_dbs && n->dbbuf_eis) {
> + cq->db_addr = n->dbbuf_dbs + (2 * cqid + 1) * stride;
> + cq->ei_addr = n->dbbuf_eis + (2 * cqid + 1) * stride;
> + }
> msix_vector_use(&n->parent_obj, cq->vector);
> n->cq[cqid] = cq;
> cq->timer = timer_new_ns(QEMU_CLOCK_VIRTUAL, nvme_post_cqes, cq);
> }
>
> +static uint16_t nvme_dbbuf_config(NvmeCtrl *n, const NvmeCmd *cmd)
> +{
> + uint32_t stride = 4 << NVME_CAP_DSTRD(n->bar.cap);
> + uint64_t dbs_addr = le64_to_cpu(cmd->prp1);
> + uint64_t eis_addr = le64_to_cpu(cmd->prp2);
> + int i;
> +
> + /* Address should not be NULL and should be page aligned */
> + if (dbs_addr == 0 || dbs_addr & (n->page_size - 1) ||
> + eis_addr == 0 || eis_addr & (n->page_size - 1)) {
> + return NVME_INVALID_FIELD | NVME_DNR;
> + }
> +
> + /* Save shadow buffer base addr for use during queue creation */
> + n->dbbuf_dbs = dbs_addr;
> + n->dbbuf_eis = eis_addr;
> +
> + for (i = 1; i < n->num_queues; i++) {
> + NvmeSQueue *sq = n->sq[i];
> + NvmeCQueue *cq = n->cq[i];
> +
> + if (sq) {
> + /* Submission queue tail pointer location, 2 * QID * stride */
> + sq->db_addr = dbs_addr + 2 * i * stride;
> + sq->ei_addr = eis_addr + 2 * i * stride;
> + }
> +
> + if (cq) {
> + /* Completion queue head pointer location, (2 * QID + 1) *
> stride */
> + cq->db_addr = dbs_addr + (2 * i + 1) * stride;
> + cq->ei_addr = eis_addr + (2 * i + 1) * stride;
> + }
> + }
> + return NVME_SUCCESS;
> +}
> +
> static uint16_t nvme_create_cq(NvmeCtrl *n, NvmeCmd *cmd)
> {
> NvmeCQueue *cq;
> @@ -753,12 +812,30 @@ static uint16_t nvme_admin_cmd(NvmeCtrl *n,
> NvmeCmd *cmd, NvmeRequest *req)
> return nvme_set_feature(n, cmd, req);
> case NVME_ADM_CMD_GET_FEATURES:
> return nvme_get_feature(n, cmd, req);
> + case NVME_ADM_CMD_DBBUF_CONFIG:
> + return nvme_dbbuf_config(n, cmd);
> default:
> trace_nvme_err_invalid_admin_opc(cmd->opcode);
> return NVME_INVALID_OPCODE | NVME_DNR;
> }
> }
>
> +static void nvme_update_sq_eventidx(const NvmeSQueue *sq)
> +{
> + if (sq->ei_addr) {
> + pci_dma_write(&sq->ctrl->parent_obj, sq->ei_addr, &sq->tail,
> + sizeof(sq->tail));
> + }
> +}
> +
> +static void nvme_update_sq_tail(NvmeSQueue *sq)
> +{
> + if (sq->db_addr) {
> + pci_dma_read(&sq->ctrl->parent_obj, sq->db_addr, &sq->tail,
> + sizeof(sq->tail));
> + }
> +}
> +
> static void nvme_process_sq(void *opaque)
> {
> NvmeSQueue *sq = opaque;
> @@ -770,6 +847,8 @@ static void nvme_process_sq(void *opaque)
> NvmeCmd cmd;
> NvmeRequest *req;
>
> + nvme_update_sq_tail(sq);
> +
> while (!(nvme_sq_empty(sq) || QTAILQ_EMPTY(&sq->req_list))) {
> addr = sq->dma_addr + sq->head * n->sqe_size;
> nvme_addr_read(n, addr, (void *)&cmd, sizeof(cmd));
> @@ -787,6 +866,9 @@ static void nvme_process_sq(void *opaque)
> req->status = status;
> nvme_enqueue_req_completion(cq, req);
> }
> +
> + nvme_update_sq_eventidx(sq);
> + nvme_update_sq_tail(sq);
> }
> }
>
> @@ -1105,7 +1187,9 @@ static void nvme_process_db(NvmeCtrl *n, hwaddr
> addr, int val)
> }
>
> start_sqs = nvme_cq_full(cq) ? 1 : 0;
> - cq->head = new_head;
> + if (!cq->db_addr) {
> + cq->head = new_head;
> + }
> if (start_sqs) {
> NvmeSQueue *sq;
> QTAILQ_FOREACH(sq, &cq->sq_list, entry) {
> @@ -1142,7 +1226,9 @@ static void nvme_process_db(NvmeCtrl *n, hwaddr
> addr, int val)
> return;
> }
>
> - sq->tail = new_tail;
> + if (!sq->db_addr) {
> + sq->tail = new_tail;
> + }
> timer_mod(sq->timer, qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL) + 500);
> }
> }
> @@ -1256,7 +1342,7 @@ static void nvme_realize(PCIDevice *pci_dev, Error
> **errp)
> id->ieee[0] = 0x00;
> id->ieee[1] = 0x02;
> id->ieee[2] = 0xb3;
> - id->oacs = cpu_to_le16(0);
> + id->oacs = cpu_to_le16(NVME_OACS_DBBUF);
> id->frmw = 7 << 1;
> id->lpa = 1 << 0;
> id->sqes = (0x6 << 4) | 0x6;
> @@ -1320,6 +1406,9 @@ static void nvme_realize(PCIDevice *pci_dev, Error
> **errp)
> cpu_to_le64(n->ns_size >>
> id_ns->lbaf[NVME_ID_NS_FLBAS_INDEX(ns->id_ns.flbas)].ds);
> }
> +
> + n->dbbuf_dbs = 0;
> + n->dbbuf_eis = 0;
> }
>
> static void nvme_exit(PCIDevice *pci_dev)
> diff --git a/hw/block/nvme.h b/hw/block/nvme.h
> index 8f3981121d..b532dbe160 100644
> --- a/hw/block/nvme.h
> +++ b/hw/block/nvme.h
> @@ -33,6 +33,8 @@ typedef struct NvmeSQueue {
> QTAILQ_HEAD(sq_req_list, NvmeRequest) req_list;
> QTAILQ_HEAD(out_req_list, NvmeRequest) out_req_list;
> QTAILQ_ENTRY(NvmeSQueue) entry;
> + uint64_t db_addr;
> + uint64_t ei_addr;
> } NvmeSQueue;
>
> typedef struct NvmeCQueue {
> @@ -48,6 +50,8 @@ typedef struct NvmeCQueue {
> QEMUTimer *timer;
> QTAILQ_HEAD(sq_list, NvmeSQueue) sq_list;
> QTAILQ_HEAD(cq_req_list, NvmeRequest) req_list;
> + uint64_t db_addr;
> + uint64_t ei_addr;
> } NvmeCQueue;
>
> typedef struct NvmeNamespace {
> @@ -88,6 +92,9 @@ typedef struct NvmeCtrl {
> NvmeSQueue admin_sq;
> NvmeCQueue admin_cq;
> NvmeIdCtrl id_ctrl;
> +
> + uint64_t dbbuf_dbs;
> + uint64_t dbbuf_eis;
> } NvmeCtrl;
>
> #endif /* HW_NVME_H */
> diff --git a/include/block/nvme.h b/include/block/nvme.h
> index 849a6f3fa3..4890aaf491 100644
> --- a/include/block/nvme.h
> +++ b/include/block/nvme.h
> @@ -235,6 +235,7 @@ enum NvmeAdminCommands {
> NVME_ADM_CMD_ASYNC_EV_REQ = 0x0c,
> NVME_ADM_CMD_ACTIVATE_FW = 0x10,
> NVME_ADM_CMD_DOWNLOAD_FW = 0x11,
> + NVME_ADM_CMD_DBBUF_CONFIG = 0x7c,
> NVME_ADM_CMD_FORMAT_NVM = 0x80,
> NVME_ADM_CMD_SECURITY_SEND = 0x81,
> NVME_ADM_CMD_SECURITY_RECV = 0x82,
> @@ -572,6 +573,7 @@ enum NvmeIdCtrlOacs {
> NVME_OACS_SECURITY = 1 << 0,
> NVME_OACS_FORMAT = 1 << 1,
> NVME_OACS_FW = 1 << 2,
> + NVME_OACS_DBBUF = 1 << 8,
> };
>
> enum NvmeIdCtrlOncs {
[Prev in Thread] |
Current Thread |
[Next in Thread] |
- Re: [Qemu-devel] [PATCH] hw/block/nvme: Add doorbell buffer config support,
Paolo Bonzini <=