qemu-devel
[Top][All Lists]
Advanced

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

Re: [Qemu-devel] [PATCH v4 3/9] block: Add VFIO based NVMe driver


From: Stefan Hajnoczi
Subject: Re: [Qemu-devel] [PATCH v4 3/9] block: Add VFIO based NVMe driver
Date: Wed, 10 Jan 2018 18:33:05 +0000
User-agent: Mutt/1.9.1 (2017-09-22)

On Wed, Jan 10, 2018 at 05:18:40PM +0800, Fam Zheng wrote:

There are several memory and lock leaks in this patch.  Please work with
Paolo to get the __attribute__((cleanup(...))) patch series merged so
this class of bugs can be eliminated:
https://lists.gnu.org/archive/html/qemu-devel/2017-12/msg01648.html

> +typedef struct {
> +    CoQueue     free_req_queue;
> +    QemuMutex   lock;
> +
> +    /* Fields protected by BQL */
> +    int         index;
> +    uint8_t     *prp_list_pages;
> +
> +    /* Fields protected by @lock */

Does this lock serve any purpose?  I didn't see a place where these
fields is accessed from multiple threads.  Perhaps you're trying to
prepare for multiqueue, but then other things like the
BDRVNVMeState->inflight counter aren't protected so it doesn't make
sense.

> +static NVMeRequest *nvme_get_free_req(NVMeQueuePair *q)
> +{
> +    int i;
> +    NVMeRequest *req = NULL;
> +
> +    qemu_mutex_lock(&q->lock);
> +    while (q->inflight + q->need_kick > NVME_QUEUE_SIZE - 2) {
> +        /* We have to leave one slot empty as that is the full queue case 
> (head
> +         * == tail + 1). */
> +        if (qemu_in_coroutine()) {
> +            trace_nvme_free_req_queue_wait(q);
> +            qemu_mutex_unlock(&q->lock);
> +            qemu_co_queue_wait(&q->free_req_queue, NULL);
> +            qemu_mutex_lock(&q->lock);
> +        } else {
> +            return NULL;

q->lock is held.

> +static void nvme_identify(BlockDriverState *bs, int namespace, Error **errp)
> +{
> +    BDRVNVMeState *s = bs->opaque;
> +    NvmeIdCtrl *idctrl;
> +    NvmeIdNs *idns;
> +    uint8_t *resp;
> +    int r;
> +    uint64_t iova;
> +    NvmeCmd cmd = {
> +        .opcode = NVME_ADM_CMD_IDENTIFY,
> +        .cdw10 = cpu_to_le32(0x1),
> +    };
> +
> +    resp = qemu_try_blockalign0(bs, sizeof(NvmeIdCtrl));
> +    if (!resp) {
> +        error_setg(errp, "Cannot allocate buffer for identify response");
> +        goto out;
> +    }
> +    idctrl = (NvmeIdCtrl *)resp;
> +    idns = (NvmeIdNs *)resp;
> +    r = qemu_vfio_dma_map(s->vfio, resp, sizeof(NvmeIdCtrl), true, &iova);
> +    if (r) {
> +        error_setg(errp, "Cannot map buffer for DMA");
> +        goto out;
> +    }
> +    cmd.prp1 = cpu_to_le64(iova);
> +
> +    if (nvme_cmd_sync(bs, s->queues[0], &cmd)) {
> +        error_setg(errp, "Failed to identify controller");
> +        goto out;
> +    }
> +
> +    if (le32_to_cpu(idctrl->nn) < namespace) {
> +        error_setg(errp, "Invalid namespace");
> +        goto out;
> +    }
> +    s->write_cache = le32_to_cpu(idctrl->vwc) & 0x1;
> +    s->max_transfer = (idctrl->mdts ? 1 << idctrl->mdts : 0) * s->page_size;
> +    /* For now the page list buffer per command is one page, to hold at most
> +     * s->page_size / sizeof(uint64_t) entries. */
> +    s->max_transfer = MIN_NON_ZERO(s->max_transfer,
> +                          s->page_size / sizeof(uint64_t) * s->page_size);
> +
> +    memset(resp, 0, 4096);
> +
> +    cmd.cdw10 = 0;
> +    cmd.nsid = namespace;

Missing cpu_to_le32().

> +static int nvme_init(BlockDriverState *bs, const char *device, int namespace,
> +                     Error **errp)
> +{
> +    BDRVNVMeState *s = bs->opaque;
> +    int ret;
> +    uint64_t cap;
> +    uint64_t timeout_ms;
> +    uint64_t deadline, now;
> +    Error *local_err = NULL;
> +
> +    qemu_co_mutex_init(&s->dma_map_lock);
> +    qemu_co_queue_init(&s->dma_flush_queue);
> +    s->nsid = namespace;
> +    s->aio_context = qemu_get_current_aio_context();

Why not bdrv_get_aio_context(bs)?

> +    ret = event_notifier_init(&s->irq_notifier, 0);
> +    if (ret) {
> +        error_setg(errp, "Failed to init event notifier");
> +        return ret;

dma_map_lock should be destroyed.

> +    }
> +
> +    s->vfio = qemu_vfio_open_pci(device, errp);
> +    if (!s->vfio) {
> +        ret = -EINVAL;
> +        goto fail;
> +    }
> +
> +    s->regs = qemu_vfio_pci_map_bar(s->vfio, 0, errp);
> +    if (!s->regs) {
> +        ret = -EINVAL;
> +        goto fail;
> +    }
> +
> +    /* Perform initialize sequence as described in NVMe spec "7.6.1
> +     * Initialization". */
> +
> +    cap = le64_to_cpu(s->regs->cap);
> +    if (!(cap & (1ULL << 37))) {
> +        error_setg(errp, "Device doesn't support NVMe command set");
> +        ret = -EINVAL;
> +        goto fail;
> +    }
> +
> +    s->page_size = MAX(4096, 1 << (12 + ((cap >> 48) & 0xF)));
> +    s->doorbell_scale = (4 << (((cap >> 32) & 0xF))) / sizeof(uint32_t);
> +    bs->bl.opt_mem_alignment = s->page_size;
> +    timeout_ms = MIN(500 * ((cap >> 24) & 0xFF), 30000);
> +
> +    /* Reset device to get a clean state. */
> +    s->regs->cc = cpu_to_le32(le32_to_cpu(s->regs->cc) & 0xFE);
> +    /* Wait for CSTS.RDY = 0. */
> +    deadline = qemu_clock_get_ns(QEMU_CLOCK_REALTIME) + timeout_ms * 
> 1000000ULL;
> +    while (le32_to_cpu(s->regs->csts) & 0x1) {
> +        if (qemu_clock_get_ns(QEMU_CLOCK_REALTIME) > deadline) {
> +            error_setg(errp, "Timeout while waiting for device to reset (%ld 
> ms)",
> +                       timeout_ms);
> +            ret = -ETIMEDOUT;
> +            goto fail;
> +        }
> +    }
> +
> +    /* Set up admin queue. */
> +    s->queues = g_new(NVMeQueuePair *, 1);
> +    s->nr_queues = 1;
> +    s->queues[0] = nvme_create_queue_pair(bs, 0, NVME_QUEUE_SIZE, errp);
> +    if (!s->queues[0]) {
> +        ret = -EINVAL;
> +        goto fail;
> +    }
> +    QEMU_BUILD_BUG_ON(NVME_QUEUE_SIZE & 0xF000);
> +    s->regs->aqa = cpu_to_le32((NVME_QUEUE_SIZE << 16) | NVME_QUEUE_SIZE);
> +    s->regs->asq = cpu_to_le64(s->queues[0]->sq.iova);
> +    s->regs->acq = cpu_to_le64(s->queues[0]->cq.iova);
> +
> +    /* After setting up all control registers we can enable device now. */
> +    s->regs->cc = cpu_to_le32((ctz32(NVME_CQ_ENTRY_BYTES) << 20) |
> +                              (ctz32(NVME_SQ_ENTRY_BYTES) << 16) |
> +                              0x1);
> +    /* Wait for CSTS.RDY = 1. */
> +    now = qemu_clock_get_ns(QEMU_CLOCK_REALTIME);
> +    deadline = now + timeout_ms * 1000000;
> +    while (!(le32_to_cpu(s->regs->csts) & 0x1)) {
> +        if (qemu_clock_get_ns(QEMU_CLOCK_REALTIME) > deadline) {
> +            error_setg(errp, "Timeout while waiting for device to start (%ld 
> ms)",
> +                       timeout_ms);
> +            ret = -ETIMEDOUT;
> +            goto fail_queue;
> +        }
> +    }
> +
> +    ret = qemu_vfio_pci_init_irq(s->vfio, &s->irq_notifier,
> +                                 VFIO_PCI_MSIX_IRQ_INDEX, errp);
> +    if (ret) {
> +        goto fail_queue;
> +    }
> +    aio_set_event_notifier(bdrv_get_aio_context(bs), &s->irq_notifier,
> +                           false, nvme_handle_event, nvme_poll_cb);
> +
> +    nvme_identify(bs, namespace, errp);
> +    if (local_err) {
> +        error_propagate(errp, local_err);
> +        ret = -EIO;
> +        goto fail_handler;
> +    }
> +
> +    /* Set up command queues. */
> +    if (!nvme_add_io_queue(bs, errp)) {
> +        ret = -EIO;
> +        goto fail_handler;
> +    }
> +    return 0;
> +
> +fail_handler:
> +    aio_set_event_notifier(bdrv_get_aio_context(bs), &s->irq_notifier,
> +                           false, NULL, NULL);
> +fail_queue:
> +    nvme_free_queue_pair(bs, s->queues[0]);
> +fail:

s->queues is not freed.

> +    qemu_vfio_pci_unmap_bar(s->vfio, 0, (void *)s->regs);
> +    qemu_vfio_close(s->vfio);
> +    event_notifier_cleanup(&s->irq_notifier);

dma_map_lock should be destroyed.

> +static int nvme_file_open(BlockDriverState *bs, QDict *options, int flags,
> +                          Error **errp)
> +{
> +    const char *device;
> +    QemuOpts *opts;
> +    int namespace;
> +
> +    opts = qemu_opts_create(&runtime_opts, NULL, 0, &error_abort);
> +    qemu_opts_absorb_qdict(opts, options, &error_abort);
> +    device = qemu_opt_get(opts, NVME_BLOCK_OPT_DEVICE);
> +    if (!device) {
> +        error_setg(errp, "'" NVME_BLOCK_OPT_DEVICE "' option is required");
> +        return -EINVAL;

opts is leaked.

Attachment: signature.asc
Description: PGP signature


reply via email to

[Prev in Thread] Current Thread [Next in Thread]