qemu-devel
[Top][All Lists]
Advanced

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

Re: [Qemu-devel] [PATCH v7 RFC] block/vxhs: Initial commit to add Verita


From: Stefan Hajnoczi
Subject: Re: [Qemu-devel] [PATCH v7 RFC] block/vxhs: Initial commit to add Veritas HyperScale VxHS block device support
Date: Wed, 28 Sep 2016 22:45:10 +0100
User-agent: Mutt/1.7.0 (2016-08-17)

On Tue, Sep 27, 2016 at 09:09:49PM -0700, Ashish Mittal wrote:

Review of .bdrv_open() and .bdrv_aio_writev() code paths.

The big issues I see in this driver and libqnio:

1. Showstoppers like broken .bdrv_open() and leaking memory on every
   reply message.
2. Insecure due to missing input validation (network packets and
   configuration) and incorrect string handling.
3. Not fully asynchronous so QEMU and the guest may hang.

Please think about the whole codebase and not just the lines I've
pointed out in this review when fixing these sorts of issues.  There may
be similar instances of these bugs elsewhere and it's important that
they are fixed so that this can be merged.

> +/*
> + * Structure per vDisk maintained for state
> + */
> +typedef struct BDRVVXHSState {
> +    int                     fds[2];
> +    int64_t                 vdisk_size;
> +    int64_t                 vdisk_blocks;
> +    int64_t                 vdisk_flags;
> +    int                     vdisk_aio_count;
> +    int                     event_reader_pos;
> +    VXHSAIOCB               *qnio_event_acb;
> +    void                    *qnio_ctx;
> +    QemuSpin                vdisk_lock; /* Lock to protect BDRVVXHSState */
> +    QemuSpin                vdisk_acb_lock;  /* Protects ACB */

These comments are insufficient for documenting locking.  Not all fields
are actually protected by these locks.  Please order fields according to
lock coverage:

typedef struct VXHSAIOCB {
    ...

    /* Protected by BDRVVXHSState->vdisk_acb_lock */
    int                 segments;
    ...
};

typedef struct BDRVVXHSState {
    ...

    /* Protected by vdisk_lock */
    QemuSpin                vdisk_lock;
    int                     vdisk_aio_count;
    QSIMPLEQ_HEAD(aio_retryq, VXHSAIOCB) vdisk_aio_retryq;
    ...
}

> +static void vxhs_qnio_iio_close(BDRVVXHSState *s, int idx)
> +{
> +    /*
> +     * Close vDisk device
> +     */
> +    if (s->vdisk_hostinfo[idx].vdisk_rfd >= 0) {
> +        iio_devclose(s->qnio_ctx, 0, s->vdisk_hostinfo[idx].vdisk_rfd);

libqnio comment:
Why does iio_devclose() take an unused cfd argument?  Perhaps it can be
dropped.

> +        s->vdisk_hostinfo[idx].vdisk_rfd = -1;
> +    }
> +
> +    /*
> +     * Close QNIO channel against cached channel-fd
> +     */
> +    if (s->vdisk_hostinfo[idx].qnio_cfd >= 0) {
> +        iio_close(s->qnio_ctx, s->vdisk_hostinfo[idx].qnio_cfd);

libqnio comment:
Why does iio_devclose() take an int32_t cfd argument but iio_close()
takes a uint32_t cfd argument?

> +        s->vdisk_hostinfo[idx].qnio_cfd = -1;
> +    }
> +}
> +
> +static int vxhs_qnio_iio_open(int *cfd, const char *of_vsa_addr,
> +                              int *rfd, const char *file_name)
> +{
> +    /*
> +     * Open qnio channel to storage agent if not opened before.
> +     */
> +    if (*cfd < 0) {
> +        *cfd = iio_open(global_qnio_ctx, of_vsa_addr, 0);

libqnio comments:

1.
There is a buffer overflow in qnio_create_channel().  strncpy() is used
incorrectly so long hostname or port (both can be 99 characters long)
will overflow channel->name[] (64 characters) or channel->port[] (8
characters).

    strncpy(channel->name, hostname, strlen(hostname) + 1);
    strncpy(channel->port, port, strlen(port) + 1);

The third argument must be the size of the *destination* buffer, not the
source buffer.  Also note that strncpy() doesn't NUL-terminate the
destination string so you must do that manually to ensure there is a NUL
byte at the end of the buffer.

2.
channel is leaked in the "Failed to open single connection" error case
in qnio_create_channel().

3.
If host is longer the 63 characters then the ioapi_ctx->channels and
qnio_ctx->channels maps will use different keys due to string truncation
in qnio_create_channel().  This means "Channel already exists" in
qnio_create_channel() and possibly other things will not work as
expected.

> +        if (*cfd < 0) {
> +            trace_vxhs_qnio_iio_open(of_vsa_addr);
> +            return -ENODEV;
> +        }
> +    }
> +
> +    /*
> +     * Open vdisk device
> +     */
> +    *rfd = iio_devopen(global_qnio_ctx, *cfd, file_name, 0);

libqnio comment:
Buffer overflow in iio_devopen() since chandev[128] is not large enough
to hold channel[100] + " " + devpath[arbitrary length] chars:

    sprintf(chandev, "%s %s", channel, devpath);

> +
> +    if (*rfd < 0) {
> +        if (*cfd >= 0) {

This check is always true.  Otherwise the return -ENODEV would have been
taken above.  The if statement isn't necessary.

> +static void vxhs_check_failover_status(int res, void *ctx)
> +{
> +    BDRVVXHSState *s = ctx;
> +
> +    if (res == 0) {
> +        /* found failover target */
> +        s->vdisk_cur_host_idx = s->vdisk_ask_failover_idx;
> +        s->vdisk_ask_failover_idx = 0;
> +        trace_vxhs_check_failover_status(
> +                   s->vdisk_hostinfo[s->vdisk_cur_host_idx].hostip,
> +                   s->vdisk_guid);
> +        qemu_spin_lock(&s->vdisk_lock);
> +        OF_VDISK_RESET_IOFAILOVER_IN_PROGRESS(s);
> +        qemu_spin_unlock(&s->vdisk_lock);
> +        vxhs_handle_queued_ios(s);
> +    } else {
> +        /* keep looking */
> +        trace_vxhs_check_failover_status_retry(s->vdisk_guid);
> +        s->vdisk_ask_failover_idx++;
> +        if (s->vdisk_ask_failover_idx == s->vdisk_nhosts) {
> +            /* pause and cycle through list again */
> +            sleep(QNIO_CONNECT_RETRY_SECS);

This code is called from a QEMU thread via vxhs_aio_rw().  It is not
permitted to call sleep() since it will freeze QEMU and probably the
guest.

If you need a timer you can use QEMU's timer APIs.  See aio_timer_new(),
timer_new_ns(), timer_mod(), timer_del(), timer_free().

> +            s->vdisk_ask_failover_idx = 0;
> +        }
> +        res = vxhs_switch_storage_agent(s);
> +    }
> +}
> +
> +static int vxhs_failover_io(BDRVVXHSState *s)
> +{
> +    int res = 0;
> +
> +    trace_vxhs_failover_io(s->vdisk_guid);
> +
> +    s->vdisk_ask_failover_idx = 0;
> +    res = vxhs_switch_storage_agent(s);
> +
> +    return res;
> +}
> +
> +static void vxhs_iio_callback(int32_t rfd, uint32_t reason, void *ctx,
> +                       uint32_t error, uint32_t opcode)

This function is doing too much.  Especially the failover code should
run in the AioContext since it's complex.  Don't do failover here
because this function is outside the AioContext lock.  Do it from
AioContext using a QEMUBH like block/rbd.c.

> +static int32_t
> +vxhs_qnio_iio_writev(void *qnio_ctx, uint32_t rfd, QEMUIOVector *qiov,
> +                     uint64_t offset, void *ctx, uint32_t flags)
> +{
> +    struct iovec cur;
> +    uint64_t cur_offset = 0;
> +    uint64_t cur_write_len = 0;
> +    int segcount = 0;
> +    int ret = 0;
> +    int i, nsio = 0;
> +    int iovcnt = qiov->niov;
> +    struct iovec *iov = qiov->iov;
> +
> +    errno = 0;
> +    cur.iov_base = 0;
> +    cur.iov_len = 0;
> +
> +    ret = iio_writev(qnio_ctx, rfd, iov, iovcnt, offset, ctx, flags);

libqnio comments:

1.
There are blocking connect(2) and getaddrinfo(3) calls in iio_writev()
so this may hang for arbitrary amounts of time.  This is not permitted
in .bdrv_aio_readv()/.bdrv_aio_writev().  Please make qnio actually
asynchronous.

2.
Where does client_callback() free reply?  It looks like every reply
message causes a memory leak!

3.
Buffer overflow in iio_writev() since device[128] cannot fit the device
string generated from the vdisk_guid.

4.
Buffer overflow in iio_writev() due to
strncpy(msg->hinfo.target,device,strlen(device)) where device[128] is
larger than target[64].  Also note the previous comments about strncpy()
usage.

5.
I don't see any endianness handling or portable alignment of struct
fields in the network protocol code.  Binary network protocols need to
take care of these issue for portability.  This means libqnio compiled
for different architectures will not work.  Do you plan to support any
other architectures besides x86?

6.
The networking code doesn't look robust: kvset uses assert() on input
from the network so the other side of the connection could cause SIGABRT
(coredump), the client uses the msg pointer as the cookie for the
response packet so the server can easily crash the client by sending a
bogus cookie value, etc.  Even on the client side these things are
troublesome but on a server they are guaranteed security issues.  I
didn't look into it deeply.  Please audit the code.

> +static int vxhs_qemu_init(QDict *options, BDRVVXHSState *s,
> +                              int *cfd, int *rfd, Error **errp)
> +{
> +    QDict *backing_options = NULL;
> +    QemuOpts *opts, *tcp_opts;
> +    const char *vxhs_filename;
> +    char *of_vsa_addr = NULL;
> +    Error *local_err = NULL;
> +    const char *vdisk_id_opt;
> +    char *file_name = NULL;
> +    size_t num_servers = 0;
> +    char *str = NULL;
> +    int ret = 0;
> +    int i;
> +
> +    opts = qemu_opts_create(&runtime_opts, NULL, 0, &error_abort);
> +    qemu_opts_absorb_qdict(opts, options, &local_err);
> +    if (local_err) {
> +        error_propagate(errp, local_err);
> +        ret = -EINVAL;
> +        goto out;
> +    }
> +
> +    vxhs_filename = qemu_opt_get(opts, VXHS_OPT_FILENAME);
> +    if (vxhs_filename) {
> +        trace_vxhs_qemu_init_filename(vxhs_filename);
> +    }
> +
> +    vdisk_id_opt = qemu_opt_get(opts, VXHS_OPT_VDISK_ID);
> +    if (!vdisk_id_opt) {
> +        error_setg(&local_err, QERR_MISSING_PARAMETER, VXHS_OPT_VDISK_ID);
> +        ret = -EINVAL;
> +        goto out;
> +    }
> +    s->vdisk_guid = g_strdup(vdisk_id_opt);
> +    trace_vxhs_qemu_init_vdisk(vdisk_id_opt);
> +
> +    num_servers = qdict_array_entries(options, VXHS_OPT_SERVER);
> +    if (num_servers < 1) {
> +        error_setg(&local_err, QERR_MISSING_PARAMETER, "server");
> +        ret = -EINVAL;
> +        goto out;
> +    } else if (num_servers > VXHS_MAX_HOSTS) {
> +        error_setg(&local_err, QERR_INVALID_PARAMETER, "server");
> +        error_append_hint(errp, "Maximum %d servers allowed.\n",
> +                          VXHS_MAX_HOSTS);
> +        ret = -EINVAL;
> +        goto out;
> +    }
> +    trace_vxhs_qemu_init_numservers(num_servers);
> +
> +    for (i = 0; i < num_servers; i++) {
> +        str = g_strdup_printf(VXHS_OPT_SERVER"%d.", i);
> +        qdict_extract_subqdict(options, &backing_options, str);
> +
> +        /* Create opts info from runtime_tcp_opts list */
> +        tcp_opts = qemu_opts_create(&runtime_tcp_opts, NULL, 0, 
> &error_abort);
> +        qemu_opts_absorb_qdict(tcp_opts, backing_options, &local_err);
> +        if (local_err) {
> +            qdict_del(backing_options, str);

backing_options is leaked and there's no need to delete the str key.

> +            qemu_opts_del(tcp_opts);
> +            g_free(str);
> +            ret = -EINVAL;
> +            goto out;
> +        }
> +
> +        s->vdisk_hostinfo[i].hostip = g_strdup(qemu_opt_get(tcp_opts,
> +                                                            VXHS_OPT_HOST));
> +        s->vdisk_hostinfo[i].port = g_ascii_strtoll(qemu_opt_get(tcp_opts,
> +                                                                 
> VXHS_OPT_PORT),
> +                                                    NULL, 0);

This will segfault if the port option was missing.

> +
> +        s->vdisk_hostinfo[i].qnio_cfd = -1;
> +        s->vdisk_hostinfo[i].vdisk_rfd = -1;
> +        trace_vxhs_qemu_init(s->vdisk_hostinfo[i].hostip,
> +                             s->vdisk_hostinfo[i].port);

It's not safe to use the %s format specifier for a trace event with a
NULL value.  In the case where hostip is NULL this could crash on some
systems.

> +
> +        qdict_del(backing_options, str);
> +        qemu_opts_del(tcp_opts);
> +        g_free(str);
> +    }

backing_options is leaked.

> +
> +    s->vdisk_nhosts = i;
> +    s->vdisk_cur_host_idx = 0;
> +    file_name = g_strdup_printf("%s%s", vdisk_prefix, s->vdisk_guid);
> +    of_vsa_addr = g_strdup_printf("of://%s:%d",
> +                                
> s->vdisk_hostinfo[s->vdisk_cur_host_idx].hostip,
> +                                
> s->vdisk_hostinfo[s->vdisk_cur_host_idx].port);

Can we get here with num_servers == 0?  In that case this would access
uninitialized memory.  I guess num_servers == 0 does not make sense and
there should be an error case for it.

> +
> +    /*
> +     * .bdrv_open() and .bdrv_create() run under the QEMU global mutex.
> +     */
> +    if (global_qnio_ctx == NULL) {
> +        global_qnio_ctx = vxhs_setup_qnio();

libqnio comment:
The client epoll thread should mask all signals (like
qemu_thread_create()).  Otherwise it may receive signals that it cannot
deal with.

> +        if (global_qnio_ctx == NULL) {
> +            error_setg(&local_err, "Failed vxhs_setup_qnio");
> +            ret = -EINVAL;
> +            goto out;
> +        }
> +    }
> +
> +    ret = vxhs_qnio_iio_open(cfd, of_vsa_addr, rfd, file_name);
> +    if (!ret) {
> +        error_setg(&local_err, "Failed qnio_iio_open");
> +        ret = -EIO;
> +    }

The return value of vxhs_qnio_iio_open() is 0 for success or -errno for
error.

I guess you never ran this code!  The block driver won't even open
successfully.

> +
> +out:
> +    g_free(file_name);
> +    g_free(of_vsa_addr);
> +    qemu_opts_del(opts);
> +
> +    if (ret < 0) {
> +        for (i = 0; i < num_servers; i++) {
> +            g_free(s->vdisk_hostinfo[i].hostip);
> +        }
> +        g_free(s->vdisk_guid);
> +        s->vdisk_guid = NULL;
> +        errno = -ret;

There is no need to set errno here.  The return value already contains
the error and the caller doesn't look at errno.

> +    }
> +    error_propagate(errp, local_err);
> +
> +    return ret;
> +}
> +
> +static int vxhs_open(BlockDriverState *bs, QDict *options,
> +              int bdrv_flags, Error **errp)
> +{
> +    BDRVVXHSState *s = bs->opaque;
> +    AioContext *aio_context;
> +    int qemu_qnio_cfd = -1;
> +    int device_opened = 0;
> +    int qemu_rfd = -1;
> +    int ret = 0;
> +    int i;
> +
> +    ret = vxhs_qemu_init(options, s, &qemu_qnio_cfd, &qemu_rfd, errp);
> +    if (ret < 0) {
> +        trace_vxhs_open_fail(ret);
> +        return ret;
> +    }
> +
> +    device_opened = 1;
> +    s->qnio_ctx = global_qnio_ctx;
> +    s->vdisk_hostinfo[0].qnio_cfd = qemu_qnio_cfd;
> +    s->vdisk_hostinfo[0].vdisk_rfd = qemu_rfd;
> +    s->vdisk_size = 0;
> +    QSIMPLEQ_INIT(&s->vdisk_aio_retryq);
> +
> +    /*
> +     * Create a pipe for communicating between two threads in different
> +     * context. Set handler for read event, which gets triggered when
> +     * IO completion is done by non-QEMU context.
> +     */
> +    ret = qemu_pipe(s->fds);
> +    if (ret < 0) {
> +        trace_vxhs_open_epipe('.');
> +        ret = -errno;
> +        goto errout;

This leaks s->vdisk_guid, s->vdisk_hostinfo[i].hostip, etc.
bdrv_close() will not be called so this function must do cleanup itself.

> +    }
> +    fcntl(s->fds[VDISK_FD_READ], F_SETFL, O_NONBLOCK);
> +
> +    aio_context = bdrv_get_aio_context(bs);
> +    aio_set_fd_handler(aio_context, s->fds[VDISK_FD_READ],
> +                       false, vxhs_aio_event_reader, NULL, s);
> +
> +    /*
> +     * Initialize the spin-locks.
> +     */
> +    qemu_spin_init(&s->vdisk_lock);
> +    qemu_spin_init(&s->vdisk_acb_lock);
> +
> +    return 0;
> +
> +errout:
> +    /*
> +     * Close remote vDisk device if it was opened earlier
> +     */
> +    if (device_opened) {

This is always true.  The device_opened variable can be removed.

> +/*
> + * This allocates QEMU-VXHS callback for each IO
> + * and is passed to QNIO. When QNIO completes the work,
> + * it will be passed back through the callback.
> + */
> +static BlockAIOCB *vxhs_aio_rw(BlockDriverState *bs,
> +                                int64_t sector_num, QEMUIOVector *qiov,
> +                                int nb_sectors,
> +                                BlockCompletionFunc *cb,
> +                                void *opaque, int iodir)
> +{
> +    VXHSAIOCB *acb = NULL;
> +    BDRVVXHSState *s = bs->opaque;
> +    size_t size;
> +    uint64_t offset;
> +    int iio_flags = 0;
> +    int ret = 0;
> +    void *qnio_ctx = s->qnio_ctx;
> +    uint32_t rfd = s->vdisk_hostinfo[s->vdisk_cur_host_idx].vdisk_rfd;
> +
> +    offset = sector_num * BDRV_SECTOR_SIZE;
> +    size = nb_sectors * BDRV_SECTOR_SIZE;
> +
> +    acb = qemu_aio_get(&vxhs_aiocb_info, bs, cb, opaque);
> +    /*
> +     * Setup or initialize VXHSAIOCB.
> +     * Every single field should be initialized since
> +     * acb will be picked up from the slab without
> +     * initializing with zero.
> +     */
> +    acb->io_offset = offset;
> +    acb->size = size;
> +    acb->ret = 0;
> +    acb->flags = 0;
> +    acb->aio_done = VXHS_IO_INPROGRESS;
> +    acb->segments = 0;
> +    acb->buffer = 0;
> +    acb->qiov = qiov;
> +    acb->direction = iodir;
> +
> +    qemu_spin_lock(&s->vdisk_lock);
> +    if (OF_VDISK_FAILED(s)) {
> +        trace_vxhs_aio_rw(s->vdisk_guid, iodir, size, offset);
> +        qemu_spin_unlock(&s->vdisk_lock);
> +        goto errout;
> +    }
> +    if (OF_VDISK_IOFAILOVER_IN_PROGRESS(s)) {
> +        QSIMPLEQ_INSERT_TAIL(&s->vdisk_aio_retryq, acb, retry_entry);
> +        s->vdisk_aio_retry_qd++;
> +        OF_AIOCB_FLAGS_SET_QUEUED(acb);
> +        qemu_spin_unlock(&s->vdisk_lock);
> +        trace_vxhs_aio_rw_retry(s->vdisk_guid, acb, 1);
> +        goto out;
> +    }
> +    s->vdisk_aio_count++;
> +    qemu_spin_unlock(&s->vdisk_lock);
> +
> +    iio_flags = (IIO_FLAG_DONE | IIO_FLAG_ASYNC);
> +
> +    switch (iodir) {
> +    case VDISK_AIO_WRITE:
> +            vxhs_inc_acb_segment_count(acb, 1);
> +            ret = vxhs_qnio_iio_writev(qnio_ctx, rfd, qiov,
> +                                       offset, (void *)acb, iio_flags);
> +            break;
> +    case VDISK_AIO_READ:
> +            vxhs_inc_acb_segment_count(acb, 1);
> +            ret = vxhs_qnio_iio_readv(qnio_ctx, rfd, qiov,
> +                                       offset, (void *)acb, iio_flags);
> +            break;
> +    default:
> +            trace_vxhs_aio_rw_invalid(iodir);
> +            goto errout;

s->vdisk_aio_count must be decremented before returning.

> +static void vxhs_close(BlockDriverState *bs)
> +{
> +    BDRVVXHSState *s = bs->opaque;
> +    int i;
> +
> +    trace_vxhs_close(s->vdisk_guid);
> +    close(s->fds[VDISK_FD_READ]);
> +    close(s->fds[VDISK_FD_WRITE]);
> +
> +    /*
> +     * Clearing all the event handlers for oflame registered to QEMU
> +     */
> +    aio_set_fd_handler(bdrv_get_aio_context(bs), s->fds[VDISK_FD_READ],
> +                       false, NULL, NULL, NULL);

Please remove the event handler before closing the fd.  I don't think it
matters in this case but in other scenarios there could be race
conditions if another thread opens an fd and the file descriptor number
is reused.

Attachment: signature.asc
Description: PGP signature


reply via email to

[Prev in Thread] Current Thread [Next in Thread]