qemu-devel
[Top][All Lists]
Advanced

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

Re: [Qemu-devel] [PATCH v12 3/5] block: add I/O throttling algorithm


From: Kevin Wolf
Subject: Re: [Qemu-devel] [PATCH v12 3/5] block: add I/O throttling algorithm
Date: Tue, 08 Nov 2011 09:41:06 +0100
User-agent: Mozilla/5.0 (X11; Linux x86_64; rv:7.0) Gecko/20110927 Thunderbird/7.0

Am 08.11.2011 05:34, schrieb Zhi Yong Wu:
> On Mon, Nov 7, 2011 at 11:18 PM, Kevin Wolf <address@hidden> wrote:
>> Am 03.11.2011 09:57, schrieb Zhi Yong Wu:
>>> Signed-off-by: Zhi Yong Wu <address@hidden>
>>> Signed-off-by: Stefan Hajnoczi <address@hidden>
>>> ---
>>>  block.c     |  220 
>>> +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
>>>  block.h     |    1 +
>>>  block_int.h |    1 +
>>>  3 files changed, 222 insertions(+), 0 deletions(-)
>>>
>>> diff --git a/block.c b/block.c
>>> index 79e7f09..b2af48f 100644
>>> --- a/block.c
>>> +++ b/block.c
>>> @@ -74,6 +74,13 @@ static BlockDriverAIOCB 
>>> *bdrv_co_aio_rw_vector(BlockDriverState *bs,
>>>                                                 bool is_write);
>>>  static void coroutine_fn bdrv_co_do_rw(void *opaque);
>>>
>>> +static bool bdrv_exceed_bps_limits(BlockDriverState *bs, int nb_sectors,
>>> +        bool is_write, double elapsed_time, uint64_t *wait);
>>> +static bool bdrv_exceed_iops_limits(BlockDriverState *bs, bool is_write,
>>> +        double elapsed_time, uint64_t *wait);
>>> +static bool bdrv_exceed_io_limits(BlockDriverState *bs, int nb_sectors,
>>> +        bool is_write, int64_t *wait);
>>> +
>>>  static QTAILQ_HEAD(, BlockDriverState) bdrv_states =
>>>      QTAILQ_HEAD_INITIALIZER(bdrv_states);
>>>
>>> @@ -107,6 +114,24 @@ int is_windows_drive(const char *filename)
>>>  #endif
>>>
>>>  /* throttling disk I/O limits */
>>> +void bdrv_io_limits_disable(BlockDriverState *bs)
>>> +{
>>> +    bs->io_limits_enabled = false;
>>> +
>>> +    while (qemu_co_queue_next(&bs->throttled_reqs));
>>> +
>>> +    if (bs->block_timer) {
>>> +        qemu_del_timer(bs->block_timer);
>>> +        qemu_free_timer(bs->block_timer);
>>> +        bs->block_timer = NULL;
>>> +    }
>>> +
>>> +    bs->slice_start = 0;
>>> +    bs->slice_end   = 0;
>>> +    bs->slice_time  = 0;
>>> +    memset(&bs->io_base, 0, sizeof(bs->io_base));
>>> +}
>>> +
>>>  static void bdrv_block_timer(void *opaque)
>>>  {
>>>      BlockDriverState *bs = opaque;
>>> @@ -136,6 +161,31 @@ bool bdrv_io_limits_enabled(BlockDriverState *bs)
>>>           || io_limits->iops[BLOCK_IO_LIMIT_TOTAL];
>>>  }
>>>
>>> +static void bdrv_io_limits_intercept(BlockDriverState *bs,
>>> +                                     bool is_write, int nb_sectors)
>>> +{
>>> +    int64_t wait_time = -1;
>>> +
>>> +    if (!qemu_co_queue_empty(&bs->throttled_reqs)) {
>>> +        qemu_co_queue_wait(&bs->throttled_reqs);
>>> +    }
>>> +
>>> +    /* In fact, we hope to keep each request's timing, in FIFO mode. The 
>>> next
>>> +     * throttled requests will not be dequeued until the current request is
>>> +     * allowed to be serviced. So if the current request still exceeds the
>>> +     * limits, it will be inserted to the head. All requests followed it 
>>> will
>>> +     * be still in throttled_reqs queue.
>>> +     */
>>> +
>>> +    while (bdrv_exceed_io_limits(bs, nb_sectors, is_write, &wait_time)) {
>>> +        qemu_mod_timer(bs->block_timer,
>>> +                       wait_time + qemu_get_clock_ns(vm_clock));
>>> +        qemu_co_queue_wait_insert_head(&bs->throttled_reqs);
>>> +    }
>>> +
>>> +    qemu_co_queue_next(&bs->throttled_reqs);
>>> +}
>>> +
>>>  /* check if the path starts with "<protocol>:" */
>>>  static int path_has_protocol(const char *path)
>>>  {
>>> @@ -718,6 +768,11 @@ int bdrv_open(BlockDriverState *bs, const char 
>>> *filename, int flags,
>>>          bdrv_dev_change_media_cb(bs, true);
>>>      }
>>>
>>> +    /* throttling disk I/O limits */
>>> +    if (bs->io_limits_enabled) {
>>> +        bdrv_io_limits_enable(bs);
>>> +    }
>>> +
>>>      return 0;
>>>
>>>  unlink_and_fail:
>>> @@ -753,6 +808,11 @@ void bdrv_close(BlockDriverState *bs)
>>>
>>>          bdrv_dev_change_media_cb(bs, false);
>>>      }
>>> +
>>> +    /*throttling disk I/O limits*/
>>> +    if (bs->io_limits_enabled) {
>>> +        bdrv_io_limits_disable(bs);
>>> +    }
>>>  }
>>>
>>>  void bdrv_close_all(void)
>>> @@ -1291,6 +1351,11 @@ static int coroutine_fn 
>>> bdrv_co_do_readv(BlockDriverState *bs,
>>>          return -EIO;
>>>      }
>>>
>>> +    /* throttling disk read I/O */
>>> +    if (bs->io_limits_enabled) {
>>> +        bdrv_io_limits_intercept(bs, false, nb_sectors);
>>> +    }
>>> +
>>>      return drv->bdrv_co_readv(bs, sector_num, nb_sectors, qiov);
>>>  }
>>>
>>> @@ -1321,6 +1386,11 @@ static int coroutine_fn 
>>> bdrv_co_do_writev(BlockDriverState *bs,
>>>          return -EIO;
>>>      }
>>>
>>> +    /* throttling disk write I/O */
>>> +    if (bs->io_limits_enabled) {
>>> +        bdrv_io_limits_intercept(bs, true, nb_sectors);
>>> +    }
>>> +
>>>      ret = drv->bdrv_co_writev(bs, sector_num, nb_sectors, qiov);
>>>
>>>      if (bs->dirty_bitmap) {
>>> @@ -2512,6 +2582,156 @@ void bdrv_aio_cancel(BlockDriverAIOCB *acb)
>>>      acb->pool->cancel(acb);
>>>  }
>>>
>>> +/* block I/O throttling */
>>> +static bool bdrv_exceed_bps_limits(BlockDriverState *bs, int nb_sectors,
>>> +                 bool is_write, double elapsed_time, uint64_t *wait) {
>>> +    uint64_t bps_limit = 0;
>>> +    double   bytes_limit, bytes_base, bytes_res;
>>> +    double   slice_time, wait_time;
>>> +
>>> +    if (bs->io_limits.bps[BLOCK_IO_LIMIT_TOTAL]) {
>>> +        bps_limit = bs->io_limits.bps[BLOCK_IO_LIMIT_TOTAL];
>>> +    } else if (bs->io_limits.bps[is_write]) {
>>> +        bps_limit = bs->io_limits.bps[is_write];
>>> +    } else {
>>> +        if (wait) {
>>> +            *wait = 0;
>>> +        }
>>> +
>>> +        return false;
>>> +    }
>>> +
>>> +    slice_time = bs->slice_end - bs->slice_start;
>>> +    slice_time /= (NANOSECONDS_PER_SECOND);
>>> +    bytes_limit = bps_limit * slice_time;
>>> +    bytes_base  = bs->nr_bytes[is_write] - bs->io_base.bytes[is_write];
>>> +    if (bs->io_limits.bps[BLOCK_IO_LIMIT_TOTAL]) {
>>> +        bytes_base += bs->nr_bytes[!is_write] - 
>>> bs->io_base.bytes[!is_write];
>>> +    }
>>> +
>>> +    bytes_res   = (unsigned) nb_sectors * BDRV_SECTOR_SIZE;
>>> +
>>> +    if (bytes_base + bytes_res <= bytes_limit) {
>>> +        if (wait) {
>>> +            *wait = 0;
>>> +        }
>>> +
>>> +        return false;
>>> +    }
>>> +
>>> +    /* Calc approx time to dispatch */
>>> +    wait_time = (bytes_base + bytes_res) / bps_limit - elapsed_time;
>>> +
>>> +    bs->slice_time = wait_time * BLOCK_IO_SLICE_TIME * 10;
>>> +    bs->slice_end += bs->slice_time - 3 * BLOCK_IO_SLICE_TIME;
>>> +    if (wait) {
>>> +        *wait = wait_time * BLOCK_IO_SLICE_TIME * 10;
>>> +    }
>>
>> I'm not quire sure what bs->slice_end really is and what these
>> calculations do exactly. Looks like magic. Can you add some comments
>> that explain why slice_end is increased?
> As you'ver known, when the I/O rate at runtime exceeds the limits,
> bs->slice_end need to be extended in order that the current statistic
> info can be kept until the timer fire, so it is increased and tuned
> based on the result of experimet.
> 
>> and how you estimate *wait?
> The wait time is calcuated based on the history info of bps and iops.
> 
> bytes_res = (unsigned) nb_sectors * BDRV_SECTOR_SIZE;
> wait_time = (bytes_base + bytes_res) / bps_limit - elapsed_time;
> 
> 1.) bytes_base is the bytes of data which have been read/written; and
> it is obtained from the history statistic info.
> 2.) bytes_res is the remaining bytes of data which need to be read/written.
> 3.) (bytes_base + bytes_res) / bps_limit, this expression will be used
> to calcuated the total time for completing reading/writting all data.
> 
> I don't make sure if you understand this.

Yes, I think this makes sense to me.

However, I don't understand why things like 10 * BLOCK_IO_SLICE_TIME or
3 * BLOCK_IO_SLICE_TIME appear in the code. These numbers are magic for
me. Are they more or less arbitrary values that happen to work well?

Kevin



reply via email to

[Prev in Thread] Current Thread [Next in Thread]