qemu-devel
[Top][All Lists]
Advanced

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

Re: [Qemu-devel] [PATCH v12 3/5] block: add I/O throttling algorithm


From: Zhi Yong Wu
Subject: Re: [Qemu-devel] [PATCH v12 3/5] block: add I/O throttling algorithm
Date: Tue, 8 Nov 2011 16:57:52 +0800

On Tue, Nov 8, 2011 at 4:41 PM, Kevin Wolf <address@hidden> wrote:
> Am 08.11.2011 05:34, schrieb Zhi Yong Wu:
>> On Mon, Nov 7, 2011 at 11:18 PM, Kevin Wolf <address@hidden> wrote:
>>> Am 03.11.2011 09:57, schrieb Zhi Yong Wu:
>>>> Signed-off-by: Zhi Yong Wu <address@hidden>
>>>> Signed-off-by: Stefan Hajnoczi <address@hidden>
>>>> ---
>>>>  block.c     |  220 
>>>> +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
>>>>  block.h     |    1 +
>>>>  block_int.h |    1 +
>>>>  3 files changed, 222 insertions(+), 0 deletions(-)
>>>>
>>>> diff --git a/block.c b/block.c
>>>> index 79e7f09..b2af48f 100644
>>>> --- a/block.c
>>>> +++ b/block.c
>>>> @@ -74,6 +74,13 @@ static BlockDriverAIOCB 
>>>> *bdrv_co_aio_rw_vector(BlockDriverState *bs,
>>>>                                                 bool is_write);
>>>>  static void coroutine_fn bdrv_co_do_rw(void *opaque);
>>>>
>>>> +static bool bdrv_exceed_bps_limits(BlockDriverState *bs, int nb_sectors,
>>>> +        bool is_write, double elapsed_time, uint64_t *wait);
>>>> +static bool bdrv_exceed_iops_limits(BlockDriverState *bs, bool is_write,
>>>> +        double elapsed_time, uint64_t *wait);
>>>> +static bool bdrv_exceed_io_limits(BlockDriverState *bs, int nb_sectors,
>>>> +        bool is_write, int64_t *wait);
>>>> +
>>>>  static QTAILQ_HEAD(, BlockDriverState) bdrv_states =
>>>>      QTAILQ_HEAD_INITIALIZER(bdrv_states);
>>>>
>>>> @@ -107,6 +114,24 @@ int is_windows_drive(const char *filename)
>>>>  #endif
>>>>
>>>>  /* throttling disk I/O limits */
>>>> +void bdrv_io_limits_disable(BlockDriverState *bs)
>>>> +{
>>>> +    bs->io_limits_enabled = false;
>>>> +
>>>> +    while (qemu_co_queue_next(&bs->throttled_reqs));
>>>> +
>>>> +    if (bs->block_timer) {
>>>> +        qemu_del_timer(bs->block_timer);
>>>> +        qemu_free_timer(bs->block_timer);
>>>> +        bs->block_timer = NULL;
>>>> +    }
>>>> +
>>>> +    bs->slice_start = 0;
>>>> +    bs->slice_end   = 0;
>>>> +    bs->slice_time  = 0;
>>>> +    memset(&bs->io_base, 0, sizeof(bs->io_base));
>>>> +}
>>>> +
>>>>  static void bdrv_block_timer(void *opaque)
>>>>  {
>>>>      BlockDriverState *bs = opaque;
>>>> @@ -136,6 +161,31 @@ bool bdrv_io_limits_enabled(BlockDriverState *bs)
>>>>           || io_limits->iops[BLOCK_IO_LIMIT_TOTAL];
>>>>  }
>>>>
>>>> +static void bdrv_io_limits_intercept(BlockDriverState *bs,
>>>> +                                     bool is_write, int nb_sectors)
>>>> +{
>>>> +    int64_t wait_time = -1;
>>>> +
>>>> +    if (!qemu_co_queue_empty(&bs->throttled_reqs)) {
>>>> +        qemu_co_queue_wait(&bs->throttled_reqs);
>>>> +    }
>>>> +
>>>> +    /* In fact, we hope to keep each request's timing, in FIFO mode. The 
>>>> next
>>>> +     * throttled requests will not be dequeued until the current request 
>>>> is
>>>> +     * allowed to be serviced. So if the current request still exceeds the
>>>> +     * limits, it will be inserted to the head. All requests followed it 
>>>> will
>>>> +     * be still in throttled_reqs queue.
>>>> +     */
>>>> +
>>>> +    while (bdrv_exceed_io_limits(bs, nb_sectors, is_write, &wait_time)) {
>>>> +        qemu_mod_timer(bs->block_timer,
>>>> +                       wait_time + qemu_get_clock_ns(vm_clock));
>>>> +        qemu_co_queue_wait_insert_head(&bs->throttled_reqs);
>>>> +    }
>>>> +
>>>> +    qemu_co_queue_next(&bs->throttled_reqs);
>>>> +}
>>>> +
>>>>  /* check if the path starts with "<protocol>:" */
>>>>  static int path_has_protocol(const char *path)
>>>>  {
>>>> @@ -718,6 +768,11 @@ int bdrv_open(BlockDriverState *bs, const char 
>>>> *filename, int flags,
>>>>          bdrv_dev_change_media_cb(bs, true);
>>>>      }
>>>>
>>>> +    /* throttling disk I/O limits */
>>>> +    if (bs->io_limits_enabled) {
>>>> +        bdrv_io_limits_enable(bs);
>>>> +    }
>>>> +
>>>>      return 0;
>>>>
>>>>  unlink_and_fail:
>>>> @@ -753,6 +808,11 @@ void bdrv_close(BlockDriverState *bs)
>>>>
>>>>          bdrv_dev_change_media_cb(bs, false);
>>>>      }
>>>> +
>>>> +    /*throttling disk I/O limits*/
>>>> +    if (bs->io_limits_enabled) {
>>>> +        bdrv_io_limits_disable(bs);
>>>> +    }
>>>>  }
>>>>
>>>>  void bdrv_close_all(void)
>>>> @@ -1291,6 +1351,11 @@ static int coroutine_fn 
>>>> bdrv_co_do_readv(BlockDriverState *bs,
>>>>          return -EIO;
>>>>      }
>>>>
>>>> +    /* throttling disk read I/O */
>>>> +    if (bs->io_limits_enabled) {
>>>> +        bdrv_io_limits_intercept(bs, false, nb_sectors);
>>>> +    }
>>>> +
>>>>      return drv->bdrv_co_readv(bs, sector_num, nb_sectors, qiov);
>>>>  }
>>>>
>>>> @@ -1321,6 +1386,11 @@ static int coroutine_fn 
>>>> bdrv_co_do_writev(BlockDriverState *bs,
>>>>          return -EIO;
>>>>      }
>>>>
>>>> +    /* throttling disk write I/O */
>>>> +    if (bs->io_limits_enabled) {
>>>> +        bdrv_io_limits_intercept(bs, true, nb_sectors);
>>>> +    }
>>>> +
>>>>      ret = drv->bdrv_co_writev(bs, sector_num, nb_sectors, qiov);
>>>>
>>>>      if (bs->dirty_bitmap) {
>>>> @@ -2512,6 +2582,156 @@ void bdrv_aio_cancel(BlockDriverAIOCB *acb)
>>>>      acb->pool->cancel(acb);
>>>>  }
>>>>
>>>> +/* block I/O throttling */
>>>> +static bool bdrv_exceed_bps_limits(BlockDriverState *bs, int nb_sectors,
>>>> +                 bool is_write, double elapsed_time, uint64_t *wait) {
>>>> +    uint64_t bps_limit = 0;
>>>> +    double   bytes_limit, bytes_base, bytes_res;
>>>> +    double   slice_time, wait_time;
>>>> +
>>>> +    if (bs->io_limits.bps[BLOCK_IO_LIMIT_TOTAL]) {
>>>> +        bps_limit = bs->io_limits.bps[BLOCK_IO_LIMIT_TOTAL];
>>>> +    } else if (bs->io_limits.bps[is_write]) {
>>>> +        bps_limit = bs->io_limits.bps[is_write];
>>>> +    } else {
>>>> +        if (wait) {
>>>> +            *wait = 0;
>>>> +        }
>>>> +
>>>> +        return false;
>>>> +    }
>>>> +
>>>> +    slice_time = bs->slice_end - bs->slice_start;
>>>> +    slice_time /= (NANOSECONDS_PER_SECOND);
>>>> +    bytes_limit = bps_limit * slice_time;
>>>> +    bytes_base  = bs->nr_bytes[is_write] - bs->io_base.bytes[is_write];
>>>> +    if (bs->io_limits.bps[BLOCK_IO_LIMIT_TOTAL]) {
>>>> +        bytes_base += bs->nr_bytes[!is_write] - 
>>>> bs->io_base.bytes[!is_write];
>>>> +    }
>>>> +
>>>> +    bytes_res   = (unsigned) nb_sectors * BDRV_SECTOR_SIZE;
>>>> +
>>>> +    if (bytes_base + bytes_res <= bytes_limit) {
>>>> +        if (wait) {
>>>> +            *wait = 0;
>>>> +        }
>>>> +
>>>> +        return false;
>>>> +    }
>>>> +
>>>> +    /* Calc approx time to dispatch */
>>>> +    wait_time = (bytes_base + bytes_res) / bps_limit - elapsed_time;
>>>> +
>>>> +    bs->slice_time = wait_time * BLOCK_IO_SLICE_TIME * 10;
>>>> +    bs->slice_end += bs->slice_time - 3 * BLOCK_IO_SLICE_TIME;
>>>> +    if (wait) {
>>>> +        *wait = wait_time * BLOCK_IO_SLICE_TIME * 10;
>>>> +    }
>>>
>>> I'm not quire sure what bs->slice_end really is and what these
>>> calculations do exactly. Looks like magic. Can you add some comments
>>> that explain why slice_end is increased?
>> As you'ver known, when the I/O rate at runtime exceeds the limits,
>> bs->slice_end need to be extended in order that the current statistic
>> info can be kept until the timer fire, so it is increased and tuned
>> based on the result of experimet.
>>
>>> and how you estimate *wait?
>> The wait time is calcuated based on the history info of bps and iops.
>>
>> bytes_res = (unsigned) nb_sectors * BDRV_SECTOR_SIZE;
>> wait_time = (bytes_base + bytes_res) / bps_limit - elapsed_time;
>>
>> 1.) bytes_base is the bytes of data which have been read/written; and
>> it is obtained from the history statistic info.
>> 2.) bytes_res is the remaining bytes of data which need to be read/written.
>> 3.) (bytes_base + bytes_res) / bps_limit, this expression will be used
>> to calcuated the total time for completing reading/writting all data.
>>
>> I don't make sure if you understand this.
>
> Yes, I think this makes sense to me.
>
> However, I don't understand why things like 10 * BLOCK_IO_SLICE_TIME or
10 * BLOCK_IO_SLICE_TIME is used to translate s value to ns value, and
is actually 1s.
> 3 * BLOCK_IO_SLICE_TIME appear in the code. These numbers are magic for
> me. Are they more or less arbitrary values that happen to work well?
They are used to define the window size of one slice. The slice
determine how close the calcuated runtime rate is to the real runtime
rate. So they are tunable variable.

>
> Kevin
>



-- 
Regards,

Zhi Yong Wu



reply via email to

[Prev in Thread] Current Thread [Next in Thread]