[Top][All Lists]
[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]
Re: [Qemu-devel] [PATCH v12 3/5] block: add I/O throttling algorithm
From: |
Kevin Wolf |
Subject: |
Re: [Qemu-devel] [PATCH v12 3/5] block: add I/O throttling algorithm |
Date: |
Tue, 08 Nov 2011 09:41:06 +0100 |
User-agent: |
Mozilla/5.0 (X11; Linux x86_64; rv:7.0) Gecko/20110927 Thunderbird/7.0 |
Am 08.11.2011 05:34, schrieb Zhi Yong Wu:
> On Mon, Nov 7, 2011 at 11:18 PM, Kevin Wolf <address@hidden> wrote:
>> Am 03.11.2011 09:57, schrieb Zhi Yong Wu:
>>> Signed-off-by: Zhi Yong Wu <address@hidden>
>>> Signed-off-by: Stefan Hajnoczi <address@hidden>
>>> ---
>>> block.c | 220
>>> +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
>>> block.h | 1 +
>>> block_int.h | 1 +
>>> 3 files changed, 222 insertions(+), 0 deletions(-)
>>>
>>> diff --git a/block.c b/block.c
>>> index 79e7f09..b2af48f 100644
>>> --- a/block.c
>>> +++ b/block.c
>>> @@ -74,6 +74,13 @@ static BlockDriverAIOCB
>>> *bdrv_co_aio_rw_vector(BlockDriverState *bs,
>>> bool is_write);
>>> static void coroutine_fn bdrv_co_do_rw(void *opaque);
>>>
>>> +static bool bdrv_exceed_bps_limits(BlockDriverState *bs, int nb_sectors,
>>> + bool is_write, double elapsed_time, uint64_t *wait);
>>> +static bool bdrv_exceed_iops_limits(BlockDriverState *bs, bool is_write,
>>> + double elapsed_time, uint64_t *wait);
>>> +static bool bdrv_exceed_io_limits(BlockDriverState *bs, int nb_sectors,
>>> + bool is_write, int64_t *wait);
>>> +
>>> static QTAILQ_HEAD(, BlockDriverState) bdrv_states =
>>> QTAILQ_HEAD_INITIALIZER(bdrv_states);
>>>
>>> @@ -107,6 +114,24 @@ int is_windows_drive(const char *filename)
>>> #endif
>>>
>>> /* throttling disk I/O limits */
>>> +void bdrv_io_limits_disable(BlockDriverState *bs)
>>> +{
>>> + bs->io_limits_enabled = false;
>>> +
>>> + while (qemu_co_queue_next(&bs->throttled_reqs));
>>> +
>>> + if (bs->block_timer) {
>>> + qemu_del_timer(bs->block_timer);
>>> + qemu_free_timer(bs->block_timer);
>>> + bs->block_timer = NULL;
>>> + }
>>> +
>>> + bs->slice_start = 0;
>>> + bs->slice_end = 0;
>>> + bs->slice_time = 0;
>>> + memset(&bs->io_base, 0, sizeof(bs->io_base));
>>> +}
>>> +
>>> static void bdrv_block_timer(void *opaque)
>>> {
>>> BlockDriverState *bs = opaque;
>>> @@ -136,6 +161,31 @@ bool bdrv_io_limits_enabled(BlockDriverState *bs)
>>> || io_limits->iops[BLOCK_IO_LIMIT_TOTAL];
>>> }
>>>
>>> +static void bdrv_io_limits_intercept(BlockDriverState *bs,
>>> + bool is_write, int nb_sectors)
>>> +{
>>> + int64_t wait_time = -1;
>>> +
>>> + if (!qemu_co_queue_empty(&bs->throttled_reqs)) {
>>> + qemu_co_queue_wait(&bs->throttled_reqs);
>>> + }
>>> +
>>> + /* In fact, we hope to keep each request's timing, in FIFO mode. The
>>> next
>>> + * throttled requests will not be dequeued until the current request is
>>> + * allowed to be serviced. So if the current request still exceeds the
>>> + * limits, it will be inserted to the head. All requests followed it
>>> will
>>> + * be still in throttled_reqs queue.
>>> + */
>>> +
>>> + while (bdrv_exceed_io_limits(bs, nb_sectors, is_write, &wait_time)) {
>>> + qemu_mod_timer(bs->block_timer,
>>> + wait_time + qemu_get_clock_ns(vm_clock));
>>> + qemu_co_queue_wait_insert_head(&bs->throttled_reqs);
>>> + }
>>> +
>>> + qemu_co_queue_next(&bs->throttled_reqs);
>>> +}
>>> +
>>> /* check if the path starts with "<protocol>:" */
>>> static int path_has_protocol(const char *path)
>>> {
>>> @@ -718,6 +768,11 @@ int bdrv_open(BlockDriverState *bs, const char
>>> *filename, int flags,
>>> bdrv_dev_change_media_cb(bs, true);
>>> }
>>>
>>> + /* throttling disk I/O limits */
>>> + if (bs->io_limits_enabled) {
>>> + bdrv_io_limits_enable(bs);
>>> + }
>>> +
>>> return 0;
>>>
>>> unlink_and_fail:
>>> @@ -753,6 +808,11 @@ void bdrv_close(BlockDriverState *bs)
>>>
>>> bdrv_dev_change_media_cb(bs, false);
>>> }
>>> +
>>> + /*throttling disk I/O limits*/
>>> + if (bs->io_limits_enabled) {
>>> + bdrv_io_limits_disable(bs);
>>> + }
>>> }
>>>
>>> void bdrv_close_all(void)
>>> @@ -1291,6 +1351,11 @@ static int coroutine_fn
>>> bdrv_co_do_readv(BlockDriverState *bs,
>>> return -EIO;
>>> }
>>>
>>> + /* throttling disk read I/O */
>>> + if (bs->io_limits_enabled) {
>>> + bdrv_io_limits_intercept(bs, false, nb_sectors);
>>> + }
>>> +
>>> return drv->bdrv_co_readv(bs, sector_num, nb_sectors, qiov);
>>> }
>>>
>>> @@ -1321,6 +1386,11 @@ static int coroutine_fn
>>> bdrv_co_do_writev(BlockDriverState *bs,
>>> return -EIO;
>>> }
>>>
>>> + /* throttling disk write I/O */
>>> + if (bs->io_limits_enabled) {
>>> + bdrv_io_limits_intercept(bs, true, nb_sectors);
>>> + }
>>> +
>>> ret = drv->bdrv_co_writev(bs, sector_num, nb_sectors, qiov);
>>>
>>> if (bs->dirty_bitmap) {
>>> @@ -2512,6 +2582,156 @@ void bdrv_aio_cancel(BlockDriverAIOCB *acb)
>>> acb->pool->cancel(acb);
>>> }
>>>
>>> +/* block I/O throttling */
>>> +static bool bdrv_exceed_bps_limits(BlockDriverState *bs, int nb_sectors,
>>> + bool is_write, double elapsed_time, uint64_t *wait) {
>>> + uint64_t bps_limit = 0;
>>> + double bytes_limit, bytes_base, bytes_res;
>>> + double slice_time, wait_time;
>>> +
>>> + if (bs->io_limits.bps[BLOCK_IO_LIMIT_TOTAL]) {
>>> + bps_limit = bs->io_limits.bps[BLOCK_IO_LIMIT_TOTAL];
>>> + } else if (bs->io_limits.bps[is_write]) {
>>> + bps_limit = bs->io_limits.bps[is_write];
>>> + } else {
>>> + if (wait) {
>>> + *wait = 0;
>>> + }
>>> +
>>> + return false;
>>> + }
>>> +
>>> + slice_time = bs->slice_end - bs->slice_start;
>>> + slice_time /= (NANOSECONDS_PER_SECOND);
>>> + bytes_limit = bps_limit * slice_time;
>>> + bytes_base = bs->nr_bytes[is_write] - bs->io_base.bytes[is_write];
>>> + if (bs->io_limits.bps[BLOCK_IO_LIMIT_TOTAL]) {
>>> + bytes_base += bs->nr_bytes[!is_write] -
>>> bs->io_base.bytes[!is_write];
>>> + }
>>> +
>>> + bytes_res = (unsigned) nb_sectors * BDRV_SECTOR_SIZE;
>>> +
>>> + if (bytes_base + bytes_res <= bytes_limit) {
>>> + if (wait) {
>>> + *wait = 0;
>>> + }
>>> +
>>> + return false;
>>> + }
>>> +
>>> + /* Calc approx time to dispatch */
>>> + wait_time = (bytes_base + bytes_res) / bps_limit - elapsed_time;
>>> +
>>> + bs->slice_time = wait_time * BLOCK_IO_SLICE_TIME * 10;
>>> + bs->slice_end += bs->slice_time - 3 * BLOCK_IO_SLICE_TIME;
>>> + if (wait) {
>>> + *wait = wait_time * BLOCK_IO_SLICE_TIME * 10;
>>> + }
>>
>> I'm not quire sure what bs->slice_end really is and what these
>> calculations do exactly. Looks like magic. Can you add some comments
>> that explain why slice_end is increased?
> As you'ver known, when the I/O rate at runtime exceeds the limits,
> bs->slice_end need to be extended in order that the current statistic
> info can be kept until the timer fire, so it is increased and tuned
> based on the result of experimet.
>
>> and how you estimate *wait?
> The wait time is calcuated based on the history info of bps and iops.
>
> bytes_res = (unsigned) nb_sectors * BDRV_SECTOR_SIZE;
> wait_time = (bytes_base + bytes_res) / bps_limit - elapsed_time;
>
> 1.) bytes_base is the bytes of data which have been read/written; and
> it is obtained from the history statistic info.
> 2.) bytes_res is the remaining bytes of data which need to be read/written.
> 3.) (bytes_base + bytes_res) / bps_limit, this expression will be used
> to calcuated the total time for completing reading/writting all data.
>
> I don't make sure if you understand this.
Yes, I think this makes sense to me.
However, I don't understand why things like 10 * BLOCK_IO_SLICE_TIME or
3 * BLOCK_IO_SLICE_TIME appear in the code. These numbers are magic for
me. Are they more or less arbitrary values that happen to work well?
Kevin
[Qemu-devel] [PATCH v12 4/5] hmp/qmp: add block_set_io_throttle, Zhi Yong Wu, 2011/11/03
[Qemu-devel] [PATCH v12 5/5] block: perf testing report based on block I/O throttling, Zhi Yong Wu, 2011/11/03