[PATCH RFC v3] cpufreq: schedutil: Make iowait boost more energy efficient

Joel Fernandes

unread,

Jul 8, 2017, 6:20:08 AM7/8/17

to

Currently the iowait_boost feature in schedutil makes the frequency go to max.
This is to handle a case that Peter described where the through put of
operations involving continuous I/O requests [1].

Instead of going to max, its also possible to achieve the same effect by
ramping up to max if there are repeated IO wait wake ups happening. This patch
is an attempt to do that. We start from the minimum frequency and double the
boost for every consecutive iowait update until we reach the maximum frequency.

I managed to find a Intel machine to test this patch and it is achieving the
desired effect. Also tested on ARM platform and see that there the transient
iowait requests aren't causing frequency spikes.

[1] https://patchwork.kernel.org/patch/9735885/

Cc: Srinivas Pandruvada <srinivas....@linux.intel.com>
Cc: Len Brown <le...@kernel.org>
Cc: Rafael J. Wysocki <r...@rjwysocki.net>
Cc: Viresh Kumar <viresh...@linaro.org>
Cc: Ingo Molnar <mi...@redhat.com>
Cc: Peter Zijlstra <pet...@infradead.org>
Signed-off-by: Joel Fernandes <joe...@google.com>
---
kernel/sched/cpufreq_schedutil.c | 47 +++++++++++++++++++++++++++++++++++-----
1 file changed, 42 insertions(+), 5 deletions(-)

diff --git a/kernel/sched/cpufreq_schedutil.c b/kernel/sched/cpufreq_schedutil.c
index 622eed1b7658..4a2d424d0c58 100644
--- a/kernel/sched/cpufreq_schedutil.c
+++ b/kernel/sched/cpufreq_schedutil.c
@@ -53,7 +53,9 @@ struct sugov_cpu {
struct update_util_data update_util;
struct sugov_policy *sg_policy;

+ bool prev_iowait_boost;
unsigned long iowait_boost;
+ unsigned long iowait_boost_min;
unsigned long iowait_boost_max;
u64 last_update;

@@ -168,22 +170,47 @@ static void sugov_get_util(unsigned long *util, unsigned long *max)
*max = cfs_max;
}

+static void sugov_decay_iowait_boost(struct sugov_cpu *sg_cpu)
+{
+ sg_cpu->iowait_boost >>= 1;
+
+ if (sg_cpu->iowait_boost < sg_cpu->iowait_boost_min)
+ sg_cpu->iowait_boost = 0;
+}
+
static void sugov_set_iowait_boost(struct sugov_cpu *sg_cpu, u64 time,
unsigned int flags)
{
if (flags & SCHED_CPUFREQ_IOWAIT) {
- sg_cpu->iowait_boost = sg_cpu->iowait_boost_max;
+ /* Remember for next time that we did an iowait boost */
+ sg_cpu->prev_iowait_boost = true;
+ if (sg_cpu->iowait_boost) {
+ sg_cpu->iowait_boost <<= 1;
+ sg_cpu->iowait_boost = min(sg_cpu->iowait_boost,
+ sg_cpu->iowait_boost_max);
+ } else {
+ sg_cpu->iowait_boost = sg_cpu->iowait_boost_min;
+ }
} else if (sg_cpu->iowait_boost) {
s64 delta_ns = time - sg_cpu->last_update;

/* Clear iowait_boost if the CPU apprears to have been idle. */
if (delta_ns > TICK_NSEC)
sg_cpu->iowait_boost = 0;
+
+ /*
+ * Since we don't decay iowait_boost when its consumed during
+ * the previous SCHED_CPUFREQ_IOWAIT update, decay it now.
+ */
+ if (sg_cpu->prev_iowait_boost) {
+ sugov_decay_iowait_boost(sg_cpu);
+ sg_cpu->prev_iowait_boost = false;
+ }
}
}

static void sugov_iowait_boost(struct sugov_cpu *sg_cpu, unsigned long *util,
- unsigned long *max)
+ unsigned long *max, unsigned int flags)
{
unsigned long boost_util = sg_cpu->iowait_boost;
unsigned long boost_max = sg_cpu->iowait_boost_max;
@@ -195,7 +222,16 @@ static void sugov_iowait_boost(struct sugov_cpu *sg_cpu, unsigned long *util,
*util = boost_util;
*max = boost_max;
}
- sg_cpu->iowait_boost >>= 1;
+
+ /*
+ * Incase iowait boost just happened on this CPU, don't reduce it right
+ * away since then the iowait boost will never increase on subsequent
+ * in_iowait wakeups.
+ */
+ if (flags & SCHED_CPUFREQ_IOWAIT && this_cpu_ptr(&sugov_cpu) == sg_cpu)
+ return;
+
+ sugov_decay_iowait_boost(sg_cpu);
}

#ifdef CONFIG_NO_HZ_COMMON
@@ -233,7 +269,7 @@ static void sugov_update_single(struct update_util_data *hook, u64 time,
next_f = policy->cpuinfo.max_freq;
} else {
sugov_get_util(&util, &max);
- sugov_iowait_boost(sg_cpu, &util, &max);
+ sugov_iowait_boost(sg_cpu, &util, &max, flags);
next_f = get_next_freq(sg_policy, util, max);
/*
* Do not reduce the frequency if the CPU has not been idle
@@ -279,7 +315,7 @@ static unsigned int sugov_next_freq_shared(struct sugov_cpu *sg_cpu, u64 time)
max = j_max;
}

- sugov_iowait_boost(j_sg_cpu, &util, &max);
+ sugov_iowait_boost(j_sg_cpu, &util, &max, flags);
}

return get_next_freq(sg_policy, util, max);
@@ -612,6 +648,7 @@ static int sugov_start(struct cpufreq_policy *policy)
memset(sg_cpu, 0, sizeof(*sg_cpu));
sg_cpu->sg_policy = sg_policy;
sg_cpu->flags = SCHED_CPUFREQ_RT;
+ sg_cpu->iowait_boost_min = policy->cpuinfo.min_freq;
sg_cpu->iowait_boost_max = policy->cpuinfo.max_freq;
cpufreq_add_update_util_hook(cpu, &sg_cpu->update_util,
policy_is_shared(policy) ?
--
2.13.2.725.g09c95d1e9-goog

Joel Fernandes

unread,

Jul 8, 2017, 6:30:08 AM7/8/17

to

Hi,

On Sat, Jul 8, 2017 at 3:17 AM, Joel Fernandes <joe...@google.com> wrote:
> Currently the iowait_boost feature in schedutil makes the frequency go to max.
> This is to handle a case that Peter described where the through put of
> operations involving continuous I/O requests [1].

Apologies for the incomplete line here, I'll rewrite the commit
message in a subsequent patch.

I meant to say "This is to handle a case that Peter described where
the throughput of operations involving continuous I/O requests is a
victim of low-frequency and ends up getting 'stuck' in a loop as he
described in [1]."

thanks,

-Joel

Joel Fernandes

unread,

Jul 9, 2017, 1:10:09 PM7/9/17

to

Currently the iowait_boost feature in schedutil makes the frequency go to max.

This feature was added to handle a case that Peter described where the
throughput of operations involving continuous I/O requests [1] is reduced due
to running at a lower frequency, however the lower throughput itself causes
utilization to be low and hence causing frequency to be low hence its "stuck".

Instead of going to max, its also possible to achieve the same effect by

ramping up to max if there are repeated in_iowait wakeups happening. This patch
is an attempt to do that. We start from a lower frequency (iowait_boost_min)

and double the boost for every consecutive iowait update until we reach the

maximum iowait boost frequency (iowait_boost_max).

I ran a synthetic test on an x86 machine with intel_pstate in passive mode
using schedutil. The patch achieves the desired effect as the existing
behavior. Also tested on ARM64 platform and see that there the transient iowait

requests aren't causing frequency spikes.

[1] https://patchwork.kernel.org/patch/9735885/

Cc: Srinivas Pandruvada <srinivas....@linux.intel.com>
Cc: Len Brown <le...@kernel.org>
Cc: Rafael J. Wysocki <r...@rjwysocki.net>
Cc: Viresh Kumar <viresh...@linaro.org>
Cc: Ingo Molnar <mi...@redhat.com>
Cc: Peter Zijlstra <pet...@infradead.org>

Suggested-by: Peter Zijlstra <pet...@infradead.org>

Signed-off-by: Joel Fernandes <joe...@google.com>
---

Changes since v1:
- not using tunables to plainly turn off iowait boost anymore

kernel/sched/cpufreq_schedutil.c | 52 ++++++++++++++++++++++++++++++++++------
1 file changed, 45 insertions(+), 7 deletions(-)

diff --git a/kernel/sched/cpufreq_schedutil.c b/kernel/sched/cpufreq_schedutil.c
index 622eed1b7658..4d9e8b96bed1 100644

@@ -245,7 +281,8 @@ static void sugov_update_single(struct update_util_data *hook, u64 time,
sugov_update_commit(sg_policy, time, next_f);
}

-static unsigned int sugov_next_freq_shared(struct sugov_cpu *sg_cpu, u64 time)
+static unsigned int sugov_next_freq_shared(struct sugov_cpu *sg_cpu, u64 time,
+ unsigned int flags)
{
struct sugov_policy *sg_policy = sg_cpu->sg_policy;
struct cpufreq_policy *policy = sg_policy->policy;
@@ -279,7 +316,7 @@ static unsigned int sugov_next_freq_shared(struct sugov_cpu *sg_cpu, u64 time)

max = j_max;
}

- sugov_iowait_boost(j_sg_cpu, &util, &max);
+ sugov_iowait_boost(j_sg_cpu, &util, &max, flags);
}

return get_next_freq(sg_policy, util, max);

@@ -308,7 +345,7 @@ static void sugov_update_shared(struct update_util_data *hook, u64 time,
if (flags & SCHED_CPUFREQ_RT_DL)
next_f = sg_policy->policy->cpuinfo.max_freq;
else
- next_f = sugov_next_freq_shared(sg_cpu, time);
+ next_f = sugov_next_freq_shared(sg_cpu, time, flags);

sugov_update_commit(sg_policy, time, next_f);
}
@@ -612,6 +649,7 @@ static int sugov_start(struct cpufreq_policy *policy)

Juri Lelli

unread,

Jul 10, 2017, 7:00:07 AM7/10/17

to

Hi Joel,

In this case we might just also want to reset prev_iowait_boost
unconditionally and return.

> +
> + /*
> + * Since we don't decay iowait_boost when its consumed during
> + * the previous SCHED_CPUFREQ_IOWAIT update, decay it now.
> + */

Code below seems pretty self-explaning to me. :)

> + if (sg_cpu->prev_iowait_boost) {
> + sugov_decay_iowait_boost(sg_cpu);
> + sg_cpu->prev_iowait_boost = false;
> + }
> }
> }
>
> static void sugov_iowait_boost(struct sugov_cpu *sg_cpu, unsigned long *util,
> - unsigned long *max)
> + unsigned long *max, unsigned int flags)
> {
> unsigned long boost_util = sg_cpu->iowait_boost;
> unsigned long boost_max = sg_cpu->iowait_boost_max;
> @@ -195,7 +222,16 @@ static void sugov_iowait_boost(struct sugov_cpu *sg_cpu, unsigned long *util,
> *util = boost_util;
> *max = boost_max;
> }
> - sg_cpu->iowait_boost >>= 1;
> +
> + /*
> + * Incase iowait boost just happened on this CPU, don't reduce it right
> + * away since then the iowait boost will never increase on subsequent
> + * in_iowait wakeups.
> + */
> + if (flags & SCHED_CPUFREQ_IOWAIT && this_cpu_ptr(&sugov_cpu) == sg_cpu)
> + return;
> +
> + sugov_decay_iowait_boost(sg_cpu);

Mmm, do we need to decay even when we are computing frequency requests
for a domain?

Considering it a per-cpu thing, isn't enough that it gets bumped up or
decayed only when a CPU does an update (by using the above from
sugov_update_shared)?

If we go this way I think we will only need to reset prev_iowait_boost
if delta_ns > TICK_NSEC during the for_each_cpu() loop of sugov_next_
freq_shared().

Thanks,

- Juri

Joel Fernandes

unread,

Jul 11, 2017, 1:20:06 AM7/11/17

to

Hi Juri,

[..]

Ok, will do.

>> +
>> + /*
>> + * Since we don't decay iowait_boost when its consumed during
>> + * the previous SCHED_CPUFREQ_IOWAIT update, decay it now.
>> + */
>
> Code below seems pretty self-explaning to me. :)
>

Ok :)

Actually the "decay" was already being done before (without this
patch), I am just preserving the same old behavior where we do decay.
Perhaps your proposal can be a separate match? Or did I miss something
else subtle here?

Thanks,

-Joel

Juri Lelli

unread,

Jul 11, 2017, 3:20:09 AM7/11/17

to

On 10/07/17 22:12, Joel Fernandes wrote:
> Hi Juri,
>
> On Mon, Jul 10, 2017 at 3:55 AM, Juri Lelli <juri....@arm.com> wrote:
> > Hi Joel,
> >
> > On 09/07/17 10:08, Joel Fernandes wrote:

[...]

True, we are currently decaying anyway.

Looking again at this path made me however think if we really need to. I
guess we need currently, as we bump frenquency to max and then decay it.
But, with your changes, I was wondering if we can simplify the thing and
decay only on the per-CPU update path.

The other reason for trying to simplify this is that I don't
particularly like adding and consuming flags argument at this point, but
I guess we could refactor the code if this is really a problem.

Thanks,

- Juri

Viresh Kumar

unread,

Jul 11, 2017, 6:20:08 AM7/11/17

to

I am not sure if boost should start from the min frequency, as the current
frequency will at least be equal to that. Which means that with no boost
initially, we will never increase the frequency for the first IOWAIT flag event.

> + }
> } else if (sg_cpu->iowait_boost) {
> s64 delta_ns = time - sg_cpu->last_update;
>
> /* Clear iowait_boost if the CPU apprears to have been idle. */
> if (delta_ns > TICK_NSEC)
> sg_cpu->iowait_boost = 0;
> +
> + /*
> + * Since we don't decay iowait_boost when its consumed during
> + * the previous SCHED_CPUFREQ_IOWAIT update, decay it now.
> + */
> + if (sg_cpu->prev_iowait_boost) {

SCHED_CPUFREQ_IOWAIT flag is set only by CFS from the enqueue_task() and in many
cases we call the util hook twice from the same enqueue_task() instance before
returning (2nd one after updating util). And in such cases we will set
iowait_boost as 0 on the second call.

Have you ever seen two consecutive calls to sugov_set_iowait_boost() with IOWAIT
flag set ? Can we get the ratio of that against the other case where we have
IOWAIT flag set in first call, followed by one or more non-IOWAIT calls and then
IOWAIT again ?

I am asking because if the calls with IOWAIT flag aren't consecutive then we
will make iowait_boost as 0 in the next non-IOWAIT call.

--
viresh

Peter Zijlstra

unread,

Jul 11, 2017, 7:50:08 AM7/11/17

to

On Sun, Jul 09, 2017 at 10:08:26AM -0700, Joel Fernandes wrote:
> --- a/kernel/sched/cpufreq_schedutil.c
> +++ b/kernel/sched/cpufreq_schedutil.c
> @@ -53,7 +53,9 @@ struct sugov_cpu {
> struct update_util_data update_util;
> struct sugov_policy *sg_policy;
>
> + bool prev_iowait_boost;
> unsigned long iowait_boost;

So I absolutely detest 'bool' in composite types, but I see there's
already some of that in this file :/

Peter Zijlstra

unread,

Jul 11, 2017, 7:50:08 AM7/11/17

to

I suspect this actually works for Joel to get rid of the transient
spikes he was seeing. Starting at the current freq, as you suggest,
appears to make sense, but would add immediate transients back.

Joel Fernandes

unread,

Jul 11, 2017, 10:20:08 AM7/11/17

to

On Tue, Jul 11, 2017 at 7:14 AM, Joel Fernandes <joe...@google.com> wrote:
[..]

>>> + }
>>> } else if (sg_cpu->iowait_boost) {
>>> s64 delta_ns = time - sg_cpu->last_update;
>>>
>>> /* Clear iowait_boost if the CPU apprears to have been idle. */
>>> if (delta_ns > TICK_NSEC)
>>> sg_cpu->iowait_boost = 0;
>>> +
>>> + /*
>>> + * Since we don't decay iowait_boost when its consumed during
>>> + * the previous SCHED_CPUFREQ_IOWAIT update, decay it now.
>>> + */
>>> + if (sg_cpu->prev_iowait_boost) {
>>
>> SCHED_CPUFREQ_IOWAIT flag is set only by CFS from the enqueue_task() and in many
>> cases we call the util hook twice from the same enqueue_task() instance before
>> returning (2nd one after updating util). And in such cases we will set
>> iowait_boost as 0 on the second call.
>>
>> Have you ever seen two consecutive calls to sugov_set_iowait_boost() with IOWAIT
>> flag set ? Can we get the ratio of that against the other case where we have
>> IOWAIT flag set in first call, followed by one or more non-IOWAIT calls and then
>> IOWAIT again ?
>>
>> I am asking because if the calls with IOWAIT flag aren't consecutive then we
>> will make iowait_boost as 0 in the next non-IOWAIT call.
>

> Yes, I've seen that happen in my testing (consecutive iowait). I
> haven't seen the other case where you have IOWAIT followed by
> non-IOWAIT for a repeated set of IOWAIT requests. Would you more
> comfortable if we moved sugov_set_iowait_boost() after the
> sugov_should_update_freq() ? That way if there are consecutive
> requests in the same path, then it most likely rate-limiting will
> prevent such updates. I will also try to collect some stats as you
> suggested to see if how often if at all this can happen.

Just to be more clear, I was saying that I've only seen repeated
IOWAIT requests in the update path, not IOWAIT followed by non-IOWAIT
cpufreq updates for a repeated sequence of IOWAIT wakeups.

thanks,

-Joel

Joel Fernandes

unread,

Jul 11, 2017, 10:20:08 AM7/11/17

to

Hi Viresh,

I think the whole point of IOWAIT boost was to solve the issue with a
long sequence of repeated I/O requests as described in the commit
message. So IIUC there isn't a usecase for that (increase freq. on
first request). Also its just for the first couple of requests in my
testing and doesn't hurt the performance at all for the intended
usecase while still not causing transient spikes.

Another approach than setting min in sugov_set_iowait_boost, is, since
we have already retrieved the current util, we can check if flags ==
SCHED_CPUFREQ_IOWAIT, then set initial the iowait_boost such that
(iowait_boost / iowait_boost_max) is aleast equal to (util / max) or
iowait_boost_min, which ever is lower. This still will not increase
frequency on the first request, but will ensure the next one will
benefit. Then again I fear running into slightly longer-term
transients where 2 iowait requests are enough to boost the frequency
to high values. I can try to measure how often this can hurt power for
common usecases if you agree its worth exploring.

>
>> + }
>> } else if (sg_cpu->iowait_boost) {
>> s64 delta_ns = time - sg_cpu->last_update;
>>
>> /* Clear iowait_boost if the CPU apprears to have been idle. */
>> if (delta_ns > TICK_NSEC)
>> sg_cpu->iowait_boost = 0;
>> +
>> + /*
>> + * Since we don't decay iowait_boost when its consumed during
>> + * the previous SCHED_CPUFREQ_IOWAIT update, decay it now.
>> + */
>> + if (sg_cpu->prev_iowait_boost) {
>
> SCHED_CPUFREQ_IOWAIT flag is set only by CFS from the enqueue_task() and in many
> cases we call the util hook twice from the same enqueue_task() instance before
> returning (2nd one after updating util). And in such cases we will set
> iowait_boost as 0 on the second call.
>
> Have you ever seen two consecutive calls to sugov_set_iowait_boost() with IOWAIT
> flag set ? Can we get the ratio of that against the other case where we have
> IOWAIT flag set in first call, followed by one or more non-IOWAIT calls and then
> IOWAIT again ?
>
> I am asking because if the calls with IOWAIT flag aren't consecutive then we
> will make iowait_boost as 0 in the next non-IOWAIT call.

Yes, I've seen that happen in my testing (consecutive iowait). I
haven't seen the other case where you have IOWAIT followed by
non-IOWAIT for a repeated set of IOWAIT requests. Would you more
comfortable if we moved sugov_set_iowait_boost() after the
sugov_should_update_freq() ? That way if there are consecutive
requests in the same path, then it most likely rate-limiting will
prevent such updates. I will also try to collect some stats as you
suggested to see if how often if at all this can happen.

thanks,

-Joel

Juri Lelli

unread,

Jul 11, 2017, 10:40:08 AM7/11/17

to

On 11/07/17 07:33, Joel Fernandes wrote:
> Hi Juri,
>

> On Tue, Jul 11, 2017 at 12:14 AM, Juri Lelli <juri....@arm.com> wrote:
> [..]

> >> > Considering it a per-cpu thing, isn't enough that it gets bumped up or
> >> > decayed only when a CPU does an update (by using the above from
> >> > sugov_update_shared)?
> >> >
> >> > If we go this way I think we will only need to reset prev_iowait_boost
> >> > if delta_ns > TICK_NSEC during the for_each_cpu() loop of sugov_next_
> >> > freq_shared().
> >> >
> >>
> >> Actually the "decay" was already being done before (without this
> >> patch), I am just preserving the same old behavior where we do decay.
> >> Perhaps your proposal can be a separate match? Or did I miss something
> >> else subtle here?
> >>
> >
> > True, we are currently decaying anyway.
> >
> > Looking again at this path made me however think if we really need to. I
> > guess we need currently, as we bump frenquency to max and then decay it.
> > But, with your changes, I was wondering if we can simplify the thing and
> > decay only on the per-CPU update path.
>

> Yes that makes sense to me, but even if we're at max, we should still
> benefit from your idea right? (I didn't follow why being at min makes
> the idea better, I think its a good idea in both cases (whether we are
> boosting to max and ramping down or starting from the min) but let me
> know if I missed something)
>

Not better, but I thought that if we start from max and decay we
probably need to have more chances to decay faster (so decaying also
when aggregating request for a domain is probably needed)?

> If I understand correctly what you're proposing:
> 1. Remove the decay from sugov_iowait_boost
> 2. Add the iowait_boost decay unconditionally to
> sugov_set_iowait_boost for the !SCHED_CPUFREQ_IOWAIT case.
>
> That would also get rid of the prev_iowait_boost flag and simplify
> things, and also address Peter's concern of adding 'bool' in composite
> types :)
>

OK, just a suggestion however, didn't play with the idea myself, it
might not work. :)

Thanks,

- Juri

Joel Fernandes

unread,

Jul 11, 2017, 10:40:09 AM7/11/17

to

Hi Juri,

On Tue, Jul 11, 2017 at 12:14 AM, Juri Lelli <juri....@arm.com> wrote:
[..]

>> > Considering it a per-cpu thing, isn't enough that it gets bumped up or
>> > decayed only when a CPU does an update (by using the above from
>> > sugov_update_shared)?
>> >
>> > If we go this way I think we will only need to reset prev_iowait_boost
>> > if delta_ns > TICK_NSEC during the for_each_cpu() loop of sugov_next_
>> > freq_shared().
>> >
>>
>> Actually the "decay" was already being done before (without this
>> patch), I am just preserving the same old behavior where we do decay.
>> Perhaps your proposal can be a separate match? Or did I miss something
>> else subtle here?
>>
>
> True, we are currently decaying anyway.
>
> Looking again at this path made me however think if we really need to. I
> guess we need currently, as we bump frenquency to max and then decay it.
> But, with your changes, I was wondering if we can simplify the thing and
> decay only on the per-CPU update path.

Yes that makes sense to me, but even if we're at max, we should still
benefit from your idea right? (I didn't follow why being at min makes
the idea better, I think its a good idea in both cases (whether we are
boosting to max and ramping down or starting from the min) but let me
know if I missed something)

If I understand correctly what you're proposing:
1. Remove the decay from sugov_iowait_boost
2. Add the iowait_boost decay unconditionally to
sugov_set_iowait_boost for the !SCHED_CPUFREQ_IOWAIT case.

That would also get rid of the prev_iowait_boost flag and simplify
things, and also address Peter's concern of adding 'bool' in composite
types :)

thanks,

-Joel

Viresh Kumar

unread,

Jul 12, 2017, 1:10:08 AM7/12/17

to

On 11-07-17, 07:14, Joel Fernandes wrote:
> I think the whole point of IOWAIT boost was to solve the issue with a
> long sequence of repeated I/O requests as described in the commit
> message. So IIUC there isn't a usecase for that (increase freq. on
> first request).

Right. So we can take example that Peter gave earlier. Task runs .1 ms and waits
for IO for 1 ms (at max speed). But there is high possibility that the util
update handler gets called within that 1 ms (from non-enqueue paths) and because
you chose to reduce iowait boost from sugov_set_iowait_boost() in your commit,
we can easily end up ignoring iowait boosting.

> Also its just for the first couple of requests in my
> testing and doesn't hurt the performance at all for the intended
> usecase while still not causing transient spikes.

We can have bad enough timing where the util handler gets called right in that 1
ms of IOWAIT period and we will never boost.

> Another approach than setting min in sugov_set_iowait_boost, is, since
> we have already retrieved the current util, we can check if flags ==
> SCHED_CPUFREQ_IOWAIT, then set initial the iowait_boost such that
> (iowait_boost / iowait_boost_max) is aleast equal to (util / max) or
> iowait_boost_min, which ever is lower.

So my concerns weren't only about the initial min value, but also that you
reduce the freq from sugov_set_iowait_boost(). We can discuss what the ideal
value to start with can be.

> This still will not increase
> frequency on the first request, but will ensure the next one will
> benefit.

If there is no non-enqueue path request lands.

> Yes, I've seen that happen in my testing (consecutive iowait).

The CFS scheduler can send a util update request every 1 ms for util updates and
I am not sure why isn't that happening in your case.

How much is the time between two consecutive IOWAIT requests in your case ?
Maybe it is too less (Ofcourse it isn't in your control :). But if we take
Peter's example, then it will surely have a non-enqueue path request between two
IOWAIT requests.

> I
> haven't seen the other case where you have IOWAIT followed by
> non-IOWAIT for a repeated set of IOWAIT requests. Would you more
> comfortable if we moved sugov_set_iowait_boost() after the
> sugov_should_update_freq() ?

That may make us ignore all IOWAIT requests that happen between rate_limit_us
time. And that would be bad IMHO.

> That way if there are consecutive
> requests in the same path, then it most likely rate-limiting will
> prevent such updates. I will also try to collect some stats as you
> suggested to see if how often if at all this can happen.

Completely untested, but what about something like this ? This should get rid of
the spikes you were getting.

diff --git a/kernel/sched/cpufreq_schedutil.c b/kernel/sched/cpufreq_schedutil.c
index 076a2e31951c..3459f327c94e 100644
--- a/kernel/sched/cpufreq_schedutil.c
+++ b/kernel/sched/cpufreq_schedutil.c
@@ -53,6 +53,7 @@ struct sugov_cpu {

struct update_util_data update_util;
struct sugov_policy *sg_policy;

+ bool iowait_boost_pending;
unsigned long iowait_boost;

unsigned long iowait_boost_max;
u64 last_update;

@@ -169,7 +170,12 @@ static void sugov_set_iowait_boost(struct sugov_cpu *sg_cpu, u64 time,

unsigned int flags)
{
if (flags & SCHED_CPUFREQ_IOWAIT) {
- sg_cpu->iowait_boost = sg_cpu->iowait_boost_max;

+ sg_cpu->iowait_boost_pending = true;
+
+ if (!sg_cpu->iowait_boost) {
+ sg_cpu->iowait_boost = sg_cpu->sg_policy->policy->cur;

+ sg_cpu->iowait_boost >>= 1;
+ }

} else if (sg_cpu->iowait_boost) {
s64 delta_ns = time - sg_cpu->last_update;

@@ -182,17 +188,26 @@ static void sugov_set_iowait_boost(struct sugov_cpu *sg_cpu, u64 time,
static void sugov_iowait_boost(struct sugov_cpu *sg_cpu, unsigned long *util,
unsigned long *max)
{
- unsigned long boost_util = sg_cpu->iowait_boost;
- unsigned long boost_max = sg_cpu->iowait_boost_max;
+ unsigned long boost_util, boost_max;

- if (!boost_util)
+ if (!sg_cpu->iowait_boost)
return;

+ if (sg_cpu->iowait_boost_pending) {
+ sg_cpu->iowait_boost_pending = false;
+ sg_cpu->iowait_boost = min(sg_cpu->iowait_boost << 1,

+ sg_cpu->iowait_boost_max);
+ } else {

+ sg_cpu->iowait_boost >>= 1;
+ }
+
+ boost_util = sg_cpu->iowait_boost;
+ boost_max = sg_cpu->iowait_boost_max;
+
if (*util * boost_max < *max * boost_util) {

*util = boost_util;
*max = boost_max;
}
- sg_cpu->iowait_boost >>= 1;
}

#ifdef CONFIG_NO_HZ_COMMON

--
viresh

Peter Zijlstra

unread,

Jul 12, 2017, 5:40:06 AM7/12/17

to

On Wed, Jul 12, 2017 at 10:30:35AM +0530, Viresh Kumar wrote:
> On 11-07-17, 07:14, Joel Fernandes wrote:

> > Another approach than setting min in sugov_set_iowait_boost, is, since
> > we have already retrieved the current util, we can check if flags ==
> > SCHED_CPUFREQ_IOWAIT, then set initial the iowait_boost such that
> > (iowait_boost / iowait_boost_max) is aleast equal to (util / max) or
> > iowait_boost_min, which ever is lower.
>
> So my concerns weren't only about the initial min value, but also that you
> reduce the freq from sugov_set_iowait_boost(). We can discuss what the ideal
> value to start with can be.

I'm not sure I see that. He only mucks with iowait_boost, not the actual
frequency afaict.

And sugov_iowait_boost() picks the highest of util vs iowait_boost,
which wasn't changed.

Or am I completely missing something? (that code is a bit hard to
follow)

Viresh Kumar

unread,

Jul 12, 2017, 5:50:08 AM7/12/17

to

No, I wasn't clear enough. Sorry about that. Lemme try again:

Suppose min freq is 500 MHz and Max is 2 GHz. The iowait-boost is
set to 1 GHz right now (because of previous events with IOWAIT flag
set), and sugov_set_iowait_boost() gets called again with IOWAIT flag,
we boost the iowait-boost value to 2 GHz. We are in the rate_limit_us
window right now, we return without changing the frequency.

If the next call into the schedutil governor happens due to normal
util-update, flags will be passed as 0. With the current patch, we
will bring iowait-boost back to 1 GHz (before updating the real
frequency to 2 GHz) as the prev-iowait-boost boolean would be set.

And even if the task is periodically getting queued after IOWAIT,
actual boosting may not happen at all in some cases.

--
viresh

Peter Zijlstra

unread,

Jul 12, 2017, 9:30:09 AM7/12/17

to

On Wed, Jul 12, 2017 at 03:16:23PM +0530, Viresh Kumar wrote:

> No, I wasn't clear enough. Sorry about that. Lemme try again:
>
> Suppose min freq is 500 MHz and Max is 2 GHz. The iowait-boost is
> set to 1 GHz right now (because of previous events with IOWAIT flag
> set), and sugov_set_iowait_boost() gets called again with IOWAIT flag,
> we boost the iowait-boost value to 2 GHz. We are in the rate_limit_us
> window right now, we return without changing the frequency.
>
> If the next call into the schedutil governor happens due to normal
> util-update, flags will be passed as 0. With the current patch, we
> will bring iowait-boost back to 1 GHz (before updating the real
> frequency to 2 GHz) as the prev-iowait-boost boolean would be set.
>
> And even if the task is periodically getting queued after IOWAIT,
> actual boosting may not happen at all in some cases.

Hmm, so you're worried about that ratelimit stuff? Shouldn't we fix that
independently -- IIRC Rafael proposed a max-filter over the window.

Joel Fernandes

unread,

Jul 12, 2017, 10:10:09 PM7/12/17

to

Hi Viresh,

On Tue, Jul 11, 2017 at 10:00 PM, Viresh Kumar <viresh...@linaro.org> wrote:
[..]

>> Another approach than setting min in sugov_set_iowait_boost, is, since
>> we have already retrieved the current util, we can check if flags ==
>> SCHED_CPUFREQ_IOWAIT, then set initial the iowait_boost such that
>> (iowait_boost / iowait_boost_max) is aleast equal to (util / max) or
>> iowait_boost_min, which ever is lower.
>
> So my concerns weren't only about the initial min value, but also that you
> reduce the freq from sugov_set_iowait_boost(). We can discuss what the ideal
> value to start with can be.
>
>> This still will not increase
>> frequency on the first request, but will ensure the next one will
>> benefit.
>
> If there is no non-enqueue path request lands.
>
>> Yes, I've seen that happen in my testing (consecutive iowait).
>
> The CFS scheduler can send a util update request every 1 ms for util updates and
> I am not sure why isn't that happening in your case.
>
> How much is the time between two consecutive IOWAIT requests in your case ?
> Maybe it is too less (Ofcourse it isn't in your control :). But if we take
> Peter's example, then it will surely have a non-enqueue path request between two
> IOWAIT requests.

Hmm, Ok. I can try to do some measurements about consecutive calls
soon and let you know how often it happens. I also noticed its
possible to call twice in the enqueue path itself as well. It probably
affect my patch more because of starting from min than max.

>> I
>> haven't seen the other case where you have IOWAIT followed by
>> non-IOWAIT for a repeated set of IOWAIT requests. Would you more
>> comfortable if we moved sugov_set_iowait_boost() after the
>> sugov_should_update_freq() ?
>
> That may make us ignore all IOWAIT requests that happen between rate_limit_us
> time. And that would be bad IMHO.

True..

>> That way if there are consecutive
>> requests in the same path, then it most likely rate-limiting will
>> prevent such updates. I will also try to collect some stats as you
>> suggested to see if how often if at all this can happen.
>
> Completely untested, but what about something like this ? This should get rid of
> the spikes you were getting.

I actually like your approach better of consuming the flag first, but
computing the boost later when its used. Its also more immune to the
problem you described. Some comments below:

>
> diff --git a/kernel/sched/cpufreq_schedutil.c b/kernel/sched/cpufreq_schedutil.c
> index 076a2e31951c..3459f327c94e 100644
> --- a/kernel/sched/cpufreq_schedutil.c
> +++ b/kernel/sched/cpufreq_schedutil.c
> @@ -53,6 +53,7 @@ struct sugov_cpu {
> struct update_util_data update_util;
> struct sugov_policy *sg_policy;
>
> + bool iowait_boost_pending;
> unsigned long iowait_boost;
> unsigned long iowait_boost_max;
> u64 last_update;
> @@ -169,7 +170,12 @@ static void sugov_set_iowait_boost(struct sugov_cpu *sg_cpu, u64 time,
> unsigned int flags)
> {
> if (flags & SCHED_CPUFREQ_IOWAIT) {
> - sg_cpu->iowait_boost = sg_cpu->iowait_boost_max;
> + sg_cpu->iowait_boost_pending = true;
> +
> + if (!sg_cpu->iowait_boost) {
> + sg_cpu->iowait_boost = sg_cpu->sg_policy->policy->cur;
> + sg_cpu->iowait_boost >>= 1;
> + }

Hmm, this doesn't look right to me.. why are we decaying in this path?
I think nothing else other than setting the pending flag and the
initial iowait_boost value is needed here.

Also I feel this is more "spike" prone than setting it initially to
min. As Peter was saying, since we apply the boost only if it
increases the frequency and not decreases, starting from min should at
worst just result in ignoring of the boost the first time.

> } else if (sg_cpu->iowait_boost) {
> s64 delta_ns = time - sg_cpu->last_update;
>
> @@ -182,17 +188,26 @@ static void sugov_set_iowait_boost(struct sugov_cpu *sg_cpu, u64 time,
> static void sugov_iowait_boost(struct sugov_cpu *sg_cpu, unsigned long *util,
> unsigned long *max)
> {
> - unsigned long boost_util = sg_cpu->iowait_boost;
> - unsigned long boost_max = sg_cpu->iowait_boost_max;
> + unsigned long boost_util, boost_max;
>
> - if (!boost_util)
> + if (!sg_cpu->iowait_boost)
> return;
>
> + if (sg_cpu->iowait_boost_pending) {
> + sg_cpu->iowait_boost_pending = false;
> + sg_cpu->iowait_boost = min(sg_cpu->iowait_boost << 1,
> + sg_cpu->iowait_boost_max);
> + } else {
> + sg_cpu->iowait_boost >>= 1;
> + }

And then this path will do the decay correctly when the boost is applied.

thanks,

-Joel

Viresh Kumar

unread,

Jul 12, 2017, 11:40:08 PM7/12/17

to

On 12-07-17, 19:02, Joel Fernandes wrote:
> On Tue, Jul 11, 2017 at 10:00 PM, Viresh Kumar <viresh...@linaro.org> wrote:

> Hmm, Ok. I can try to do some measurements about consecutive calls
> soon and let you know how often it happens. I also noticed its
> possible to call twice in the enqueue path itself as well.

Yeah, I think I told you that in previous replies.

> It probably
> affect my patch more because of starting from min than max.

Yeah, it will.

> > diff --git a/kernel/sched/cpufreq_schedutil.c b/kernel/sched/cpufreq_schedutil.c
> > index 076a2e31951c..3459f327c94e 100644
> > --- a/kernel/sched/cpufreq_schedutil.c
> > +++ b/kernel/sched/cpufreq_schedutil.c
> > @@ -53,6 +53,7 @@ struct sugov_cpu {
> > struct update_util_data update_util;
> > struct sugov_policy *sg_policy;
> >
> > + bool iowait_boost_pending;
> > unsigned long iowait_boost;
> > unsigned long iowait_boost_max;
> > u64 last_update;
> > @@ -169,7 +170,12 @@ static void sugov_set_iowait_boost(struct sugov_cpu *sg_cpu, u64 time,
> > unsigned int flags)
> > {
> > if (flags & SCHED_CPUFREQ_IOWAIT) {
> > - sg_cpu->iowait_boost = sg_cpu->iowait_boost_max;
> > + sg_cpu->iowait_boost_pending = true;
> > +
> > + if (!sg_cpu->iowait_boost) {
> > + sg_cpu->iowait_boost = sg_cpu->sg_policy->policy->cur;
> > + sg_cpu->iowait_boost >>= 1;
> > + }
>
> Hmm, this doesn't look right to me.. why are we decaying in this path?

I am convinced that we need a comment here as what I did wasn't
straight forward :)

The idea: We wouldn't increase the frequency for the first event with
IOWAIT flag set, but on every subsequent event (captured over
rate-limit-us window). You may have noticed that I am updating the
boost values in sugov_iowait_boost() a bit earlier now and they will
affect the current frequency update as well.

Because I wanted to do a 2X there unconditionally if
iowait_boost_pending is set, I had to make it half for the very first
event with IOWAIT flag set.

End result:
- First event, we stay at current freq.
- Second and all consecutive events every rate_limit_us time, 2X
- If there is no IOWAIT event in last rate_limit_us, X/2

Makes sense ?

> I think nothing else other than setting the pending flag and the
> initial iowait_boost value is needed here.
>
> Also I feel this is more "spike" prone than setting it initially to
> min. As Peter was saying, since we apply the boost only if it
> increases the frequency and not decreases, starting from min should at
> worst just result in ignoring of the boost the first time.

Yeah, we can discuss on what should be the default value to start with
and I would be fine with "min" if Rafael is, as he proposed the iowait
thing to begin with after seeing some real issues on hardware.

> > } else if (sg_cpu->iowait_boost) {
> > s64 delta_ns = time - sg_cpu->last_update;
> >
> > @@ -182,17 +188,26 @@ static void sugov_set_iowait_boost(struct sugov_cpu *sg_cpu, u64 time,
> > static void sugov_iowait_boost(struct sugov_cpu *sg_cpu, unsigned long *util,
> > unsigned long *max)
> > {
> > - unsigned long boost_util = sg_cpu->iowait_boost;
> > - unsigned long boost_max = sg_cpu->iowait_boost_max;
> > + unsigned long boost_util, boost_max;
> >
> > - if (!boost_util)
> > + if (!sg_cpu->iowait_boost)
> > return;
> >
> > + if (sg_cpu->iowait_boost_pending) {
> > + sg_cpu->iowait_boost_pending = false;
> > + sg_cpu->iowait_boost = min(sg_cpu->iowait_boost << 1,

I was talking about this unconditional 2X earlier.

> > + sg_cpu->iowait_boost_max);
> > + } else {
> > + sg_cpu->iowait_boost >>= 1;
> > + }
>
> And then this path will do the decay correctly when the boost is applied.

Yeah, the else part should do good.

--
viresh

Viresh Kumar

unread,

Jul 12, 2017, 11:50:08 PM7/12/17

to

On 12-07-17, 15:28, Peter Zijlstra wrote:
> Hmm, so you're worried about that ratelimit stuff?

Yeah, somewhat.

> Shouldn't we fix that
> independently -- IIRC Rafael proposed a max-filter over the window.

I had some doubts about that idea in general and shared them earlier
with you:

https://marc.info/?l=linux-kernel&m=149683722822537&w=2

But anyway, I think Joel is somewhat convinced to adapt to the
solution I gave and that wouldn't have any of the problems I shared.
All good now :)

--
viresh

Joel Fernandes

unread,

Jul 13, 2017, 12:00:07 AM7/13/17

to

On Wed, Jul 12, 2017 at 8:35 PM, Viresh Kumar <viresh...@linaro.org> wrote:
[..]

Yes, that makes sense, its a bit subtle but I get what you're doing
now and I agree with it. Its also cleaner than my original patch :-)
and yeah definitely needs a comment ;-)

thanks,

-Joel

Viresh Kumar

unread,

Jul 13, 2017, 12:20:07 AM7/13/17

to

On 12-07-17, 20:52, Joel Fernandes wrote:
> Yes, that makes sense, its a bit subtle but I get what you're doing
> now and I agree with it. Its also cleaner than my original patch :-)
> and yeah definitely needs a comment ;-)

And I have full trust on you to include that comment in you V5 :)

--
viresh

Joel Fernandes

unread,

Jul 13, 2017, 12:20:07 AM7/13/17

to

You bet :)

thanks,

-Joel

Joel Fernandes

unread,

Jul 16, 2017, 4:10:09 AM7/16/17

to

Currently the iowait_boost feature in schedutil makes the frequency go to max

on iowait wakeups. This feature was added to handle a case that Peter

described where the throughput of operations involving continuous I/O requests
[1] is reduced due to running at a lower frequency, however the lower
throughput itself causes utilization to be low and hence causing frequency to
be low hence its "stuck".

Instead of going to max, its also possible to achieve the same effect by
ramping up to max if there are repeated in_iowait wakeups happening. This patch

is an attempt to do that. We start from a lower frequency (policy->mind)

and double the boost for every consecutive iowait update until we reach the
maximum iowait boost frequency (iowait_boost_max).

I ran a synthetic test (continuous O_DIRECT writes in a loop) on an x86 machine
with intel_pstate in passive mode using schedutil. In this test the iowait_boost
value ramped from 800MHz to 4GHz in 60ms. The patch achieves the desired improved
throughput as the existing behavior.

Also while at it, make iowait_boost and iowait_boost_max as unsigned int since
its unit is kHz and this is consistent with struct cpufreq_policy.

[1] https://patchwork.kernel.org/patch/9735885/

Cc: Srinivas Pandruvada <srinivas....@linux.intel.com>
Cc: Len Brown <le...@kernel.org>
Cc: Rafael J. Wysocki <r...@rjwysocki.net>
Cc: Viresh Kumar <viresh...@linaro.org>
Cc: Ingo Molnar <mi...@redhat.com>
Cc: Peter Zijlstra <pet...@infradead.org>
Suggested-by: Peter Zijlstra <pet...@infradead.org>
Signed-off-by: Joel Fernandes <joe...@google.com>

---
This version is based on some ideas from Viresh and Juri in v4. Viresh, one
difference between the idea we just discussed is, I am scaling up/down the
boost only after consuming it. This has the effect of slightly delaying the
"deboost" but achieves the same boost ramp time. Its more cleaner in the code
IMO to avoid the scaling up and then down on the initial boost. Note that I
also dropped iowait_boost_min and now I'm just starting the initial boost from
policy->min since as I mentioned in the commit above, the ramp of the
iowait_boost value is very quick and for the usecase its intended for, it works
fine. Hope this is acceptable. Thanks.

kernel/sched/cpufreq_schedutil.c | 31 +++++++++++++++++++++++--------
1 file changed, 23 insertions(+), 8 deletions(-)

diff --git a/kernel/sched/cpufreq_schedutil.c b/kernel/sched/cpufreq_schedutil.c
index 622eed1b7658..4225bbada88d 100644
--- a/kernel/sched/cpufreq_schedutil.c
+++ b/kernel/sched/cpufreq_schedutil.c
@@ -53,8 +53,9 @@ struct sugov_cpu {

struct update_util_data update_util;
struct sugov_policy *sg_policy;

- unsigned long iowait_boost;
- unsigned long iowait_boost_max;
+ bool iowait_boost_pending;
+ unsigned int iowait_boost;
+ unsigned int iowait_boost_max;
u64 last_update;

/* The fields below are only needed when sharing a policy. */
@@ -172,30 +173,43 @@ static void sugov_set_iowait_boost(struct sugov_cpu *sg_cpu, u64 time,

unsigned int flags)
{
if (flags & SCHED_CPUFREQ_IOWAIT) {
- sg_cpu->iowait_boost = sg_cpu->iowait_boost_max;
+ sg_cpu->iowait_boost_pending = true;

+ sg_cpu->iowait_boost = max(sg_cpu->iowait_boost,
+ sg_cpu->sg_policy->policy->min);

} else if (sg_cpu->iowait_boost) {
s64 delta_ns = time - sg_cpu->last_update;

/* Clear iowait_boost if the CPU apprears to have been idle. */

- if (delta_ns > TICK_NSEC)
+ if (delta_ns > TICK_NSEC) {
sg_cpu->iowait_boost = 0;

+ sg_cpu->iowait_boost_pending = false;
+ }
}
}

static void sugov_iowait_boost(struct sugov_cpu *sg_cpu, unsigned long *util,
unsigned long *max)
{
- unsigned long boost_util = sg_cpu->iowait_boost;
- unsigned long boost_max = sg_cpu->iowait_boost_max;
+ unsigned long boost_util, boost_max;

- if (!boost_util)
+ if (!sg_cpu->iowait_boost)
return;

+ boost_util = sg_cpu->iowait_boost;

+ boost_max = sg_cpu->iowait_boost_max;
+

if (*util * boost_max < *max * boost_util) {
*util = boost_util;
*max = boost_max;
}

- sg_cpu->iowait_boost >>= 1;
+

+ if (sg_cpu->iowait_boost_pending) {
+ sg_cpu->iowait_boost_pending = false;
+ sg_cpu->iowait_boost = min(sg_cpu->iowait_boost << 1,

+ sg_cpu->iowait_boost_max);
+ } else {

+ sg_cpu->iowait_boost >>= 1;
+ }
}

#ifdef CONFIG_NO_HZ_COMMON
@@ -267,6 +281,7 @@ static unsigned int sugov_next_freq_shared(struct sugov_cpu *sg_cpu, u64 time)
delta_ns = time - j_sg_cpu->last_update;
if (delta_ns > TICK_NSEC) {
j_sg_cpu->iowait_boost = 0;
+ j_sg_cpu->iowait_boost_pending = false;
continue;
}
if (j_sg_cpu->flags & SCHED_CPUFREQ_RT_DL)
--
2.13.2.932.g7449e964c-goog

Viresh Kumar

unread,

Jul 17, 2017, 4:10:07 AM7/17/17

to

On 16-07-17, 01:04, Joel Fernandes wrote:
> Currently the iowait_boost feature in schedutil makes the frequency go to max
> on iowait wakeups. This feature was added to handle a case that Peter
> described where the throughput of operations involving continuous I/O requests
> [1] is reduced due to running at a lower frequency, however the lower
> throughput itself causes utilization to be low and hence causing frequency to
> be low hence its "stuck".
>
> Instead of going to max, its also possible to achieve the same effect by
> ramping up to max if there are repeated in_iowait wakeups happening. This patch
> is an attempt to do that. We start from a lower frequency (policy->mind)

s/mind/min/

We don't really need to clear this flag here as we are already making
iowait_boost as 0 and that's what we check while using boost.

> }
> }
>
> static void sugov_iowait_boost(struct sugov_cpu *sg_cpu, unsigned long *util,
> unsigned long *max)
> {
> - unsigned long boost_util = sg_cpu->iowait_boost;
> - unsigned long boost_max = sg_cpu->iowait_boost_max;
> + unsigned long boost_util, boost_max;
>
> - if (!boost_util)
> + if (!sg_cpu->iowait_boost)
> return;
>
> + boost_util = sg_cpu->iowait_boost;
> + boost_max = sg_cpu->iowait_boost_max;
> +

The above changes are not required anymore (and were required only
with my patch).

> if (*util * boost_max < *max * boost_util) {
> *util = boost_util;
> *max = boost_max;
> }
> - sg_cpu->iowait_boost >>= 1;
> +
> + if (sg_cpu->iowait_boost_pending) {
> + sg_cpu->iowait_boost_pending = false;
> + sg_cpu->iowait_boost = min(sg_cpu->iowait_boost << 1,
> + sg_cpu->iowait_boost_max);

Now this has a problem. We will also boost after waiting for
rate_limit_us. And that's why I had proposed the tricky solution in
the first place. I thought we wanted to avoid instant boost only for
the first iteration, but after that we wanted to do it ASAP. Isn't it?

Now that you are using policy->min instead of policy->cur, we can
simplify the solution I proposed and always do 2 * iowait_boost before
getting current util/max in above if loop. i.e. we will start iowait
boost with min * 2 instead of min and that should be fine.

> + } else {
> + sg_cpu->iowait_boost >>= 1;
> + }
> }
>
> #ifdef CONFIG_NO_HZ_COMMON
> @@ -267,6 +281,7 @@ static unsigned int sugov_next_freq_shared(struct sugov_cpu *sg_cpu, u64 time)
> delta_ns = time - j_sg_cpu->last_update;
> if (delta_ns > TICK_NSEC) {
> j_sg_cpu->iowait_boost = 0;
> + j_sg_cpu->iowait_boost_pending = false;

Not required here as well.

> continue;
> }
> if (j_sg_cpu->flags & SCHED_CPUFREQ_RT_DL)
> --
> 2.13.2.932.g7449e964c-goog

--
viresh

Joel Fernandes

unread,

Jul 17, 2017, 1:40:08 PM7/17/17

to

Hi Viresh,

Hmm, I would rather clear this flag here and in the loop in
sugov_next_freq_shared since it keeps the flag in sync with what's
happening and is less confusing IMHO.

>
>> }
>> }
>>
>> static void sugov_iowait_boost(struct sugov_cpu *sg_cpu, unsigned long *util,
>> unsigned long *max)
>> {
>> - unsigned long boost_util = sg_cpu->iowait_boost;
>> - unsigned long boost_max = sg_cpu->iowait_boost_max;
>> + unsigned long boost_util, boost_max;
>>
>> - if (!boost_util)
>> + if (!sg_cpu->iowait_boost)
>> return;
>>
>> + boost_util = sg_cpu->iowait_boost;
>> + boost_max = sg_cpu->iowait_boost_max;
>> +
>
> The above changes are not required anymore (and were required only
> with my patch).

Yep, I'll drop it.

>> if (*util * boost_max < *max * boost_util) {
>> *util = boost_util;
>> *max = boost_max;
>> }
>> - sg_cpu->iowait_boost >>= 1;
>> +
>> + if (sg_cpu->iowait_boost_pending) {
>> + sg_cpu->iowait_boost_pending = false;
>> + sg_cpu->iowait_boost = min(sg_cpu->iowait_boost << 1,
>> + sg_cpu->iowait_boost_max);
>
> Now this has a problem. We will also boost after waiting for
> rate_limit_us. And that's why I had proposed the tricky solution in

Not really unless rate_limit_us is < TICK_NSEC? Once TICK_NSEC
elapses, we would clear the boost in sugov_set_iowait_boost and in
sugov_next_freq_shared.

> the first place. I thought we wanted to avoid instant boost only for
> the first iteration, but after that we wanted to do it ASAP. Isn't it?
>
> Now that you are using policy->min instead of policy->cur, we can
> simplify the solution I proposed and always do 2 * iowait_boost before

No, doubling on the first boost was never discussed or intended in my
earlier patches. I thought even your patch never did, you were
dividing by 2, and then scaling it back up by 2 before consuming it to
preserve the initial boost.

> getting current util/max in above if loop. i.e. we will start iowait
> boost with min * 2 instead of min and that should be fine.

Hmm, but why start from double of min? Why not just min? It doesn't
make any difference to the intended behavior itself and is also
consistent with my proposal in RFC v4. Also I feel what you're
suggesting is more spike prone as well, the idea was to start from the
minimum and double it as we go, not to double the min the first go.
That was never intended.

Also I would rather keep the "set and use and set and use" pattern to
keep the logic less confusing and clean IMO.
So we set initial boost in sugov_set_iowait_boost, and then in
sugov_iowait_boost we use it, and then set the boost for the next time
around at the end of sugov_iowait_boost (that is we double it). Next
time sugov_set_iowait_boost wouldn't touch the boost whether iowait
flag is set or not and we would continue into sugov_iowait_boost to
consume the boost. This would have a small delay in reducing the
boost, but that's Ok since its only one cycle of delay, and keeps the
code clean. I assume the last part is not an issue considering you're
proposing double of the initial boost anyway ;-)

thanks,

-Joel

Viresh Kumar

unread,

Jul 18, 2017, 1:50:09 AM7/18/17

to

On 17-07-17, 10:35, Joel Fernandes wrote:
> On Mon, Jul 17, 2017 at 1:04 AM, Viresh Kumar <viresh...@linaro.org> wrote:
> > On 16-07-17, 01:04, Joel Fernandes wrote:

> >> + if (sg_cpu->iowait_boost_pending) {
> >> + sg_cpu->iowait_boost_pending = false;
> >> + sg_cpu->iowait_boost = min(sg_cpu->iowait_boost << 1,
> >> + sg_cpu->iowait_boost_max);
> >
> > Now this has a problem. We will also boost after waiting for

s/also/always/

> > rate_limit_us. And that's why I had proposed the tricky solution in
>
> Not really unless rate_limit_us is < TICK_NSEC? Once TICK_NSEC
> elapses, we would clear the boost in sugov_set_iowait_boost and in
> sugov_next_freq_shared.

You misread it and I know why it happened. And so I have sent a small
patch to make it a bit more readable.

rate_limit_us is associated with "last_freq_update_time", while
iowait-boost is associated with "last_update".

And last_update gets updated way too often.

Okay, let me try to explain the problem first and then you can propose
a solution if required.

Expected Behavior:

(Window refers to a time window of rate_limit_us here)

A. The first window where IOWAIT flag is set, we set boost to min-freq
and that shall be used for next freq update in
sugov_iowait_boost(). Any more calls to sugov_set_iowait_boost()
within this window shouldn't change the behavior.

B. If the next window also has IOWAIT flag set, then
sugov_iowait_boost() should use iowait*2 for freq update.

C. If a window doesn't have IOWAIT flag set, then sugov_iowait_boost()
should use iowait/2 in it.

Do they look fine to you?

Now coming to how will system behave with your patch:

A. would be fine. We will follow things properly.

But B. and C. aren't true anymore.

This happened because after the first window we updated iowait_boost
as 2*min unconditionally and the next window will *always* use that,
even if the flag isn't set. And we may end up increasing the frequency
unnecessarily, i.e. the spike where this discussion started.

And so in my initial solution I reversed the order in
sugov_iowait_boost().

--
viresh

Juri Lelli

unread,

Jul 18, 2017, 10:10:09 AM7/18/17

to

Hi,

Mmm, seems to make sense to me. :/

Would the following work (on top of Joel's v5)? Rationale being that
only in sugov_set_iowait_boost we might bump freq up (if no iowait_boost
was set) or start from policy->min. In sugov_iowait_boost (consumer)
instead we do the decay (if no boosting was pending).

---
kernel/sched/cpufreq_schedutil.c | 29 +++++++++++++++++++----------
1 file changed, 19 insertions(+), 10 deletions(-)

diff --git a/kernel/sched/cpufreq_schedutil.c b/kernel/sched/cpufreq_schedutil.c
index 46b2479641cc..b270563c15a5 100644
--- a/kernel/sched/cpufreq_schedutil.c
+++ b/kernel/sched/cpufreq_schedutil.c
@@ -171,8 +171,14 @@ static void sugov_set_iowait_boost(struct sugov_cpu *sg_cpu, u64 time,
{
if (flags & SCHED_CPUFREQ_IOWAIT) {
sg_cpu->iowait_boost_pending = true;
- sg_cpu->iowait_boost = max(sg_cpu->iowait_boost,
- sg_cpu->sg_policy->policy->min);
+ if (sg_cpu->iowait_boost) {
+ /* Bump up 2*current_boost until hitting max */
+ sg_cpu->iowait_boost = max(sg_cpu->iowait_boost << 1,
+ sg_cpu->iowait_boost_max);
+ } else {
+ /* Start from policy->min */
+ sg_cpu->iowait_boost = sg_cpu->sg_policy->policy->min;
+ }

} else if (sg_cpu->iowait_boost) {
s64 delta_ns = time - sg_cpu->last_update;

@@ -192,6 +198,17 @@ static void sugov_iowait_boost(struct sugov_cpu *sg_cpu, unsigned long *util,
if (!sg_cpu->iowait_boost)
return;

+ if (sg_cpu->iowait_boost_pending) {
+ /*
+ * Record consumption of current boost value
+ * (set by sugov_set_iowait_boost).
+ */

+ sg_cpu->iowait_boost_pending = false;

+ } else {
+ /* Decay boost */
+ sg_cpu->iowait_boost >>= 1;
+ }
+
boost_util = sg_cpu->iowait_boost;
boost_max = sg_cpu->iowait_boost_max;

@@ -199,14 +216,6 @@ static void sugov_iowait_boost(struct sugov_cpu *sg_cpu, unsigned long *util,

*util = boost_util;
*max = boost_max;
}
-

- if (sg_cpu->iowait_boost_pending) {
- sg_cpu->iowait_boost_pending = false;
- sg_cpu->iowait_boost = min(sg_cpu->iowait_boost << 1,
- sg_cpu->iowait_boost_max);
- } else {

- sg_cpu->iowait_boost >>= 1;

- }
}

#ifdef CONFIG_NO_HZ_COMMON
--
2.11.0

Joel Fernandes

unread,

Jul 19, 2017, 12:40:06 AM7/19/17

to

Hi Viresh,

I appreciate the discussion.

Not really, to me B will still work because in the case the flag is
set, we are correctly double boosting in the next cycle.

Taking an example, with B = flag is set and D = flag is not set

F = Fmin (minimum)

iowait flag B B B D D D
resulting boost F 2*F 4*F 4*F 2*F F

What will not work is C but as I mentioned in my last email, that
would cause us to delay the iowait boost halving by at most 1 cycle,
is that really an issue considering we are starting from min compared
to max? Note that cases A. and B. are still working.

Considering the following cases:
(1) min freq is 800MHz, it takes upto 4 cycles to reach 4GHz where the
flag is set. At this point I think its likely we will run for many
more cycles which means keeping the boost active for 1 extra cycle
isn't that big a deal. Even if run for just 5 cycles with boost, that
means only the last cycle will suffer from C not decaying as soon as
possible. Comparing that to the case where in current code we
currently run at max from the first cycle, its not that bad.

(2) we have a transient type of thing, in this case we're probably not
reaching the full max immediately so even if we delay the decaying,
its still not as bad as what it is currently.

I think considering that the code is way cleaner than any other
approach - its a small price to pay. Also keeping in mind that this
patch is still an improvement over the current spike, even though as
you said its still a bit spikey, but its still better right?

Functionally the code is working and I think is also clean, but if you
feel that its still confusing, then I'm open to rewriting it.

>
> And so in my initial solution I reversed the order in
> sugov_iowait_boost().

Yes, but to fix A. you had to divide by 2 in sugov_set_iowait_boost,
and then multiply by 2 later in sugov_iowait_boost to keep the first
boost at min. That IMO was confusing so my modified patch did it
differently.

thanks,

-Joel

Viresh Kumar

unread,

Jul 19, 2017, 1:40:05 AM7/19/17

to

On 18-07-17, 15:02, Juri Lelli wrote:
> Mmm, seems to make sense to me. :/
>
> Would the following work (on top of Joel's v5)? Rationale being that
> only in sugov_set_iowait_boost we might bump freq up (if no iowait_boost
> was set) or start from policy->min. In sugov_iowait_boost (consumer)
> instead we do the decay (if no boosting was pending).
>
> ---
> kernel/sched/cpufreq_schedutil.c | 29 +++++++++++++++++++----------
> 1 file changed, 19 insertions(+), 10 deletions(-)
>
> diff --git a/kernel/sched/cpufreq_schedutil.c b/kernel/sched/cpufreq_schedutil.c
> index 46b2479641cc..b270563c15a5 100644
> --- a/kernel/sched/cpufreq_schedutil.c
> +++ b/kernel/sched/cpufreq_schedutil.c
> @@ -171,8 +171,14 @@ static void sugov_set_iowait_boost(struct sugov_cpu *sg_cpu, u64 time,
> {
> if (flags & SCHED_CPUFREQ_IOWAIT) {
> sg_cpu->iowait_boost_pending = true;
> - sg_cpu->iowait_boost = max(sg_cpu->iowait_boost,
> - sg_cpu->sg_policy->policy->min);
> + if (sg_cpu->iowait_boost) {
> + /* Bump up 2*current_boost until hitting max */
> + sg_cpu->iowait_boost = max(sg_cpu->iowait_boost << 1,
> + sg_cpu->iowait_boost_max);

And we are back at where we started :)

This wouldn't work because sugov_set_iowait_boost() gets called a lot.
Maybe 10 times within a rate_limit_us period.

The thumb rule is, never double/half the boost from
sugov_set_iowait_boost() :)

--
viresh

Joel Fernandes

unread,

Jul 19, 2017, 2:20:08 AM7/19/17

to

Ok so tomorrow I'll post something slightly different. In
sugov_iowait_boost, if the flag is set and current iowait_boost < min,
then set it to min, otherwise double it. If the flag is not set, halve
it. And both these would be done, *before* consuming the boost (as in
Viresh's last patch). I think that's reasonable and handles all cases.
Hopefully that will put it to rest, let me know if any objections.

thanks,

-Joel

Viresh Kumar

unread,

Jul 19, 2017, 2:20:08 AM7/19/17

to

On 18-07-17, 21:39, Joel Fernandes wrote:
> Not really, to me B will still work because in the case the flag is
> set, we are correctly double boosting in the next cycle.
>
> Taking an example, with B = flag is set and D = flag is not set
>
> F = Fmin (minimum)
>
> iowait flag B B B D D D
> resulting boost F 2*F 4*F 4*F 2*F F

What about this ?

iowait flag B D B D B D
resulting boost F 2*F F 2*F F 2*F

Isn't this the worst behavior we may wish for ?

> What will not work is C but as I mentioned in my last email, that
> would cause us to delay the iowait boost halving by at most 1 cycle,
> is that really an issue considering we are starting from min compared
> to max? Note that cases A. and B. are still working.
>
> Considering the following cases:
> (1) min freq is 800MHz, it takes upto 4 cycles to reach 4GHz where the
> flag is set. At this point I think its likely we will run for many
> more cycles which means keeping the boost active for 1 extra cycle
> isn't that big a deal. Even if run for just 5 cycles with boost, that
> means only the last cycle will suffer from C not decaying as soon as
> possible. Comparing that to the case where in current code we
> currently run at max from the first cycle, its not that bad.
>
> (2) we have a transient type of thing, in this case we're probably not
> reaching the full max immediately so even if we delay the decaying,
> its still not as bad as what it is currently.
>
> I think considering that the code is way cleaner than any other
> approach - its a small price to pay. Also keeping in mind that this
> patch is still an improvement over the current spike, even though as
> you said its still a bit spikey, but its still better right?
>
> Functionally the code is working and I think is also clean, but if you
> feel that its still confusing, then I'm open to rewriting it.

I am not worried for being boosted for a bit more time, but with the
spikes even when we do not want a freq change.

> > And so in my initial solution I reversed the order in
> > sugov_iowait_boost().
>
> Yes, but to fix A. you had to divide by 2 in sugov_set_iowait_boost,
> and then multiply by 2 later in sugov_iowait_boost to keep the first
> boost at min. That IMO was confusing so my modified patch did it
> differently.

Yeah, it wasn't great for sure.

I hope the following one will work for everyone ?

diff --git a/kernel/sched/cpufreq_schedutil.c b/kernel/sched/cpufreq_schedutil.c
index 45fcf21ad685..ceac5f72d8da 100644
--- a/kernel/sched/cpufreq_schedutil.c
+++ b/kernel/sched/cpufreq_schedutil.c
@@ -53,6 +53,7 @@ struct sugov_cpu {

struct update_util_data update_util;
struct sugov_policy *sg_policy;

+ bool iowait_boost_pending;
unsigned long iowait_boost;
unsigned long iowait_boost_max;
u64 last_update;

@@ -169,7 +170,17 @@ static void sugov_set_iowait_boost(struct sugov_cpu *sg_cpu, u64 time,

unsigned int flags)
{
if (flags & SCHED_CPUFREQ_IOWAIT) {
- sg_cpu->iowait_boost = sg_cpu->iowait_boost_max;

+ if (sg_cpu->iowait_boost_pending)
+ return;
+

+ sg_cpu->iowait_boost_pending = true;
+

+ if (sg_cpu->iowait_boost) {

+ sg_cpu->iowait_boost = min(sg_cpu->iowait_boost << 1,
+ sg_cpu->iowait_boost_max);

+ } else {

+ sg_cpu->iowait_boost = sg_cpu->sg_policy->policy->min;
+ }

} else if (sg_cpu->iowait_boost) {
s64 delta_ns = time - sg_cpu->last_update;

@@ -182,17 +193,23 @@ static void sugov_set_iowait_boost(struct sugov_cpu *sg_cpu, u64 time,

static void sugov_iowait_boost(struct sugov_cpu *sg_cpu, unsigned long *util,
unsigned long *max)
{
- unsigned long boost_util = sg_cpu->iowait_boost;
- unsigned long boost_max = sg_cpu->iowait_boost_max;
+ unsigned long boost_util, boost_max;

- if (!boost_util)
+ if (!sg_cpu->iowait_boost)
return;

+ if (sg_cpu->iowait_boost_pending)
+ sg_cpu->iowait_boost_pending = false;

+ else
+ sg_cpu->iowait_boost >>= 1;
+

+ boost_util = sg_cpu->iowait_boost;
+ boost_max = sg_cpu->iowait_boost_max;
+

if (*util * boost_max < *max * boost_util) {
*util = boost_util;
*max = boost_max;
}
- sg_cpu->iowait_boost >>= 1;
}

#ifdef CONFIG_NO_HZ_COMMON

--
viresh

Joel Fernandes

unread,

Jul 19, 2017, 10:40:06 PM7/19/17

to

Hi Viresh,

On Tue, Jul 18, 2017 at 11:19 PM, Viresh Kumar <viresh...@linaro.org> wrote:
> On 18-07-17, 21:39, Joel Fernandes wrote:
>> Not really, to me B will still work because in the case the flag is
>> set, we are correctly double boosting in the next cycle.
>>
>> Taking an example, with B = flag is set and D = flag is not set
>>
>> F = Fmin (minimum)
>>
>> iowait flag B B B D D D
>> resulting boost F 2*F 4*F 4*F 2*F F
>
> What about this ?
>
> iowait flag B D B D B D
> resulting boost F 2*F F 2*F F 2*F

Yes I guess so but this oscillation can still happen in current code I think.

I would prefer this to be:

if (sg_cpu->iowait_boost >= policy->min) {
// double it
} else {
// set it to min
}

This is for the case when boost happens all the way, then its capped
at max, but when its decayed back, its not exactly decayed to Fmin but
lower than it, so in that case when boost next time we start from
Fmin.

> } else if (sg_cpu->iowait_boost) {
> s64 delta_ns = time - sg_cpu->last_update;
>
> @@ -182,17 +193,23 @@ static void sugov_set_iowait_boost(struct sugov_cpu *sg_cpu, u64 time,
> static void sugov_iowait_boost(struct sugov_cpu *sg_cpu, unsigned long *util,
> unsigned long *max)
> {
> - unsigned long boost_util = sg_cpu->iowait_boost;
> - unsigned long boost_max = sg_cpu->iowait_boost_max;
> + unsigned long boost_util, boost_max;
>
> - if (!boost_util)
> + if (!sg_cpu->iowait_boost)
> return;
>
> + if (sg_cpu->iowait_boost_pending)
> + sg_cpu->iowait_boost_pending = false;
> + else
> + sg_cpu->iowait_boost >>= 1;
> +
> + boost_util = sg_cpu->iowait_boost;
> + boost_max = sg_cpu->iowait_boost_max;
> +
> if (*util * boost_max < *max * boost_util) {
> *util = boost_util;
> *max = boost_max;

This looks good to me and is kind of what I had in mind. I can spend
some time testing it soon. Just to be clear if I were to repost this
patch after testing, should I have your authorship and my tested-by or
do you prefer something else?

thanks,

-Joel

Viresh Kumar

unread,

Jul 19, 2017, 11:50:08 PM7/19/17

to

On 19-07-17, 19:38, Joel Fernandes wrote:
> On Tue, Jul 18, 2017 at 11:19 PM, Viresh Kumar <viresh...@linaro.org> wrote:
> > On 18-07-17, 21:39, Joel Fernandes wrote:
> >> Not really, to me B will still work because in the case the flag is
> >> set, we are correctly double boosting in the next cycle.
> >>
> >> Taking an example, with B = flag is set and D = flag is not set
> >>
> >> F = Fmin (minimum)
> >>
> >> iowait flag B B B D D D
> >> resulting boost F 2*F 4*F 4*F 2*F F
> >
> > What about this ?
> >
> > iowait flag B D B D B D
> > resulting boost F 2*F F 2*F F 2*F
>
> Yes I guess so but this oscillation can still happen in current code I think.

How ?

Actually you can add another patch first which makes iowait_boost as 0
when it goes below min as that problem exists today as well.

And this patch would be fine then as is ?

You can keep your authorship I wouldn't mind. Maybe a suggested-by at
max would be fine.

--
viresh

Joel Fernandes

unread,

Jul 20, 2017, 3:50:06 PM7/20/17

to

Hi Viresh,

On Wed, Jul 19, 2017 at 8:41 PM, Viresh Kumar <viresh...@linaro.org> wrote:
> On 19-07-17, 19:38, Joel Fernandes wrote:
>> On Tue, Jul 18, 2017 at 11:19 PM, Viresh Kumar <viresh...@linaro.org> wrote:
>> > On 18-07-17, 21:39, Joel Fernandes wrote:
>> >> Not really, to me B will still work because in the case the flag is
>> >> set, we are correctly double boosting in the next cycle.
>> >>
>> >> Taking an example, with B = flag is set and D = flag is not set
>> >>
>> >> F = Fmin (minimum)
>> >>
>> >> iowait flag B B B D D D
>> >> resulting boost F 2*F 4*F 4*F 2*F F
>> >
>> > What about this ?
>> >
>> > iowait flag B D B D B D
>> > resulting boost F 2*F F 2*F F 2*F
>>
>> Yes I guess so but this oscillation can still happen in current code I think.
>
> How ?

Yes you're right, its not an issue with current code.

Yes I think that's fine, I thought about it some more and I think this
can be an issue in a scenario where

iowait_boost_max < policy->min but:

(iowait_boost / iowait_boost_max) > (rq->cfs.avg.util_avg /
arch_scale_cpu_capacity)

This is probably not a common case in current real world cases but if
iowait_boost_max is say way less than arch_scale_cpu_capacity for some
reason in the future, then it can be an issue I think. I'll post a
patch for it.

Cool, will do. Thanks a lot Viresh.

thanks,

-Joel

Viresh Kumar

unread,

Jul 21, 2017, 12:20:07 AM7/21/17

to

On 20-07-17, 12:49, Joel Fernandes wrote:
> Yes I think that's fine, I thought about it some more and I think this
> can be an issue in a scenario where
>
> iowait_boost_max < policy->min but:

We will never have this case as boost-max is set to cpuinfo.max_freq.

--
viresh

Joel Fernandes

unread,

Jul 21, 2017, 2:10:08 AM7/21/17

to

On Thu, Jul 20, 2017 at 9:09 PM, Viresh Kumar <viresh...@linaro.org> wrote:
> On 20-07-17, 12:49, Joel Fernandes wrote:
>> Yes I think that's fine, I thought about it some more and I think this
>> can be an issue in a scenario where
>>
>> iowait_boost_max < policy->min but:

Uhh I meant to say here iowait_boost < policy->min. Sorry.

> We will never have this case as boost-max is set to cpuinfo.max_freq.

But you're right it can't be an issue in current code. I was just
thinking of future proofing it incase someone decided to lower the
boost-max in the code for whatever reason and forgets to handle this.

thanks,

-Joel

Joel Fernandes

unread,

Jul 22, 2017, 3:50:08 AM7/22/17

to

Currently the iowait_boost feature in schedutil makes the frequency go to max
on iowait wakeups. This feature was added to handle a case that Peter
described where the throughput of operations involving continuous I/O requests
[1] is reduced due to running at a lower frequency, however the lower
throughput itself causes utilization to be low and hence causing frequency to
be low hence its "stuck".

Instead of going to max, its also possible to achieve the same effect by
ramping up to max if there are repeated in_iowait wakeups happening. This patch

is an attempt to do that. We start from a lower frequency (policy->min)

and double the boost for every consecutive iowait update until we reach the
maximum iowait boost frequency (iowait_boost_max).

I ran a synthetic test (continuous O_DIRECT writes in a loop) on an x86 machine
with intel_pstate in passive mode using schedutil. In this test the iowait_boost
value ramped from 800MHz to 4GHz in 60ms. The patch achieves the desired improved
throughput as the existing behavior.

Also while at it, make iowait_boost and iowait_boost_max as unsigned int since
its unit is kHz and this is consistent with struct cpufreq_policy.

[1] https://patchwork.kernel.org/patch/9735885/

Cc: Srinivas Pandruvada <srinivas....@linux.intel.com>
Cc: Len Brown <le...@kernel.org>
Cc: Rafael J. Wysocki <r...@rjwysocki.net>
Cc: Viresh Kumar <viresh...@linaro.org>
Cc: Ingo Molnar <mi...@redhat.com>
Cc: Peter Zijlstra <pet...@infradead.org>
Suggested-by: Peter Zijlstra <pet...@infradead.org>

Suggested-by: Viresh Kumar <viresh...@linaro.org>

Signed-off-by: Joel Fernandes <joe...@google.com>
---

Viresh, made slight modifications to the last approach we agreed on using, but
nothing we didn't already discuss. I also dropped the RFC tag since I think
this is increasingly now becoming final (or has become final if no one else has
any other objection).

kernel/sched/cpufreq_schedutil.c | 37 +++++++++++++++++++++++++++++++------
1 file changed, 31 insertions(+), 6 deletions(-)

diff --git a/kernel/sched/cpufreq_schedutil.c b/kernel/sched/cpufreq_schedutil.c
index 622eed1b7658..0c0b6c8c15fc 100644

--- a/kernel/sched/cpufreq_schedutil.c
+++ b/kernel/sched/cpufreq_schedutil.c
@@ -53,6 +53,7 @@ struct sugov_cpu {
struct update_util_data update_util;
struct sugov_policy *sg_policy;

+ bool iowait_boost_pending;
unsigned long iowait_boost;
unsigned long iowait_boost_max;
u64 last_update;

@@ -172,30 +173,53 @@ static void sugov_set_iowait_boost(struct sugov_cpu *sg_cpu, u64 time,

unsigned int flags)
{
if (flags & SCHED_CPUFREQ_IOWAIT) {
- sg_cpu->iowait_boost = sg_cpu->iowait_boost_max;
+ if (sg_cpu->iowait_boost_pending)
+ return;
+
+ sg_cpu->iowait_boost_pending = true;
+
+ if (sg_cpu->iowait_boost) {
+ sg_cpu->iowait_boost = min(sg_cpu->iowait_boost << 1,
+ sg_cpu->iowait_boost_max);
+ } else {
+ sg_cpu->iowait_boost = sg_cpu->sg_policy->policy->min;
+ }

} else if (sg_cpu->iowait_boost) {
s64 delta_ns = time - sg_cpu->last_update;

/* Clear iowait_boost if the CPU apprears to have been idle. */
- if (delta_ns > TICK_NSEC)
+ if (delta_ns > TICK_NSEC) {
sg_cpu->iowait_boost = 0;

+ sg_cpu->iowait_boost_pending = false;
+ }
}
}

static void sugov_iowait_boost(struct sugov_cpu *sg_cpu, unsigned long *util,
unsigned long *max)
{
- unsigned long boost_util = sg_cpu->iowait_boost;
- unsigned long boost_max = sg_cpu->iowait_boost_max;
+ unsigned long boost_util, boost_max;

- if (!boost_util)
+ if (!sg_cpu->iowait_boost)
return;

+ if (sg_cpu->iowait_boost_pending) {
+ sg_cpu->iowait_boost_pending = false;
+ } else {
+ sg_cpu->iowait_boost >>= 1;

+ if (sg_cpu->iowait_boost < sg_cpu->sg_policy->policy->min) {
+ sg_cpu->iowait_boost = 0;
+ return;
+ }
+ }

+
+ boost_util = sg_cpu->iowait_boost;
+ boost_max = sg_cpu->iowait_boost_max;
+
if (*util * boost_max < *max * boost_util) {
*util = boost_util;
*max = boost_max;
}

- sg_cpu->iowait_boost >>= 1;
}

#ifdef CONFIG_NO_HZ_COMMON

@@ -267,6 +291,7 @@ static unsigned int sugov_next_freq_shared(struct sugov_cpu *sg_cpu, u64 time)

delta_ns = time - j_sg_cpu->last_update;
if (delta_ns > TICK_NSEC) {
j_sg_cpu->iowait_boost = 0;
+ j_sg_cpu->iowait_boost_pending = false;

continue;
}
if (j_sg_cpu->flags & SCHED_CPUFREQ_RT_DL)
--

2.14.0.rc0.284.gd933b75aa4-goog

Rafael J. Wysocki

unread,

Jul 22, 2017, 6:00:09 PM7/22/17

to

I would do

sg_cpu->iowait_boost <<= 1;
if (sg_cpu->iowait_boost > sg_cpu->iowait_boost_max)

sg_cpu->iowait_boost = sg_cpu->iowait_boost_max;

as that's easeir to read.

The rest of the patch is fine by me.

Thanks,
Rafael

Joel Fernandes

unread,

Jul 23, 2017, 12:10:09 AM7/23/17

to

Currently the iowait_boost feature in schedutil makes the frequency go to max
on iowait wakeups. This feature was added to handle a case that Peter
described where the throughput of operations involving continuous I/O requests
[1] is reduced due to running at a lower frequency, however the lower
throughput itself causes utilization to be low and hence causing frequency to
be low hence its "stuck".

Instead of going to max, its also possible to achieve the same effect by
ramping up to max if there are repeated in_iowait wakeups happening. This patch
is an attempt to do that. We start from a lower frequency (policy->min)
and double the boost for every consecutive iowait update until we reach the
maximum iowait boost frequency (iowait_boost_max).

I ran a synthetic test (continuous O_DIRECT writes in a loop) on an x86 machine
with intel_pstate in passive mode using schedutil. In this test the iowait_boost
value ramped from 800MHz to 4GHz in 60ms. The patch achieves the desired improved
throughput as the existing behavior.

[1] https://patchwork.kernel.org/patch/9735885/

Cc: Srinivas Pandruvada <srinivas....@linux.intel.com>
Cc: Len Brown <le...@kernel.org>
Cc: Rafael J. Wysocki <r...@rjwysocki.net>
Cc: Viresh Kumar <viresh...@linaro.org>
Cc: Ingo Molnar <mi...@redhat.com>
Cc: Peter Zijlstra <pet...@infradead.org>
Suggested-by: Peter Zijlstra <pet...@infradead.org>
Suggested-by: Viresh Kumar <viresh...@linaro.org>
Signed-off-by: Joel Fernandes <joe...@google.com>
---

kernel/sched/cpufreq_schedutil.c | 38 ++++++++++++++++++++++++++++++++------
1 file changed, 32 insertions(+), 6 deletions(-)

diff --git a/kernel/sched/cpufreq_schedutil.c b/kernel/sched/cpufreq_schedutil.c
index 622eed1b7658..570ab6e779e6 100644

--- a/kernel/sched/cpufreq_schedutil.c
+++ b/kernel/sched/cpufreq_schedutil.c
@@ -53,6 +53,7 @@ struct sugov_cpu {
struct update_util_data update_util;
struct sugov_policy *sg_policy;

+ bool iowait_boost_pending;
unsigned long iowait_boost;
unsigned long iowait_boost_max;
u64 last_update;

@@ -172,30 +173,54 @@ static void sugov_set_iowait_boost(struct sugov_cpu *sg_cpu, u64 time,

unsigned int flags)
{
if (flags & SCHED_CPUFREQ_IOWAIT) {
- sg_cpu->iowait_boost = sg_cpu->iowait_boost_max;
+ if (sg_cpu->iowait_boost_pending)
+ return;
+
+ sg_cpu->iowait_boost_pending = true;
+
+ if (sg_cpu->iowait_boost) {

+ sg_cpu->iowait_boost <<= 1;
+ if (sg_cpu->iowait_boost > sg_cpu->iowait_boost_max)
+ sg_cpu->iowait_boost = sg_cpu->iowait_boost_max;

+ } else {
+ sg_cpu->iowait_boost = sg_cpu->sg_policy->policy->min;
+ }
} else if (sg_cpu->iowait_boost) {
s64 delta_ns = time - sg_cpu->last_update;

/* Clear iowait_boost if the CPU apprears to have been idle. */
- if (delta_ns > TICK_NSEC)
+ if (delta_ns > TICK_NSEC) {
sg_cpu->iowait_boost = 0;
+ sg_cpu->iowait_boost_pending = false;
+ }
}
}

static void sugov_iowait_boost(struct sugov_cpu *sg_cpu, unsigned long *util,
unsigned long *max)
{
- unsigned long boost_util = sg_cpu->iowait_boost;
- unsigned long boost_max = sg_cpu->iowait_boost_max;
+ unsigned long boost_util, boost_max;

- if (!boost_util)
+ if (!sg_cpu->iowait_boost)
return;

+ if (sg_cpu->iowait_boost_pending) {

+ sg_cpu->iowait_boost_pending = false;
+ } else {
+ sg_cpu->iowait_boost >>= 1;
+ if (sg_cpu->iowait_boost < sg_cpu->sg_policy->policy->min) {
+ sg_cpu->iowait_boost = 0;
+ return;
+ }
+ }
+
+ boost_util = sg_cpu->iowait_boost;

+ boost_max = sg_cpu->iowait_boost_max;
+

if (*util * boost_max < *max * boost_util) {
*util = boost_util;
*max = boost_max;
}
- sg_cpu->iowait_boost >>= 1;
}

#ifdef CONFIG_NO_HZ_COMMON

@@ -267,6 +292,7 @@ static unsigned int sugov_next_freq_shared(struct sugov_cpu *sg_cpu, u64 time)

Joel Fernandes

unread,

Jul 23, 2017, 12:10:09 AM7/23/17

to

Done, and resent patches. Also added one more to change the
iowait_boost and iowait_boost_max to unsigned it.

thanks,

-Joel

Joel Fernandes

unread,

Jul 23, 2017, 12:10:09 AM7/23/17

to

Make iowait_boost and iowait_boost_max as unsigned int since its unit is kHz

and this is consistent with struct cpufreq_policy. Also change the local
variables in sugov_iowait_boost to match this.

Cc: Srinivas Pandruvada <srinivas....@linux.intel.com>
Cc: Len Brown <le...@kernel.org>
Cc: Rafael J. Wysocki <r...@rjwysocki.net>
Cc: Viresh Kumar <viresh...@linaro.org>
Cc: Ingo Molnar <mi...@redhat.com>
Cc: Peter Zijlstra <pet...@infradead.org>

Signed-off-by: Joel Fernandes <joe...@google.com>
---

kernel/sched/cpufreq_schedutil.c | 6 +++---
1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/kernel/sched/cpufreq_schedutil.c b/kernel/sched/cpufreq_schedutil.c
index 570ab6e779e6..7650784eb857 100644
--- a/kernel/sched/cpufreq_schedutil.c
+++ b/kernel/sched/cpufreq_schedutil.c
@@ -54,8 +54,8 @@ struct sugov_cpu {
struct sugov_policy *sg_policy;

bool iowait_boost_pending;

- unsigned long iowait_boost;
- unsigned long iowait_boost_max;

+ unsigned int iowait_boost;
+ unsigned int iowait_boost_max;
u64 last_update;

/* The fields below are only needed when sharing a policy. */

@@ -199,7 +199,7 @@ static void sugov_set_iowait_boost(struct sugov_cpu *sg_cpu, u64 time,

static void sugov_iowait_boost(struct sugov_cpu *sg_cpu, unsigned long *util,
unsigned long *max)
{

- unsigned long boost_util, boost_max;
+ unsigned int boost_util, boost_max;

if (!sg_cpu->iowait_boost)
return;
--
2.14.0.rc0.284.gd933b75aa4-goog

Joel Fernandes

unread,

Jul 23, 2017, 12:00:09 PM7/23/17

to

Currently the iowait_boost feature in schedutil makes the frequency go to max
on iowait wakeups. This feature was added to handle a case that Peter
described where the throughput of operations involving continuous I/O requests
[1] is reduced due to running at a lower frequency, however the lower
throughput itself causes utilization to be low and hence causing frequency to
be low hence its "stuck".

Instead of going to max, its also possible to achieve the same effect by
ramping up to max if there are repeated in_iowait wakeups happening. This patch
is an attempt to do that. We start from a lower frequency (policy->min)
and double the boost for every consecutive iowait update until we reach the
maximum iowait boost frequency (iowait_boost_max).

I ran a synthetic test (continuous O_DIRECT writes in a loop) on an x86 machine
with intel_pstate in passive mode using schedutil. In this test the iowait_boost
value ramped from 800MHz to 4GHz in 60ms. The patch achieves the desired improved
throughput as the existing behavior.

[1] https://patchwork.kernel.org/patch/9735885/

Cc: Srinivas Pandruvada <srinivas....@linux.intel.com>
Cc: Len Brown <le...@kernel.org>
Cc: Rafael J. Wysocki <r...@rjwysocki.net>
Cc: Viresh Kumar <viresh...@linaro.org>
Cc: Ingo Molnar <mi...@redhat.com>
Cc: Peter Zijlstra <pet...@infradead.org>

Suggested-by: Peter Zijlstra <pet...@infradead.org>
Suggested-by: Viresh Kumar <viresh...@linaro.org>

Signed-off-by: Joel Fernandes <joe...@google.com>
---

kernel/sched/cpufreq_schedutil.c | 38 ++++++++++++++++++++++++++++++++------
1 file changed, 32 insertions(+), 6 deletions(-)

diff --git a/kernel/sched/cpufreq_schedutil.c b/kernel/sched/cpufreq_schedutil.c
index 622eed1b7658..570ab6e779e6 100644
--- a/kernel/sched/cpufreq_schedutil.c
+++ b/kernel/sched/cpufreq_schedutil.c

@@ -53,6 +53,7 @@ struct sugov_cpu {
struct update_util_data update_util;
struct sugov_policy *sg_policy;

+ bool iowait_boost_pending;
unsigned long iowait_boost;
unsigned long iowait_boost_max;
u64 last_update;

@@ -172,30 +173,54 @@ static void sugov_set_iowait_boost(struct sugov_cpu *sg_cpu, u64 time,

unsigned int flags)
{
if (flags & SCHED_CPUFREQ_IOWAIT) {
- sg_cpu->iowait_boost = sg_cpu->iowait_boost_max;
+ if (sg_cpu->iowait_boost_pending)
+ return;
+
+ sg_cpu->iowait_boost_pending = true;
+
+ if (sg_cpu->iowait_boost) {

+ sg_cpu->iowait_boost <<= 1;
+ if (sg_cpu->iowait_boost > sg_cpu->iowait_boost_max)
+ sg_cpu->iowait_boost = sg_cpu->iowait_boost_max;
+ } else {
+ sg_cpu->iowait_boost = sg_cpu->sg_policy->policy->min;
+ }
} else if (sg_cpu->iowait_boost) {
s64 delta_ns = time - sg_cpu->last_update;

/* Clear iowait_boost if the CPU apprears to have been idle. */
- if (delta_ns > TICK_NSEC)
+ if (delta_ns > TICK_NSEC) {
sg_cpu->iowait_boost = 0;
+ sg_cpu->iowait_boost_pending = false;
+ }
}
}

static void sugov_iowait_boost(struct sugov_cpu *sg_cpu, unsigned long *util,
unsigned long *max)
{

- unsigned long boost_util = sg_cpu->iowait_boost;
- unsigned long boost_max = sg_cpu->iowait_boost_max;
+ unsigned long boost_util, boost_max;

- if (!boost_util)
+ if (!sg_cpu->iowait_boost)
return;

+ if (sg_cpu->iowait_boost_pending) {

+ sg_cpu->iowait_boost_pending = false;
+ } else {
+ sg_cpu->iowait_boost >>= 1;
+ if (sg_cpu->iowait_boost < sg_cpu->sg_policy->policy->min) {
+ sg_cpu->iowait_boost = 0;
+ return;
+ }
+ }
+
+ boost_util = sg_cpu->iowait_boost;

+ boost_max = sg_cpu->iowait_boost_max;
+

Joel Fernandes

unread,

Jul 23, 2017, 12:00:09 PM7/23/17

to

Make iowait_boost and iowait_boost_max as unsigned int since its unit is kHz
and this is consistent with struct cpufreq_policy. Also change the local
variables in sugov_iowait_boost to match this.

Cc: Srinivas Pandruvada <srinivas....@linux.intel.com>
Cc: Len Brown <le...@kernel.org>
Cc: Rafael J. Wysocki <r...@rjwysocki.net>
Cc: Viresh Kumar <viresh...@linaro.org>
Cc: Ingo Molnar <mi...@redhat.com>
Cc: Peter Zijlstra <pet...@infradead.org>

Signed-off-by: Joel Fernandes <joe...@google.com>
---

kernel/sched/cpufreq_schedutil.c | 6 +++---
1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/kernel/sched/cpufreq_schedutil.c b/kernel/sched/cpufreq_schedutil.c
index 570ab6e779e6..7650784eb857 100644
--- a/kernel/sched/cpufreq_schedutil.c
+++ b/kernel/sched/cpufreq_schedutil.c

@@ -54,8 +54,8 @@ struct sugov_cpu {
struct sugov_policy *sg_policy;

bool iowait_boost_pending;
- unsigned long iowait_boost;
- unsigned long iowait_boost_max;
+ unsigned int iowait_boost;
+ unsigned int iowait_boost_max;
u64 last_update;

/* The fields below are only needed when sharing a policy. */

@@ -199,7 +199,7 @@ static void sugov_set_iowait_boost(struct sugov_cpu *sg_cpu, u64 time,

static void sugov_iowait_boost(struct sugov_cpu *sg_cpu, unsigned long *util,
unsigned long *max)
{

Viresh Kumar

unread,

Jul 24, 2017, 5:00:09 AM7/24/17

to

You Send V7 [1-2]/2 twice, Are they different ?

For both the patches:

Acked-by: Viresh Kumar <viresh...@linaro.org>

--
viresh

Joel Fernandes

unread,

Jul 24, 2017, 2:40:07 PM7/24/17

to

No they are the same. Rafael suggested reposting it with linux-pm in
CC I just resent it.

>
> For both the patches:
>
> Acked-by: Viresh Kumar <viresh...@linaro.org>

Thanks!

-Joel

Rafael J. Wysocki

unread,

Jul 27, 2017, 8:10:06 PM7/27/17

to

Applied, thanks!