The fundamental issue is that ondemand will go to a (too) low CPU
frequency for workloads that alternatingly disk and CPU bound...
--
Arjan van de Ven Intel Open Source Technology Centre
For development, discussion and tips for power savings,
visit http://www.lesswatts.org
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majo...@vger.kernel.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/
For the ondemand cpufreq governor, it is desired that the iowait
time is microaccounted in a similar way as idle time is.
This patch introduces the infrastructure to account and expose
this information via the get_cpu_iowait_time_us() function.
Signed-off-by: Arjan van de Ven <ar...@linux.intel.com>
---
include/linux/tick.h | 4 ++++
kernel/time/tick-sched.c | 28 ++++++++++++++++++++++++++++
kernel/time/timer_list.c | 1 +
3 files changed, 33 insertions(+), 0 deletions(-)
diff --git a/include/linux/tick.h b/include/linux/tick.h
index 0343eed..4aa3703 100644
--- a/include/linux/tick.h
+++ b/include/linux/tick.h
@@ -42,6 +42,7 @@ enum tick_nohz_mode {
* @idle_waketime: Time when the idle was interrupted
* @idle_exittime: Time when the idle state was left
* @idle_sleeptime: Sum of the time slept in idle with sched tick stopped
+ * @iowait_sleeptime: Sum of the time slept in idle with sched tick stopped, with IO outstanding
* @sleep_length: Duration of the current idle sleep
* @do_timer_lst: CPU was the last one doing do_timer before going idle
*/
@@ -60,6 +61,7 @@ struct tick_sched {
ktime_t idle_waketime;
ktime_t idle_exittime;
ktime_t idle_sleeptime;
+ ktime_t iowait_sleeptime;
ktime_t sleep_length;
unsigned long last_jiffies;
unsigned long next_jiffies;
@@ -123,6 +125,7 @@ extern void tick_nohz_stop_sched_tick(int inidle);
extern void tick_nohz_restart_sched_tick(void);
extern ktime_t tick_nohz_get_sleep_length(void);
extern u64 get_cpu_idle_time_us(int cpu, u64 *last_update_time);
+extern u64 get_cpu_iowait_time_us(int cpu, u64 *last_update_time);
# else
static inline void tick_nohz_stop_sched_tick(int inidle) { }
static inline void tick_nohz_restart_sched_tick(void) { }
@@ -133,6 +136,7 @@ static inline ktime_t tick_nohz_get_sleep_length(void)
return len;
}
static inline u64 get_cpu_idle_time_us(int cpu, u64 *unused) { return -1; }
+static inline u64 get_cpu_iowait(int cpu, u64 *unused) { return -1; }
# endif /* !NO_HZ */
#endif
diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c
index 326f5f8..a6104a8 100644
--- a/kernel/time/tick-sched.c
+++ b/kernel/time/tick-sched.c
@@ -161,6 +161,8 @@ update_ts_time_stats(struct tick_sched *ts, ktime_t now, u64 *last_update_time)
if (ts->idle_active) {
delta = ktime_sub(now, ts->idle_entrytime);
ts->idle_sleeptime = ktime_add(ts->idle_sleeptime, delta);
+ if (nr_iowait_cpu() > 0)
+ ts->iowait_sleeptime = ktime_add(ts->iowait_sleeptime, delta);
ts->idle_entrytime = now;
}
@@ -220,6 +222,32 @@ u64 get_cpu_idle_time_us(int cpu, u64 *last_update_time)
}
EXPORT_SYMBOL_GPL(get_cpu_idle_time_us);
+/*
+ * get_cpu_iowait_time_us - get the total iowait time of a cpu
+ * @cpu: CPU number to query
+ * @last_update_time: variable to store update time in
+ *
+ * Return the cummulative iowait time (since boot) for a given
+ * CPU, in microseconds.
+ *
+ * This time is measured via accounting rather than sampling,
+ * and is as accurate as ktime_get() is.
+ *
+ * This function returns -1 if NOHZ is not enabled.
+ */
+u64 get_cpu_iowait_time_us(int cpu, u64 *last_update_time)
+{
+ struct tick_sched *ts = &per_cpu(tick_cpu_sched, cpu);
+
+ if (!tick_nohz_enabled)
+ return -1;
+
+ update_ts_time_stats(ts, ktime_get(), last_update_time);
+
+ return ktime_to_us(ts->iowait_sleeptime);
+}
+EXPORT_SYMBOL_GPL(get_cpu_iowait_time_us);
+
/**
* tick_nohz_stop_sched_tick - stop the idle tick from the idle task
*
diff --git a/kernel/time/timer_list.c b/kernel/time/timer_list.c
index 1a4a7dd..ab8f5e3 100644
--- a/kernel/time/timer_list.c
+++ b/kernel/time/timer_list.c
@@ -176,6 +176,7 @@ static void print_cpu(struct seq_file *m, int cpu, u64 now)
P_ns(idle_waketime);
P_ns(idle_exittime);
P_ns(idle_sleeptime);
+ P_ns(iowait_sleeptime);
P(last_jiffies);
P(next_jiffies);
P_ns(idle_expires);
--
1.6.2.5
The exported function get_cpu_idle_time_us() has no comment
describing it; add a kerneldoc comment
Signed-off-by: Arjan van de Ven <ar...@linux.intel.com>
---
kernel/time/tick-sched.c | 14 ++++++++++++++
1 files changed, 14 insertions(+), 0 deletions(-)
diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c
index f992762..54dc155 100644
--- a/kernel/time/tick-sched.c
+++ b/kernel/time/tick-sched.c
@@ -179,6 +179,20 @@ static ktime_t tick_nohz_start_idle(struct tick_sched *ts)
return now;
}
+/**
+ * get_cpu_idle_time_us - get the total idle time of a cpu
+ * @cpu: CPU number to query
+ * @last_update_time: variable to store update time in
+ *
+ * Return the cummulative idle time (since boot) for a given
+ * CPU, in microseconds. The idle time returned includes
+ * the iowait time (unlike what "top" and co report).
+ *
+ * This time is measured via accounting rather than sampling,
+ * and is as accurate as ktime_get() is.
+ *
+ * This function returns -1 if NOHZ is not enabled.
+ */
u64 get_cpu_idle_time_us(int cpu, u64 *last_update_time)
{
struct tick_sched *ts = &per_cpu(tick_cpu_sched, cpu);
--
1.6.2.5
Currently, two places update the idle statistics (and more to
come later in this series).
This patch creates a helper function for updating these statistics.
Signed-off-by: Arjan van de Ven <ar...@linux.intel.com>
---
kernel/time/tick-sched.c | 29 +++++++++++++++++++----------
1 files changed, 19 insertions(+), 10 deletions(-)
diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c
index 54dc155..ca2211d 100644
--- a/kernel/time/tick-sched.c
+++ b/kernel/time/tick-sched.c
@@ -150,14 +150,25 @@ static void tick_nohz_update_jiffies(ktime_t now)
touch_softlockup_watchdog();
}
-static void tick_nohz_stop_idle(int cpu, ktime_t now)
+/*
+ * Updates the per cpu time idle statistics counters
+ */
+static void update_ts_time_stats(struct tick_sched *ts, ktime_t now)
{
- struct tick_sched *ts = &per_cpu(tick_cpu_sched, cpu);
ktime_t delta;
- delta = ktime_sub(now, ts->idle_entrytime);
ts->idle_lastupdate = now;
- ts->idle_sleeptime = ktime_add(ts->idle_sleeptime, delta);
+ if (ts->idle_active) {
+ delta = ktime_sub(now, ts->idle_entrytime);
+ ts->idle_sleeptime = ktime_add(ts->idle_sleeptime, delta);
+ }
+}
+
+static void tick_nohz_stop_idle(int cpu, ktime_t now)
+{
+ struct tick_sched *ts = &per_cpu(tick_cpu_sched, cpu);
+
+ update_ts_time_stats(ts, now);
ts->idle_active = 0;
sched_clock_idle_wakeup_event(0);
@@ -165,14 +176,12 @@ static void tick_nohz_stop_idle(int cpu, ktime_t now)
static ktime_t tick_nohz_start_idle(struct tick_sched *ts)
{
- ktime_t now, delta;
+ ktime_t now;
now = ktime_get();
- if (ts->idle_active) {
- delta = ktime_sub(now, ts->idle_entrytime);
- ts->idle_lastupdate = now;
- ts->idle_sleeptime = ktime_add(ts->idle_sleeptime, delta);
- }
+
+ update_ts_time_stats(ts, now);
+
ts->idle_entrytime = now;
ts->idle_active = 1;
sched_clock_idle_sleep_event();
--
1.6.2.5
This patch folds the updating of the last_update_time into the
update_ts_time_stats() function, and updates the callers.
This allows for further cleanups that are done in the next patch.
Signed-off-by: Arjan van de Ven <ar...@linux.intel.com>
---
kernel/time/tick-sched.c | 22 +++++++++++-----------
1 files changed, 11 insertions(+), 11 deletions(-)
diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c
index 7dbad2f..ac54543 100644
--- a/kernel/time/tick-sched.c
+++ b/kernel/time/tick-sched.c
@@ -153,7 +153,8 @@ static void tick_nohz_update_jiffies(ktime_t now)
/*
* Updates the per cpu time idle statistics counters
*/
-static void update_ts_time_stats(struct tick_sched *ts, ktime_t now)
+static void
+update_ts_time_stats(struct tick_sched *ts, ktime_t now, u64 *last_update_time)
{
ktime_t delta;
@@ -163,13 +164,19 @@ static void update_ts_time_stats(struct tick_sched *ts, ktime_t now)
ts->idle_sleeptime = ktime_add(ts->idle_sleeptime, delta);
ts->idle_entrytime = now;
}
+
+ if (ts->idle_active && last_update_time)
+ *last_update_time = ktime_to_us(ts->idle_lastupdate);
+ else
+ *last_update_time = ktime_to_us(now);
+
}
static void tick_nohz_stop_idle(int cpu, ktime_t now)
{
struct tick_sched *ts = &per_cpu(tick_cpu_sched, cpu);
- update_ts_time_stats(ts, now);
+ update_ts_time_stats(ts, now, NULL);
ts->idle_active = 0;
sched_clock_idle_wakeup_event(0);
@@ -181,7 +188,7 @@ static ktime_t tick_nohz_start_idle(struct tick_sched *ts)
now = ktime_get();
- update_ts_time_stats(ts, now);
+ update_ts_time_stats(ts, now, NULL);
ts->idle_entrytime = now;
ts->idle_active = 1;
@@ -206,18 +213,11 @@ static ktime_t tick_nohz_start_idle(struct tick_sched *ts)
u64 get_cpu_idle_time_us(int cpu, u64 *last_update_time)
{
struct tick_sched *ts = &per_cpu(tick_cpu_sched, cpu);
- ktime_t now;
if (!tick_nohz_enabled)
return -1;
- now = ktime_get();
- update_ts_time_stats(ts, now);
-
- if (ts->idle_active)
- *last_update_time = ktime_to_us(ts->idle_lastupdate);
- else
- *last_update_time = ktime_to_us(now);
+ update_ts_time_stats(ts, ktime_get(), last_update_time);
return ktime_to_us(ts->idle_sleeptime);
}
--
1.6.2.5
Right now, get_cpu_idle_time_us() only reports the idle statistics
upto the point the CPU entered last idle; not what is valid right now.
This patch adds an update of the idle statistics to get_cpu_idle_time_us(),
so that calling this function always returns statistics that are accurate
at the point of the call.
This includes resetting the start of the idle time for accounting purposes
to avoid double accounting.
Signed-off-by: Arjan van de Ven <ar...@linux.intel.com>
---
kernel/time/tick-sched.c | 7 ++++++-
1 files changed, 6 insertions(+), 1 deletions(-)
diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c
index ca2211d..7dbad2f 100644
--- a/kernel/time/tick-sched.c
+++ b/kernel/time/tick-sched.c
@@ -161,6 +161,7 @@ static void update_ts_time_stats(struct tick_sched *ts, ktime_t now)
if (ts->idle_active) {
delta = ktime_sub(now, ts->idle_entrytime);
ts->idle_sleeptime = ktime_add(ts->idle_sleeptime, delta);
+ ts->idle_entrytime = now;
}
}
@@ -205,14 +206,18 @@ static ktime_t tick_nohz_start_idle(struct tick_sched *ts)
u64 get_cpu_idle_time_us(int cpu, u64 *last_update_time)
{
struct tick_sched *ts = &per_cpu(tick_cpu_sched, cpu);
+ ktime_t now;
if (!tick_nohz_enabled)
return -1;
+ now = ktime_get();
+ update_ts_time_stats(ts, now);
+
if (ts->idle_active)
*last_update_time = ktime_to_us(ts->idle_lastupdate);
else
- *last_update_time = ktime_to_us(ktime_get());
+ *last_update_time = ktime_to_us(now);
return ktime_to_us(ts->idle_sleeptime);
}
--
1.6.2.5
--
Now that the only user of ts->idle_lastupdate is update_ts_time_stats(),
the entire field can be eliminated.
In update_ts_time_stats(), idle_lastupdate is first set to "now",
and a few lines later, the only user is an if() statement that
assigns a variable either to "now" or to ts->idle_lastupdate,
which has the value of "now" at that point.
Signed-off-by: Arjan van de Ven <ar...@linux.intel.com>
---
include/linux/tick.h | 1 -
kernel/time/tick-sched.c | 5 +----
2 files changed, 1 insertions(+), 5 deletions(-)
diff --git a/include/linux/tick.h b/include/linux/tick.h
index d2ae79e..0343eed 100644
--- a/include/linux/tick.h
+++ b/include/linux/tick.h
@@ -60,7 +60,6 @@ struct tick_sched {
ktime_t idle_waketime;
ktime_t idle_exittime;
ktime_t idle_sleeptime;
- ktime_t idle_lastupdate;
ktime_t sleep_length;
unsigned long last_jiffies;
unsigned long next_jiffies;
diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c
index ac54543..326f5f8 100644
--- a/kernel/time/tick-sched.c
+++ b/kernel/time/tick-sched.c
@@ -158,16 +158,13 @@ update_ts_time_stats(struct tick_sched *ts, ktime_t now, u64 *last_update_time)
{
ktime_t delta;
- ts->idle_lastupdate = now;
if (ts->idle_active) {
delta = ktime_sub(now, ts->idle_entrytime);
ts->idle_sleeptime = ktime_add(ts->idle_sleeptime, delta);
ts->idle_entrytime = now;
}
- if (ts->idle_active && last_update_time)
- *last_update_time = ktime_to_us(ts->idle_lastupdate);
- else
+ if (last_update_time)
*last_update_time = ktime_to_us(now);
}
--
1.6.2.5
Reviewed-by: Rik van Riel <ri...@redhat.com>
--
All rights reversed
Reviewed-by: Rik van Riel <ri...@redhat.com>
--
All rights reversed
Reviewed-by: Rik van Riel <ri...@redhat.com>
--
All rights reversed
Reviewed-by: Rik van Riel <ri...@redhat.com>
--
All rights reversed
Reviewed-by: Rik van Riel <ri...@redhat.com>
--
All rights reversed
Reviewed-by: Rik van Riel <ri...@redhat.com>
--
All rights reversed
What about the issue Pavel reported and Willy confirmed (regression
on older hardware)?
Best,
Dominik
given the scheduler changes, these probably make more sense to go
via one of Ingo's trees ?
If everyone agrees, feel free to add my Signed-off-by:
Dave
>> given the scheduler changes, these probably make more sense to go
>> via one of Ingo's trees ?
>> If everyone agrees, feel free to add my Signed-off-by:
>
> What about the issue Pavel reported and Willy confirmed (regression
> on older hardware)?
Having a /sys tunable makes sense, especially when running
on battery power. I guess that can be introduced as a patch
8/7 :)
--
All rights reversed
... only if combined with sane default values.
Best,
Dominik