From: Nadav Har'El <
n...@scylladb.com>
This patch adds "real-time priority" to OSv threads, which can be used
to implement Posix's SCHED_FIFO and SCHED_RR scheduling policies.
Refs #386. It doesn't yet "Fixes" it because this patch does not yet add
the Posix front-end to this implementation (sched_setscheduler() et al.) -
this will be added in a separate patch.
We add to each thread a _realtime._priority, an unsigned integer.
Normal (non-real-time) threads have _realtime._priority of 0, and are
time-shared fairly as before, while realtime threads have
_realtime._priority > 0, and preempt any thread with a lower realtime
priority, including all normal threads.
We add a new API, t->set_realtime_priority(priority), to make a thread
real-time scheduled. The "priority" is an unsigned int as explained above
(0 means normal thread). The resulting scheduling policy is fully
compatible with POSIX's SCHED_FIFO. This includes following the intricate
rules on what happens to the running thread's place in the queue when there
are several real-time threads with the same priority and the running
thread yields, waits, or is preempted by a higher-priority thread.
This patch also adds an exercise for these cases in tests/misc-scheduler.cc.
This patch does not yet implement time slices as needed for supporting
POSIX's SCHED_RR policy. This patch already includes a time-slice setting
API, t->set_realtime_time_slice(duration), but if used it produces a
warning message, and the scheduler currently ignores this setting.
Adding support for real-time time slices is left for a future patch:
To implement time slices, we will need to keep in _realtime the total
time this thread ran since starting this slice (only zeroed when the
thread is removed from the run queue), and each time we switch to a
realtime thread with timeslice >= 0, we'll set an appropriate timer
(for timeslice minus already used time).
I tested that although this patch adds code into the scheduler it does
not noticably slow down context switches of normal threads. E.g.,
tests/misc-ctxsw.so measured colocated context switch time of 240ns
before this patch, and exactly the same after it.
While developing this patch I tested it using added workloads in
tests/misc-scheduler.cc, included in this patch, but unfortunately
this is not a real regression test - it has no assertions and cannot
automatically detect failure (and, like all the "misc-*" tests, it
doesn't run automatically in "make check").
Signed-off-by: Nadav Har'El <
n...@scylladb.com>
---
core/sched.cc | 82 ++++++++++++++++++++++++++++++++-----
include/osv/sched.hh | 75 +++++++++++++++++++++++++++++++++-
tests/misc-scheduler.cc | 89 +++++++++++++++++++++++++++++++++++++++++
3 files changed, 235 insertions(+), 11 deletions(-)
diff --git a/core/sched.cc b/core/sched.cc
index 65842ff3..68f49bcd 100644
--- a/core/sched.cc
+++ b/core/sched.cc
@@ -26,6 +26,7 @@
#include <osv/preempt-lock.hh>
#include <osv/app.hh>
#include <osv/symbols.hh>
+#include <osv/stubbing.hh>
MAKE_SYMBOL(sched::thread::current);
MAKE_SYMBOL(sched::cpu::current);
@@ -290,7 +291,16 @@ void cpu::reschedule_from_interrupt(bool called_from_yield,
#endif
} else if (!called_from_yield) {
auto &t = *runqueue.begin();
- if (p->_runtime.get_local() < t._runtime.get_local()) {
+ if (p->_realtime._priority > 0) {
+ // Only switch to a higher-priority realtime thread
+ if (t._realtime._priority <= p->_realtime._priority) {
+#ifdef __aarch64__
+ return switch_data;
+#else
+ return;
+#endif
+ }
+ } else if (t._realtime._priority == 0 && p->_runtime.get_local() < t._runtime.get_local()) {
preemption_timer.cancel();
auto delta = p->_runtime.time_until(t._runtime.get_local());
if (delta > 0) {
@@ -302,6 +312,20 @@ void cpu::reschedule_from_interrupt(bool called_from_yield,
return;
#endif
}
+ } else { /* called_from_yield */
+ if (p->_realtime._priority > 0) {
+ auto &t = *runqueue.begin();
+ // When yielding, only switch if the next thread has a higher
+ // or same priority as p (i.e., don't switch if t it has a
+ // lesser priority than p).
+ if (t._realtime._priority < p->_realtime._priority) {
+#ifdef __aarch64__
+ return switch_data;
+#else
+ return;
+#endif
+ }
+ }
}
// If we're here, p no longer has the lowest runtime. Before queuing
// p, return the runtime it borrowed for hysteresis.
@@ -309,7 +333,10 @@ void cpu::reschedule_from_interrupt(bool called_from_yield,
p->_detached_state->st.store(thread::status::queued);
if (!called_from_yield) {
- enqueue(*p);
+ // POSIX requires that if a real-time thread doesn't yield but
+ // rather is preempted by a higher-priority thread, it be
+ // reinserted into the runqueue first, not last, among its equals.
+ enqueue_first_equal(*p);
}
trace_sched_preempt();
@@ -350,16 +377,18 @@ void cpu::reschedule_from_interrupt(bool called_from_yield,
n->_runtime.add_context_switch_penalty();
}
preemption_timer.cancel();
- if (!called_from_yield) {
- if (!runqueue.empty()) {
- auto& t = *runqueue.begin();
- auto delta = n->_runtime.time_until(t._runtime.get_local());
- if (delta > 0) {
- preemption_timer.set_with_irq_disabled(now + delta);
+ if (n->_realtime._priority == 0) {
+ if (!called_from_yield) {
+ if (!runqueue.empty()) {
+ auto& t = *runqueue.begin();
+ auto delta = n->_runtime.time_until(t._runtime.get_local());
+ if (delta > 0) {
+ preemption_timer.set_with_irq_disabled(now + delta);
+ }
}
+ } else {
+ preemption_timer.set_with_irq_disabled(now + preempt_after);
}
- } else {
- preemption_timer.set_with_irq_disabled(now + preempt_after);
}
if (app_thread.load(std::memory_order_relaxed) != n->_app) { // don't write into a cache line if it can be avoided
@@ -530,6 +559,16 @@ void cpu::enqueue(thread& t)
runqueue.insert_equal(t);
}
+// When the run queue has several threads with equal thread_runtime_compare,
+// enqueue() puts a thread after its equals, while enqueue_first_equal()
+// puts it before its equals. The distinction is mostly interesting for real-
+// time priority threads.
+void cpu::enqueue_first_equal(thread& t)
+{
+ trace_sched_queue(&t);
+ runqueue.insert_before(runqueue.lower_bound(t), t);
+}
+
void cpu::init_on_cpu()
{
arch.init_on_cpu();
@@ -866,6 +905,29 @@ float thread::priority() const
return _runtime.priority();
}
+void thread::set_realtime_priority(unsigned priority)
+{
+ _realtime._priority = priority;
+}
+
+unsigned thread::realtime_priority() const
+{
+ return _realtime._priority;
+}
+
+void thread::set_realtime_time_slice(thread_realtime::duration time_slice)
+{
+ if (time_slice > 0) {
+ WARN_ONCE("set_realtime_time_slice() used but real-time time slices"
+ " not yet supported");
+ }
+ _realtime._time_slice = time_slice;
+}
+
+thread_realtime::duration thread::realtime_time_slice() const {
+ return _realtime._time_slice;
+}
+
sched::thread::status thread::get_status() const
{
return _detached_state->st.load(std::memory_order_relaxed);
diff --git a/include/osv/sched.hh b/include/osv/sched.hh
index 8a2694cb..c1f414ba 100644
--- a/include/osv/sched.hh
+++ b/include/osv/sched.hh
@@ -313,6 +313,24 @@ private:
int _renormalize_count;
};
+// "Normal" threads are fairly time-shared according to the thread_runtime
+// data above. Such normal threads all have _realtime._priority == 0.
+// A "real-time" thread is one where _realtime._priority > 0. The scheduler
+// always picks the thread with the highest _realtime_priority to run next;
+// In particular normal threads run only when no real-time thread wants to
+// run. When several real-time threads with equal _realtime._priority want to
+// run, each one is run for _realtime._time_slice before switching to the
+// next one; If _realtime_time_slice is == 0, there is no limit on the amount
+// of time the thread may run - it will only be preempted when it waits,
+// yields, or a higher-priority thread comes along.
+// The realtime scheduling policy matches the POSIX SCHED_RR and SCHED_FIFO.
+class thread_realtime {
+public:
+ using duration = thread_runtime::duration;
+ unsigned _priority = 0;
+ duration _time_slice = duration::zero();
+};
+
// "tau" controls the length of the history we consider for scheduling,
// or more accurately the rate of decay of an exponential moving average.
// In particular, it can be seen that if a thread has been monopolizing the
@@ -615,6 +633,54 @@ public:
* explained in set_priority().
*/
float priority() const;
+ /**
+ * Set thread's real-time priority
+ *
+ * By default new threads have a "real-time priority" of 0 and participate
+ * in fair time-sharing of the CPUs (the share is determined by
+ * set_priority()).
+ *
+ * A thread becomes a "real-time" thread by setting a positive integer as
+ * real-time priority. The scheduler always picks the thread with the
+ * highest real-time priority to run next; In particular normal threads run
+ * only when no real-time thread wants to run. When several real-time
+ * threads with equal real-time priority want to run, each one is run
+ * until it waits or yields, before switching to the next one.
+ *
+ * The real-time scheduling policy matches POSIX's "SCHED_FIFO" policy,
+ * or "SCHED_RR" if set_realtime_time_slice() was also used.
+ */
+ void set_realtime_priority(unsigned priority);
+ /**
+ * Get thread's real-time priority
+ *
+ * Returns the thread's real-time priority, an unsigned integer whose
+ * meaning is explained in set_realtime_priority().
+ */
+ unsigned realtime_priority() const;
+ /**
+ * Set thread's real-time scheduling time slice.
+ *
+ * By default, real-time threads (see set_realtime_priority()) continue to
+ * run until they yield, wait, or are preempted by a higher-priority
+ * real-time thread. When a time_slice > 0 is set for a thread with this
+ * function, a thread will be limited to that time-slice before it
+ * automatically yields to the next thread of equal real-time priority (if
+ * any). Setting time_slice == 0 reverts to the default behavior, disabling
+ * the time-slice limit for the thread.
+ *
+ * With time_slice == 0, the real-time scheduling policy matches POSIX's
+ * "SCHED_FIFO" policy. With time_slice > 0, it matches POSIX's "SCHED_RR"
+ * policy.
+ */
+ void set_realtime_time_slice(thread_realtime::duration time_slice);
+ /**
+ * Get thread's real-time scheduling time slice
+ *
+ * Returns the thread's real-time scheduling time slice, whose meaning is
+ * explained in set_realtime_time_slice().
+ */
+ thread_realtime::duration realtime_time_slice() const;
/**
* Prevent a waiting thread from ever waking (returns false if the thread
* was not in waiting state). This capability is not safe: If the thread
@@ -708,6 +774,7 @@ private:
//
// wake() on any state except waiting is discarded.
thread_runtime _runtime;
+ thread_realtime _realtime;
// part of the thread state is detached from the thread structure,
// and freed by rcu, so that waking a thread and destroying it can
// occur in parallel without synchronization via thread_handle
@@ -862,7 +929,12 @@ osv::clock::uptime::duration process_cputime();
class thread_runtime_compare {
public:
bool operator()(const thread& t1, const thread& t2) const {
- return t1._runtime.get_local() < t2._runtime.get_local();
+ if (t1._realtime._priority > t2._realtime._priority)
+ return true;
+ else if (t2._realtime._priority > 0)
+ return false;
+ else
+ return t1._runtime.get_local() < t2._runtime.get_local();
}
};
@@ -931,6 +1003,7 @@ struct cpu : private timer_base::client {
thread_runtime::duration preempt_after = thyst);
#endif
void enqueue(thread& t);
+ void enqueue_first_equal(thread& t);
void init_idle_thread();
virtual void timer_fired() override;
class notifier;
diff --git a/tests/misc-scheduler.cc b/tests/misc-scheduler.cc
index f4de24d1..578fabe1 100644
--- a/tests/misc-scheduler.cc
+++ b/tests/misc-scheduler.cc
@@ -139,6 +139,66 @@ void priority_test(std::vector<float> ps)
}
#endif
+#ifdef __OSV__
+void realtime_test(std::vector<int> ps)
+{
+ std::cerr << "Starting realtime test\n";
+ std::vector<std::thread> threads;
+ mutex mtx;
+ for (auto p : ps) {
+ threads.push_back(std::thread([p]() {
+ sched::thread::current()->set_realtime_priority(p);
+ std::cout << "Starting thread with realtime priority " << p << "\n";
+ // Sleep a bit, to let all test threads get started. The thread
+ // starting the test threads is not realtime, so it can be preempted
+ // by the test threads.
+ sleep(1);
+ for (int i=0; i<10; i++) {
+ _loop(100000);
+ std::cout << p << std::flush;
+ }
+ }));
+ }
+ for (auto &t : threads) {
+ t.join();
+ }
+ std::cerr << "\nRealtime test done\n";
+}
+
+void realtime_test2(bool yield)
+{
+ std::cerr << "Starting realtime test #2 - FIFO order, yield=" << yield << "\n";
+ std::vector<std::thread> threads;
+ mutex mtx;
+ std::atomic<int> last_seen(-1);
+ for (int p = 0; p < 10; p++) {
+ threads.push_back(std::thread([p,yield,&last_seen]() {
+ sched::thread::current()->set_realtime_priority(1);
+ // Sleep a bit, to let all test threads get started. The thread
+ // starting the test threads is not realtime, so it can be preempted
+ // by the test threads.
+ sleep(1);
+ for (int i = 0 ; i < 3; i++) {
+ for(int j = 0; j < 100000; j++) {
+ if (last_seen.exchange(p) != p) {
+ std::cout << p << std::flush; // context-switched to p
+ }
+ _loop(1);
+ }
+ if (yield)
+ sched::thread::yield();
+ else
+ sched::thread::sleep(std::chrono::milliseconds(1));
+ }
+ }));
+ }
+ for (auto &t : threads) {
+ t.join();
+ }
+ std::cerr << "\nRealtime test #2 done\n";
+}
+#endif
+
int main()
{
@@ -148,6 +208,35 @@ int main()
return 0;
}
+ #ifdef __OSV__
+ // Tests for thread::set_realtime() support for POSIX-like realtime
+ // scheduling.
+ // TODO: Move this code into a real test, and in addition to just
+ // printing progress, also save it into a string and check this string.
+ // (Need to check the first 10 characters of this string repeat 2 more
+ // times and that's it).
+ realtime_test({0, 1, 2});
+ realtime_test2(false);
+ realtime_test2(true);
+ // Check that intermittent thread with priority 2 doesn't force
+ // realtime_test2 to context-switch more often than it normally
+ // should (each time we the priority-2 thread sleeps, we need to
+ // go back to the same priority-1 thread that previously ran - not
+ // to the next one). We expect the output from the test below to
+ // be identical to that from the test above.
+ std::cout << "Additional intermittent thread with priority 2\n";
+ std::atomic<bool> stop(false);
+ std::thread ti([&stop]() {
+ sched::thread::current()->set_realtime_priority(2);
+ while (!stop.load()) {
+ sched::thread::sleep(std::chrono::milliseconds(5));
+ }
+ });
+ realtime_test2(true);
+ stop.store(true);
+ ti.join();
+#endif
+
#ifdef __OSV__
auto p = sched::thread::priority_default;
priority_test({p, p*4});
--
2.30.2