[PATCH seastar v1 0/3] Add kernel stack trace reporting for stalls

79 views
Skip to first unread message

Avi Kivity

<avi@scylladb.com>
unread,
Sep 1, 2021, 12:00:38 PM9/1/21
to seastar-dev@googlegroups.com
We've observed CPU stalls in TCP processing[1] and when paging in
shard memory[2]. To allow investigation of exactly what is happening
in this and similar situations, extend the stall detector to report
kernel stacks as well as user-space stacks.

This works by using the Linux perf_event subsystem (when available) and
asking it to capture a kernel stack. When we do, we print it out.

Example output:

Reactor stalled for 1176 ms on shard 0. Backtrace: 0x4969a4 0x463a88 0x463f7b 0x4640f7 0x7fa06b8aaa1f 0xfafb5 0x4282dd 0x42a5f3 0x426e7c
kernel callstack: 0xffffffffffffff80 0xffffffff9569a327 0xffffffff952ccf56 0xffffffff952cf882 0xffffffff952d21e9 0xffffffff952d3fc6 0xffffffff952f1e80 0xffffffff952b2261 0xffffffff952b2bf5 0xffffffff952a780c 0xffffffff952aa0c6 0xffffffff952aa181 0xffffffff952930ad 0xffffffff95c05c10 0xffffffff95e0007c

This was generated by a mmap() call that was intentionally made to
stall, and the kernel trace (when decoded) indeed points at mmap
internals.

https://github.com/avikivity/seastar stall-kernel-stack-signal

Avi Kivity (3):
cpu_stall_detector: make abstract
cpu_stall_detector: add perf_event stall detector
stall_detector: record and print kernel stack trace

src/core/stall_detector.hh | 44 +++++++++-
src/core/reactor.cc | 175 ++++++++++++++++++++++++++++++++++---
2 files changed, 203 insertions(+), 16 deletions(-)

--
2.31.1

Avi Kivity

<avi@scylladb.com>
unread,
Sep 1, 2021, 12:00:39 PM9/1/21
to seastar-dev@googlegroups.com
Split cpu_stall_detector into a base class with the common logic and a
derived class with the mechanism to detect the stall (i.e. the timer).

This prepares the was for a stall detector that uses Linux perf_event,
when available, which can also collect kernel traces.
---
src/core/stall_detector.hh | 24 ++++++++++++++++++++----
src/core/reactor.cc | 32 ++++++++++++++++++++------------
2 files changed, 40 insertions(+), 16 deletions(-)

diff --git a/src/core/stall_detector.hh b/src/core/stall_detector.hh
index f0b066ed9c..c2f555e8ff 100644
--- a/src/core/stall_detector.hh
+++ b/src/core/stall_detector.hh
@@ -24,10 +24,11 @@

#include <signal.h>
#include <limits>
#include <chrono>
#include <functional>
+#include <memory>
#include <seastar/core/posix.hh>
#include <seastar/core/metrics_registration.hh>
#include <seastar/core/scheduling.hh>

namespace seastar {
@@ -45,11 +46,11 @@ struct cpu_stall_detector_config {
std::function<void ()> report; // alternative reporting function for tests
};

// Detects stalls in continuations that run for too long
class cpu_stall_detector {
- timer_t _timer;
+protected:
std::atomic<uint64_t> _last_tasks_processed_seen{};
unsigned _stall_detector_reports_per_minute;
std::atomic<uint64_t> _stall_detector_missed_ticks = { 0 };
unsigned _reported = 0;
unsigned _total_reported = 0;
@@ -63,27 +64,42 @@ class cpu_stall_detector {
sched_clock::duration _threshold;
sched_clock::duration _slack;
cpu_stall_detector_config _config;
seastar::metrics::metric_groups _metrics;
friend reactor;
+ virtual bool reap_event_and_check_spuriousness() {
+ return false;
+ }
private:
void maybe_report();
- void arm_timer();
+ virtual void arm_timer() = 0;
void report_suppressions(sched_clock::time_point now);
public:
using clock_type = thread_cputime_clock;
public:
explicit cpu_stall_detector(cpu_stall_detector_config cfg = {});
- ~cpu_stall_detector();
+ virtual ~cpu_stall_detector() = default;
static int signal_number() { return SIGRTMIN + 1; }
void start_task_run(sched_clock::time_point now);
void end_task_run(sched_clock::time_point now);
void generate_trace();
void update_config(cpu_stall_detector_config cfg);
cpu_stall_detector_config get_config() const;
void on_signal();
- void start_sleep();
+ virtual void start_sleep() = 0;
void end_sleep();
};

+class cpu_stall_detector_posix_timer : public cpu_stall_detector {
+ timer_t _timer;
+public:
+ explicit cpu_stall_detector_posix_timer(cpu_stall_detector_config cfg = {});
+ virtual ~cpu_stall_detector_posix_timer() override;
+private:
+ virtual void arm_timer() override;
+ virtual void start_sleep() override;
+};
+
+std::unique_ptr<cpu_stall_detector> make_cpu_stall_detector(cpu_stall_detector_config cfg = {});
+
}
}
diff --git a/src/core/reactor.cc b/src/core/reactor.cc
index 2b6bd206b6..ad967b7ac9 100644
--- a/src/core/reactor.cc
+++ b/src/core/reactor.cc
@@ -923,11 +923,11 @@ reactor::reactor(std::shared_ptr<smp> smp, alien::instance& alien, unsigned id,
, _timer_thread(
[&] { timer_thread_func(); }, sched::thread::attr().stack(4096).name("timer_thread").pin(sched::cpu::current()))
, _engine_thread(sched::thread::current())
#endif
, _cpu_started(0)
- , _cpu_stall_detector(std::make_unique<cpu_stall_detector>())
+ , _cpu_stall_detector(make_cpu_stall_detector())
, _reuseport(posix_reuseport_detect())
, _thread_pool(std::make_unique<thread_pool>(this, seastar::format("syscall-{}", id))) {
/*
* The _backend assignment is here, not on the initialization list as
* the chosen backend constructor may want to handle signals and thus
@@ -1059,29 +1059,31 @@ cpu_stall_detector::cpu_stall_detector(cpu_stall_detector_config cfg)
// If first stall detector invocation happens during another dlopen() call the calling thread
// will deadlock. The dummy call here makes sure that backtrace's initialization happens in
// a safe place.
backtrace([] (frame) {});
update_config(cfg);
+
+ namespace sm = seastar::metrics;
+
+ _metrics.add_group("stall_detector", {
+ sm::make_derive("reported", _total_reported, sm::description("Total number of reported stalls, look in the traces for the exact reason"))});
+
+ // note: if something is added here that can, it should take care to destroy _timer.
+}
+
+cpu_stall_detector_posix_timer::cpu_stall_detector_posix_timer(cpu_stall_detector_config cfg) : cpu_stall_detector(cfg) {
struct sigevent sev = {};
sev.sigev_notify = SIGEV_THREAD_ID;
sev.sigev_signo = signal_number();
sev._sigev_un._tid = syscall(SYS_gettid);
int err = timer_create(CLOCK_THREAD_CPUTIME_ID, &sev, &_timer);
if (err) {
throw std::system_error(std::error_code(err, std::system_category()));
}
-
- namespace sm = seastar::metrics;
-
- _metrics.add_group("stall_detector", {
- sm::make_derive("reported", _total_reported, sm::description("Total number of reported stalls, look in the traces for the exact reason"))});
-
-
- // note: if something is added here that can, it should take care to destroy _timer.
}

-cpu_stall_detector::~cpu_stall_detector() {
+cpu_stall_detector_posix_timer::~cpu_stall_detector_posix_timer() {
timer_delete(_timer);
}

cpu_stall_detector_config
cpu_stall_detector::get_config() const {
@@ -1138,11 +1140,11 @@ void cpu_stall_detector::report_suppressions(sched_clock::time_point now) {
_reported = 0;
_minute_mark = now;
}
}

-void cpu_stall_detector::arm_timer() {
+void cpu_stall_detector_posix_timer::arm_timer() {
auto its = posix::to_relative_itimerspec(_threshold * _report_at + _slack, 0s);
timer_settime(_timer, 0, &its, nullptr);
}

void cpu_stall_detector::start_task_run(sched_clock::time_point now) {
@@ -1160,19 +1162,25 @@ void cpu_stall_detector::start_task_run(sched_clock::time_point now) {
void cpu_stall_detector::end_task_run(sched_clock::time_point now) {
std::atomic_signal_fence(std::memory_order_acquire); // Don't hoist this write, so the signal handler can see it
_last_tasks_processed_seen.store(0, std::memory_order_relaxed);
}

-void cpu_stall_detector::start_sleep() {
+void cpu_stall_detector_posix_timer::start_sleep() {
auto its = posix::to_relative_itimerspec(0s, 0s);
timer_settime(_timer, 0, &its, nullptr);
_rearm_timer_at = reactor::now();
}

void cpu_stall_detector::end_sleep() {
}

+
+std::unique_ptr<cpu_stall_detector>
+internal::make_cpu_stall_detector(cpu_stall_detector_config cfg) {
+ return std::make_unique<cpu_stall_detector_posix_timer>(cfg);
+}
+
void
reactor::update_blocked_reactor_notify_ms(std::chrono::milliseconds ms) {
auto cfg = _cpu_stall_detector->get_config();
if (ms != cfg.threshold) {
cfg.threshold = ms;
--
2.31.1

Avi Kivity

<avi@scylladb.com>
unread,
Sep 1, 2021, 12:00:40 PM9/1/21
to seastar-dev@googlegroups.com
The Linux perf_event subsystem can collect kernel traces, for when
the stall happens in the kernel, so add a new stall detector and
use it preferentially when available (it might not be available due
to settings of perf_event_paranoid.

The ioctl that resets the count does not appear to work 100%
reliably, so we detect false positives and ignore them.

Note we don't yet report kernel traces; this will come in the next
patch.
---
src/core/stall_detector.hh | 12 +++++
src/core/reactor.cc | 93 ++++++++++++++++++++++++++++++++++++++
2 files changed, 105 insertions(+)

diff --git a/src/core/stall_detector.hh b/src/core/stall_detector.hh
index c2f555e8ff..7642f19d35 100644
--- a/src/core/stall_detector.hh
+++ b/src/core/stall_detector.hh
@@ -97,9 +97,21 @@ class cpu_stall_detector_posix_timer : public cpu_stall_detector {
private:
virtual void arm_timer() override;
virtual void start_sleep() override;
};

+class cpu_stall_detector_linux_perf_event : public cpu_stall_detector {
+ file_desc _fd;
+ bool _enabled = false;
+ uint64_t _current_period = 0;
+public:
+ static std::unique_ptr<cpu_stall_detector_linux_perf_event> try_make(cpu_stall_detector_config cfg = {});
+ explicit cpu_stall_detector_linux_perf_event(file_desc fd, cpu_stall_detector_config cfg = {});
+ virtual void arm_timer() override;
+ virtual void start_sleep() override;
+ virtual bool reap_event_and_check_spuriousness() override;
+};
+
std::unique_ptr<cpu_stall_detector> make_cpu_stall_detector(cpu_stall_detector_config cfg = {});

}
}
diff --git a/src/core/reactor.cc b/src/core/reactor.cc
index ad967b7ac9..0e7d3bc519 100644
--- a/src/core/reactor.cc
+++ b/src/core/reactor.cc
@@ -81,10 +81,11 @@
#include <boost/version.hpp>
#include <atomic>
#include <dirent.h>
#include <linux/types.h> // for xfs, below
#include <sys/ioctl.h>
+#include <linux/perf_event.h>
#include <xfs/linux.h>
#define min min /* prevent xfs.h from defining min() as a macro */
#include <xfs/xfs.h>
#undef min
#ifdef SEASTAR_HAVE_DPDK
@@ -1108,10 +1109,13 @@ void cpu_stall_detector::maybe_report() {
// Best case it's a correctly predicted branch. If a backtrace had happened in
// the near past it's an increment and two branches.
//
// We can do it a cheaper if we don't report suppressed backtraces.
void cpu_stall_detector::on_signal() {
+ if (reap_event_and_check_spuriousness()) {
+ return;
+ }
auto tasks_processed = engine().tasks_processed();
auto last_seen = _last_tasks_processed_seen.load(std::memory_order_relaxed);
if (!last_seen) {
return; // stall detector in not active
} else if (last_seen == tasks_processed) { // no task was processed - report
@@ -1171,13 +1175,102 @@ void cpu_stall_detector_posix_timer::start_sleep() {
}

void cpu_stall_detector::end_sleep() {
}

+static long
+perf_event_open(struct perf_event_attr* hw_event, pid_t pid, int cpu, int group_fd, unsigned long flags) {
+ return syscall(__NR_perf_event_open, hw_event, pid, cpu, group_fd, flags);
+}
+
+cpu_stall_detector_linux_perf_event::cpu_stall_detector_linux_perf_event(file_desc fd, cpu_stall_detector_config cfg)
+ : cpu_stall_detector(cfg), _fd(std::move(fd)) {
+}
+
+void
+cpu_stall_detector_linux_perf_event::arm_timer() {
+ uint64_t ns = (_threshold * _report_at + _slack) / 1ns;
+ if (_enabled && _current_period == ns) [[likely]] {
+ // Common case - we're re-arming with the same period, the counter
+ // is already enabled.
+ _fd.ioctl(PERF_EVENT_IOC_RESET, 0);
+ } else {
+ // Uncommon case - we're moving from disabled to enabled, or changing
+ // the period. Issue more calls and be careful.
+ _fd.ioctl(PERF_EVENT_IOC_DISABLE, 0); // avoid false alarms while we modify stuff
+ _fd.ioctl(PERF_EVENT_IOC_PERIOD, ns);
+ _fd.ioctl(PERF_EVENT_IOC_RESET, 0);
+ _fd.ioctl(PERF_EVENT_IOC_ENABLE, 0);
+ _enabled = true;
+ _current_period = ns;
+ }
+}
+
+void
+cpu_stall_detector_linux_perf_event::start_sleep() {
+ _fd.ioctl(PERF_EVENT_IOC_DISABLE, 0);
+ _enabled = false;
+}
+
+bool
+cpu_stall_detector_linux_perf_event::reap_event_and_check_spuriousness() {
+ struct read_format {
+ uint64_t value;
+ } buf;
+ _fd.read(&buf, sizeof(read_format));
+ return buf.value < _current_period;
+}
+
+std::unique_ptr<cpu_stall_detector_linux_perf_event>
+cpu_stall_detector_linux_perf_event::try_make(cpu_stall_detector_config cfg) {
+ ::perf_event_attr pea = {
+ .type = PERF_TYPE_SOFTWARE,
+ .size = sizeof(pea),
+ .config = PERF_COUNT_SW_TASK_CLOCK, // more likely to work on virtual machines than hardware events
+ .sample_period = 1'000'000'000, // Needs non-zero value or PERF_IOC_PERIOD gets confused
+ .disabled = 1,
+ .wakeup_events = 1,
+ };
+ unsigned long flags = 0;
+ if (kernel_uname().whitelisted({"3.14"})) {
+ flags |= PERF_FLAG_FD_CLOEXEC;
+ }
+ int fd = perf_event_open(&pea, 0, -1, -1, flags);
+ if (fd == -1) {
+ return {};
+ }
+ auto desc = file_desc::from_fd(fd);
+ struct f_owner_ex sig_owner = {
+ .type = F_OWNER_TID,
+ .pid = gettid(),
+ };
+ auto ret1 = ::fcntl(fd, F_SETOWN_EX, &sig_owner);
+ if (ret1 == -1) {
+ abort();
+ }
+ auto ret2 = ::fcntl(fd, F_SETSIG, signal_number());
+ if (ret2 == -1) {
+ abort();
+ }
+ auto fd_flags = ::fcntl(fd, F_GETFL);
+ if (fd_flags == -1) {
+ abort();
+ }
+ auto ret3 = ::fcntl(fd, F_SETFL, fd_flags | O_ASYNC);
+ if (ret3 == -1) {
+ abort();
+ }
+ return std::make_unique<cpu_stall_detector_linux_perf_event>(std::move(desc));
+}
+

std::unique_ptr<cpu_stall_detector>
internal::make_cpu_stall_detector(cpu_stall_detector_config cfg) {
+ auto csd = cpu_stall_detector_linux_perf_event::try_make(cfg);
+ if (csd) {
+ return csd;
+ }
return std::make_unique<cpu_stall_detector_posix_timer>(cfg);
}

void
reactor::update_blocked_reactor_notify_ms(std::chrono::milliseconds ms) {
--
2.31.1

Avi Kivity

<avi@scylladb.com>
unread,
Sep 1, 2021, 12:00:42 PM9/1/21
to seastar-dev@googlegroups.com
Enable kernel stack collection. Note this can only be gathered via
the mmap interface, so mmap the ring buffer and use it.

I tested this against a large mmap(MAP_POPULATE) to force a kernel
stall, and it correctly reported the kernel trace. Note that it will
only report time when the thread is active (spin loops or real work).
If the task is preempted or asleep (say due to contention on a mutex)
this will not be reported here.
---
src/core/stall_detector.hh | 8 ++++++
src/core/reactor.cc | 50 ++++++++++++++++++++++++++++++++++++++
2 files changed, 58 insertions(+)

diff --git a/src/core/stall_detector.hh b/src/core/stall_detector.hh
index 7642f19d35..429992a37f 100644
--- a/src/core/stall_detector.hh
+++ b/src/core/stall_detector.hh
@@ -29,10 +29,12 @@
#include <memory>
#include <seastar/core/posix.hh>
#include <seastar/core/metrics_registration.hh>
#include <seastar/core/scheduling.hh>

+struct perf_event_mmap_page;
+
namespace seastar {

class reactor;
class thread_cputime_clock;

@@ -67,10 +69,11 @@ class cpu_stall_detector {
seastar::metrics::metric_groups _metrics;
friend reactor;
virtual bool reap_event_and_check_spuriousness() {
return false;
}
+ virtual void maybe_report_kernel_trace() {}
private:
void maybe_report();
virtual void arm_timer() = 0;
void report_suppressions(sched_clock::time_point now);
public:
@@ -101,16 +104,21 @@ class cpu_stall_detector_posix_timer : public cpu_stall_detector {

class cpu_stall_detector_linux_perf_event : public cpu_stall_detector {
file_desc _fd;
bool _enabled = false;
uint64_t _current_period = 0;
+ struct ::perf_event_mmap_page* _mmap;
+ char* _data_area;
+ size_t _data_area_mask;
public:
static std::unique_ptr<cpu_stall_detector_linux_perf_event> try_make(cpu_stall_detector_config cfg = {});
explicit cpu_stall_detector_linux_perf_event(file_desc fd, cpu_stall_detector_config cfg = {});
+ ~cpu_stall_detector_linux_perf_event();
virtual void arm_timer() override;
virtual void start_sleep() override;
virtual bool reap_event_and_check_spuriousness() override;
+ virtual void maybe_report_kernel_trace() override;
};

std::unique_ptr<cpu_stall_detector> make_cpu_stall_detector(cpu_stall_detector_config cfg = {});

}
diff --git a/src/core/reactor.cc b/src/core/reactor.cc
index 0e7d3bc519..e5252bd181 100644
--- a/src/core/reactor.cc
+++ b/src/core/reactor.cc
@@ -1182,10 +1182,21 @@ perf_event_open(struct perf_event_attr* hw_event, pid_t pid, int cpu, int group_
return syscall(__NR_perf_event_open, hw_event, pid, cpu, group_fd, flags);
}

cpu_stall_detector_linux_perf_event::cpu_stall_detector_linux_perf_event(file_desc fd, cpu_stall_detector_config cfg)
: cpu_stall_detector(cfg), _fd(std::move(fd)) {
+ void* ret = ::mmap(nullptr, 2*getpagesize(), PROT_READ|PROT_WRITE, MAP_SHARED, _fd.get(), 0);
+ if (ret == MAP_FAILED) {
+ abort();
+ }
+ _mmap = static_cast<struct ::perf_event_mmap_page*>(ret);
+ _data_area = reinterpret_cast<char*>(_mmap) + getpagesize();
+ _data_area_mask = getpagesize() - 1;
+}
+
+cpu_stall_detector_linux_perf_event::~cpu_stall_detector_linux_perf_event() {
+ ::munmap(_mmap, 2*getpagesize());
}

void
cpu_stall_detector_linux_perf_event::arm_timer() {
uint64_t ns = (_threshold * _report_at + _slack) / 1ns;
@@ -1218,18 +1229,56 @@ cpu_stall_detector_linux_perf_event::reap_event_and_check_spuriousness() {
} buf;
_fd.read(&buf, sizeof(read_format));
return buf.value < _current_period;
}

+void
+cpu_stall_detector_linux_perf_event::maybe_report_kernel_trace() {
+ auto tail = _mmap->data_tail;
+ auto head = _mmap->data_head;
+ std::atomic_thread_fence(std::memory_order_acquire); // required after reading data_head
+ auto current_record = [&] () -> ::perf_event_header* {
+ return reinterpret_cast<::perf_event_header*>(_data_area + (tail & _data_area_mask));
+ };
+
+ // Skip all but last record
+ while (tail + current_record()->size != head) {
+ tail += current_record()->size;
+ }
+
+ auto last_record = current_record();
+
+ if (last_record->type == PERF_RECORD_SAMPLE) {
+ struct sample_record : perf_event_header {
+ uint64_t nr;
+ uint64_t ips[];
+ };
+ auto sample = static_cast<sample_record*>(last_record);
+ backtrace_buffer buf;
+ buf.append("kernel callstack:");
+ for (uint64_t i = 0; i < sample->nr; ++i) {
+ buf.append(" 0x");
+ buf.append_hex(uintptr_t(sample->ips[i]));
+ }
+ buf.append("\n");
+ buf.flush();
+ };
+
+ std::atomic_thread_fence(std::memory_order_release); // not documented, but probably required before reading data_tail
+ _mmap->data_tail = tail;
+}
+
std::unique_ptr<cpu_stall_detector_linux_perf_event>
cpu_stall_detector_linux_perf_event::try_make(cpu_stall_detector_config cfg) {
::perf_event_attr pea = {
.type = PERF_TYPE_SOFTWARE,
.size = sizeof(pea),
.config = PERF_COUNT_SW_TASK_CLOCK, // more likely to work on virtual machines than hardware events
.sample_period = 1'000'000'000, // Needs non-zero value or PERF_IOC_PERIOD gets confused
+ .sample_type = PERF_SAMPLE_CALLCHAIN,
.disabled = 1,
+ .exclude_callchain_user = 1, // we're using backtrace() to capture the user callchain
.wakeup_events = 1,
};
unsigned long flags = 0;
if (kernel_uname().whitelisted({"3.14"})) {
flags |= PERF_FLAG_FD_CLOEXEC;
@@ -1318,10 +1367,11 @@ cpu_stall_detector::generate_trace() {
backtrace_buffer buf;
buf.append("Reactor stalled for ");
buf.append_decimal(uint64_t(delta / 1ms));
buf.append(" ms");
print_with_backtrace(buf, _config.oneline);
+ maybe_report_kernel_trace();
}

template <typename T, typename E, typename EnableFunc>
void reactor::complete_timers(T& timers, E& expired_timers, EnableFunc&& enable_fn) noexcept(noexcept(enable_fn())) {
expired_timers = timers.expire(timers.now());
--
2.31.1

Avi Kivity

<avi@scylladb.com>
unread,
Sep 1, 2021, 12:05:10 PM9/1/21
to seastar-dev@googlegroups.com

On 01/09/2021 19.00, Avi Kivity wrote:
> We've observed CPU stalls in TCP processing[1] and when paging in
> shard memory[2]. To allow investigation of exactly what is happening
> in this and similar situations, extend the stall detector to report
> kernel stacks as well as user-space stacks.
>
> This works by using the Linux perf_event subsystem (when available) and
> asking it to capture a kernel stack. When we do, we print it out.
>
> Example output:
>
> Reactor stalled for 1176 ms on shard 0. Backtrace: 0x4969a4 0x463a88 0x463f7b 0x4640f7 0x7fa06b8aaa1f 0xfafb5 0x4282dd 0x42a5f3 0x426e7c
> kernel callstack: 0xffffffffffffff80 0xffffffff9569a327 0xffffffff952ccf56 0xffffffff952cf882 0xffffffff952d21e9 0xffffffff952d3fc6 0xffffffff952f1e80 0xffffffff952b2261 0xffffffff952b2bf5 0xffffffff952a780c 0xffffffff952aa0c6 0xffffffff952aa181 0xffffffff952930ad 0xffffffff95c05c10 0xffffffff95e0007c
>
> This was generated by a mmap() call that was intentionally made to
> stall, and the kernel trace (when decoded) indeed points at mmap
> internals.
>

Missing references:


[1] https://github.com/scylladb/scylla/issues/7882

[2] https://github.com/scylladb/scylla/issues/8828


Avi Kivity

<avi@scylladb.com>
unread,
Sep 14, 2021, 3:00:17 AM9/14/21
to seastar-dev@googlegroups.com
Review bump

On 01/09/2021 19.00, Avi Kivity wrote:

Pavel Emelyanov

<xemul@scylladb.com>
unread,
Sep 14, 2021, 5:53:59 AM9/14/21
to Avi Kivity, seastar-dev@googlegroups.com
Since you don't declare in this patch the __NR_perf_event_open, then it should
work with glibcs fresh enough to have it. But in this case, why not expect that
it should already contain the perf_event_open(2) wrapper itself?
I think it's worth printing something about why perf-event stall detector cannot be enabled.

> + return {};
> + }
> + auto desc = file_desc::from_fd(fd);
> + struct f_owner_ex sig_owner = {
> + .type = F_OWNER_TID,
> + .pid = gettid(),
> + };
> + auto ret1 = ::fcntl(fd, F_SETOWN_EX, &sig_owner);
> + if (ret1 == -1) {
> + abort();
> + }
> + auto ret2 = ::fcntl(fd, F_SETSIG, signal_number());
> + if (ret2 == -1) {
> + abort();
> + }
> + auto fd_flags = ::fcntl(fd, F_GETFL);
> + if (fd_flags == -1) {
> + abort();
> + }
> + auto ret3 = ::fcntl(fd, F_SETFL, fd_flags | O_ASYNC);
> + if (ret3 == -1) {
> + abort();
> + }
> + return std::make_unique<cpu_stall_detector_linux_perf_event>(std::move(desc));

Is the cpu_stall_detector_config missing deliberately?

Avi Kivity

<avi@scylladb.com>
unread,
Sep 14, 2021, 7:52:12 AM9/14/21
to Pavel Emelyanov, seastar-dev@googlegroups.com
There isn't one:


$ grep -r perf_event_open /usr/include
/usr/include/asm/unistd_x32.h:#define __NR_perf_event_open
(__X32_SYSCALL_BIT + 298)
/usr/include/asm/unistd_32.h:#define __NR_perf_event_open 336
/usr/include/asm/unistd_64.h:#define __NR_perf_event_open 298
/usr/include/asm-generic/unistd.h:#define __NR_perf_event_open 241
/usr/include/asm-generic/unistd.h:__SYSCALL(__NR_perf_event_open,
sys_perf_event_open)
/usr/include/linux/bpf.h: *         call **perf_event_open**\ () on the
perf event (either for
/usr/include/linux/perf_event.h: * attr.event_id parameter of the
sys_perf_event_open()
/usr/include/bits/syscall.h:#ifdef __NR_perf_event_open
/usr/include/bits/syscall.h:# define SYS_perf_event_open
__NR_perf_event_open
/usr/include/sanitizer/linux_syscall_hooks.h:#define
__sanitizer_syscall_pre_perf_event_open(attr_uptr, pid, cpu, group_fd, \
/usr/include/sanitizer/linux_syscall_hooks.h:
__sanitizer_syscall_pre_impl_perf_event_open((long)(attr_uptr),
(long)(pid), \
/usr/include/sanitizer/linux_syscall_hooks.h:#define
__sanitizer_syscall_post_perf_event_open(res, attr_uptr, pid, cpu, \
/usr/include/sanitizer/linux_syscall_hooks.h:
__sanitizer_syscall_post_impl_perf_event_open( \
/usr/include/sanitizer/linux_syscall_hooks.h:void
__sanitizer_syscall_pre_impl_perf_event_open(long attr_uptr, long pid,
/usr/include/sanitizer/linux_syscall_hooks.h:void
__sanitizer_syscall_post_impl_perf_event_open(long res, long attr_uptr,


The manual page recommends syscall().
syscall errors don't contain much information, but it's better than
nothing, so I'll add something.
No, will fix.

Avi Kivity

<avi@scylladb.com>
unread,
Sep 15, 2021, 3:50:24 AM9/15/21
to seastar-dev@googlegroups.com
We've observed CPU stalls in TCP processing[1] and when paging in
shard memory[2]. To allow investigation of exactly what is happening
in this and similar situations, extend the stall detector to report
kernel stacks as well as user-space stacks.

This works by using the Linux perf_event subsystem (when available) and
asking it to capture a kernel stack. When we do, we print it out.

Example output:

Reactor stalled for 1176 ms on shard 0. Backtrace: 0x4969a4 0x463a88 0x463f7b 0x4640f7 0x7fa06b8aaa1f 0xfafb5 0x4282dd 0x42a5f3 0x426e7c
kernel callstack: 0xffffffffffffff80 0xffffffff9569a327 0xffffffff952ccf56 0xffffffff952cf882 0xffffffff952d21e9 0xffffffff952d3fc6 0xffffffff952f1e80 0xffffffff952b2261 0xffffffff952b2bf5 0xffffffff952a780c 0xffffffff952aa0c6 0xffffffff952aa181 0xffffffff952930ad 0xffffffff95c05c10 0xffffffff95e0007c

This was generated by a mmap() call that was intentionally made to
stall, and the kernel trace (when decoded) indeed points at mmap
internals.


https://github.com/avikivity/seastar stall-kernel-stack-signal

v2:
- pass stall detector configuration to the perf_event stall detector
- report error message if perf_event_open fails, and explain what
we're falling back to

Avi Kivity (3):
cpu_stall_detector: make abstract
cpu_stall_detector: add perf_event stall detector
stall_detector: record and print kernel stack trace

src/core/stall_detector.hh | 44 +++++++++-
src/core/reactor.cc | 176 ++++++++++++++++++++++++++++++++++---
2 files changed, 204 insertions(+), 16 deletions(-)

--
2.31.1

Avi Kivity

<avi@scylladb.com>
unread,
Sep 15, 2021, 3:50:25 AM9/15/21
to seastar-dev@googlegroups.com
Split cpu_stall_detector into a base class with the common logic and a
derived class with the mechanism to detect the stall (i.e. the timer).

This prepares the was for a stall detector that uses Linux perf_event,
when available, which can also collect kernel traces.
---
src/core/stall_detector.hh | 24 ++++++++++++++++++++----
src/core/reactor.cc | 32 ++++++++++++++++++++------------
2 files changed, 40 insertions(+), 16 deletions(-)

diff --git a/src/core/stall_detector.hh b/src/core/stall_detector.hh
index f0b066ed9c..c2f555e8ff 100644
--- a/src/core/stall_detector.hh
+++ b/src/core/stall_detector.hh
@@ -24,10 +24,11 @@

#include <signal.h>
#include <limits>
#include <chrono>
#include <functional>
+#include <memory>
#include <seastar/core/posix.hh>
#include <seastar/core/metrics_registration.hh>
#include <seastar/core/scheduling.hh>

namespace seastar {
@@ -45,11 +46,11 @@ struct cpu_stall_detector_config {
std::function<void ()> report; // alternative reporting function for tests
};

// Detects stalls in continuations that run for too long
class cpu_stall_detector {
- timer_t _timer;
+protected:
std::atomic<uint64_t> _last_tasks_processed_seen{};
unsigned _stall_detector_reports_per_minute;
std::atomic<uint64_t> _stall_detector_missed_ticks = { 0 };
unsigned _reported = 0;
unsigned _total_reported = 0;
@@ -63,27 +64,42 @@ class cpu_stall_detector {
sched_clock::duration _threshold;
sched_clock::duration _slack;
cpu_stall_detector_config _config;
seastar::metrics::metric_groups _metrics;
friend reactor;
+ virtual bool reap_event_and_check_spuriousness() {
+ return false;
+ }
private:
void maybe_report();
- void arm_timer();
+ virtual void arm_timer() = 0;
void report_suppressions(sched_clock::time_point now);
public:
using clock_type = thread_cputime_clock;
public:
explicit cpu_stall_detector(cpu_stall_detector_config cfg = {});
- ~cpu_stall_detector();
+ virtual ~cpu_stall_detector() = default;
static int signal_number() { return SIGRTMIN + 1; }
void start_task_run(sched_clock::time_point now);
void end_task_run(sched_clock::time_point now);
void generate_trace();
void update_config(cpu_stall_detector_config cfg);
cpu_stall_detector_config get_config() const;
void on_signal();
- void start_sleep();
+ virtual void start_sleep() = 0;
void end_sleep();
};

+class cpu_stall_detector_posix_timer : public cpu_stall_detector {
+ timer_t _timer;
+public:
+ explicit cpu_stall_detector_posix_timer(cpu_stall_detector_config cfg = {});
+ virtual ~cpu_stall_detector_posix_timer() override;
+private:
+ virtual void arm_timer() override;
+ virtual void start_sleep() override;
+};
+
+std::unique_ptr<cpu_stall_detector> make_cpu_stall_detector(cpu_stall_detector_config cfg = {});
+
}
}
diff --git a/src/core/reactor.cc b/src/core/reactor.cc
index 2b6bd206b6..ad967b7ac9 100644
--- a/src/core/reactor.cc
+++ b/src/core/reactor.cc
+ return std::make_unique<cpu_stall_detector_posix_timer>(cfg);
+}
+
void
reactor::update_blocked_reactor_notify_ms(std::chrono::milliseconds ms) {

Avi Kivity

<avi@scylladb.com>
unread,
Sep 15, 2021, 3:50:26 AM9/15/21
to seastar-dev@googlegroups.com
The Linux perf_event subsystem can collect kernel traces, for when
the stall happens in the kernel, so add a new stall detector and
use it preferentially when available (it might not be available due
to settings of perf_event_paranoid.

The ioctl that resets the count does not appear to work 100%
reliably, so we detect false positives and ignore them.

Note we don't yet report kernel traces; this will come in the next
patch.
---
src/core/stall_detector.hh | 12 +++++
src/core/reactor.cc | 96 +++++++++++++++++++++++++++++++++++++-
2 files changed, 107 insertions(+), 1 deletion(-)

diff --git a/src/core/stall_detector.hh b/src/core/stall_detector.hh
index c2f555e8ff..7642f19d35 100644
--- a/src/core/stall_detector.hh
+++ b/src/core/stall_detector.hh
@@ -97,9 +97,21 @@ class cpu_stall_detector_posix_timer : public cpu_stall_detector {
private:
virtual void arm_timer() override;
virtual void start_sleep() override;
};

+class cpu_stall_detector_linux_perf_event : public cpu_stall_detector {
+ file_desc _fd;
+ bool _enabled = false;
+ uint64_t _current_period = 0;
+public:
+ static std::unique_ptr<cpu_stall_detector_linux_perf_event> try_make(cpu_stall_detector_config cfg = {});
+ explicit cpu_stall_detector_linux_perf_event(file_desc fd, cpu_stall_detector_config cfg = {});
+ virtual void arm_timer() override;
+ virtual void start_sleep() override;
+ virtual bool reap_event_and_check_spuriousness() override;
+};
+
std::unique_ptr<cpu_stall_detector> make_cpu_stall_detector(cpu_stall_detector_config cfg = {});

}
}
diff --git a/src/core/reactor.cc b/src/core/reactor.cc
index ad967b7ac9..afc17bb036 100644
--- a/src/core/reactor.cc
+++ b/src/core/reactor.cc
@@ -1171,14 +1175,104 @@ void cpu_stall_detector_posix_timer::start_sleep() {
}

void cpu_stall_detector::end_sleep() {
}

+static long
+perf_event_open(struct perf_event_attr* hw_event, pid_t pid, int cpu, int group_fd, unsigned long flags) {
+ return syscall(__NR_perf_event_open, hw_event, pid, cpu, group_fd, flags);
+ throw std::system_error(errno, std::system_category(), "perf_event_open() failed");
+ }
+ auto desc = file_desc::from_fd(fd);
+ struct f_owner_ex sig_owner = {
+ .type = F_OWNER_TID,
+ .pid = gettid(),
+ };
+ auto ret1 = ::fcntl(fd, F_SETOWN_EX, &sig_owner);
+ if (ret1 == -1) {
+ abort();
+ }
+ auto ret2 = ::fcntl(fd, F_SETSIG, signal_number());
+ if (ret2 == -1) {
+ abort();
+ }
+ auto fd_flags = ::fcntl(fd, F_GETFL);
+ if (fd_flags == -1) {
+ abort();
+ }
+ auto ret3 = ::fcntl(fd, F_SETFL, fd_flags | O_ASYNC);
+ if (ret3 == -1) {
+ abort();
+ }
+ return std::make_unique<cpu_stall_detector_linux_perf_event>(std::move(desc), std::move(cfg));
+}
+

std::unique_ptr<cpu_stall_detector>
internal::make_cpu_stall_detector(cpu_stall_detector_config cfg) {
- return std::make_unique<cpu_stall_detector_posix_timer>(cfg);
+ try {
+ return cpu_stall_detector_linux_perf_event::try_make(cfg);
+ } catch (...) {
+ seastar_logger.warn("Creation of perf_event based stall detector failed, falling back to posix timer: {}", std::current_exception());
+ return std::make_unique<cpu_stall_detector_posix_timer>(cfg);
+ }
}

void
reactor::update_blocked_reactor_notify_ms(std::chrono::milliseconds ms) {
auto cfg = _cpu_stall_detector->get_config();
--
2.31.1

Avi Kivity

<avi@scylladb.com>
unread,
Sep 15, 2021, 3:50:27 AM9/15/21
to seastar-dev@googlegroups.com
Enable kernel stack collection. Note this can only be gathered via
the mmap interface, so mmap the ring buffer and use it.

I tested this against a large mmap(MAP_POPULATE) to force a kernel
stall, and it correctly reported the kernel trace. Note that it will
only report time when the thread is active (spin loops or real work).
If the task is preempted or asleep (say due to contention on a mutex)
this will not be reported here.
---
src/core/stall_detector.hh | 8 ++++++
src/core/reactor.cc | 50 ++++++++++++++++++++++++++++++++++++++
2 files changed, 58 insertions(+)

diff --git a/src/core/stall_detector.hh b/src/core/stall_detector.hh
index 7642f19d35..429992a37f 100644
--- a/src/core/stall_detector.hh
+++ b/src/core/stall_detector.hh
@@ -29,10 +29,12 @@
#include <memory>
#include <seastar/core/posix.hh>
#include <seastar/core/metrics_registration.hh>
#include <seastar/core/scheduling.hh>

+struct perf_event_mmap_page;
+
namespace seastar {

class reactor;
class thread_cputime_clock;

@@ -67,10 +69,11 @@ class cpu_stall_detector {
seastar::metrics::metric_groups _metrics;
friend reactor;
virtual bool reap_event_and_check_spuriousness() {
return false;
}
+ virtual void maybe_report_kernel_trace() {}
private:
void maybe_report();
virtual void arm_timer() = 0;
void report_suppressions(sched_clock::time_point now);
public:
@@ -101,16 +104,21 @@ class cpu_stall_detector_posix_timer : public cpu_stall_detector {

class cpu_stall_detector_linux_perf_event : public cpu_stall_detector {
file_desc _fd;
bool _enabled = false;
uint64_t _current_period = 0;
+ struct ::perf_event_mmap_page* _mmap;
+ char* _data_area;
+ size_t _data_area_mask;
public:
static std::unique_ptr<cpu_stall_detector_linux_perf_event> try_make(cpu_stall_detector_config cfg = {});
explicit cpu_stall_detector_linux_perf_event(file_desc fd, cpu_stall_detector_config cfg = {});
+ ~cpu_stall_detector_linux_perf_event();
virtual void arm_timer() override;
virtual void start_sleep() override;
virtual bool reap_event_and_check_spuriousness() override;
+ virtual void maybe_report_kernel_trace() override;
};

std::unique_ptr<cpu_stall_detector> make_cpu_stall_detector(cpu_stall_detector_config cfg = {});

}
diff --git a/src/core/reactor.cc b/src/core/reactor.cc
index afc17bb036..ab5e941bce 100644
--- a/src/core/reactor.cc
+++ b/src/core/reactor.cc
@@ -1182,10 +1182,21 @@ perf_event_open(struct perf_event_attr* hw_event, pid_t pid, int cpu, int group_
return syscall(__NR_perf_event_open, hw_event, pid, cpu, group_fd, flags);
}

cpu_stall_detector_linux_perf_event::cpu_stall_detector_linux_perf_event(file_desc fd, cpu_stall_detector_config cfg)
: cpu_stall_detector(cfg), _fd(std::move(fd)) {
+ void* ret = ::mmap(nullptr, 2*getpagesize(), PROT_READ|PROT_WRITE, MAP_SHARED, _fd.get(), 0);
+ if (ret == MAP_FAILED) {
+ abort();
+ }
+ _mmap = static_cast<struct ::perf_event_mmap_page*>(ret);
+ _data_area = reinterpret_cast<char*>(_mmap) + getpagesize();
+ _data_area_mask = getpagesize() - 1;
+}
+
+cpu_stall_detector_linux_perf_event::~cpu_stall_detector_linux_perf_event() {
+ ::munmap(_mmap, 2*getpagesize());
}

void
cpu_stall_detector_linux_perf_event::arm_timer() {
uint64_t ns = (_threshold * _report_at + _slack) / 1ns;
.config = PERF_COUNT_SW_TASK_CLOCK, // more likely to work on virtual machines than hardware events
.sample_period = 1'000'000'000, // Needs non-zero value or PERF_IOC_PERIOD gets confused
+ .sample_type = PERF_SAMPLE_CALLCHAIN,
.disabled = 1,
+ .exclude_callchain_user = 1, // we're using backtrace() to capture the user callchain
.wakeup_events = 1,
};
unsigned long flags = 0;
if (kernel_uname().whitelisted({"3.14"})) {
flags |= PERF_FLAG_FD_CLOEXEC;
@@ -1319,10 +1368,11 @@ cpu_stall_detector::generate_trace() {

Tomasz Grabiec

<tgrabiec@scylladb.com>
unread,
Sep 15, 2021, 9:13:25 AM9/15/21
to Avi Kivity, seastar-dev
Did you mean "before writing data_fail" ?
 
--
You received this message because you are subscribed to the Google Groups "seastar-dev" group.
To unsubscribe from this group and stop receiving emails from it, send an email to seastar-dev...@googlegroups.com.
To view this discussion on the web visit https://groups.google.com/d/msgid/seastar-dev/20210915075020.1295227-4-avi%40scylladb.com.

Avi Kivity

<avi@scylladb.com>
unread,
Sep 19, 2021, 1:15:43 PM9/19/21
to Tomasz Grabiec, seastar-dev

Yes. Force-pushed to same branch with this fixed (I kept "data_tail", just change the direction).


Avi Kivity

<avi@scylladb.com>
unread,
Sep 22, 2021, 8:41:31 AM9/22/21
to Tomasz Grabiec, seastar-dev

force-push ping


Commit Bot

<bot@cloudius-systems.com>
unread,
Sep 22, 2021, 9:41:51 AM9/22/21
to seastar-dev@googlegroups.com, Avi Kivity
From: Avi Kivity <a...@scylladb.com>
Committer: Avi Kivity <a...@scylladb.com>
Branch: master

cpu_stall_detector: make abstract

Split cpu_stall_detector into a base class with the common logic and a
derived class with the mechanism to detect the stall (i.e. the timer).

This prepares the was for a stall detector that uses Linux perf_event,
when available, which can also collect kernel traces.

---
diff --git a/src/core/reactor.cc b/src/core/reactor.cc
--- a/src/core/reactor.cc
+++ b/src/core/reactor.cc
@@ -925,7 +925,7 @@ reactor::reactor(std::shared_ptr<smp> smp, alien::instance& alien, unsigned id,
, _engine_thread(sched::thread::current())
#endif
, _cpu_started(0)
- , _cpu_stall_detector(std::make_unique<cpu_stall_detector>())
+ , _cpu_stall_detector(make_cpu_stall_detector())
, _reuseport(posix_reuseport_detect())
, _thread_pool(std::make_unique<thread_pool>(this, seastar::format("syscall-{}", id))) {
/*
@@ -1061,6 +1061,16 @@ cpu_stall_detector::cpu_stall_detector(cpu_stall_detector_config cfg)
// a safe place.
backtrace([] (frame) {});
update_config(cfg);
+
+ namespace sm = seastar::metrics;
+
+ _metrics.add_group("stall_detector", {
+ sm::make_derive("reported", _total_reported, sm::description("Total number of reported stalls, look in the traces for the exact reason"))});
+
+ // note: if something is added here that can, it should take care to destroy _timer.
+}
+
+cpu_stall_detector_posix_timer::cpu_stall_detector_posix_timer(cpu_stall_detector_config cfg) : cpu_stall_detector(cfg) {
struct sigevent sev = {};
sev.sigev_notify = SIGEV_THREAD_ID;
sev.sigev_signo = signal_number();
@@ -1069,17 +1079,9 @@ cpu_stall_detector::cpu_stall_detector(cpu_stall_detector_config cfg)
if (err) {
throw std::system_error(std::error_code(err, std::system_category()));
}
-
- namespace sm = seastar::metrics;
-
- _metrics.add_group("stall_detector", {
- sm::make_derive("reported", _total_reported, sm::description("Total number of reported stalls, look in the traces for the exact reason"))});
-
-
- // note: if something is added here that can, it should take care to destroy _timer.
}

-cpu_stall_detector::~cpu_stall_detector() {
+cpu_stall_detector_posix_timer::~cpu_stall_detector_posix_timer() {
timer_delete(_timer);
}

@@ -1140,7 +1142,7 @@ void cpu_stall_detector::report_suppressions(sched_clock::time_point now) {
}
}

-void cpu_stall_detector::arm_timer() {
+void cpu_stall_detector_posix_timer::arm_timer() {
auto its = posix::to_relative_itimerspec(_threshold * _report_at + _slack, 0s);
timer_settime(_timer, 0, &its, nullptr);
}
@@ -1162,7 +1164,7 @@ void cpu_stall_detector::end_task_run(sched_clock::time_point now) {
_last_tasks_processed_seen.store(0, std::memory_order_relaxed);
}

-void cpu_stall_detector::start_sleep() {
+void cpu_stall_detector_posix_timer::start_sleep() {
auto its = posix::to_relative_itimerspec(0s, 0s);
timer_settime(_timer, 0, &its, nullptr);
_rearm_timer_at = reactor::now();
@@ -1171,6 +1173,12 @@ void cpu_stall_detector::start_sleep() {
void cpu_stall_detector::end_sleep() {
}

+
+std::unique_ptr<cpu_stall_detector>
+internal::make_cpu_stall_detector(cpu_stall_detector_config cfg) {
+ return std::make_unique<cpu_stall_detector_posix_timer>(cfg);
+}
+
void
reactor::update_blocked_reactor_notify_ms(std::chrono::milliseconds ms) {
auto cfg = _cpu_stall_detector->get_config();
diff --git a/src/core/stall_detector.hh b/src/core/stall_detector.hh
--- a/src/core/stall_detector.hh
+++ b/src/core/stall_detector.hh
@@ -26,6 +26,7 @@
#include <limits>
#include <chrono>
#include <functional>
+#include <memory>
#include <seastar/core/posix.hh>
#include <seastar/core/metrics_registration.hh>
#include <seastar/core/scheduling.hh>
@@ -47,7 +48,7 @@ struct cpu_stall_detector_config {

// Detects stalls in continuations that run for too long
class cpu_stall_detector {
- timer_t _timer;
+protected:
std::atomic<uint64_t> _last_tasks_processed_seen{};
unsigned _stall_detector_reports_per_minute;
std::atomic<uint64_t> _stall_detector_missed_ticks = { 0 };
@@ -65,25 +66,40 @@ class cpu_stall_detector {
cpu_stall_detector_config _config;
seastar::metrics::metric_groups _metrics;
friend reactor;
+ virtual bool reap_event_and_check_spuriousness() {
+ return false;
+ }
private:
void maybe_report();
- void arm_timer();
+ virtual void arm_timer() = 0;
void report_suppressions(sched_clock::time_point now);
public:
using clock_type = thread_cputime_clock;
public:
explicit cpu_stall_detector(cpu_stall_detector_config cfg = {});
- ~cpu_stall_detector();
+ virtual ~cpu_stall_detector() = default;
static int signal_number() { return SIGRTMIN + 1; }
void start_task_run(sched_clock::time_point now);
void end_task_run(sched_clock::time_point now);
void generate_trace();
void update_config(cpu_stall_detector_config cfg);
cpu_stall_detector_config get_config() const;
void on_signal();
- void start_sleep();
+ virtual void start_sleep() = 0;
void end_sleep();
};

+class cpu_stall_detector_posix_timer : public cpu_stall_detector {
+ timer_t _timer;
+public:
+ explicit cpu_stall_detector_posix_timer(cpu_stall_detector_config cfg = {});
+ virtual ~cpu_stall_detector_posix_timer() override;
+private:
+ virtual void arm_timer() override;
+ virtual void start_sleep() override;
+};
+
+std::unique_ptr<cpu_stall_detector> make_cpu_stall_detector(cpu_stall_detector_config cfg = {});
+
}
}

Commit Bot

<bot@cloudius-systems.com>
unread,
Sep 22, 2021, 9:41:53 AM9/22/21
to seastar-dev@googlegroups.com, Avi Kivity
From: Avi Kivity <a...@scylladb.com>
Committer: Avi Kivity <a...@scylladb.com>
Branch: master

cpu_stall_detector: add perf_event stall detector

The Linux perf_event subsystem can collect kernel traces, for when
the stall happens in the kernel, so add a new stall detector and
use it preferentially when available (it might not be available due
to settings of perf_event_paranoid.

The ioctl that resets the count does not appear to work 100%
reliably, so we detect false positives and ignore them.

Note we don't yet report kernel traces; this will come in the next
patch.

---
diff --git a/src/core/reactor.cc b/src/core/reactor.cc
--- a/src/core/reactor.cc
+++ b/src/core/reactor.cc
@@ -83,6 +83,7 @@
#include <dirent.h>
#include <linux/types.h> // for xfs, below
#include <sys/ioctl.h>
+#include <linux/perf_event.h>
#include <xfs/linux.h>
#define min min /* prevent xfs.h from defining min() as a macro */
#include <xfs/xfs.h>
@@ -1110,6 +1111,9 @@ void cpu_stall_detector::maybe_report() {
//
// We can do it a cheaper if we don't report suppressed backtraces.
void cpu_stall_detector::on_signal() {
+ if (reap_event_and_check_spuriousness()) {
+ return;
+ }
auto tasks_processed = engine().tasks_processed();
auto last_seen = _last_tasks_processed_seen.load(std::memory_order_relaxed);
if (!last_seen) {
@@ -1173,10 +1177,100 @@ void cpu_stall_detector_posix_timer::start_sleep() {
+ return std::make_unique<cpu_stall_detector_posix_timer>(cfg);
+ }
}

void
diff --git a/src/core/stall_detector.hh b/src/core/stall_detector.hh
--- a/src/core/stall_detector.hh
+++ b/src/core/stall_detector.hh
@@ -99,6 +99,18 @@ private:
virtual void start_sleep() override;
};

+class cpu_stall_detector_linux_perf_event : public cpu_stall_detector {
+ file_desc _fd;
+ bool _enabled = false;
+ uint64_t _current_period = 0;
+public:
+ static std::unique_ptr<cpu_stall_detector_linux_perf_event> try_make(cpu_stall_detector_config cfg = {});
+ explicit cpu_stall_detector_linux_perf_event(file_desc fd, cpu_stall_detector_config cfg = {});
+ virtual void arm_timer() override;
+ virtual void start_sleep() override;
+ virtual bool reap_event_and_check_spuriousness() override;
+};
+

Commit Bot

<bot@cloudius-systems.com>
unread,
Sep 22, 2021, 9:41:53 AM9/22/21
to seastar-dev@googlegroups.com, Avi Kivity
From: Avi Kivity <a...@scylladb.com>
Committer: Avi Kivity <a...@scylladb.com>
Branch: master

stall_detector: record and print kernel stack trace

Enable kernel stack collection. Note this can only be gathered via
the mmap interface, so mmap the ring buffer and use it.

I tested this against a large mmap(MAP_POPULATE) to force a kernel
stall, and it correctly reported the kernel trace. Note that it will
only report time when the thread is active (spin loops or real work).
If the task is preempted or asleep (say due to contention on a mutex)
this will not be reported here.

---
diff --git a/src/core/reactor.cc b/src/core/reactor.cc
--- a/src/core/reactor.cc
+++ b/src/core/reactor.cc
@@ -1184,6 +1184,17 @@ perf_event_open(struct perf_event_attr* hw_event, pid_t pid, int cpu, int group_

cpu_stall_detector_linux_perf_event::cpu_stall_detector_linux_perf_event(file_desc fd, cpu_stall_detector_config cfg)
: cpu_stall_detector(cfg), _fd(std::move(fd)) {
+ void* ret = ::mmap(nullptr, 2*getpagesize(), PROT_READ|PROT_WRITE, MAP_SHARED, _fd.get(), 0);
+ if (ret == MAP_FAILED) {
+ abort();
+ }
+ _mmap = static_cast<struct ::perf_event_mmap_page*>(ret);
+ _data_area = reinterpret_cast<char*>(_mmap) + getpagesize();
+ _data_area_mask = getpagesize() - 1;
+}
+
+cpu_stall_detector_linux_perf_event::~cpu_stall_detector_linux_perf_event() {
+ ::munmap(_mmap, 2*getpagesize());
}

void
@@ -1220,14 +1231,52 @@ cpu_stall_detector_linux_perf_event::reap_event_and_check_spuriousness() {
+ std::atomic_thread_fence(std::memory_order_release); // not documented, but probably required before writing data_tail
+ _mmap->data_tail = tail;
+}
+
std::unique_ptr<cpu_stall_detector_linux_perf_event>
cpu_stall_detector_linux_perf_event::try_make(cpu_stall_detector_config cfg) {
::perf_event_attr pea = {
.type = PERF_TYPE_SOFTWARE,
.size = sizeof(pea),
.config = PERF_COUNT_SW_TASK_CLOCK, // more likely to work on virtual machines than hardware events
.sample_period = 1'000'000'000, // Needs non-zero value or PERF_IOC_PERIOD gets confused
+ .sample_type = PERF_SAMPLE_CALLCHAIN,
.disabled = 1,
+ .exclude_callchain_user = 1, // we're using backtrace() to capture the user callchain
.wakeup_events = 1,
};
unsigned long flags = 0;
@@ -1321,6 +1370,7 @@ cpu_stall_detector::generate_trace() {
buf.append_decimal(uint64_t(delta / 1ms));
buf.append(" ms");
print_with_backtrace(buf, _config.oneline);
+ maybe_report_kernel_trace();
}

template <typename T, typename E, typename EnableFunc>
diff --git a/src/core/stall_detector.hh b/src/core/stall_detector.hh
--- a/src/core/stall_detector.hh
+++ b/src/core/stall_detector.hh
@@ -31,6 +31,8 @@
#include <seastar/core/metrics_registration.hh>
#include <seastar/core/scheduling.hh>

+struct perf_event_mmap_page;
+
namespace seastar {

class reactor;
@@ -69,6 +71,7 @@ protected:
virtual bool reap_event_and_check_spuriousness() {
return false;
}
+ virtual void maybe_report_kernel_trace() {}
private:
void maybe_report();
virtual void arm_timer() = 0;
@@ -103,12 +106,17 @@ class cpu_stall_detector_linux_perf_event : public cpu_stall_detector {
file_desc _fd;
bool _enabled = false;
uint64_t _current_period = 0;
+ struct ::perf_event_mmap_page* _mmap;
+ char* _data_area;
+ size_t _data_area_mask;
public:
static std::unique_ptr<cpu_stall_detector_linux_perf_event> try_make(cpu_stall_detector_config cfg = {});
explicit cpu_stall_detector_linux_perf_event(file_desc fd, cpu_stall_detector_config cfg = {});
+ ~cpu_stall_detector_linux_perf_event();
virtual void arm_timer() override;
virtual void start_sleep() override;
virtual bool reap_event_and_check_spuriousness() override;
+ virtual void maybe_report_kernel_trace() override;
};
Reply all
Reply to author
Forward
0 new messages