Enable kernel stack collection. Note this can only be gathered via
the mmap interface, so mmap the ring buffer and use it.
I tested this against a large mmap(MAP_POPULATE) to force a kernel
stall, and it correctly reported the kernel trace. Note that it will
only report time when the thread is active (spin loops or real work).
If the task is preempted or asleep (say due to contention on a mutex)
this will not be reported here.
---
src/core/stall_detector.hh | 8 ++++++
src/core/reactor.cc | 50 ++++++++++++++++++++++++++++++++++++++
2 files changed, 58 insertions(+)
diff --git a/src/core/stall_detector.hh b/src/core/stall_detector.hh
index 7642f19d35..429992a37f 100644
--- a/src/core/stall_detector.hh
+++ b/src/core/stall_detector.hh
@@ -29,10 +29,12 @@
#include <memory>
#include <seastar/core/posix.hh>
#include <seastar/core/metrics_registration.hh>
#include <seastar/core/scheduling.hh>
+struct perf_event_mmap_page;
+
namespace seastar {
class reactor;
class thread_cputime_clock;
@@ -67,10 +69,11 @@ class cpu_stall_detector {
seastar::metrics::metric_groups _metrics;
friend reactor;
virtual bool reap_event_and_check_spuriousness() {
return false;
}
+ virtual void maybe_report_kernel_trace() {}
private:
void maybe_report();
virtual void arm_timer() = 0;
void report_suppressions(sched_clock::time_point now);
public:
@@ -101,16 +104,21 @@ class cpu_stall_detector_posix_timer : public cpu_stall_detector {
class cpu_stall_detector_linux_perf_event : public cpu_stall_detector {
file_desc _fd;
bool _enabled = false;
uint64_t _current_period = 0;
+ struct ::perf_event_mmap_page* _mmap;
+ char* _data_area;
+ size_t _data_area_mask;
public:
static std::unique_ptr<cpu_stall_detector_linux_perf_event> try_make(cpu_stall_detector_config cfg = {});
explicit cpu_stall_detector_linux_perf_event(file_desc fd, cpu_stall_detector_config cfg = {});
+ ~cpu_stall_detector_linux_perf_event();
virtual void arm_timer() override;
virtual void start_sleep() override;
virtual bool reap_event_and_check_spuriousness() override;
+ virtual void maybe_report_kernel_trace() override;
};
std::unique_ptr<cpu_stall_detector> make_cpu_stall_detector(cpu_stall_detector_config cfg = {});
}
diff --git a/src/core/reactor.cc b/src/core/reactor.cc
index 0e7d3bc519..e5252bd181 100644
--- a/src/core/reactor.cc
+++ b/src/core/reactor.cc
@@ -1182,10 +1182,21 @@ perf_event_open(struct perf_event_attr* hw_event, pid_t pid, int cpu, int group_
return syscall(__NR_perf_event_open, hw_event, pid, cpu, group_fd, flags);
}
cpu_stall_detector_linux_perf_event::cpu_stall_detector_linux_perf_event(file_desc fd, cpu_stall_detector_config cfg)
: cpu_stall_detector(cfg), _fd(std::move(fd)) {
+ void* ret = ::mmap(nullptr, 2*getpagesize(), PROT_READ|PROT_WRITE, MAP_SHARED, _fd.get(), 0);
+ if (ret == MAP_FAILED) {
+ abort();
+ }
+ _mmap = static_cast<struct ::perf_event_mmap_page*>(ret);
+ _data_area = reinterpret_cast<char*>(_mmap) + getpagesize();
+ _data_area_mask = getpagesize() - 1;
+}
+
+cpu_stall_detector_linux_perf_event::~cpu_stall_detector_linux_perf_event() {
+ ::munmap(_mmap, 2*getpagesize());
}
void
cpu_stall_detector_linux_perf_event::arm_timer() {
uint64_t ns = (_threshold * _report_at + _slack) / 1ns;
@@ -1218,18 +1229,56 @@ cpu_stall_detector_linux_perf_event::reap_event_and_check_spuriousness() {
} buf;
_fd.read(&buf, sizeof(read_format));
return buf.value < _current_period;
}
+void
+cpu_stall_detector_linux_perf_event::maybe_report_kernel_trace() {
+ auto tail = _mmap->data_tail;
+ auto head = _mmap->data_head;
+ std::atomic_thread_fence(std::memory_order_acquire); // required after reading data_head
+ auto current_record = [&] () -> ::perf_event_header* {
+ return reinterpret_cast<::perf_event_header*>(_data_area + (tail & _data_area_mask));
+ };
+
+ // Skip all but last record
+ while (tail + current_record()->size != head) {
+ tail += current_record()->size;
+ }
+
+ auto last_record = current_record();
+
+ if (last_record->type == PERF_RECORD_SAMPLE) {
+ struct sample_record : perf_event_header {
+ uint64_t nr;
+ uint64_t ips[];
+ };
+ auto sample = static_cast<sample_record*>(last_record);
+ backtrace_buffer buf;
+ buf.append("kernel callstack:");
+ for (uint64_t i = 0; i < sample->nr; ++i) {
+ buf.append(" 0x");
+ buf.append_hex(uintptr_t(sample->ips[i]));
+ }
+ buf.append("\n");
+ buf.flush();
+ };
+
+ std::atomic_thread_fence(std::memory_order_release); // not documented, but probably required before reading data_tail
+ _mmap->data_tail = tail;
+}
+
std::unique_ptr<cpu_stall_detector_linux_perf_event>
cpu_stall_detector_linux_perf_event::try_make(cpu_stall_detector_config cfg) {
::perf_event_attr pea = {
.type = PERF_TYPE_SOFTWARE,
.size = sizeof(pea),
.config = PERF_COUNT_SW_TASK_CLOCK, // more likely to work on virtual machines than hardware events
.sample_period = 1'000'000'000, // Needs non-zero value or PERF_IOC_PERIOD gets confused
+ .sample_type = PERF_SAMPLE_CALLCHAIN,
.disabled = 1,
+ .exclude_callchain_user = 1, // we're using backtrace() to capture the user callchain
.wakeup_events = 1,
};
unsigned long flags = 0;
if (kernel_uname().whitelisted({"3.14"})) {
flags |= PERF_FLAG_FD_CLOEXEC;
@@ -1318,10 +1367,11 @@ cpu_stall_detector::generate_trace() {
backtrace_buffer buf;
buf.append("Reactor stalled for ");
buf.append_decimal(uint64_t(delta / 1ms));
buf.append(" ms");
print_with_backtrace(buf, _config.oneline);
+ maybe_report_kernel_trace();
}
template <typename T, typename E, typename EnableFunc>
void reactor::complete_timers(T& timers, E& expired_timers, EnableFunc&& enable_fn) noexcept(noexcept(enable_fn())) {
expired_timers = timers.expire(timers.now());
--
2.31.1