[PATCH seastar v1] core/memory: Introduce heap profiler

Tomasz Grabiec

<tgrabiec@scylladb.com>

unread,

Sep 30, 2016, 9:06:11 AM9/30/16

to seastar-dev@googlegroups.com

Records allocation sites and sizes so that it's later possible to see
what kind of objects are taking up space on the heap.

Adds 8 bytes of memory overhead per live object when compiled-in.

Not compiled-in by default. To do so, configure with SEASTAR_HEAPPROF:

./configure.py --cflags=-DSEASTAR_HEAPPROF

To enable recording allocation sites, start seastar application with
the --heapprof flag:

build/release/memcached --heapprof

It is possible to enable recording in run-time using, for instance, a
RESTful API.

scylla-gdb.py support will follow, with text-mode tree presentation
and dumping to flamegraphs.

Flamegraph example:

https://cloud.githubusercontent.com/assets/283695/18888026/f69fc4e6-84f6-11e6-9b7b-305667d30f52.png

Text example:

(gdb) scylla heapprof -r
All (275954028, #12783)
|-- void* memory::cpu_pages::allocate_large_and_trim<memory::cpu_pages::allocate_large_aligned(unsigned int, unsigned int)::{lambda(unsigned int, unsigned int)#1}>(unsigned int, memory::cpu_pages::allocate_large_aligned(unsigned int, unsigned int)::{lambda(unsigned int, unsigned int)#1}) + 169 (268959744, #5)
| memory::allocate_large_aligned(unsigned long, unsigned long) + 87
| |-- logalloc::segment_zone::segment_zone() + 291 (268435456, #1)
| | logalloc::segment_pool::allocate_segment() + 413
| | logalloc::segment_pool::segment_pool() + 296
| | __tls_init.part.787 + 72
| | logalloc::region_group::release_requests() + 1333
| | logalloc::region_group::add(logalloc::region_group*) + 514
| | database::database(db::config const&) + 4246
| | (...)
| |
| \-- memory::allocate_aligned(unsigned long, unsigned long) + 13 (524288, #4)
| |-- memalign + 9 (262144, #2)
| | db::commitlog::segment_manager::acquire_buffer(unsigned long) + 90
| | db::commitlog::segment::new_buffer(unsigned long) + 113
| | db::commitlog::segment::allocate(utils::UUID const&, shared_ptr<db::commitlog::entry_writer>) + 2347
| | |-- db::commitlog::add_entry(utils::UUID const&, commitlog_entry_writer const&) + 1620 (131072, #1)
| | | database::do_apply(lw_shared_ptr<schema const>, frozen_mutation const&) + 182
| | | database::apply(lw_shared_ptr<schema const>, frozen_mutation const&) + 235
| | | service::storage_proxy::mutate_locally(lw_shared_ptr<schema const> const&, frozen_mutation const&) + 434
---
core/memory.hh | 2 +
core/memory.cc | 228 +++++++++++++++++++++++++++++++++++++++++++++++++++++++-
core/reactor.cc | 9 ++-
3 files changed, 234 insertions(+), 5 deletions(-)

diff --git a/core/memory.hh b/core/memory.hh
index e4796ee..f505e19 100644
--- a/core/memory.hh
+++ b/core/memory.hh
@@ -62,6 +62,8 @@ void configure(std::vector<resource::memory> m,

void enable_abort_on_allocation_failure();

+void set_heap_profiling_enabled(bool);
+
void* allocate_reclaimable(size_t size);

enum class reclaiming_result {
diff --git a/core/memory.cc b/core/memory.cc
index 8260ff8..25b9ab6 100644
--- a/core/memory.cc
+++ b/core/memory.cc
@@ -71,12 +71,49 @@
#include <cstring>
#include <boost/intrusive/list.hpp>
#include <sys/mman.h>
+#include "util/defer.hh"
+#include "util/backtrace.hh"
+
#ifdef HAVE_NUMA
#include <numaif.h>
#endif

+struct allocation_site {
+ size_t count = 0; // number of live objects allocated at backtrace.
+ size_t size = 0; // amount of bytes in live objects allocated at backtrace.
+ allocation_site* next = nullptr;
+ std::vector<uintptr_t> backtrace;
+
+ bool operator==(const allocation_site& o) const {
+ return backtrace == o.backtrace;
+ }
+
+ bool operator!=(const allocation_site& o) const {
+ return !(*this == o);
+ }
+};
+
+namespace std {
+
+template<>
+struct hash<::allocation_site> {
+ size_t operator()(const ::allocation_site& bi) const {
+ size_t h = 0;
+ for (auto addr : bi.backtrace) {
+ h = ((h << 5) - h) ^ addr;
+ }
+ return h;
+ }
+};
+
+}
+
+using allocation_site_ptr = allocation_site*;
+
namespace memory {

+static allocation_site_ptr get_allocation_site() __attribute__((unused));
+
static std::atomic<bool> abort_on_allocation_failure{false};

void enable_abort_on_allocation_failure() {
@@ -145,6 +182,14 @@ static char* mem_base() {
return known;
}

+constexpr bool is_page_aligned(size_t size) {
+ return (size & (page_size - 1)) == 0;
+}
+
+constexpr size_t next_page_aligned(size_t size) {
+ return (size + (page_size - 1)) & ~(page_size - 1);
+}
+
class small_pool;

struct free_object {
@@ -159,6 +204,9 @@ struct page {
page_list_link link;
small_pool* pool; // if used in a small_pool
free_object* freelist;
+#ifdef SEASTAR_HEAPPROF
+ allocation_site_ptr alloc_site; // for objects whose size is multiple of page size, valid for head only
+#endif
};

class page_list {
@@ -229,8 +277,10 @@ class small_pool {
void* allocate();
void deallocate(void* object);
unsigned object_size() const { return _object_size; }
+ bool objects_page_aligned() const { return is_page_aligned(_object_size); }
static constexpr unsigned size_to_idx(unsigned size);
static constexpr unsigned idx_to_size(unsigned idx);
+ allocation_site_ptr& alloc_site_holder(void* ptr);
private:
void add_more_objects();
void trim_free_list();
@@ -283,6 +333,31 @@ class small_pool_array {
static constexpr size_t max_small_allocation
= small_pool::idx_to_size(small_pool_array::nr_small_pools - 1);

+constexpr size_t object_size_with_alloc_site(size_t size) {
+#ifdef SEASTAR_HEAPPROF
+ // For page-aligned sizes, allocation_site* lives in page::alloc_site, not with the object.
+ static_assert(is_page_aligned(max_small_allocation), "assuming that max_small_allocation is page aligned so that we"
+ " don't need to add allocation_site_ptr to objects of size close to it");
+ size_t next_page_aligned_size = next_page_aligned(size);
+ if (next_page_aligned_size - size > sizeof(allocation_site_ptr)) {
+ size += sizeof(allocation_site_ptr);
+ } else {
+ return next_page_aligned_size;
+ }
+#endif
+ return size;
+}
+
+#ifdef SEASTAR_HEAPPROF
+// Ensure that object_size_with_alloc_site() does not exceed max_small_allocation
+static_assert(object_size_with_alloc_site(max_small_allocation) == max_small_allocation, "");
+static_assert(object_size_with_alloc_site(max_small_allocation - 1) == max_small_allocation, "");
+static_assert(object_size_with_alloc_site(max_small_allocation - sizeof(allocation_site_ptr) + 1) == max_small_allocation, "");
+static_assert(object_size_with_alloc_site(max_small_allocation - sizeof(allocation_site_ptr)) == max_small_allocation, "");
+static_assert(object_size_with_alloc_site(max_small_allocation - sizeof(allocation_site_ptr) - 1) == max_small_allocation - 1, "");
+static_assert(object_size_with_alloc_site(max_small_allocation - sizeof(allocation_site_ptr) - 2) == max_small_allocation - 2, "");
+#endif
+
struct cross_cpu_free_item {
cross_cpu_free_item* next;
};
@@ -314,6 +389,18 @@ struct cpu_pages {
alignas(cache_line_size) std::vector<physical_address> virt_to_phys_map;
static std::atomic<unsigned> cpu_id_gen;
static cpu_pages* all_cpus[max_cpus];
+ union asu {
+ using alloc_sites_type = std::unordered_set<std::unique_ptr<allocation_site>,
+ indirect_hash<std::unique_ptr<allocation_site>>,
+ indirect_equal_to<std::unique_ptr<allocation_site>>>;
+ asu() {
+ new (&alloc_sites) alloc_sites_type();
+ }
+ ~asu() {} // alloc_sites live forever
+ alloc_sites_type alloc_sites;
+ } asu;
+ allocation_site* alloc_site_list_head = nullptr; // For easy traversal of asu.alloc_sites from scylla-gdb.py
+ bool collect_backtrace = false;
char* mem() { return memory; }

void link(page_list& list, page* span);
@@ -362,6 +449,20 @@ static thread_local cpu_pages cpu_mem;
std::atomic<unsigned> cpu_pages::cpu_id_gen;
cpu_pages* cpu_pages::all_cpus[max_cpus];

+void set_heap_profiling_enabled(bool enable) {
+ bool is_enabled = cpu_mem.collect_backtrace;
+ if (enable) {
+ if (!is_enabled) {
+ seastar_logger.info("Enabling heap profiler");
+ }
+ } else {
+ if (is_enabled) {
+ seastar_logger.info("Disabling heap profiler");
+ }
+ }
+ cpu_mem.collect_backtrace = enable;
+}
+
// Free spans are store in the largest index i such that nr_pages >= 1 << i.
static inline
unsigned index_of(unsigned pages) {
@@ -492,6 +593,14 @@ cpu_pages::allocate_large_and_trim(unsigned n_pages, Trimmer trimmer) {
span->free = span_end->free = false;
span->span_size = span_end->span_size = t.nr_pages;
span->pool = nullptr;
+#ifdef SEASTAR_HEAPPROF
+ auto alloc_site = get_allocation_site();
+ span->alloc_site = alloc_site;
+ if (alloc_site) {
+ ++alloc_site->count;
+ alloc_site->size += span->span_size * page_size;
+ }
+#endif
if (nr_free_pages < current_min_free_pages) {
drain_cross_cpu_freelist();
run_reclaimers(reclaimer_scope::sync);
@@ -516,17 +625,92 @@ cpu_pages::allocate_large_aligned(unsigned align_pages, unsigned n_pages) {
});
}

+#ifdef SEASTAR_HEAPPROF
+
+class disable_backtrace_temporarily {
+ bool _old;
+public:
+ disable_backtrace_temporarily() {
+ _old = cpu_mem.collect_backtrace;
+ cpu_mem.collect_backtrace = false;
+ }
+ ~disable_backtrace_temporarily() {
+ cpu_mem.collect_backtrace = _old;
+ }
+};
+
+#else
+
+struct disable_backtrace_temporarily {};
+
+#endif
+
+static
+std::vector<uintptr_t> get_backtrace() noexcept {
+ disable_backtrace_temporarily dbt;
+ std::vector<uintptr_t> result;
+ backtrace([&result] (uintptr_t addr) {
+ result.push_back(addr);
+ });
+ return result;
+}
+
+static
+allocation_site_ptr get_allocation_site() {
+ if (!cpu_mem.is_initialized() || !cpu_mem.collect_backtrace) {
+ return nullptr;
+ }
+ disable_backtrace_temporarily dbt;
+ auto new_alloc_site = std::make_unique<allocation_site>();
+ new_alloc_site->backtrace = get_backtrace();
+ auto insert_result = cpu_mem.asu.alloc_sites.insert(std::move(new_alloc_site));
+ allocation_site_ptr alloc_site = insert_result.first->get();
+ if (insert_result.second) {
+ alloc_site->next = cpu_mem.alloc_site_list_head;
+ cpu_mem.alloc_site_list_head = alloc_site;
+ }
+ return alloc_site;
+}
+
+allocation_site_ptr&
+small_pool::alloc_site_holder(void* ptr) {
+ if (objects_page_aligned()) {
+ return cpu_mem.to_page(ptr)->alloc_site;
+ } else {
+ return *reinterpret_cast<allocation_site_ptr*>(reinterpret_cast<char*>(ptr) + _object_size - sizeof(allocation_site_ptr));
+ }
+}
+
void*
cpu_pages::allocate_small(unsigned size) {
auto idx = small_pool::size_to_idx(size);
auto& pool = small_pools[idx];
assert(size <= pool.object_size());
- return pool.allocate();
+ auto ptr = pool.allocate();
+#ifdef SEASTAR_HEAPPROF
+ if (!ptr) {
+ return nullptr;
+ }
+ allocation_site* alloc_site = get_allocation_site();
+ if (alloc_site) {
+ ++alloc_site->count;
+ alloc_site->size += pool.object_size();
+ }
+ pool.alloc_site_holder(ptr) = alloc_site;
+#endif
+ return ptr;
}

void cpu_pages::free_large(void* ptr) {
pageidx idx = (reinterpret_cast<char*>(ptr) - mem()) / page_size;
page* span = &pages[idx];
+#ifdef SEASTAR_HEAPPROF
+ auto alloc_site = span->alloc_site;
+ if (alloc_site) {
+ --alloc_site->count;
+ alloc_site->size -= span->span_size * page_size;
+ }
+#endif
free_span(idx, span->span_size);
}

@@ -534,7 +718,14 @@ size_t cpu_pages::object_size(void* ptr) {
pageidx idx = (reinterpret_cast<char*>(ptr) - mem()) / page_size;
page* span = &pages[idx];
if (span->pool) {
- return span->pool->object_size();
+ auto s = span->pool->object_size();
+#ifdef SEASTAR_HEAPPROF
+ // We must not allow the object to be extended onto the allocation_site_ptr field.
+ if (!span->pool->objects_page_aligned()) {
+ s -= sizeof(allocation_site_ptr);
+ }
+#endif
+ return s;
} else {
return size_t(span->span_size) * page_size;
}
@@ -572,7 +763,15 @@ bool cpu_pages::drain_cross_cpu_freelist() {
void cpu_pages::free(void* ptr) {
page* span = to_page(ptr);
if (span->pool) {
- span->pool->deallocate(ptr);
+ small_pool& pool = *span->pool;
+#ifdef SEASTAR_HEAPPROF
+ allocation_site* alloc_site = pool.alloc_site_holder(ptr);
+ if (alloc_site) {
+ --alloc_site->count;
+ alloc_site->size -= pool.object_size();
+ }
+#endif
+ pool.deallocate(ptr);
} else {
free_large(ptr);
}
@@ -584,7 +783,15 @@ void cpu_pages::free(void* ptr, size_t size) {
size = sizeof(free_object);
}
if (size <= max_small_allocation) {
+ size = object_size_with_alloc_site(size);
auto pool = &small_pools[small_pool::size_to_idx(size)];
+#ifdef SEASTAR_HEAPPROF
+ allocation_site* alloc_site = pool->alloc_site_holder(ptr);
+ if (alloc_site) {
+ --alloc_site->count;
+ alloc_site->size -= pool->object_size();
+ }
+#endif
pool->deallocate(ptr);
} else {
free_large(ptr);
@@ -614,6 +821,13 @@ void cpu_pages::shrink(void* ptr, size_t new_size) {
if (new_size_pages == old_size_pages) {
return;
}
+#ifdef SEASTAR_HEAPPROF
+ auto alloc_site = span->alloc_site;
+ if (alloc_site) {
+ alloc_site->size -= span->span_size * page_size;
+ alloc_site->size += new_size_pages * page_size;
+ }
+#endif
span->span_size = new_size_pages;
span[new_size_pages - 1].free = false;
span[new_size_pages - 1].span_size = new_size_pages;
@@ -881,6 +1095,7 @@ small_pool::add_more_objects() {
}
}
while (_free_count < goal) {
+ disable_backtrace_temporarily dbt;
auto data = reinterpret_cast<char*>(cpu_mem.allocate_large(_span_size));
if (!data) {
return;
@@ -967,6 +1182,7 @@ void* allocate(size_t size) {
}
void* ptr;
if (size <= max_small_allocation) {
+ size = object_size_with_alloc_site(size);
ptr = cpu_mem.allocate_small(size);
} else {
ptr = allocate_large(size);
@@ -987,7 +1203,7 @@ void* allocate_aligned(size_t align, size_t size) {
if (size <= max_small_allocation && align <= page_size) {
// Our small allocator only guarantees alignment for power-of-two
// allocations which are not larger than a page.
- size = 1 << log2ceil(size);
+ size = 1 << log2ceil(object_size_with_alloc_site(size));
ptr = cpu_mem.allocate_small(size);
} else {
ptr = allocate_large_aligned(align, size);
@@ -1384,6 +1600,10 @@ void operator delete[](void* ptr, with_alignment wa) {

namespace memory {

+void set_heap_profiling_enabled(bool enabled) {
+ seastar_logger.warn("Seastar compiled with default allocator, heap profiler not supported");
+}
+
void enable_abort_on_allocation_failure() {
seastar_logger.warn("Seastar compiled with default allocator, will not abort on bad_alloc");
}
diff --git a/core/reactor.cc b/core/reactor.cc
index 585be7c..ea6e891 100644
--- a/core/reactor.cc
+++ b/core/reactor.cc
@@ -2928,6 +2928,9 @@ reactor::get_options_description() {
("relaxed-dma", "allow using buffered I/O if DMA is not available (reduces performance)")
("overprovisioned", "run in an overprovisioned environment (such as docker or a laptop); equivalent to --idle-poll-time-us 0 --thread-affinity 0 --poll-aio 0")
("abort-on-seastar-bad-alloc", "abort when seastar allocator cannot allocate memory")
+#ifdef SEASTAR_HEAPPROF
+ ("heapprof", "enable seastar heap profiling")
+#endif
;
opts.add(network_stack_registry::options_description());
return opts;
@@ -3225,6 +3228,9 @@ void smp::configure(boost::program_options::variables_map configuration)
memory::enable_abort_on_allocation_failure();
}

+ bool heapprof_enabled = configuration.count("heapprof");
+ memory::set_heap_profiling_enabled(heapprof_enabled);
+
#ifdef HAVE_DPDK
if (smp::_using_dpdk) {
dpdk::eal::cpuset cpus;
@@ -3276,11 +3282,12 @@ void smp::configure(boost::program_options::variables_map configuration)
unsigned i;
for (i = 1; i < smp::count; i++) {
auto allocation = allocations[i];
- create_thread([configuration, hugepages_path, i, allocation, assign_io_queue, alloc_io_queue, thread_affinity] {
+ create_thread([configuration, hugepages_path, i, allocation, assign_io_queue, alloc_io_queue, thread_affinity, heapprof_enabled] {
if (thread_affinity) {
smp::pin(allocation.cpu_id);
}
memory::configure(allocation.mem, hugepages_path);
+ memory::set_heap_profiling_enabled(heapprof_enabled);
sigset_t mask;
sigfillset(&mask);
for (auto sig : { SIGSEGV, SIGQUIT }) {
--
2.5.5

Avi Kivity

<avi@scylladb.com>

unread,

Oct 1, 2016, 1:48:24 PM10/1/16

to Tomasz Grabiec, seastar-dev@googlegroups.com

On 09/30/2016 04:06 PM, Tomasz Grabiec wrote:
> Records allocation sites and sizes so that it's later possible to see
> what kind of objects are taking up space on the heap.

Committing because this is such a beautiful patch, but see below.

> Adds 8 bytes of memory overhead per live object when compiled-in.

Could have avoided this by creating a new small_pool per allocation_site
(giving up sized deallocation optimization). Doesn't matter for our use
case.

Why the indirection? Just to maintain alloc_site_list_head?

I think traversing an unordered_set is not that hard, there's a linked
list you can follow that contains all elements.

Maybe safer against strict aliasing rules is new
(&pool.alloc_site_holder(ptr)) allocation_site_ptr{alloc_site};

Commit Bot

<bot@cloudius-systems.com>

unread,

Oct 1, 2016, 1:50:35 PM10/1/16

to seastar-dev@googlegroups.com, Tomasz Grabiec

From: Tomasz Grabiec <tgra...@scylladb.com>
Committer: Avi Kivity <a...@scylladb.com>
Branch: master

core/memory: Introduce heap profiler

Message-Id: <1475240766-20198-1-gi...@scylladb.com>

---
diff --git a/core/memory.cc b/core/memory.cc

@@ -516,25 +625,107 @@ cpu_pages::allocate_large_aligned(unsigned

diff --git a/core/memory.hh b/core/memory.hh

--- a/core/memory.hh
+++ b/core/memory.hh
@@ -62,6 +62,8 @@ void configure(std::vector<resource::memory> m,

void enable_abort_on_allocation_failure();

+void set_heap_profiling_enabled(bool);
+
void* allocate_reclaimable(size_t size);

enum class reclaiming_result {

diff --git a/core/reactor.cc b/core/reactor.cc

--- a/core/reactor.cc
+++ b/core/reactor.cc
@@ -2928,6 +2928,9 @@ reactor::get_options_description() {
("relaxed-dma", "allow using buffered I/O if DMA is not available
(reduces performance)")
("overprovisioned", "run in an overprovisioned environment (such
as docker or a laptop); equivalent to --idle-poll-time-us 0
--thread-affinity 0 --poll-aio 0")
("abort-on-seastar-bad-alloc", "abort when seastar allocator
cannot allocate memory")
+#ifdef SEASTAR_HEAPPROF
+ ("heapprof", "enable seastar heap profiling")
+#endif
;
opts.add(network_stack_registry::options_description());
return opts;

@@ -3205,6 +3208,9 @@ void

smp::configure(boost::program_options::variables_map configuration)
memory::enable_abort_on_allocation_failure();
}

+ bool heapprof_enabled = configuration.count("heapprof");
+ memory::set_heap_profiling_enabled(heapprof_enabled);
+
#ifdef HAVE_DPDK
if (smp::_using_dpdk) {
dpdk::eal::cpuset cpus;

@@ -3256,11 +3262,12 @@ void

smp::configure(boost::program_options::variables_map configuration)
unsigned i;
for (i = 1; i < smp::count; i++) {
auto allocation = allocations[i];
- create_thread([configuration, hugepages_path, i, allocation,
assign_io_queue, alloc_io_queue, thread_affinity] {
+ create_thread([configuration, hugepages_path, i, allocation,
assign_io_queue, alloc_io_queue, thread_affinity, heapprof_enabled] {
if (thread_affinity) {
smp::pin(allocation.cpu_id);
}
memory::configure(allocation.mem, hugepages_path);
+ memory::set_heap_profiling_enabled(heapprof_enabled);
sigset_t mask;
sigfillset(&mask);

for (auto sig : { SIGSEGV }) {

Pekka Enberg

<penberg@scylladb.com>

unread,

Oct 3, 2016, 2:01:39 AM10/3/16

to Tomasz Grabiec, seastar-dev

On Fri, Sep 30, 2016 at 4:06 PM, Tomasz Grabiec <tgra...@scylladb.com> wrote:
> Records allocation sites and sizes so that it's later possible to see
> what kind of objects are taking up space on the heap.
>
> Adds 8 bytes of memory overhead per live object when compiled-in.
>
> Not compiled-in by default. To do so, configure with SEASTAR_HEAPPROF:
>
> ./configure.py --cflags=-DSEASTAR_HEAPPROF
>
> To enable recording allocation sites, start seastar application with
> the --heapprof flag:
>
> build/release/memcached --heapprof
>
> It is possible to enable recording in run-time using, for instance, a
> RESTful API.
>
> scylla-gdb.py support will follow, with text-mode tree presentation
> and dumping to flamegraphs.
>
> Flamegraph example:
>
> https://cloud.githubusercontent.com/assets/283695/18888026/f69fc4e6-84f6-11e6-9b7b-305667d30f52.png

Nice patch, Tomek!

Tomasz Grabiec

<tgrabiec@scylladb.com>

unread,

Oct 3, 2016, 4:04:05 AM10/3/16

to Avi Kivity, seastar-dev

In order to make references to allocation_site objects stable. We store pointers to them with each object. But I can see now that unordered_map does not invalidate references on insert, so we could get rid of this indirection.

Will change.

Reply all

Reply to author

Forward