[QUEUED scylla next] Merge 'auto-scale promoted index' from Benny Halevy

0 views

Skip to first unread message

Commit Bot

<bot@cloudius-systems.com>

unread,

May 25, 2022, 3:27:45 AM5/25/22

to scylladb-dev@googlegroups.com, Avi Kivity

From: Avi Kivity <a...@scylladb.com>
Committer: Avi Kivity <a...@scylladb.com>
Branch: next

Merge 'auto-scale promoted index' from Benny Halevy

Add column_index_auto_scale_threshold_in_kb to the configuration (defaults to 10MB).

When the promoted index (serialized) size gets to this
threshold, it's halved by merging each two adjacent blocks
into one and doubling the desired_block_size.

Fixes #4217

Signed-off-by: Benny Halevy <bha...@scylladb.com>

Closes #10646

* github.com:scylladb/scylla:
sstables: mx: add pi_auto_scale_events metric
sstables: mx/writer: auto-scale promoted index

---
diff --git a/conf/scylla.yaml b/conf/scylla.yaml
--- a/conf/scylla.yaml
+++ b/conf/scylla.yaml
@@ -386,6 +386,15 @@ commitlog_total_space_in_mb: -1
# you can cache more hot rows
# column_index_size_in_kb: 64

+# Auto-scaling of the promoted index prevents running out of memory
+# when the promoted index grows too large (due to partitions with many rows
+# vs. too small column_index_size_in_kb). When the serialized representation
+# of the promoted index grows by this threshold, the desired block size
+# for this partition (initialized to column_index_size_in_kb)
+# is doubled, to decrease the sampling resolution by half.
+#
+# To disable promoted index auto-scaling, set the threshold to 0.
+# column_index_auto_scale_threshold_in_kb: 10240

# Log a warning when writing partitions larger than this value
# compaction_large_partition_warning_threshold_mb: 1000
diff --git a/db/config.cc b/db/config.cc
--- a/db/config.cc
+++ b/db/config.cc
@@ -468,6 +468,8 @@ db::config::config(std::shared_ptr<db::extensions> exts)
/* Cache and index settings */
, column_index_size_in_kb(this, "column_index_size_in_kb", value_status::Used, 64,
"Granularity of the index of rows within a partition. For huge rows, decrease this setting to improve seek time. If you use key cache, be careful not to make this setting too large because key cache will be overwhelmed. If you're unsure of the size of the rows, it's best to use the default setting.")
+ , column_index_auto_scale_threshold_in_kb(this, "column_index_auto_scale_threshold_in_kb", liveness::LiveUpdate, value_status::Used, 10240,
+ "Auto-reduce the promoted index granularity by half when reaching this threshold, to prevent promoted index bloating due to partitions with too many rows. Set to 0 to disable this feature.")
, index_summary_capacity_in_mb(this, "index_summary_capacity_in_mb", value_status::Unused, 0,
"Fixed memory pool size in MB for SSTable index summaries. If the memory usage of all index summaries exceeds this limit, any SSTables with low read rates shrink their index summaries to meet this limit. This is a best-effort process. In extreme conditions, Cassandra may need to use more than this amount of memory.")
, index_summary_resize_interval_in_minutes(this, "index_summary_resize_interval_in_minutes", value_status::Unused, 60,
diff --git a/db/config.hh b/db/config.hh
--- a/db/config.hh
+++ b/db/config.hh
@@ -190,6 +190,7 @@ public:
named_value<uint32_t> memtable_heap_space_in_mb;
named_value<uint32_t> memtable_offheap_space_in_mb;
named_value<uint32_t> column_index_size_in_kb;
+ named_value<uint32_t> column_index_auto_scale_threshold_in_kb;
named_value<uint32_t> index_summary_capacity_in_mb;
named_value<uint32_t> index_summary_resize_interval_in_minutes;
named_value<double> reduce_cache_capacity_to;
diff --git a/sstables/mx/writer.cc b/sstables/mx/writer.cc
--- a/sstables/mx/writer.cc
+++ b/sstables/mx/writer.cc
@@ -581,7 +581,14 @@ class writer : public sstable_writer::writer_impl {
uint64_t block_next_start_offset;
std::optional<clustering_info> first_clustering;
std::optional<clustering_info> last_clustering;
+
+ // for this partition
size_t desired_block_size;
+ size_t auto_scale_threshold;
+
+ // from write config
+ size_t promoted_index_block_size;
+ size_t promoted_index_auto_scale_threshold;
} _pi_write_m;
utils::UUID _run_identifier;
bool _write_regular_as_static; // See #4139
@@ -785,7 +792,8 @@ class writer : public sstable_writer::writer_impl {

_cfg.monitor->on_write_started(_data_writer->offset_tracker());
_sst._components->filter = utils::i_filter::get_filter(estimated_partitions, _schema.bloom_filter_fp_chance(), utils::filter_format::m_format);
- _pi_write_m.desired_block_size = cfg.promoted_index_block_size;
+ _pi_write_m.promoted_index_block_size = cfg.promoted_index_block_size;
+ _pi_write_m.promoted_index_auto_scale_threshold = cfg.promoted_index_auto_scale_threshold;
_index_sampling_state.summary_byte_cost = _cfg.summary_byte_cost;
prepare_summary(_sst._components->summary, estimated_partitions, _schema.min_index_interval());
}
@@ -844,6 +852,13 @@ void writer::add_pi_block() {

write_pi_block(block);
++_pi_write_m.promoted_index_size;
+
+ // auto-scale?
+ if (_pi_write_m.blocks.size() >= _pi_write_m.auto_scale_threshold) {
+ _pi_write_m.desired_block_size *= 2;
+ _pi_write_m.auto_scale_threshold += _pi_write_m.promoted_index_auto_scale_threshold;
+ _sst.get_stats().on_promoted_index_auto_scale();
+ }
}

void writer::maybe_add_pi_block() {
@@ -920,6 +935,8 @@ void writer::consume_new_partition(const dht::decorated_key& dk) {
_pi_write_m.tomb = {};
_pi_write_m.first_clustering.reset();
_pi_write_m.last_clustering.reset();
+ _pi_write_m.desired_block_size = _pi_write_m.promoted_index_block_size;
+ _pi_write_m.auto_scale_threshold = _pi_write_m.promoted_index_auto_scale_threshold;

write(_sst.get_version(), *_data_writer, p_key);
_partition_header_length = _data_writer->offset() - _c_stats.start_offset;
diff --git a/sstables/sstables.cc b/sstables/sstables.cc
--- a/sstables/sstables.cc
+++ b/sstables/sstables.cc
@@ -3112,6 +3112,9 @@ future<> init_metrics() {
sm::description("Number of tombstones written")),
sm::make_counter("range_tombstone_writes", [] { return sstables_stats::get_shard_stats().range_tombstone_writes; },
sm::description("Number of range tombstones written")),
+ sm::make_counter("pi_auto_scale_events", [] { return sstables_stats::get_shard_stats().promoted_index_auto_scale_events; },
+ sm::description("Number of promoted index auto-scaling events")),
+
sm::make_counter("range_tombstone_reads", [] { return sstables_stats::get_shard_stats().range_tombstone_reads; },
sm::description("Number of range tombstones read")),
sm::make_counter("row_tombstone_reads", [] { return sstables_stats::get_shard_stats().row_tombstone_reads; },
diff --git a/sstables/sstables.hh b/sstables/sstables.hh
--- a/sstables/sstables.hh
+++ b/sstables/sstables.hh
@@ -101,6 +101,7 @@ extern size_t summary_byte_cost(double summary_ratio);

struct sstable_writer_config {
size_t promoted_index_block_size;
+ size_t promoted_index_auto_scale_threshold;
uint64_t max_sstable_size = std::numeric_limits<uint64_t>::max();
bool backup = false;
bool leave_unsealed = false;
diff --git a/sstables/sstables_manager.cc b/sstables/sstables_manager.cc
--- a/sstables/sstables_manager.cc
+++ b/sstables/sstables_manager.cc
@@ -48,6 +48,10 @@ sstable_writer_config sstables_manager::configure_writer(sstring origin) const {
sstable_writer_config cfg;

cfg.promoted_index_block_size = _db_config.column_index_size_in_kb() * 1024;
+ cfg.promoted_index_auto_scale_threshold = (size_t)_db_config.column_index_auto_scale_threshold_in_kb() * 1024;
+ if (!cfg.promoted_index_auto_scale_threshold) {
+ cfg.promoted_index_auto_scale_threshold = std::numeric_limits<size_t>::max();
+ }
cfg.validation_level = _db_config.enable_sstable_key_validation()
? mutation_fragment_stream_validation_level::clustering_key
: mutation_fragment_stream_validation_level::token;
diff --git a/sstables/stats.hh b/sstables/stats.hh
--- a/sstables/stats.hh
+++ b/sstables/stats.hh
@@ -35,6 +35,7 @@ class sstables_stats {
uint64_t open_for_writing = 0;
uint64_t closed_for_writing = 0;
uint64_t deleted = 0;
+ uint64_t promoted_index_auto_scale_events = 0;
} _shard_stats;

stats& _stats = _shard_stats;
@@ -125,6 +126,10 @@ public:
inline void on_delete() noexcept {
++_stats.deleted;
}
+
+ inline void on_promoted_index_auto_scale() noexcept {
+ ++_stats.promoted_index_auto_scale_events;
+ }
};

}
diff --git a/test/boost/config_test.cc b/test/boost/config_test.cc
--- a/test/boost/config_test.cc
+++ b/test/boost/config_test.cc
@@ -561,7 +561,7 @@ tombstone_failure_threshold: 100000
# rows (as part of the key cache), so a larger granularity means
# you can cache more hot rows
column_index_size_in_kb: 64
-
+column_index_auto_scale_threshold_in_kb: 1024

# Log WARN on any batch size exceeding this value. 5kb per batch by default.
# Caution should be taken on increasing the size of this threshold as it can lead to node instability.
diff --git a/test/boost/sstable_datafile_test.cc b/test/boost/sstable_datafile_test.cc
--- a/test/boost/sstable_datafile_test.cc
+++ b/test/boost/sstable_datafile_test.cc
@@ -1983,6 +1983,7 @@ SEASTAR_TEST_CASE(test_skipping_using_index) {
tmpdir dir;
sstable_writer_config cfg = env.manager().configure_writer();
cfg.promoted_index_block_size = 1; // So that every fragment is indexed
+ cfg.promoted_index_auto_scale_threshold = 0; // disable auto-scaling
auto sst = make_sstable_easy(env, dir.path(), make_flat_mutation_reader_from_mutations_v2(table.schema(), env.make_reader_permit(), partitions), cfg, 1, version);

auto ms = as_mutation_source(sst);
diff --git a/test/boost/sstable_mutation_test.cc b/test/boost/sstable_mutation_test.cc
--- a/test/boost/sstable_mutation_test.cc
+++ b/test/boost/sstable_mutation_test.cc
@@ -957,6 +957,44 @@ SEASTAR_TEST_CASE(test_promoted_index_blocks_are_monotonic) {
mt->apply(std::move(m));
sstable_writer_config cfg = env.manager().configure_writer();
cfg.promoted_index_block_size = 1;
+ cfg.promoted_index_auto_scale_threshold = 0; // disable auto-scaling
+
+ auto sst = make_sstable_easy(env, dir.path(), mt, cfg);
+ assert_that(get_index_reader(sst, env.make_reader_permit())).has_monotonic_positions(*s);
+ });
+}
+
+SEASTAR_TEST_CASE(test_promoted_index_blocks_are_monotonic_with_auto_scaling) {
+ return test_env::do_with_async([] (test_env& env) {
+ auto dir = tmpdir();
+ schema_builder builder("ks", "cf");
+ builder.with_column("p", utf8_type, column_kind::partition_key);
+ builder.with_column("c1", int32_type, column_kind::clustering_key);
+ builder.with_column("c2", int32_type, column_kind::clustering_key);
+ builder.with_column("v", int32_type);
+ auto s = builder.build();
+
+ auto k = partition_key::from_exploded(*s, {to_bytes(make_local_key(s))});
+ auto cell = atomic_cell::make_live(*int32_type, 1, int32_type->decompose(88), { });
+ mutation m(s, k);
+
+ for (int i = 1; i <= 1024; i++) {
+ auto ck = clustering_key::from_exploded(*s, {int32_type->decompose(i), int32_type->decompose(i*2)});
+ m.set_clustered_cell(ck, *s->get_column_definition("v"), atomic_cell(*int32_type, cell));
+ }
+
+ m.partition().apply_row_tombstone(*s, range_tombstone(
+ clustering_key_prefix::from_exploded(*s, {int32_type->decompose(1)}),
+ bound_kind::excl_start,
+ clustering_key_prefix::from_exploded(*s, {int32_type->decompose(2)}),
+ bound_kind::incl_end,
+ {1, gc_clock::now()}));
+
+ auto mt = make_lw_shared<replica::memtable>(s);
+ mt->apply(std::move(m));
+ sstable_writer_config cfg = env.manager().configure_writer();
+ cfg.promoted_index_block_size = 1;
+ cfg.promoted_index_auto_scale_threshold = 100; // set to a low value to trigger auto-scaling

auto sst = make_sstable_easy(env, dir.path(), mt, cfg);
assert_that(get_index_reader(sst, env.make_reader_permit())).has_monotonic_positions(*s);

Commit Bot

<bot@cloudius-systems.com>

unread,

May 25, 2022, 2:57:03 PM5/25/22

to scylladb-dev@googlegroups.com, Avi Kivity

From: Avi Kivity <a...@scylladb.com>
Committer: Avi Kivity <a...@scylladb.com>

Branch: master

Reply all

Reply to author

Forward

0 new messages