From b273b45f7f33c4df3bb9d6325cbe1244e8733106 Mon Sep 17 00:00:00 2001 From: Ahmed Farghal Date: Wed, 1 May 2024 14:11:07 +0100 Subject: [PATCH] Tune rocksdb for multi-cf setup --- crates/node/src/network_server/handler/mod.rs | 92 +++++++++++++++---- .../src/partition_store_manager.rs | 3 - crates/rocksdb/src/db_manager.rs | 1 - crates/types/src/config/bifrost.rs | 2 + crates/types/src/config/common.rs | 2 +- crates/types/src/config/rocksdb.rs | 4 +- 6 files changed, 78 insertions(+), 26 deletions(-) diff --git a/crates/node/src/network_server/handler/mod.rs b/crates/node/src/network_server/handler/mod.rs index 59a6b87917..6582a8773b 100644 --- a/crates/node/src/network_server/handler/mod.rs +++ b/crates/node/src/network_server/handler/mod.rs @@ -17,7 +17,7 @@ use axum::extract::State; use metrics_exporter_prometheus::formatting; use rocksdb::statistics::{Histogram, Ticker}; -use restate_rocksdb::RocksDbManager; +use restate_rocksdb::{CfName, RocksDbManager}; use crate::network_server::prometheus_helpers::{ format_rocksdb_histogram_for_prometheus, format_rocksdb_property_for_prometheus, @@ -25,30 +25,36 @@ use crate::network_server::prometheus_helpers::{ }; use crate::network_server::state::NodeCtrlHandlerState; -static ROCKSDB_TICKERS: &[Ticker] = &[ - Ticker::BlockCacheDataBytesInsert, - Ticker::BlockCacheDataHit, - Ticker::BlockCacheDataMiss, +const ROCKSDB_TICKERS: &[Ticker] = &[ + Ticker::BlockCacheBytesRead, + Ticker::BlockCacheBytesWrite, + Ticker::BlockCacheHit, + Ticker::BlockCacheMiss, Ticker::BloomFilterUseful, Ticker::BytesRead, Ticker::BytesWritten, Ticker::CompactReadBytes, Ticker::CompactWriteBytes, Ticker::FlushWriteBytes, + Ticker::IterBytesRead, Ticker::MemtableHit, Ticker::MemtableMiss, Ticker::NoIteratorCreated, Ticker::NoIteratorDeleted, + Ticker::NumberDbNext, + Ticker::NumberDbSeek, + Ticker::NumberIterSkip, Ticker::NumberKeysRead, Ticker::NumberKeysUpdated, Ticker::NumberKeysWritten, + Ticker::NumberOfReseeksInIteration, Ticker::StallMicros, Ticker::WalFileBytes, Ticker::WalFileSynced, Ticker::WriteWithWal, ]; -static ROCKSDB_HISTOGRAMS: &[(Histogram, &str, MetricUnit)] = &[ +const ROCKSDB_HISTOGRAMS: &[(Histogram, &str, MetricUnit)] = &[ (Histogram::DbGet, "rocksdb.db.get", MetricUnit::Micros), ( Histogram::DbMultiget, @@ -58,11 +64,46 @@ static ROCKSDB_HISTOGRAMS: &[(Histogram, &str, MetricUnit)] = &[ (Histogram::DbWrite, "rocksdb.db.write", MetricUnit::Micros), (Histogram::DbSeek, "rocksdb.db.seek", MetricUnit::Micros), (Histogram::FlushTime, "rocksdb.db.flush", MetricUnit::Micros), + ( + Histogram::ReadBlockGetMicros, + "rocksdb.read.block.get", + MetricUnit::Micros, + ), + ( + Histogram::SstReadMicros, + "rocksdb.sst.read", + MetricUnit::Micros, + ), + ( + Histogram::SstWriteMicros, + "rocksdb.sst.write", + MetricUnit::Micros, + ), + ( + Histogram::ReadNumMergeOperands, + Histogram::ReadNumMergeOperands.name(), + MetricUnit::Count, + ), + ( + Histogram::NumSstReadPerLevel, + Histogram::NumSstReadPerLevel.name(), + MetricUnit::Count, + ), ( Histogram::WalFileSyncMicros, "rocksdb.wal.file.sync", MetricUnit::Micros, ), + ( + Histogram::AsyncReadBytes, + "rocksdb.async.read", + MetricUnit::Bytes, + ), + ( + Histogram::PollWaitMicros, + "rocksdb.poll.wait", + MetricUnit::Micros, + ), ( Histogram::CompactionTime, "rocksdb.compaction.times", @@ -90,8 +131,15 @@ static ROCKSDB_HISTOGRAMS: &[(Histogram, &str, MetricUnit)] = &[ ), ]; +// Per database properties +const ROCKSDB_DB_PROPERTIES: &[(&str, MetricUnit)] = &[ + ("rocksdb.block-cache-capacity", MetricUnit::Bytes), + ("rocksdb.block-cache-usage", MetricUnit::Bytes), + ("rocksdb.block-cache-pinned-usage", MetricUnit::Bytes), +]; + // Per column-family properties -static ROCKSDB_PROPERTIES: &[(&str, MetricUnit)] = &[ +const ROCKSDB_CF_PROPERTIES: &[(&str, MetricUnit)] = &[ ("rocksdb.num-immutable-mem-table", MetricUnit::Count), ("rocksdb.mem-table-flush-pending", MetricUnit::Count), ("rocksdb.compaction-pending", MetricUnit::Count), @@ -113,19 +161,19 @@ static ROCKSDB_PROPERTIES: &[(&str, MetricUnit)] = &[ "rocksdb.estimate-pending-compaction-bytes", MetricUnit::Bytes, ), + ("rocksdb.num-running-flushes", MetricUnit::Count), ("rocksdb.num-running-compactions", MetricUnit::Count), ("rocksdb.actual-delayed-write-rate", MetricUnit::Count), - ("rocksdb.block-cache-capacity", MetricUnit::Bytes), - ("rocksdb.block-cache-usage", MetricUnit::Bytes), - ("rocksdb.block-cache-pinned-usage", MetricUnit::Bytes), ("rocksdb.num-files-at-level0", MetricUnit::Count), ("rocksdb.num-files-at-level1", MetricUnit::Count), // Add more as needed. ("rocksdb.num-files-at-level2", MetricUnit::Count), + ("rocksdb.num-files-at-level3", MetricUnit::Count), ]; // -- Direct HTTP Handlers -- pub async fn render_metrics(State(state): State) -> String { + let default_cf = CfName::new("default"); let mut out = String::new(); // Response content type is plain/text and that's expected. @@ -182,14 +230,6 @@ pub async fn render_metrics(State(state): State) -> String .get_memory_usage_stats(&[]) .expect("get_memory_usage_stats"); - format_rocksdb_property_for_prometheus( - &mut out, - &labels, - MetricUnit::Bytes, - "rocksdb.memory.approximate-cache", - memory_usage.approximate_cache_total(), - ); - format_rocksdb_property_for_prometheus( &mut out, &labels, @@ -214,6 +254,20 @@ pub async fn render_metrics(State(state): State) -> String memory_usage.approximate_mem_table_readers_total(), ); + // Other per-database properties + for (property, unit) in ROCKSDB_DB_PROPERTIES { + format_rocksdb_property_for_prometheus( + &mut out, + &labels, + *unit, + property, + db.inner() + .get_property_int_cf(&default_cf, property) + .unwrap_or_default() + .unwrap_or_default(), + ); + } + // Properties (Gauges) // For properties, we need to get them for each column family. for cf in &db.cfs() { @@ -221,7 +275,7 @@ pub async fn render_metrics(State(state): State) -> String let mut cf_labels = Vec::with_capacity(labels.len() + 1); labels.clone_into(&mut cf_labels); cf_labels.push(format!("cf=\"{}\"", sanitized_cf_name)); - for (property, unit) in ROCKSDB_PROPERTIES { + for (property, unit) in ROCKSDB_CF_PROPERTIES { format_rocksdb_property_for_prometheus( &mut out, &cf_labels, diff --git a/crates/partition-store/src/partition_store_manager.rs b/crates/partition-store/src/partition_store_manager.rs index 467859bc72..e09d117ce7 100644 --- a/crates/partition-store/src/partition_store_manager.rs +++ b/crates/partition-store/src/partition_store_manager.rs @@ -149,9 +149,6 @@ fn db_options() -> rocksdb::Options { // db_options.set_keep_log_file_num(1); - // we always need to enable atomic flush in case that the user disables wal at runtime - db_options.set_atomic_flush(true); - // we always enable manual wal flushing in case that the user enables wal at runtime db_options.set_manual_wal_flush(true); diff --git a/crates/rocksdb/src/db_manager.rs b/crates/rocksdb/src/db_manager.rs index 4ae2b720d4..4eeb8df07d 100644 --- a/crates/rocksdb/src/db_manager.rs +++ b/crates/rocksdb/src/db_manager.rs @@ -313,7 +313,6 @@ impl RocksDbManager { // write buffer // cf_options.set_write_buffer_size(opts.rocksdb_write_buffer_size().get()); - // // bloom filters and block cache. // let mut block_opts = BlockBasedOptions::default(); diff --git a/crates/types/src/config/bifrost.rs b/crates/types/src/config/bifrost.rs index 5d4b5d5123..fdebbf285c 100644 --- a/crates/types/src/config/bifrost.rs +++ b/crates/types/src/config/bifrost.rs @@ -8,6 +8,7 @@ // the Business Source License, use of this software will be governed // by the Apache License, Version 2.0. +use std::num::NonZeroUsize; use std::path::PathBuf; use std::time::Duration; @@ -95,6 +96,7 @@ impl LocalLogletOptions { impl Default for LocalLogletOptions { fn default() -> Self { let rocksdb = RocksDbOptionsBuilder::default() + .rocksdb_write_buffer_size(Some(NonZeroUsize::new(128_000_000).unwrap())) .rocksdb_disable_wal(Some(false)) .build() .unwrap(); diff --git a/crates/types/src/config/common.rs b/crates/types/src/config/common.rs index 2388917a0b..6d2a4e24be 100644 --- a/crates/types/src/config/common.rs +++ b/crates/types/src/config/common.rs @@ -281,7 +281,7 @@ impl Default for CommonOptions { default_thread_pool_size: None, storage_high_priority_bg_threads: None, storage_low_priority_bg_threads: None, - rocksdb_total_memtables_size: 0, + rocksdb_total_memtables_size: 2_000_000_000, // 2GB (50% of total memory) rocksdb_total_memory_size: NonZeroUsize::new(4_000_000_000).unwrap(), // 4GB rocksdb_bg_threads: None, rocksdb_high_priority_bg_threads: NonZeroU32::new(2).unwrap(), diff --git a/crates/types/src/config/rocksdb.rs b/crates/types/src/config/rocksdb.rs index b710a3e23d..98bf8d2f62 100644 --- a/crates/types/src/config/rocksdb.rs +++ b/crates/types/src/config/rocksdb.rs @@ -26,7 +26,7 @@ pub struct RocksDbOptions { /// # Write Buffer size /// /// The size of a single memtable. Once memtable exceeds this size, it is marked - /// immutable and a new one is created. Default is 256MB per memtable. + /// immutable and a new one is created. Default is 50MB per memtable. #[serde(skip_serializing_if = "Option::is_none")] #[serde_as(as = "Option")] #[cfg_attr(feature = "schemars", schemars(with = "Option"))] @@ -109,7 +109,7 @@ impl RocksDbOptions { pub fn rocksdb_write_buffer_size(&self) -> NonZeroUsize { self.rocksdb_write_buffer_size - .unwrap_or(NonZeroUsize::new(256_000_000).unwrap()) // 256MB + .unwrap_or(NonZeroUsize::new(50_000_000).unwrap()) // 50MB } pub fn rocksdb_max_total_wal_size(&self) -> NonZeroUsize {