Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Tune rocksdb for multi-cf setup #1490

Merged
merged 1 commit into from
May 1, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
92 changes: 73 additions & 19 deletions crates/node/src/network_server/handler/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -17,38 +17,44 @@ use axum::extract::State;
use metrics_exporter_prometheus::formatting;
use rocksdb::statistics::{Histogram, Ticker};

use restate_rocksdb::RocksDbManager;
use restate_rocksdb::{CfName, RocksDbManager};

use crate::network_server::prometheus_helpers::{
format_rocksdb_histogram_for_prometheus, format_rocksdb_property_for_prometheus,
format_rocksdb_stat_ticker_for_prometheus, MetricUnit,
};
use crate::network_server::state::NodeCtrlHandlerState;

static ROCKSDB_TICKERS: &[Ticker] = &[
Ticker::BlockCacheDataBytesInsert,
Ticker::BlockCacheDataHit,
Ticker::BlockCacheDataMiss,
const ROCKSDB_TICKERS: &[Ticker] = &[
Ticker::BlockCacheBytesRead,
Ticker::BlockCacheBytesWrite,
Ticker::BlockCacheHit,
Ticker::BlockCacheMiss,
Ticker::BloomFilterUseful,
Ticker::BytesRead,
Ticker::BytesWritten,
Ticker::CompactReadBytes,
Ticker::CompactWriteBytes,
Ticker::FlushWriteBytes,
Ticker::IterBytesRead,
Ticker::MemtableHit,
Ticker::MemtableMiss,
Ticker::NoIteratorCreated,
Ticker::NoIteratorDeleted,
Ticker::NumberDbNext,
Ticker::NumberDbSeek,
Ticker::NumberIterSkip,
Ticker::NumberKeysRead,
Ticker::NumberKeysUpdated,
Ticker::NumberKeysWritten,
Ticker::NumberOfReseeksInIteration,
Ticker::StallMicros,
Ticker::WalFileBytes,
Ticker::WalFileSynced,
Ticker::WriteWithWal,
];

static ROCKSDB_HISTOGRAMS: &[(Histogram, &str, MetricUnit)] = &[
const ROCKSDB_HISTOGRAMS: &[(Histogram, &str, MetricUnit)] = &[
(Histogram::DbGet, "rocksdb.db.get", MetricUnit::Micros),
(
Histogram::DbMultiget,
Expand All @@ -58,11 +64,46 @@ static ROCKSDB_HISTOGRAMS: &[(Histogram, &str, MetricUnit)] = &[
(Histogram::DbWrite, "rocksdb.db.write", MetricUnit::Micros),
(Histogram::DbSeek, "rocksdb.db.seek", MetricUnit::Micros),
(Histogram::FlushTime, "rocksdb.db.flush", MetricUnit::Micros),
(
Histogram::ReadBlockGetMicros,
"rocksdb.read.block.get",
MetricUnit::Micros,
),
(
Histogram::SstReadMicros,
"rocksdb.sst.read",
MetricUnit::Micros,
),
(
Histogram::SstWriteMicros,
"rocksdb.sst.write",
MetricUnit::Micros,
),
(
Histogram::ReadNumMergeOperands,
Histogram::ReadNumMergeOperands.name(),
MetricUnit::Count,
),
(
Histogram::NumSstReadPerLevel,
Histogram::NumSstReadPerLevel.name(),
MetricUnit::Count,
),
(
Histogram::WalFileSyncMicros,
"rocksdb.wal.file.sync",
MetricUnit::Micros,
),
(
Histogram::AsyncReadBytes,
"rocksdb.async.read",
MetricUnit::Bytes,
),
(
Histogram::PollWaitMicros,
"rocksdb.poll.wait",
MetricUnit::Micros,
),
(
Histogram::CompactionTime,
"rocksdb.compaction.times",
Expand Down Expand Up @@ -90,8 +131,15 @@ static ROCKSDB_HISTOGRAMS: &[(Histogram, &str, MetricUnit)] = &[
),
];

// Per database properties
const ROCKSDB_DB_PROPERTIES: &[(&str, MetricUnit)] = &[
("rocksdb.block-cache-capacity", MetricUnit::Bytes),
("rocksdb.block-cache-usage", MetricUnit::Bytes),
("rocksdb.block-cache-pinned-usage", MetricUnit::Bytes),
];

// Per column-family properties
static ROCKSDB_PROPERTIES: &[(&str, MetricUnit)] = &[
const ROCKSDB_CF_PROPERTIES: &[(&str, MetricUnit)] = &[
("rocksdb.num-immutable-mem-table", MetricUnit::Count),
("rocksdb.mem-table-flush-pending", MetricUnit::Count),
("rocksdb.compaction-pending", MetricUnit::Count),
Expand All @@ -113,19 +161,19 @@ static ROCKSDB_PROPERTIES: &[(&str, MetricUnit)] = &[
"rocksdb.estimate-pending-compaction-bytes",
MetricUnit::Bytes,
),
("rocksdb.num-running-flushes", MetricUnit::Count),
("rocksdb.num-running-compactions", MetricUnit::Count),
("rocksdb.actual-delayed-write-rate", MetricUnit::Count),
("rocksdb.block-cache-capacity", MetricUnit::Bytes),
("rocksdb.block-cache-usage", MetricUnit::Bytes),
("rocksdb.block-cache-pinned-usage", MetricUnit::Bytes),
("rocksdb.num-files-at-level0", MetricUnit::Count),
("rocksdb.num-files-at-level1", MetricUnit::Count),
// Add more as needed.
("rocksdb.num-files-at-level2", MetricUnit::Count),
("rocksdb.num-files-at-level3", MetricUnit::Count),
];

// -- Direct HTTP Handlers --
pub async fn render_metrics(State(state): State<NodeCtrlHandlerState>) -> String {
let default_cf = CfName::new("default");
let mut out = String::new();

// Response content type is plain/text and that's expected.
Expand Down Expand Up @@ -182,14 +230,6 @@ pub async fn render_metrics(State(state): State<NodeCtrlHandlerState>) -> String
.get_memory_usage_stats(&[])
.expect("get_memory_usage_stats");

format_rocksdb_property_for_prometheus(
&mut out,
&labels,
MetricUnit::Bytes,
"rocksdb.memory.approximate-cache",
memory_usage.approximate_cache_total(),
);

format_rocksdb_property_for_prometheus(
&mut out,
&labels,
Expand All @@ -214,14 +254,28 @@ pub async fn render_metrics(State(state): State<NodeCtrlHandlerState>) -> String
memory_usage.approximate_mem_table_readers_total(),
);

// Other per-database properties
for (property, unit) in ROCKSDB_DB_PROPERTIES {
format_rocksdb_property_for_prometheus(
&mut out,
&labels,
*unit,
property,
db.inner()
.get_property_int_cf(&default_cf, property)
.unwrap_or_default()
.unwrap_or_default(),
);
}

// Properties (Gauges)
// For properties, we need to get them for each column family.
for cf in &db.cfs() {
let sanitized_cf_name = formatting::sanitize_label_value(cf);
let mut cf_labels = Vec::with_capacity(labels.len() + 1);
labels.clone_into(&mut cf_labels);
cf_labels.push(format!("cf=\"{}\"", sanitized_cf_name));
for (property, unit) in ROCKSDB_PROPERTIES {
for (property, unit) in ROCKSDB_CF_PROPERTIES {
format_rocksdb_property_for_prometheus(
&mut out,
&cf_labels,
Expand Down
3 changes: 0 additions & 3 deletions crates/partition-store/src/partition_store_manager.rs
Original file line number Diff line number Diff line change
Expand Up @@ -149,9 +149,6 @@ fn db_options() -> rocksdb::Options {
//
db_options.set_keep_log_file_num(1);

// we always need to enable atomic flush in case that the user disables wal at runtime
db_options.set_atomic_flush(true);
Comment on lines -152 to -153
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Did this setting hurt performance?


// we always enable manual wal flushing in case that the user enables wal at runtime
db_options.set_manual_wal_flush(true);

Expand Down
1 change: 0 additions & 1 deletion crates/rocksdb/src/db_manager.rs
Original file line number Diff line number Diff line change
Expand Up @@ -313,7 +313,6 @@ impl RocksDbManager {
// write buffer
//
cf_options.set_write_buffer_size(opts.rocksdb_write_buffer_size().get());
//
// bloom filters and block cache.
//
let mut block_opts = BlockBasedOptions::default();
Expand Down
2 changes: 2 additions & 0 deletions crates/types/src/config/bifrost.rs
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@
// the Business Source License, use of this software will be governed
// by the Apache License, Version 2.0.

use std::num::NonZeroUsize;
use std::path::PathBuf;
use std::time::Duration;

Expand Down Expand Up @@ -95,6 +96,7 @@ impl LocalLogletOptions {
impl Default for LocalLogletOptions {
fn default() -> Self {
let rocksdb = RocksDbOptionsBuilder::default()
.rocksdb_write_buffer_size(Some(NonZeroUsize::new(128_000_000).unwrap()))
.rocksdb_disable_wal(Some(false))
.build()
.unwrap();
Expand Down
2 changes: 1 addition & 1 deletion crates/types/src/config/common.rs
Original file line number Diff line number Diff line change
Expand Up @@ -281,7 +281,7 @@ impl Default for CommonOptions {
default_thread_pool_size: None,
storage_high_priority_bg_threads: None,
storage_low_priority_bg_threads: None,
rocksdb_total_memtables_size: 0,
rocksdb_total_memtables_size: 2_000_000_000, // 2GB (50% of total memory)
rocksdb_total_memory_size: NonZeroUsize::new(4_000_000_000).unwrap(), // 4GB
rocksdb_bg_threads: None,
rocksdb_high_priority_bg_threads: NonZeroU32::new(2).unwrap(),
Expand Down
4 changes: 2 additions & 2 deletions crates/types/src/config/rocksdb.rs
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,7 @@ pub struct RocksDbOptions {
/// # Write Buffer size
///
/// The size of a single memtable. Once memtable exceeds this size, it is marked
/// immutable and a new one is created. Default is 256MB per memtable.
/// immutable and a new one is created. Default is 50MB per memtable.
#[serde(skip_serializing_if = "Option::is_none")]
#[serde_as(as = "Option<NonZeroByteCount>")]
#[cfg_attr(feature = "schemars", schemars(with = "Option<NonZeroByteCount>"))]
Expand Down Expand Up @@ -109,7 +109,7 @@ impl RocksDbOptions {

pub fn rocksdb_write_buffer_size(&self) -> NonZeroUsize {
self.rocksdb_write_buffer_size
.unwrap_or(NonZeroUsize::new(256_000_000).unwrap()) // 256MB
.unwrap_or(NonZeroUsize::new(50_000_000).unwrap()) // 50MB
}

pub fn rocksdb_max_total_wal_size(&self) -> NonZeroUsize {
Expand Down
Loading