From a83da4f17bd68e92115c6584e816ab1aa53667aa Mon Sep 17 00:00:00 2001 From: Jason Volk <jason@zemos.net> Date: Sat, 6 Apr 2024 08:48:41 -0700 Subject: [PATCH] refactor rocksdb opts; split kvtree Signed-off-by: Jason Volk <jason@zemos.net> --- src/database/rocksdb/kvtree.rs | 197 ++++++++++++++++++++ src/database/rocksdb/mod.rs | 325 ++------------------------------- src/database/rocksdb/opts.rs | 235 ++++++++++++++++++++++++ 3 files changed, 449 insertions(+), 308 deletions(-) create mode 100644 src/database/rocksdb/kvtree.rs create mode 100644 src/database/rocksdb/opts.rs diff --git a/src/database/rocksdb/kvtree.rs b/src/database/rocksdb/kvtree.rs new file mode 100644 index 000000000..4b30b552e --- /dev/null +++ b/src/database/rocksdb/kvtree.rs @@ -0,0 +1,197 @@ +use std::{future::Future, pin::Pin, sync::Arc}; + +use rust_rocksdb::WriteBatchWithTransaction; + +use super::{watchers::Watchers, Engine, KeyValueDatabaseEngine, KvTree}; +use crate::{utils, Result}; + +pub(super) struct RocksDbEngineTree<'a> { + pub db: Arc<Engine>, + pub name: &'a str, + pub watchers: Watchers, +} + +impl RocksDbEngineTree<'_> { + fn cf(&self) -> Arc<rust_rocksdb::BoundColumnFamily<'_>> { self.db.rocks.cf_handle(self.name).unwrap() } +} + +impl KvTree for RocksDbEngineTree<'_> { + fn get(&self, key: &[u8]) -> Result<Option<Vec<u8>>> { + let mut readoptions = rust_rocksdb::ReadOptions::default(); + readoptions.set_total_order_seek(true); + + Ok(self.db.rocks.get_cf_opt(&self.cf(), key, &readoptions)?) + } + + fn multi_get( + &self, iter: Vec<(&Arc<rust_rocksdb::BoundColumnFamily<'_>>, Vec<u8>)>, + ) -> Vec<Result<Option<Vec<u8>>, rust_rocksdb::Error>> { + let mut readoptions = rust_rocksdb::ReadOptions::default(); + readoptions.set_total_order_seek(true); + + self.db.rocks.multi_get_cf_opt(iter, &readoptions) + } + + fn insert(&self, key: &[u8], value: &[u8]) -> Result<()> { + let writeoptions = rust_rocksdb::WriteOptions::default(); + + self.db + .rocks + .put_cf_opt(&self.cf(), key, value, &writeoptions)?; + + if !self.db.corked() { + self.db.flush()?; + } + + self.watchers.wake(key); + + Ok(()) + } + + fn insert_batch(&self, iter: &mut dyn Iterator<Item = (Vec<u8>, Vec<u8>)>) -> Result<()> { + let writeoptions = rust_rocksdb::WriteOptions::default(); + + let mut batch = WriteBatchWithTransaction::<false>::default(); + + for (key, value) in iter { + batch.put_cf(&self.cf(), key, value); + } + + let result = self.db.rocks.write_opt(batch, &writeoptions); + + if !self.db.corked() { + self.db.flush()?; + } + + Ok(result?) + } + + fn remove(&self, key: &[u8]) -> Result<()> { + let writeoptions = rust_rocksdb::WriteOptions::default(); + + let result = self.db.rocks.delete_cf_opt(&self.cf(), key, &writeoptions); + + if !self.db.corked() { + self.db.flush()?; + } + + Ok(result?) + } + + fn remove_batch(&self, iter: &mut dyn Iterator<Item = Vec<u8>>) -> Result<()> { + let writeoptions = rust_rocksdb::WriteOptions::default(); + + let mut batch = WriteBatchWithTransaction::<false>::default(); + + for key in iter { + batch.delete_cf(&self.cf(), key); + } + + let result = self.db.rocks.write_opt(batch, &writeoptions); + + if !self.db.corked() { + self.db.flush()?; + } + + Ok(result?) + } + + fn iter<'a>(&'a self) -> Box<dyn Iterator<Item = (Vec<u8>, Vec<u8>)> + 'a> { + let mut readoptions = rust_rocksdb::ReadOptions::default(); + readoptions.set_total_order_seek(true); + + Box::new( + self.db + .rocks + .iterator_cf_opt(&self.cf(), readoptions, rust_rocksdb::IteratorMode::Start) + .map(Result::unwrap) + .map(|(k, v)| (Vec::from(k), Vec::from(v))), + ) + } + + fn iter_from<'a>(&'a self, from: &[u8], backwards: bool) -> Box<dyn Iterator<Item = (Vec<u8>, Vec<u8>)> + 'a> { + let mut readoptions = rust_rocksdb::ReadOptions::default(); + readoptions.set_total_order_seek(true); + + Box::new( + self.db + .rocks + .iterator_cf_opt( + &self.cf(), + readoptions, + rust_rocksdb::IteratorMode::From( + from, + if backwards { + rust_rocksdb::Direction::Reverse + } else { + rust_rocksdb::Direction::Forward + }, + ), + ) + .map(Result::unwrap) + .map(|(k, v)| (Vec::from(k), Vec::from(v))), + ) + } + + fn increment(&self, key: &[u8]) -> Result<Vec<u8>> { + let mut readoptions = rust_rocksdb::ReadOptions::default(); + readoptions.set_total_order_seek(true); + let writeoptions = rust_rocksdb::WriteOptions::default(); + + let old = self.db.rocks.get_cf_opt(&self.cf(), key, &readoptions)?; + let new = utils::increment(old.as_deref()); + self.db + .rocks + .put_cf_opt(&self.cf(), key, &new, &writeoptions)?; + + if !self.db.corked() { + self.db.flush()?; + } + + Ok(new) + } + + fn increment_batch(&self, iter: &mut dyn Iterator<Item = Vec<u8>>) -> Result<()> { + let mut readoptions = rust_rocksdb::ReadOptions::default(); + readoptions.set_total_order_seek(true); + let writeoptions = rust_rocksdb::WriteOptions::default(); + + let mut batch = WriteBatchWithTransaction::<false>::default(); + + for key in iter { + let old = self.db.rocks.get_cf_opt(&self.cf(), &key, &readoptions)?; + let new = utils::increment(old.as_deref()); + batch.put_cf(&self.cf(), key, new); + } + + self.db.rocks.write_opt(batch, &writeoptions)?; + + if !self.db.corked() { + self.db.flush()?; + } + + Ok(()) + } + + fn scan_prefix<'a>(&'a self, prefix: Vec<u8>) -> Box<dyn Iterator<Item = (Vec<u8>, Vec<u8>)> + 'a> { + let mut readoptions = rust_rocksdb::ReadOptions::default(); + readoptions.set_total_order_seek(true); + + Box::new( + self.db + .rocks + .iterator_cf_opt( + &self.cf(), + readoptions, + rust_rocksdb::IteratorMode::From(&prefix, rust_rocksdb::Direction::Forward), + ) + .map(Result::unwrap) + .map(|(k, v)| (Vec::from(k), Vec::from(v))) + .take_while(move |(k, _)| k.starts_with(&prefix)), + ) + } + + fn watch_prefix<'a>(&'a self, prefix: &[u8]) -> Pin<Box<dyn Future<Output = ()> + Send + 'a>> { + self.watchers.watch(prefix) + } +} diff --git a/src/database/rocksdb/mod.rs b/src/database/rocksdb/mod.rs index 27c6c9624..604930419 100644 --- a/src/database/rocksdb/mod.rs +++ b/src/database/rocksdb/mod.rs @@ -1,20 +1,22 @@ -use std::{ - future::Future, - pin::Pin, - sync::{atomic::AtomicU32, Arc}, -}; +use std::sync::{atomic::AtomicU32, Arc}; use chrono::{DateTime, Utc}; use rust_rocksdb::{ backup::{BackupEngine, BackupEngineOptions}, - DBWithThreadMode as Db, - LogLevel::{Debug, Error, Fatal, Info, Warn}, - MultiThreaded, WriteBatchWithTransaction, + DBWithThreadMode as Db, MultiThreaded, }; use tracing::{debug, error, info, warn}; use super::{super::Config, watchers::Watchers, KeyValueDatabaseEngine, KvTree}; -use crate::{utils, Result}; +use crate::Result; + +pub(crate) mod kvtree; +pub(crate) mod opts; + +use kvtree::RocksDbEngineTree; +use opts::{cf_options, db_options}; + +use super::watchers; pub(crate) struct Engine { rocks: Db<MultiThreaded>, @@ -27,113 +29,6 @@ pub(crate) struct Engine { corks: AtomicU32, } -struct RocksDbEngineTree<'a> { - db: Arc<Engine>, - name: &'a str, - watchers: Watchers, -} - -fn db_options( - config: &Config, env: &rust_rocksdb::Env, row_cache: &rust_rocksdb::Cache, col_cache: &rust_rocksdb::Cache, -) -> rust_rocksdb::Options { - // database options: https://docs.rs/rocksdb/latest/rocksdb/struct.Options.html# - let mut db_opts = rust_rocksdb::Options::default(); - - // Logging - let rocksdb_log_level = match config.rocksdb_log_level.as_ref() { - "debug" => Debug, - "info" => Info, - "warn" => Warn, - "fatal" => Fatal, - _ => Error, - }; - db_opts.set_log_level(rocksdb_log_level); - db_opts.set_max_log_file_size(config.rocksdb_max_log_file_size); - db_opts.set_log_file_time_to_roll(config.rocksdb_log_time_to_roll); - db_opts.set_keep_log_file_num(config.rocksdb_max_log_files); - - // Processing - let threads = if config.rocksdb_parallelism_threads == 0 { - num_cpus::get_physical() // max cores if user specified 0 - } else { - config.rocksdb_parallelism_threads - }; - - db_opts.set_max_background_jobs(threads.try_into().unwrap()); - db_opts.set_max_subcompactions(threads.try_into().unwrap()); - - // IO - db_opts.set_manual_wal_flush(true); - db_opts.set_use_direct_reads(true); - db_opts.set_use_direct_io_for_flush_and_compaction(true); - if config.rocksdb_optimize_for_spinning_disks { - db_opts.set_skip_stats_update_on_db_open(true); // speeds up opening DB on hard - // drives - } - - // Blocks - let mut block_based_options = rust_rocksdb::BlockBasedOptions::default(); - block_based_options.set_block_size(4 * 1024); - block_based_options.set_metadata_block_size(4 * 1024); - block_based_options.set_bloom_filter(9.6, true); - block_based_options.set_optimize_filters_for_memory(true); - block_based_options.set_cache_index_and_filter_blocks(true); - block_based_options.set_pin_top_level_index_and_filter(true); - block_based_options.set_block_cache(col_cache); - db_opts.set_row_cache(row_cache); - - // Buffers - db_opts.set_write_buffer_size(2 * 1024 * 1024); - db_opts.set_max_write_buffer_number(2); - db_opts.set_min_write_buffer_number(1); - - // Files - db_opts.set_level_zero_file_num_compaction_trigger(1); - db_opts.set_target_file_size_base(64 * 1024 * 1024); - db_opts.set_max_bytes_for_level_base(128 * 1024 * 1024); - db_opts.set_ttl(14 * 24 * 60 * 60); - - // Compression - let rocksdb_compression_algo = match config.rocksdb_compression_algo.as_ref() { - "zlib" => rust_rocksdb::DBCompressionType::Zlib, - "lz4" => rust_rocksdb::DBCompressionType::Lz4, - "bz2" => rust_rocksdb::DBCompressionType::Bz2, - _ => rust_rocksdb::DBCompressionType::Zstd, - }; - - if config.rocksdb_bottommost_compression { - db_opts.set_bottommost_compression_type(rocksdb_compression_algo); - db_opts.set_bottommost_zstd_max_train_bytes(0, true); - - // -14 w_bits is only read by zlib. - db_opts.set_bottommost_compression_options(-14, config.rocksdb_bottommost_compression_level, 0, 0, true); - } - - // -14 w_bits is only read by zlib. - db_opts.set_compression_options(-14, config.rocksdb_compression_level, 0, 0); - db_opts.set_compression_type(rocksdb_compression_algo); - - // Misc - db_opts.create_if_missing(true); - - // Default: https://github.com/facebook/rocksdb/wiki/WAL-Recovery-Modes#ktoleratecorruptedtailrecords - // - // Unclean shutdowns of a Matrix homeserver are likely to be fine when - // recovered in this manner as it's likely any lost information will be - // restored via federation. - db_opts.set_wal_recovery_mode(match config.rocksdb_recovery_mode { - 0 => rust_rocksdb::DBRecoveryMode::AbsoluteConsistency, - 1 => rust_rocksdb::DBRecoveryMode::TolerateCorruptedTailRecords, - 2 => rust_rocksdb::DBRecoveryMode::PointInTime, - 3 => rust_rocksdb::DBRecoveryMode::SkipAnyCorruptedRecord, - 4_u8..=u8::MAX => unimplemented!(), - }); - - db_opts.set_block_based_table_factory(&block_based_options); - db_opts.set_env(env); - db_opts -} - impl KeyValueDatabaseEngine for Arc<Engine> { fn open(config: &Config) -> Result<Self> { let cache_capacity_bytes = config.db_cache_capacity_mb * 1024.0 * 1024.0; @@ -145,9 +40,6 @@ fn open(config: &Config) -> Result<Self> { let col_cache = rust_rocksdb::Cache::new_lru_cache(col_cache_capacity_bytes); let db_opts = db_options(config, &db_env, &row_cache, &col_cache); - debug!("Listing column families in database"); - let cfs = Db::<MultiThreaded>::list_cf(&db_opts, &config.database_path).unwrap_or_default(); - if config.rocksdb_repair { warn!("Starting database repair. This may take a long time..."); if let Err(e) = Db::<MultiThreaded>::repair(&db_opts, &config.database_path) { @@ -155,21 +47,23 @@ fn open(config: &Config) -> Result<Self> { } } - debug!("Opening {} column family descriptors in database", cfs.len()); - info!("RocksDB database compaction will take place now, a delay in startup is expected"); + debug!("Listing column families in database"); + let cfs = Db::<MultiThreaded>::list_cf(&db_opts, &config.database_path).unwrap_or_default(); + debug!("Opening {} column family descriptors in database", cfs.len()); let cfds = cfs .iter() - .map(|name| rust_rocksdb::ColumnFamilyDescriptor::new(name, db_opts.clone())) + .map(|name| rust_rocksdb::ColumnFamilyDescriptor::new(name, cf_options(name, db_opts.clone(), config))) .collect::<Vec<_>>(); + debug!("Opening database..."); let db = if config.rocksdb_read_only { Db::<MultiThreaded>::open_cf_for_read_only(&db_opts, &config.database_path, cfs.clone(), false)? } else { Db::<MultiThreaded>::open_cf_descriptors(&db_opts, &config.database_path, cfds)? }; - debug!("Opened database at sequence number {}", db.latest_sequence_number()); + info!("Opened database at sequence number {}", db.latest_sequence_number()); Ok(Arc::new(Engine { rocks: db, row_cache, @@ -346,188 +240,3 @@ fn file_list(&self) -> Result<String> { #[allow(dead_code)] fn clear_caches(&self) {} } - -impl RocksDbEngineTree<'_> { - fn cf(&self) -> Arc<rust_rocksdb::BoundColumnFamily<'_>> { self.db.rocks.cf_handle(self.name).unwrap() } -} - -impl KvTree for RocksDbEngineTree<'_> { - fn get(&self, key: &[u8]) -> Result<Option<Vec<u8>>> { - let mut readoptions = rust_rocksdb::ReadOptions::default(); - readoptions.set_total_order_seek(true); - - Ok(self.db.rocks.get_cf_opt(&self.cf(), key, &readoptions)?) - } - - fn multi_get( - &self, iter: Vec<(&Arc<rust_rocksdb::BoundColumnFamily<'_>>, Vec<u8>)>, - ) -> Vec<Result<Option<Vec<u8>>, rust_rocksdb::Error>> { - let mut readoptions = rust_rocksdb::ReadOptions::default(); - readoptions.set_total_order_seek(true); - - self.db.rocks.multi_get_cf_opt(iter, &readoptions) - } - - fn insert(&self, key: &[u8], value: &[u8]) -> Result<()> { - let writeoptions = rust_rocksdb::WriteOptions::default(); - - self.db - .rocks - .put_cf_opt(&self.cf(), key, value, &writeoptions)?; - - if !self.db.corked() { - self.db.flush()?; - } - - self.watchers.wake(key); - - Ok(()) - } - - fn insert_batch(&self, iter: &mut dyn Iterator<Item = (Vec<u8>, Vec<u8>)>) -> Result<()> { - let writeoptions = rust_rocksdb::WriteOptions::default(); - - let mut batch = WriteBatchWithTransaction::<false>::default(); - - for (key, value) in iter { - batch.put_cf(&self.cf(), key, value); - } - - let result = self.db.rocks.write_opt(batch, &writeoptions); - - if !self.db.corked() { - self.db.flush()?; - } - - Ok(result?) - } - - fn remove(&self, key: &[u8]) -> Result<()> { - let writeoptions = rust_rocksdb::WriteOptions::default(); - - let result = self.db.rocks.delete_cf_opt(&self.cf(), key, &writeoptions); - - if !self.db.corked() { - self.db.flush()?; - } - - Ok(result?) - } - - fn remove_batch(&self, iter: &mut dyn Iterator<Item = Vec<u8>>) -> Result<()> { - let writeoptions = rust_rocksdb::WriteOptions::default(); - - let mut batch = WriteBatchWithTransaction::<false>::default(); - - for key in iter { - batch.delete_cf(&self.cf(), key); - } - - let result = self.db.rocks.write_opt(batch, &writeoptions); - - if !self.db.corked() { - self.db.flush()?; - } - - Ok(result?) - } - - fn iter<'a>(&'a self) -> Box<dyn Iterator<Item = (Vec<u8>, Vec<u8>)> + 'a> { - let mut readoptions = rust_rocksdb::ReadOptions::default(); - readoptions.set_total_order_seek(true); - - Box::new( - self.db - .rocks - .iterator_cf_opt(&self.cf(), readoptions, rust_rocksdb::IteratorMode::Start) - .map(Result::unwrap) - .map(|(k, v)| (Vec::from(k), Vec::from(v))), - ) - } - - fn iter_from<'a>(&'a self, from: &[u8], backwards: bool) -> Box<dyn Iterator<Item = (Vec<u8>, Vec<u8>)> + 'a> { - let mut readoptions = rust_rocksdb::ReadOptions::default(); - readoptions.set_total_order_seek(true); - - Box::new( - self.db - .rocks - .iterator_cf_opt( - &self.cf(), - readoptions, - rust_rocksdb::IteratorMode::From( - from, - if backwards { - rust_rocksdb::Direction::Reverse - } else { - rust_rocksdb::Direction::Forward - }, - ), - ) - .map(Result::unwrap) - .map(|(k, v)| (Vec::from(k), Vec::from(v))), - ) - } - - fn increment(&self, key: &[u8]) -> Result<Vec<u8>> { - let mut readoptions = rust_rocksdb::ReadOptions::default(); - readoptions.set_total_order_seek(true); - let writeoptions = rust_rocksdb::WriteOptions::default(); - - let old = self.db.rocks.get_cf_opt(&self.cf(), key, &readoptions)?; - let new = utils::increment(old.as_deref()); - self.db - .rocks - .put_cf_opt(&self.cf(), key, &new, &writeoptions)?; - - if !self.db.corked() { - self.db.flush()?; - } - - Ok(new) - } - - fn increment_batch(&self, iter: &mut dyn Iterator<Item = Vec<u8>>) -> Result<()> { - let mut readoptions = rust_rocksdb::ReadOptions::default(); - readoptions.set_total_order_seek(true); - let writeoptions = rust_rocksdb::WriteOptions::default(); - - let mut batch = WriteBatchWithTransaction::<false>::default(); - - for key in iter { - let old = self.db.rocks.get_cf_opt(&self.cf(), &key, &readoptions)?; - let new = utils::increment(old.as_deref()); - batch.put_cf(&self.cf(), key, new); - } - - self.db.rocks.write_opt(batch, &writeoptions)?; - - if !self.db.corked() { - self.db.flush()?; - } - - Ok(()) - } - - fn scan_prefix<'a>(&'a self, prefix: Vec<u8>) -> Box<dyn Iterator<Item = (Vec<u8>, Vec<u8>)> + 'a> { - let mut readoptions = rust_rocksdb::ReadOptions::default(); - readoptions.set_total_order_seek(true); - - Box::new( - self.db - .rocks - .iterator_cf_opt( - &self.cf(), - readoptions, - rust_rocksdb::IteratorMode::From(&prefix, rust_rocksdb::Direction::Forward), - ) - .map(Result::unwrap) - .map(|(k, v)| (Vec::from(k), Vec::from(v))) - .take_while(move |(k, _)| k.starts_with(&prefix)), - ) - } - - fn watch_prefix<'a>(&'a self, prefix: &[u8]) -> Pin<Box<dyn Future<Output = ()> + Send + 'a>> { - self.watchers.watch(prefix) - } -} diff --git a/src/database/rocksdb/opts.rs b/src/database/rocksdb/opts.rs new file mode 100644 index 000000000..8f81a7d5e --- /dev/null +++ b/src/database/rocksdb/opts.rs @@ -0,0 +1,235 @@ +#![allow(dead_code)] + +use rust_rocksdb::{ + BlockBasedOptions, Cache, DBCompactionStyle, DBCompressionType, DBRecoveryMode, Env, LogLevel, Options, + UniversalCompactOptions, UniversalCompactionStopStyle, +}; + +use super::Config; + +/// Create database-wide options suitable for opening the database. This also +/// sets our default column options in case of opening a column with the same +/// resulting value. Note that we require special per-column options on some +/// columns, therefor columns should only be opened after passing this result +/// through cf_options(). +pub(crate) fn db_options(config: &Config, env: &Env, row_cache: &Cache, col_cache: &Cache) -> Options { + let mut opts = Options::default(); + + // Logging + set_logging_defaults(&mut opts, config); + + // Processing + let threads = if config.rocksdb_parallelism_threads == 0 { + num_cpus::get_physical() // max cores if user specified 0 + } else { + config.rocksdb_parallelism_threads + }; + + opts.set_max_background_jobs(threads.try_into().unwrap()); + opts.set_max_subcompactions(threads.try_into().unwrap()); + opts.set_max_file_opening_threads(0); + + // IO + opts.set_manual_wal_flush(true); + opts.set_use_direct_reads(true); + opts.set_use_direct_io_for_flush_and_compaction(true); + if config.rocksdb_optimize_for_spinning_disks { + // speeds up opening DB on hard drives + opts.set_skip_checking_sst_file_sizes_on_db_open(true); + opts.set_skip_stats_update_on_db_open(true); + //opts.set_max_file_opening_threads(threads.try_into().unwrap()); + } + + // Blocks + let mut table_opts = table_options(config); + table_opts.set_block_cache(col_cache); + opts.set_row_cache(row_cache); + + // Buffers + opts.set_write_buffer_size(2 * 1024 * 1024); + opts.set_max_write_buffer_number(2); + opts.set_min_write_buffer_number(1); + + // Files + opts.set_max_total_wal_size(96 * 1024 * 1024); + opts.set_level_zero_file_num_compaction_trigger(2); + set_level_defaults(&mut opts, config); + opts.set_ttl(14 * 24 * 60 * 60); + + // Compression + set_compression_defaults(&mut opts, config); + + // Misc + opts.create_if_missing(true); + + // Default: https://github.com/facebook/rocksdb/wiki/WAL-Recovery-Modes#ktoleratecorruptedtailrecords + // + // Unclean shutdowns of a Matrix homeserver are likely to be fine when + // recovered in this manner as it's likely any lost information will be + // restored via federation. + opts.set_wal_recovery_mode(match config.rocksdb_recovery_mode { + 0 => DBRecoveryMode::AbsoluteConsistency, + 1 => DBRecoveryMode::TolerateCorruptedTailRecords, + 2 => DBRecoveryMode::PointInTime, + 3 => DBRecoveryMode::SkipAnyCorruptedRecord, + 4_u8..=u8::MAX => unimplemented!(), + }); + + opts.set_block_based_table_factory(&table_opts); + opts.set_env(env); + opts +} + +/// Adjust options for the specific column by name. Provide the result of +/// db_options() as the argument to this function and use the return value in +/// the arguments to open the specific column. +pub(crate) fn cf_options(name: &str, mut opts: Options, config: &Config) -> Options { + match name { + "backupid_algorithm" + | "backupid_etag" + | "backupkeyid_backup" + | "roomid_shortroomid" + | "shorteventid_shortstatehash" + | "shorteventid_eventid" + | "shortstatekey_statekey" + | "shortstatehash_statediff" + | "userdevicetxnid_response" + | "userfilterid_filter" => set_for_sequential_small_uc(&mut opts, config), + &_ => {}, + } + + opts +} + +fn set_logging_defaults(opts: &mut Options, config: &Config) { + let rocksdb_log_level = match config.rocksdb_log_level.as_ref() { + "debug" => LogLevel::Debug, + "info" => LogLevel::Info, + "warn" => LogLevel::Warn, + "fatal" => LogLevel::Fatal, + _ => LogLevel::Error, + }; + + opts.set_log_level(rocksdb_log_level); + opts.set_max_log_file_size(config.rocksdb_max_log_file_size); + opts.set_log_file_time_to_roll(config.rocksdb_log_time_to_roll); + opts.set_keep_log_file_num(config.rocksdb_max_log_files); + opts.set_stats_dump_period_sec(0); +} + +fn set_compression_defaults(opts: &mut Options, config: &Config) { + let rocksdb_compression_algo = match config.rocksdb_compression_algo.as_ref() { + "zlib" => DBCompressionType::Zlib, + "lz4" => DBCompressionType::Lz4, + "bz2" => DBCompressionType::Bz2, + _ => DBCompressionType::Zstd, + }; + + if config.rocksdb_bottommost_compression { + opts.set_bottommost_compression_type(rocksdb_compression_algo); + opts.set_bottommost_zstd_max_train_bytes(0, true); + + // -14 w_bits is only read by zlib. + opts.set_bottommost_compression_options(-14, config.rocksdb_bottommost_compression_level, 0, 0, true); + } + + // -14 w_bits is only read by zlib. + opts.set_compression_options(-14, config.rocksdb_compression_level, 0, 0); + opts.set_compression_type(rocksdb_compression_algo); +} + +fn set_for_random_small_uc(opts: &mut Options, config: &Config) { + let uco = uc_options(config); + set_for_random_small(opts, config); + opts.set_universal_compaction_options(&uco); + opts.set_compaction_style(DBCompactionStyle::Universal); + opts.set_level_zero_file_num_compaction_trigger(1); +} + +fn set_for_sequential_small_uc(opts: &mut Options, config: &Config) { + let uco = uc_options(config); + set_for_sequential_small(opts, config); + opts.set_universal_compaction_options(&uco); + opts.set_compaction_style(DBCompactionStyle::Universal); + opts.set_level_zero_file_num_compaction_trigger(1); +} + +fn set_for_random_small(opts: &mut Options, config: &Config) { + set_for_random(opts, config); + + opts.set_write_buffer_size(1024 * 1024); + opts.set_target_file_size_base(65536); + opts.set_max_bytes_for_level_base(131072); +} + +fn set_for_sequential_small(opts: &mut Options, config: &Config) { + set_for_random(opts, config); + + opts.set_write_buffer_size(1024 * 1024); + opts.set_target_file_size_base(65536); + opts.set_max_bytes_for_level_base(131072); +} + +fn set_for_random(opts: &mut Options, config: &Config) { + set_level_defaults(opts, config); + + let pri = "compaction_pri=kOldestSmallestSeqFirst"; + opts.set_options_from_string(pri) + .expect("set compaction priority string"); + + opts.set_max_bytes_for_level_base(8 * 1024 * 1024); + opts.set_max_bytes_for_level_multiplier(1.0); + opts.set_max_bytes_for_level_multiplier_additional(&[0, 1, 1, 3, 7, 15, 31]); +} + +fn set_for_sequential(opts: &mut Options, config: &Config) { + set_level_defaults(opts, config); + + let pri = "compaction_pri=kOldestLargestSeqFirst"; + opts.set_options_from_string(pri) + .expect("set compaction priority string"); + + opts.set_target_file_size_base(2 * 1024 * 1024); + opts.set_target_file_size_multiplier(2); + + opts.set_max_bytes_for_level_base(32 * 1024 * 1024); + opts.set_max_bytes_for_level_multiplier(1.0); + opts.set_max_bytes_for_level_multiplier_additional(&[0, 1, 1, 3, 7, 15, 31]); +} + +fn set_level_defaults(opts: &mut Options, _config: &Config) { + opts.set_target_file_size_base(1024 * 1024); + opts.set_target_file_size_multiplier(2); + + opts.set_level_compaction_dynamic_level_bytes(false); + opts.set_max_bytes_for_level_base(8 * 1024 * 1024); + opts.set_max_bytes_for_level_multiplier(2.0); +} + +fn uc_options(_config: &Config) -> UniversalCompactOptions { + let mut opts = UniversalCompactOptions::default(); + + opts.set_stop_style(UniversalCompactionStopStyle::Total); + opts.set_max_size_amplification_percent(10000); + opts.set_compression_size_percent(-1); + opts.set_size_ratio(1); + + opts.set_min_merge_width(2); + opts.set_max_merge_width(16); + + opts +} + +fn table_options(_config: &Config) -> BlockBasedOptions { + let mut opts = BlockBasedOptions::default(); + + opts.set_block_size(4 * 1024); + opts.set_metadata_block_size(4 * 1024); + + opts.set_bloom_filter(9.6, true); + opts.set_optimize_filters_for_memory(true); + opts.set_cache_index_and_filter_blocks(true); + opts.set_pin_top_level_index_and_filter(true); + + opts +} -- GitLab