From a83da4f17bd68e92115c6584e816ab1aa53667aa Mon Sep 17 00:00:00 2001
From: Jason Volk <jason@zemos.net>
Date: Sat, 6 Apr 2024 08:48:41 -0700
Subject: [PATCH] refactor rocksdb opts; split kvtree

Signed-off-by: Jason Volk <jason@zemos.net>
---
 src/database/rocksdb/kvtree.rs | 197 ++++++++++++++++++++
 src/database/rocksdb/mod.rs    | 325 ++-------------------------------
 src/database/rocksdb/opts.rs   | 235 ++++++++++++++++++++++++
 3 files changed, 449 insertions(+), 308 deletions(-)
 create mode 100644 src/database/rocksdb/kvtree.rs
 create mode 100644 src/database/rocksdb/opts.rs

diff --git a/src/database/rocksdb/kvtree.rs b/src/database/rocksdb/kvtree.rs
new file mode 100644
index 000000000..4b30b552e
--- /dev/null
+++ b/src/database/rocksdb/kvtree.rs
@@ -0,0 +1,197 @@
+use std::{future::Future, pin::Pin, sync::Arc};
+
+use rust_rocksdb::WriteBatchWithTransaction;
+
+use super::{watchers::Watchers, Engine, KeyValueDatabaseEngine, KvTree};
+use crate::{utils, Result};
+
+pub(super) struct RocksDbEngineTree<'a> {
+	pub db: Arc<Engine>,
+	pub name: &'a str,
+	pub watchers: Watchers,
+}
+
+impl RocksDbEngineTree<'_> {
+	fn cf(&self) -> Arc<rust_rocksdb::BoundColumnFamily<'_>> { self.db.rocks.cf_handle(self.name).unwrap() }
+}
+
+impl KvTree for RocksDbEngineTree<'_> {
+	fn get(&self, key: &[u8]) -> Result<Option<Vec<u8>>> {
+		let mut readoptions = rust_rocksdb::ReadOptions::default();
+		readoptions.set_total_order_seek(true);
+
+		Ok(self.db.rocks.get_cf_opt(&self.cf(), key, &readoptions)?)
+	}
+
+	fn multi_get(
+		&self, iter: Vec<(&Arc<rust_rocksdb::BoundColumnFamily<'_>>, Vec<u8>)>,
+	) -> Vec<Result<Option<Vec<u8>>, rust_rocksdb::Error>> {
+		let mut readoptions = rust_rocksdb::ReadOptions::default();
+		readoptions.set_total_order_seek(true);
+
+		self.db.rocks.multi_get_cf_opt(iter, &readoptions)
+	}
+
+	fn insert(&self, key: &[u8], value: &[u8]) -> Result<()> {
+		let writeoptions = rust_rocksdb::WriteOptions::default();
+
+		self.db
+			.rocks
+			.put_cf_opt(&self.cf(), key, value, &writeoptions)?;
+
+		if !self.db.corked() {
+			self.db.flush()?;
+		}
+
+		self.watchers.wake(key);
+
+		Ok(())
+	}
+
+	fn insert_batch(&self, iter: &mut dyn Iterator<Item = (Vec<u8>, Vec<u8>)>) -> Result<()> {
+		let writeoptions = rust_rocksdb::WriteOptions::default();
+
+		let mut batch = WriteBatchWithTransaction::<false>::default();
+
+		for (key, value) in iter {
+			batch.put_cf(&self.cf(), key, value);
+		}
+
+		let result = self.db.rocks.write_opt(batch, &writeoptions);
+
+		if !self.db.corked() {
+			self.db.flush()?;
+		}
+
+		Ok(result?)
+	}
+
+	fn remove(&self, key: &[u8]) -> Result<()> {
+		let writeoptions = rust_rocksdb::WriteOptions::default();
+
+		let result = self.db.rocks.delete_cf_opt(&self.cf(), key, &writeoptions);
+
+		if !self.db.corked() {
+			self.db.flush()?;
+		}
+
+		Ok(result?)
+	}
+
+	fn remove_batch(&self, iter: &mut dyn Iterator<Item = Vec<u8>>) -> Result<()> {
+		let writeoptions = rust_rocksdb::WriteOptions::default();
+
+		let mut batch = WriteBatchWithTransaction::<false>::default();
+
+		for key in iter {
+			batch.delete_cf(&self.cf(), key);
+		}
+
+		let result = self.db.rocks.write_opt(batch, &writeoptions);
+
+		if !self.db.corked() {
+			self.db.flush()?;
+		}
+
+		Ok(result?)
+	}
+
+	fn iter<'a>(&'a self) -> Box<dyn Iterator<Item = (Vec<u8>, Vec<u8>)> + 'a> {
+		let mut readoptions = rust_rocksdb::ReadOptions::default();
+		readoptions.set_total_order_seek(true);
+
+		Box::new(
+			self.db
+				.rocks
+				.iterator_cf_opt(&self.cf(), readoptions, rust_rocksdb::IteratorMode::Start)
+				.map(Result::unwrap)
+				.map(|(k, v)| (Vec::from(k), Vec::from(v))),
+		)
+	}
+
+	fn iter_from<'a>(&'a self, from: &[u8], backwards: bool) -> Box<dyn Iterator<Item = (Vec<u8>, Vec<u8>)> + 'a> {
+		let mut readoptions = rust_rocksdb::ReadOptions::default();
+		readoptions.set_total_order_seek(true);
+
+		Box::new(
+			self.db
+				.rocks
+				.iterator_cf_opt(
+					&self.cf(),
+					readoptions,
+					rust_rocksdb::IteratorMode::From(
+						from,
+						if backwards {
+							rust_rocksdb::Direction::Reverse
+						} else {
+							rust_rocksdb::Direction::Forward
+						},
+					),
+				)
+				.map(Result::unwrap)
+				.map(|(k, v)| (Vec::from(k), Vec::from(v))),
+		)
+	}
+
+	fn increment(&self, key: &[u8]) -> Result<Vec<u8>> {
+		let mut readoptions = rust_rocksdb::ReadOptions::default();
+		readoptions.set_total_order_seek(true);
+		let writeoptions = rust_rocksdb::WriteOptions::default();
+
+		let old = self.db.rocks.get_cf_opt(&self.cf(), key, &readoptions)?;
+		let new = utils::increment(old.as_deref());
+		self.db
+			.rocks
+			.put_cf_opt(&self.cf(), key, &new, &writeoptions)?;
+
+		if !self.db.corked() {
+			self.db.flush()?;
+		}
+
+		Ok(new)
+	}
+
+	fn increment_batch(&self, iter: &mut dyn Iterator<Item = Vec<u8>>) -> Result<()> {
+		let mut readoptions = rust_rocksdb::ReadOptions::default();
+		readoptions.set_total_order_seek(true);
+		let writeoptions = rust_rocksdb::WriteOptions::default();
+
+		let mut batch = WriteBatchWithTransaction::<false>::default();
+
+		for key in iter {
+			let old = self.db.rocks.get_cf_opt(&self.cf(), &key, &readoptions)?;
+			let new = utils::increment(old.as_deref());
+			batch.put_cf(&self.cf(), key, new);
+		}
+
+		self.db.rocks.write_opt(batch, &writeoptions)?;
+
+		if !self.db.corked() {
+			self.db.flush()?;
+		}
+
+		Ok(())
+	}
+
+	fn scan_prefix<'a>(&'a self, prefix: Vec<u8>) -> Box<dyn Iterator<Item = (Vec<u8>, Vec<u8>)> + 'a> {
+		let mut readoptions = rust_rocksdb::ReadOptions::default();
+		readoptions.set_total_order_seek(true);
+
+		Box::new(
+			self.db
+				.rocks
+				.iterator_cf_opt(
+					&self.cf(),
+					readoptions,
+					rust_rocksdb::IteratorMode::From(&prefix, rust_rocksdb::Direction::Forward),
+				)
+				.map(Result::unwrap)
+				.map(|(k, v)| (Vec::from(k), Vec::from(v)))
+				.take_while(move |(k, _)| k.starts_with(&prefix)),
+		)
+	}
+
+	fn watch_prefix<'a>(&'a self, prefix: &[u8]) -> Pin<Box<dyn Future<Output = ()> + Send + 'a>> {
+		self.watchers.watch(prefix)
+	}
+}
diff --git a/src/database/rocksdb/mod.rs b/src/database/rocksdb/mod.rs
index 27c6c9624..604930419 100644
--- a/src/database/rocksdb/mod.rs
+++ b/src/database/rocksdb/mod.rs
@@ -1,20 +1,22 @@
-use std::{
-	future::Future,
-	pin::Pin,
-	sync::{atomic::AtomicU32, Arc},
-};
+use std::sync::{atomic::AtomicU32, Arc};
 
 use chrono::{DateTime, Utc};
 use rust_rocksdb::{
 	backup::{BackupEngine, BackupEngineOptions},
-	DBWithThreadMode as Db,
-	LogLevel::{Debug, Error, Fatal, Info, Warn},
-	MultiThreaded, WriteBatchWithTransaction,
+	DBWithThreadMode as Db, MultiThreaded,
 };
 use tracing::{debug, error, info, warn};
 
 use super::{super::Config, watchers::Watchers, KeyValueDatabaseEngine, KvTree};
-use crate::{utils, Result};
+use crate::Result;
+
+pub(crate) mod kvtree;
+pub(crate) mod opts;
+
+use kvtree::RocksDbEngineTree;
+use opts::{cf_options, db_options};
+
+use super::watchers;
 
 pub(crate) struct Engine {
 	rocks: Db<MultiThreaded>,
@@ -27,113 +29,6 @@ pub(crate) struct Engine {
 	corks: AtomicU32,
 }
 
-struct RocksDbEngineTree<'a> {
-	db: Arc<Engine>,
-	name: &'a str,
-	watchers: Watchers,
-}
-
-fn db_options(
-	config: &Config, env: &rust_rocksdb::Env, row_cache: &rust_rocksdb::Cache, col_cache: &rust_rocksdb::Cache,
-) -> rust_rocksdb::Options {
-	// database options: https://docs.rs/rocksdb/latest/rocksdb/struct.Options.html#
-	let mut db_opts = rust_rocksdb::Options::default();
-
-	// Logging
-	let rocksdb_log_level = match config.rocksdb_log_level.as_ref() {
-		"debug" => Debug,
-		"info" => Info,
-		"warn" => Warn,
-		"fatal" => Fatal,
-		_ => Error,
-	};
-	db_opts.set_log_level(rocksdb_log_level);
-	db_opts.set_max_log_file_size(config.rocksdb_max_log_file_size);
-	db_opts.set_log_file_time_to_roll(config.rocksdb_log_time_to_roll);
-	db_opts.set_keep_log_file_num(config.rocksdb_max_log_files);
-
-	// Processing
-	let threads = if config.rocksdb_parallelism_threads == 0 {
-		num_cpus::get_physical() // max cores if user specified 0
-	} else {
-		config.rocksdb_parallelism_threads
-	};
-
-	db_opts.set_max_background_jobs(threads.try_into().unwrap());
-	db_opts.set_max_subcompactions(threads.try_into().unwrap());
-
-	// IO
-	db_opts.set_manual_wal_flush(true);
-	db_opts.set_use_direct_reads(true);
-	db_opts.set_use_direct_io_for_flush_and_compaction(true);
-	if config.rocksdb_optimize_for_spinning_disks {
-		db_opts.set_skip_stats_update_on_db_open(true); // speeds up opening DB on hard
-		                                        // drives
-	}
-
-	// Blocks
-	let mut block_based_options = rust_rocksdb::BlockBasedOptions::default();
-	block_based_options.set_block_size(4 * 1024);
-	block_based_options.set_metadata_block_size(4 * 1024);
-	block_based_options.set_bloom_filter(9.6, true);
-	block_based_options.set_optimize_filters_for_memory(true);
-	block_based_options.set_cache_index_and_filter_blocks(true);
-	block_based_options.set_pin_top_level_index_and_filter(true);
-	block_based_options.set_block_cache(col_cache);
-	db_opts.set_row_cache(row_cache);
-
-	// Buffers
-	db_opts.set_write_buffer_size(2 * 1024 * 1024);
-	db_opts.set_max_write_buffer_number(2);
-	db_opts.set_min_write_buffer_number(1);
-
-	// Files
-	db_opts.set_level_zero_file_num_compaction_trigger(1);
-	db_opts.set_target_file_size_base(64 * 1024 * 1024);
-	db_opts.set_max_bytes_for_level_base(128 * 1024 * 1024);
-	db_opts.set_ttl(14 * 24 * 60 * 60);
-
-	// Compression
-	let rocksdb_compression_algo = match config.rocksdb_compression_algo.as_ref() {
-		"zlib" => rust_rocksdb::DBCompressionType::Zlib,
-		"lz4" => rust_rocksdb::DBCompressionType::Lz4,
-		"bz2" => rust_rocksdb::DBCompressionType::Bz2,
-		_ => rust_rocksdb::DBCompressionType::Zstd,
-	};
-
-	if config.rocksdb_bottommost_compression {
-		db_opts.set_bottommost_compression_type(rocksdb_compression_algo);
-		db_opts.set_bottommost_zstd_max_train_bytes(0, true);
-
-		// -14 w_bits is only read by zlib.
-		db_opts.set_bottommost_compression_options(-14, config.rocksdb_bottommost_compression_level, 0, 0, true);
-	}
-
-	// -14 w_bits is only read by zlib.
-	db_opts.set_compression_options(-14, config.rocksdb_compression_level, 0, 0);
-	db_opts.set_compression_type(rocksdb_compression_algo);
-
-	// Misc
-	db_opts.create_if_missing(true);
-
-	// Default: https://github.com/facebook/rocksdb/wiki/WAL-Recovery-Modes#ktoleratecorruptedtailrecords
-	//
-	// Unclean shutdowns of a Matrix homeserver are likely to be fine when
-	// recovered in this manner as it's likely any lost information will be
-	// restored via federation.
-	db_opts.set_wal_recovery_mode(match config.rocksdb_recovery_mode {
-		0 => rust_rocksdb::DBRecoveryMode::AbsoluteConsistency,
-		1 => rust_rocksdb::DBRecoveryMode::TolerateCorruptedTailRecords,
-		2 => rust_rocksdb::DBRecoveryMode::PointInTime,
-		3 => rust_rocksdb::DBRecoveryMode::SkipAnyCorruptedRecord,
-		4_u8..=u8::MAX => unimplemented!(),
-	});
-
-	db_opts.set_block_based_table_factory(&block_based_options);
-	db_opts.set_env(env);
-	db_opts
-}
-
 impl KeyValueDatabaseEngine for Arc<Engine> {
 	fn open(config: &Config) -> Result<Self> {
 		let cache_capacity_bytes = config.db_cache_capacity_mb * 1024.0 * 1024.0;
@@ -145,9 +40,6 @@ fn open(config: &Config) -> Result<Self> {
 		let col_cache = rust_rocksdb::Cache::new_lru_cache(col_cache_capacity_bytes);
 		let db_opts = db_options(config, &db_env, &row_cache, &col_cache);
 
-		debug!("Listing column families in database");
-		let cfs = Db::<MultiThreaded>::list_cf(&db_opts, &config.database_path).unwrap_or_default();
-
 		if config.rocksdb_repair {
 			warn!("Starting database repair. This may take a long time...");
 			if let Err(e) = Db::<MultiThreaded>::repair(&db_opts, &config.database_path) {
@@ -155,21 +47,23 @@ fn open(config: &Config) -> Result<Self> {
 			}
 		}
 
-		debug!("Opening {} column family descriptors in database", cfs.len());
-		info!("RocksDB database compaction will take place now, a delay in startup is expected");
+		debug!("Listing column families in database");
+		let cfs = Db::<MultiThreaded>::list_cf(&db_opts, &config.database_path).unwrap_or_default();
 
+		debug!("Opening {} column family descriptors in database", cfs.len());
 		let cfds = cfs
 			.iter()
-			.map(|name| rust_rocksdb::ColumnFamilyDescriptor::new(name, db_opts.clone()))
+			.map(|name| rust_rocksdb::ColumnFamilyDescriptor::new(name, cf_options(name, db_opts.clone(), config)))
 			.collect::<Vec<_>>();
 
+		debug!("Opening database...");
 		let db = if config.rocksdb_read_only {
 			Db::<MultiThreaded>::open_cf_for_read_only(&db_opts, &config.database_path, cfs.clone(), false)?
 		} else {
 			Db::<MultiThreaded>::open_cf_descriptors(&db_opts, &config.database_path, cfds)?
 		};
 
-		debug!("Opened database at sequence number {}", db.latest_sequence_number());
+		info!("Opened database at sequence number {}", db.latest_sequence_number());
 		Ok(Arc::new(Engine {
 			rocks: db,
 			row_cache,
@@ -346,188 +240,3 @@ fn file_list(&self) -> Result<String> {
 	#[allow(dead_code)]
 	fn clear_caches(&self) {}
 }
-
-impl RocksDbEngineTree<'_> {
-	fn cf(&self) -> Arc<rust_rocksdb::BoundColumnFamily<'_>> { self.db.rocks.cf_handle(self.name).unwrap() }
-}
-
-impl KvTree for RocksDbEngineTree<'_> {
-	fn get(&self, key: &[u8]) -> Result<Option<Vec<u8>>> {
-		let mut readoptions = rust_rocksdb::ReadOptions::default();
-		readoptions.set_total_order_seek(true);
-
-		Ok(self.db.rocks.get_cf_opt(&self.cf(), key, &readoptions)?)
-	}
-
-	fn multi_get(
-		&self, iter: Vec<(&Arc<rust_rocksdb::BoundColumnFamily<'_>>, Vec<u8>)>,
-	) -> Vec<Result<Option<Vec<u8>>, rust_rocksdb::Error>> {
-		let mut readoptions = rust_rocksdb::ReadOptions::default();
-		readoptions.set_total_order_seek(true);
-
-		self.db.rocks.multi_get_cf_opt(iter, &readoptions)
-	}
-
-	fn insert(&self, key: &[u8], value: &[u8]) -> Result<()> {
-		let writeoptions = rust_rocksdb::WriteOptions::default();
-
-		self.db
-			.rocks
-			.put_cf_opt(&self.cf(), key, value, &writeoptions)?;
-
-		if !self.db.corked() {
-			self.db.flush()?;
-		}
-
-		self.watchers.wake(key);
-
-		Ok(())
-	}
-
-	fn insert_batch(&self, iter: &mut dyn Iterator<Item = (Vec<u8>, Vec<u8>)>) -> Result<()> {
-		let writeoptions = rust_rocksdb::WriteOptions::default();
-
-		let mut batch = WriteBatchWithTransaction::<false>::default();
-
-		for (key, value) in iter {
-			batch.put_cf(&self.cf(), key, value);
-		}
-
-		let result = self.db.rocks.write_opt(batch, &writeoptions);
-
-		if !self.db.corked() {
-			self.db.flush()?;
-		}
-
-		Ok(result?)
-	}
-
-	fn remove(&self, key: &[u8]) -> Result<()> {
-		let writeoptions = rust_rocksdb::WriteOptions::default();
-
-		let result = self.db.rocks.delete_cf_opt(&self.cf(), key, &writeoptions);
-
-		if !self.db.corked() {
-			self.db.flush()?;
-		}
-
-		Ok(result?)
-	}
-
-	fn remove_batch(&self, iter: &mut dyn Iterator<Item = Vec<u8>>) -> Result<()> {
-		let writeoptions = rust_rocksdb::WriteOptions::default();
-
-		let mut batch = WriteBatchWithTransaction::<false>::default();
-
-		for key in iter {
-			batch.delete_cf(&self.cf(), key);
-		}
-
-		let result = self.db.rocks.write_opt(batch, &writeoptions);
-
-		if !self.db.corked() {
-			self.db.flush()?;
-		}
-
-		Ok(result?)
-	}
-
-	fn iter<'a>(&'a self) -> Box<dyn Iterator<Item = (Vec<u8>, Vec<u8>)> + 'a> {
-		let mut readoptions = rust_rocksdb::ReadOptions::default();
-		readoptions.set_total_order_seek(true);
-
-		Box::new(
-			self.db
-				.rocks
-				.iterator_cf_opt(&self.cf(), readoptions, rust_rocksdb::IteratorMode::Start)
-				.map(Result::unwrap)
-				.map(|(k, v)| (Vec::from(k), Vec::from(v))),
-		)
-	}
-
-	fn iter_from<'a>(&'a self, from: &[u8], backwards: bool) -> Box<dyn Iterator<Item = (Vec<u8>, Vec<u8>)> + 'a> {
-		let mut readoptions = rust_rocksdb::ReadOptions::default();
-		readoptions.set_total_order_seek(true);
-
-		Box::new(
-			self.db
-				.rocks
-				.iterator_cf_opt(
-					&self.cf(),
-					readoptions,
-					rust_rocksdb::IteratorMode::From(
-						from,
-						if backwards {
-							rust_rocksdb::Direction::Reverse
-						} else {
-							rust_rocksdb::Direction::Forward
-						},
-					),
-				)
-				.map(Result::unwrap)
-				.map(|(k, v)| (Vec::from(k), Vec::from(v))),
-		)
-	}
-
-	fn increment(&self, key: &[u8]) -> Result<Vec<u8>> {
-		let mut readoptions = rust_rocksdb::ReadOptions::default();
-		readoptions.set_total_order_seek(true);
-		let writeoptions = rust_rocksdb::WriteOptions::default();
-
-		let old = self.db.rocks.get_cf_opt(&self.cf(), key, &readoptions)?;
-		let new = utils::increment(old.as_deref());
-		self.db
-			.rocks
-			.put_cf_opt(&self.cf(), key, &new, &writeoptions)?;
-
-		if !self.db.corked() {
-			self.db.flush()?;
-		}
-
-		Ok(new)
-	}
-
-	fn increment_batch(&self, iter: &mut dyn Iterator<Item = Vec<u8>>) -> Result<()> {
-		let mut readoptions = rust_rocksdb::ReadOptions::default();
-		readoptions.set_total_order_seek(true);
-		let writeoptions = rust_rocksdb::WriteOptions::default();
-
-		let mut batch = WriteBatchWithTransaction::<false>::default();
-
-		for key in iter {
-			let old = self.db.rocks.get_cf_opt(&self.cf(), &key, &readoptions)?;
-			let new = utils::increment(old.as_deref());
-			batch.put_cf(&self.cf(), key, new);
-		}
-
-		self.db.rocks.write_opt(batch, &writeoptions)?;
-
-		if !self.db.corked() {
-			self.db.flush()?;
-		}
-
-		Ok(())
-	}
-
-	fn scan_prefix<'a>(&'a self, prefix: Vec<u8>) -> Box<dyn Iterator<Item = (Vec<u8>, Vec<u8>)> + 'a> {
-		let mut readoptions = rust_rocksdb::ReadOptions::default();
-		readoptions.set_total_order_seek(true);
-
-		Box::new(
-			self.db
-				.rocks
-				.iterator_cf_opt(
-					&self.cf(),
-					readoptions,
-					rust_rocksdb::IteratorMode::From(&prefix, rust_rocksdb::Direction::Forward),
-				)
-				.map(Result::unwrap)
-				.map(|(k, v)| (Vec::from(k), Vec::from(v)))
-				.take_while(move |(k, _)| k.starts_with(&prefix)),
-		)
-	}
-
-	fn watch_prefix<'a>(&'a self, prefix: &[u8]) -> Pin<Box<dyn Future<Output = ()> + Send + 'a>> {
-		self.watchers.watch(prefix)
-	}
-}
diff --git a/src/database/rocksdb/opts.rs b/src/database/rocksdb/opts.rs
new file mode 100644
index 000000000..8f81a7d5e
--- /dev/null
+++ b/src/database/rocksdb/opts.rs
@@ -0,0 +1,235 @@
+#![allow(dead_code)]
+
+use rust_rocksdb::{
+	BlockBasedOptions, Cache, DBCompactionStyle, DBCompressionType, DBRecoveryMode, Env, LogLevel, Options,
+	UniversalCompactOptions, UniversalCompactionStopStyle,
+};
+
+use super::Config;
+
+/// Create database-wide options suitable for opening the database. This also
+/// sets our default column options in case of opening a column with the same
+/// resulting value. Note that we require special per-column options on some
+/// columns, therefor columns should only be opened after passing this result
+/// through cf_options().
+pub(crate) fn db_options(config: &Config, env: &Env, row_cache: &Cache, col_cache: &Cache) -> Options {
+	let mut opts = Options::default();
+
+	// Logging
+	set_logging_defaults(&mut opts, config);
+
+	// Processing
+	let threads = if config.rocksdb_parallelism_threads == 0 {
+		num_cpus::get_physical() // max cores if user specified 0
+	} else {
+		config.rocksdb_parallelism_threads
+	};
+
+	opts.set_max_background_jobs(threads.try_into().unwrap());
+	opts.set_max_subcompactions(threads.try_into().unwrap());
+	opts.set_max_file_opening_threads(0);
+
+	// IO
+	opts.set_manual_wal_flush(true);
+	opts.set_use_direct_reads(true);
+	opts.set_use_direct_io_for_flush_and_compaction(true);
+	if config.rocksdb_optimize_for_spinning_disks {
+		// speeds up opening DB on hard drives
+		opts.set_skip_checking_sst_file_sizes_on_db_open(true);
+		opts.set_skip_stats_update_on_db_open(true);
+		//opts.set_max_file_opening_threads(threads.try_into().unwrap());
+	}
+
+	// Blocks
+	let mut table_opts = table_options(config);
+	table_opts.set_block_cache(col_cache);
+	opts.set_row_cache(row_cache);
+
+	// Buffers
+	opts.set_write_buffer_size(2 * 1024 * 1024);
+	opts.set_max_write_buffer_number(2);
+	opts.set_min_write_buffer_number(1);
+
+	// Files
+	opts.set_max_total_wal_size(96 * 1024 * 1024);
+	opts.set_level_zero_file_num_compaction_trigger(2);
+	set_level_defaults(&mut opts, config);
+	opts.set_ttl(14 * 24 * 60 * 60);
+
+	// Compression
+	set_compression_defaults(&mut opts, config);
+
+	// Misc
+	opts.create_if_missing(true);
+
+	// Default: https://github.com/facebook/rocksdb/wiki/WAL-Recovery-Modes#ktoleratecorruptedtailrecords
+	//
+	// Unclean shutdowns of a Matrix homeserver are likely to be fine when
+	// recovered in this manner as it's likely any lost information will be
+	// restored via federation.
+	opts.set_wal_recovery_mode(match config.rocksdb_recovery_mode {
+		0 => DBRecoveryMode::AbsoluteConsistency,
+		1 => DBRecoveryMode::TolerateCorruptedTailRecords,
+		2 => DBRecoveryMode::PointInTime,
+		3 => DBRecoveryMode::SkipAnyCorruptedRecord,
+		4_u8..=u8::MAX => unimplemented!(),
+	});
+
+	opts.set_block_based_table_factory(&table_opts);
+	opts.set_env(env);
+	opts
+}
+
+/// Adjust options for the specific column by name. Provide the result of
+/// db_options() as the argument to this function and use the return value in
+/// the arguments to open the specific column.
+pub(crate) fn cf_options(name: &str, mut opts: Options, config: &Config) -> Options {
+	match name {
+		"backupid_algorithm"
+		| "backupid_etag"
+		| "backupkeyid_backup"
+		| "roomid_shortroomid"
+		| "shorteventid_shortstatehash"
+		| "shorteventid_eventid"
+		| "shortstatekey_statekey"
+		| "shortstatehash_statediff"
+		| "userdevicetxnid_response"
+		| "userfilterid_filter" => set_for_sequential_small_uc(&mut opts, config),
+		&_ => {},
+	}
+
+	opts
+}
+
+fn set_logging_defaults(opts: &mut Options, config: &Config) {
+	let rocksdb_log_level = match config.rocksdb_log_level.as_ref() {
+		"debug" => LogLevel::Debug,
+		"info" => LogLevel::Info,
+		"warn" => LogLevel::Warn,
+		"fatal" => LogLevel::Fatal,
+		_ => LogLevel::Error,
+	};
+
+	opts.set_log_level(rocksdb_log_level);
+	opts.set_max_log_file_size(config.rocksdb_max_log_file_size);
+	opts.set_log_file_time_to_roll(config.rocksdb_log_time_to_roll);
+	opts.set_keep_log_file_num(config.rocksdb_max_log_files);
+	opts.set_stats_dump_period_sec(0);
+}
+
+fn set_compression_defaults(opts: &mut Options, config: &Config) {
+	let rocksdb_compression_algo = match config.rocksdb_compression_algo.as_ref() {
+		"zlib" => DBCompressionType::Zlib,
+		"lz4" => DBCompressionType::Lz4,
+		"bz2" => DBCompressionType::Bz2,
+		_ => DBCompressionType::Zstd,
+	};
+
+	if config.rocksdb_bottommost_compression {
+		opts.set_bottommost_compression_type(rocksdb_compression_algo);
+		opts.set_bottommost_zstd_max_train_bytes(0, true);
+
+		// -14 w_bits is only read by zlib.
+		opts.set_bottommost_compression_options(-14, config.rocksdb_bottommost_compression_level, 0, 0, true);
+	}
+
+	// -14 w_bits is only read by zlib.
+	opts.set_compression_options(-14, config.rocksdb_compression_level, 0, 0);
+	opts.set_compression_type(rocksdb_compression_algo);
+}
+
+fn set_for_random_small_uc(opts: &mut Options, config: &Config) {
+	let uco = uc_options(config);
+	set_for_random_small(opts, config);
+	opts.set_universal_compaction_options(&uco);
+	opts.set_compaction_style(DBCompactionStyle::Universal);
+	opts.set_level_zero_file_num_compaction_trigger(1);
+}
+
+fn set_for_sequential_small_uc(opts: &mut Options, config: &Config) {
+	let uco = uc_options(config);
+	set_for_sequential_small(opts, config);
+	opts.set_universal_compaction_options(&uco);
+	opts.set_compaction_style(DBCompactionStyle::Universal);
+	opts.set_level_zero_file_num_compaction_trigger(1);
+}
+
+fn set_for_random_small(opts: &mut Options, config: &Config) {
+	set_for_random(opts, config);
+
+	opts.set_write_buffer_size(1024 * 1024);
+	opts.set_target_file_size_base(65536);
+	opts.set_max_bytes_for_level_base(131072);
+}
+
+fn set_for_sequential_small(opts: &mut Options, config: &Config) {
+	set_for_random(opts, config);
+
+	opts.set_write_buffer_size(1024 * 1024);
+	opts.set_target_file_size_base(65536);
+	opts.set_max_bytes_for_level_base(131072);
+}
+
+fn set_for_random(opts: &mut Options, config: &Config) {
+	set_level_defaults(opts, config);
+
+	let pri = "compaction_pri=kOldestSmallestSeqFirst";
+	opts.set_options_from_string(pri)
+		.expect("set compaction priority string");
+
+	opts.set_max_bytes_for_level_base(8 * 1024 * 1024);
+	opts.set_max_bytes_for_level_multiplier(1.0);
+	opts.set_max_bytes_for_level_multiplier_additional(&[0, 1, 1, 3, 7, 15, 31]);
+}
+
+fn set_for_sequential(opts: &mut Options, config: &Config) {
+	set_level_defaults(opts, config);
+
+	let pri = "compaction_pri=kOldestLargestSeqFirst";
+	opts.set_options_from_string(pri)
+		.expect("set compaction priority string");
+
+	opts.set_target_file_size_base(2 * 1024 * 1024);
+	opts.set_target_file_size_multiplier(2);
+
+	opts.set_max_bytes_for_level_base(32 * 1024 * 1024);
+	opts.set_max_bytes_for_level_multiplier(1.0);
+	opts.set_max_bytes_for_level_multiplier_additional(&[0, 1, 1, 3, 7, 15, 31]);
+}
+
+fn set_level_defaults(opts: &mut Options, _config: &Config) {
+	opts.set_target_file_size_base(1024 * 1024);
+	opts.set_target_file_size_multiplier(2);
+
+	opts.set_level_compaction_dynamic_level_bytes(false);
+	opts.set_max_bytes_for_level_base(8 * 1024 * 1024);
+	opts.set_max_bytes_for_level_multiplier(2.0);
+}
+
+fn uc_options(_config: &Config) -> UniversalCompactOptions {
+	let mut opts = UniversalCompactOptions::default();
+
+	opts.set_stop_style(UniversalCompactionStopStyle::Total);
+	opts.set_max_size_amplification_percent(10000);
+	opts.set_compression_size_percent(-1);
+	opts.set_size_ratio(1);
+
+	opts.set_min_merge_width(2);
+	opts.set_max_merge_width(16);
+
+	opts
+}
+
+fn table_options(_config: &Config) -> BlockBasedOptions {
+	let mut opts = BlockBasedOptions::default();
+
+	opts.set_block_size(4 * 1024);
+	opts.set_metadata_block_size(4 * 1024);
+
+	opts.set_bloom_filter(9.6, true);
+	opts.set_optimize_filters_for_memory(true);
+	opts.set_cache_index_and_filter_blocks(true);
+	opts.set_pin_top_level_index_and_filter(true);
+
+	opts
+}
-- 
GitLab