314 lines
10 KiB
Rust
314 lines
10 KiB
Rust
use std::{path::PathBuf, sync::Arc};
|
|
|
|
use tuwunel_core::{
|
|
Server, at, debug,
|
|
debug::INFO_SPAN_LEVEL,
|
|
debug_info, debug_warn, expected, info, is_equal_to,
|
|
utils::{
|
|
BoolExt,
|
|
math::usize_from_f64,
|
|
result::LogDebugErr,
|
|
stream,
|
|
stream::{AMPLIFICATION_LIMIT, WIDTH_LIMIT},
|
|
sys::{
|
|
compute::{available_parallelism, cores_available, is_core_available},
|
|
max_threads, storage,
|
|
},
|
|
},
|
|
};
|
|
|
|
use super::{QUEUE_LIMIT, WORKER_LIMIT};
|
|
|
|
/// Determine storage hardware capabilities of the system for configuring the
|
|
/// shape of the database frontend threadpool.
|
|
///
|
|
/// Returns a tuple of:
|
|
/// - `topology` Vector mapping hardware cores to hardware queues. Systems with
|
|
/// fewer queues than cores will see queue ID's repeated. Systems with the
|
|
/// same or more queues as cores will usually see a 1:1 association of core
|
|
/// ID's to queue ID's. Systems with sparse core assignments will see 0 for
|
|
/// core ID positions not available to the process. Systems where detection
|
|
/// failed will see a default of 1:1 core identity as a best-guess maintaining
|
|
/// core locality.
|
|
/// - `workers` Vector mapping hardware queues to the number of threads to spawn
|
|
/// in service of that queue. Systems with fewer queues than cores will set an
|
|
/// affinity mask for each thread to multiple cores based on the topology.
|
|
/// Systems with equal or more hardware queues than cores will set a single
|
|
/// affinity for each thread.
|
|
/// - `queues` Vector of software mpmc queues to create and the size of each
|
|
/// queue. Each indice is associated with a thread-pool of workers which it
|
|
/// feeds requests from various tokio tasks. When this queue reaches capacity
|
|
/// the tokio task must yield.
|
|
#[tracing::instrument(
|
|
level = INFO_SPAN_LEVEL,
|
|
skip_all,
|
|
ret(level = "trace"),
|
|
)]
|
|
pub(super) fn configure(server: &Arc<Server>) -> (Vec<usize>, Vec<usize>, Vec<usize>) {
|
|
let config = &server.config;
|
|
let num_cores = available_parallelism();
|
|
|
|
// Determine the maximum number of cores. The total number of cores available to
|
|
// the process may be less on systems with sparse core assignments, but this
|
|
// still serves as an upper-bound.
|
|
let cores_max = cores_available()
|
|
.last()
|
|
.unwrap_or(0)
|
|
.saturating_add(1);
|
|
|
|
// This finds the block device and gathers all the properties we need.
|
|
let path: PathBuf = config.database_path.clone();
|
|
let device_name = storage::name_from_path(&path)
|
|
.log_debug_err()
|
|
.ok();
|
|
|
|
let devices = storage::md_discover(&path);
|
|
let topology_detected = devices.md.is_empty().is_false();
|
|
debug!(?topology_detected, ?device_name, ?devices);
|
|
|
|
// The default worker count is masked-on if we didn't find better information.
|
|
let default_worker_count = topology_detected
|
|
.is_false()
|
|
.then_some(config.db_pool_workers)
|
|
.map(|workers| workers.saturating_mul(num_cores));
|
|
|
|
// Sum the total number of possible tags. When no hardware detected this will
|
|
// default to the default_worker_count. Note well that the thread-worker model
|
|
// we use will never approach actual NVMe capacity as with io_uring or even
|
|
// close to userspace drivers. We still take some cues from this value which
|
|
// does give us actual request capacity.
|
|
let total_tags = devices
|
|
.md
|
|
.iter()
|
|
.flat_map(|md| md.mq.iter())
|
|
.filter(|mq| mq.cpu_list.iter().copied().any(is_core_available))
|
|
.filter_map(|mq| mq.nr_tags)
|
|
.chain(default_worker_count)
|
|
.fold(0_usize, usize::saturating_add);
|
|
|
|
// Determine the CPU affinities of each hardware queue. Each indice is a core
|
|
// and each value is the associated hardware queue. On systems which share
|
|
// queues between cores some values will be repeated; on systems with multiple
|
|
// queues per core the affinities are assumed to match and we don't require a
|
|
// vector of vectors. Sparse unavailable cores default to 0. Undetected hardware
|
|
// defaults to the core identity as a best-guess.
|
|
let topology: Vec<usize> = devices
|
|
.md
|
|
.iter()
|
|
.flat_map(|md| md.mq.iter())
|
|
.fold(vec![0; cores_max], |mut topology, mq| {
|
|
mq.cpu_list
|
|
.iter()
|
|
.filter(|&&id| id < cores_max)
|
|
.filter(|&&id| is_core_available(id))
|
|
.for_each(|&id| {
|
|
topology[id] = mq.id;
|
|
});
|
|
|
|
topology
|
|
})
|
|
.into_iter()
|
|
.enumerate()
|
|
.map(|(core_id, queue_id)| {
|
|
topology_detected
|
|
.then_some(queue_id)
|
|
.unwrap_or(core_id)
|
|
})
|
|
.collect();
|
|
|
|
// Query getrlimit(2) to impose any additional restriction, divide to leave room
|
|
// for other threads in the process.
|
|
let max_threads = max_threads()
|
|
.map(at!(0))
|
|
.unwrap_or(usize::MAX)
|
|
.saturating_div(3);
|
|
|
|
// Determine an ideal max worker count based on true capacity. As stated prior
|
|
// the true value is rarely attainable in any thread-worker model, and clamped.
|
|
let max_workers = devices
|
|
.md
|
|
.iter()
|
|
.flat_map(|md| md.mq.iter())
|
|
.filter_map(|mq| mq.nr_tags)
|
|
.chain(default_worker_count.into_iter())
|
|
.fold(0_usize, usize::saturating_add)
|
|
.min(config.db_pool_max_workers)
|
|
.clamp(WORKER_LIMIT.0, max_threads)
|
|
.clamp(WORKER_LIMIT.0, WORKER_LIMIT.1);
|
|
|
|
// Tamper for the total number of workers by reducing the count for each group.
|
|
let chan_limit = expected!(max_workers / num_cores)
|
|
.saturating_sub(8)
|
|
.saturating_add(1)
|
|
.next_multiple_of(8);
|
|
|
|
// Default workers vector without detection.
|
|
let default_workers = default_worker_count
|
|
.into_iter()
|
|
.cycle()
|
|
.enumerate()
|
|
.map(|(core_id, count)| {
|
|
is_core_available(core_id)
|
|
.then_some(count)
|
|
.unwrap_or(0)
|
|
.min(chan_limit)
|
|
});
|
|
|
|
// Determine the worker groupings. Each indice represents a hardware queue and
|
|
// contains the number of workers which will service it. This vector is
|
|
// truncated to the number of cores on systems which have multiple hardware
|
|
// queues per core. The number of workers is then truncated to a maximum for
|
|
// each pool; as stated prior, this will usually be less than NVMe capacity.
|
|
let workers: Vec<usize> = devices
|
|
.md
|
|
.iter()
|
|
.inspect(|md| debug!(?md))
|
|
.flat_map(|md| md.mq.iter())
|
|
.map(|mq| {
|
|
let shares = mq
|
|
.cpu_list
|
|
.iter()
|
|
.filter(|&&id| is_core_available(id))
|
|
.count();
|
|
|
|
let conf_limit = config
|
|
.db_pool_workers_limit
|
|
.saturating_mul(shares);
|
|
|
|
let hard_limit = devices
|
|
.md
|
|
.iter()
|
|
.filter(|_| shares > 0)
|
|
.fold(0_usize, |acc, mq| {
|
|
mq.nr_requests
|
|
.map(|nr| nr.min(conf_limit))
|
|
.or(Some(conf_limit))
|
|
.map(|nr| acc.saturating_add(nr))
|
|
.unwrap_or(acc)
|
|
});
|
|
|
|
let tags = mq
|
|
.nr_tags
|
|
.unwrap_or(WORKER_LIMIT.0)
|
|
.min(hard_limit)
|
|
.min(chan_limit);
|
|
|
|
debug!(?mq, ?shares, ?tags, ?conf_limit, ?hard_limit, ?chan_limit);
|
|
|
|
tags
|
|
})
|
|
.chain(default_workers)
|
|
.take(topology.len())
|
|
.collect();
|
|
|
|
// Determine our software queue size for each hardware queue. This is the mpmc
|
|
// between the tokio worker and the pool worker.
|
|
let queues: Vec<usize> = workers
|
|
.iter()
|
|
.map(|count| {
|
|
count
|
|
.saturating_mul(config.db_pool_queue_mult)
|
|
.min(QUEUE_LIMIT.1)
|
|
})
|
|
.collect();
|
|
|
|
// Total number of workers to spawn.
|
|
let total_workers = workers.iter().sum::<usize>();
|
|
|
|
// Total capacity of all software queues.
|
|
let total_capacity = queues.iter().sum::<usize>();
|
|
|
|
// Discount queues with zero capacity for a proper denominator.
|
|
let num_queues = queues.iter().filter(|&&cap| cap > 0).count();
|
|
|
|
// After computing all of the above we can update the global automatic stream
|
|
// width, hopefully with a better value tailored to this system.
|
|
if config.stream_width_scale > 0.0 {
|
|
update_stream_width(server, num_queues, total_workers, total_capacity);
|
|
}
|
|
|
|
if topology_detected {
|
|
debug_info!(?num_cores, ?topology, ?workers, ?queues, "Frontend topology",);
|
|
info!(
|
|
device_name = ?device_name.as_deref().unwrap_or("None"),
|
|
?num_queues,
|
|
?total_workers,
|
|
?total_tags,
|
|
?total_capacity,
|
|
stream_width = ?stream::automatic_width(),
|
|
amplification = ?stream::automatic_amplification(),
|
|
"Frontend topology",
|
|
);
|
|
} else {
|
|
debug_info!(?num_cores, ?topology, ?workers, ?queues, "Frontend topology (defaults)");
|
|
debug_warn!(
|
|
device_name = ?device_name.as_deref().unwrap_or("None"),
|
|
?total_workers,
|
|
?total_capacity,
|
|
stream_width = ?stream::automatic_width(),
|
|
amplification = ?stream::automatic_amplification(),
|
|
"Storage hardware not detected for database directory; assuming defaults.",
|
|
);
|
|
}
|
|
|
|
assert!(total_workers > 0, "some workers expected");
|
|
debug_assert!(
|
|
total_workers <= max_workers || !topology_detected,
|
|
"spawning too many workers"
|
|
);
|
|
|
|
assert!(!queues.is_empty(), "some queues expected");
|
|
assert!(!queues.iter().copied().all(is_equal_to!(0)), "positive queue capacity expected");
|
|
|
|
(topology, workers, queues)
|
|
}
|
|
|
|
#[expect(clippy::as_conversions, clippy::cast_precision_loss)]
|
|
fn update_stream_width(
|
|
server: &Arc<Server>,
|
|
num_queues: usize,
|
|
total_workers: usize,
|
|
_total_capacity: usize,
|
|
) {
|
|
assert!(num_queues > 0, "Expected at least one queue.");
|
|
assert!(total_workers > 0, "Expected some workers.");
|
|
|
|
let config = &server.config;
|
|
let scale: f64 = config.stream_width_scale.min(100.0).into();
|
|
let max_width = expected!(total_workers / num_queues);
|
|
|
|
let old_width = stream::automatic_width();
|
|
let old_scale_width = expected!(old_width * num_queues);
|
|
|
|
let new_scale = total_workers as f64 / old_scale_width as f64;
|
|
let new_scale = new_scale.clamp(1.0, 4.0);
|
|
let new_scale_width = new_scale * old_width as f64;
|
|
let new_scale_width = usize_from_f64(new_scale_width)
|
|
.expect("failed to convert f64 to usize")
|
|
.next_multiple_of(8);
|
|
|
|
let req_width = usize_from_f64(scale * new_scale_width as f64)
|
|
.expect("failed to convert f64 to usize")
|
|
.next_multiple_of(4)
|
|
.min(max_width)
|
|
.clamp(WIDTH_LIMIT.0, WIDTH_LIMIT.1);
|
|
|
|
let req_amp = new_scale * config.stream_amplification as f64;
|
|
let req_amp = usize_from_f64(req_amp * scale)
|
|
.expect("failed to convert f64 to usize")
|
|
.next_multiple_of(64)
|
|
.clamp(AMPLIFICATION_LIMIT.0, AMPLIFICATION_LIMIT.1);
|
|
|
|
let (old_width, new_width) = stream::set_width(req_width);
|
|
let (old_amp, new_amp) = stream::set_amplification(req_amp);
|
|
debug!(
|
|
config_scale = ?config.stream_width_scale,
|
|
?old_width,
|
|
?new_scale,
|
|
?new_width,
|
|
?old_amp,
|
|
?new_amp,
|
|
"Updated global stream width"
|
|
);
|
|
}
|