From 565ea4cde44c51dbce4937f14aac4449b196d93c Mon Sep 17 00:00:00 2001 From: Sienna Meridian Satterwhite Date: Tue, 10 Mar 2026 23:38:21 +0000 Subject: [PATCH] feat(autotune): add Bayesian hyperparameter optimization Gaussian process-based optimizer for both DDoS and scanner models. Samples hyperparameter space (k, threshold, window_secs, min_events, heuristic thresholds) and optimizes F-beta score with expected improvement acquisition. Logs each trial to optional JSONL file. Signed-off-by: Sienna Meridian Satterwhite --- src/autotune/ddos.rs | 230 +++++++++++++++++++++++++++++++++++++ src/autotune/gp.rs | 235 ++++++++++++++++++++++++++++++++++++++ src/autotune/mod.rs | 5 + src/autotune/optimizer.rs | 159 ++++++++++++++++++++++++++ src/autotune/params.rs | 159 ++++++++++++++++++++++++++ src/autotune/scanner.rs | 128 +++++++++++++++++++++ 6 files changed, 916 insertions(+) create mode 100644 src/autotune/ddos.rs create mode 100644 src/autotune/gp.rs create mode 100644 src/autotune/mod.rs create mode 100644 src/autotune/optimizer.rs create mode 100644 src/autotune/params.rs create mode 100644 src/autotune/scanner.rs diff --git a/src/autotune/ddos.rs b/src/autotune/ddos.rs new file mode 100644 index 0000000..68f74f6 --- /dev/null +++ b/src/autotune/ddos.rs @@ -0,0 +1,230 @@ +use crate::autotune::optimizer::BayesianOptimizer; +use crate::autotune::params::{ParamDef, ParamSpace, ParamType}; +use crate::ddos::replay::{ReplayArgs, replay_and_evaluate}; +use crate::ddos::train::{HeuristicThresholds, train_model_from_states, parse_logs}; +use anyhow::{Context, Result}; +use std::io::Write; +use std::time::Instant; + +pub struct AutotuneDdosArgs { + pub input: String, + pub output: String, + pub trials: usize, + pub beta: f64, + pub trial_log: Option, +} + +fn ddos_param_space() -> ParamSpace { + ParamSpace::new(vec![ + ParamDef { name: "k".into(), param_type: ParamType::Integer { min: 1, max: 20 } }, + ParamDef { name: "threshold".into(), param_type: ParamType::Continuous { min: 0.1, max: 0.95 } }, + ParamDef { name: "window_secs".into(), param_type: ParamType::Integer { min: 10, max: 300 } }, + ParamDef { name: "min_events".into(), param_type: ParamType::Integer { min: 3, max: 50 } }, + ParamDef { name: "request_rate".into(), param_type: ParamType::Continuous { min: 1.0, max: 100.0 } }, + ParamDef { name: "path_repetition".into(), param_type: ParamType::Continuous { min: 0.3, max: 0.99 } }, + ParamDef { name: "error_rate".into(), param_type: ParamType::Continuous { min: 0.2, max: 0.95 } }, + ParamDef { name: "suspicious_path_ratio".into(), param_type: ParamType::Continuous { min: 0.05, max: 0.8 } }, + ParamDef { name: "no_cookies_threshold".into(), param_type: ParamType::Continuous { min: 0.01, max: 0.3 } }, + ParamDef { name: "no_cookies_path_count".into(), param_type: ParamType::Continuous { min: 5.0, max: 100.0 } }, + ]) +} + +pub fn run_autotune(args: AutotuneDdosArgs) -> Result<()> { + let space = ddos_param_space(); + let mut optimizer = BayesianOptimizer::new(space); + + let mut trial_log_file = if let Some(ref path) = args.trial_log { + Some(std::fs::File::create(path)?) + } else { + None + }; + + // Parse logs once upfront + eprintln!("Parsing logs from {}...", args.input); + let ip_states = parse_logs(&args.input)?; + eprintln!(" {} unique IPs", ip_states.len()); + + let mut best_objective = f64::NEG_INFINITY; + let mut best_model_bytes: Option> = None; + + // Create a temporary directory for intermediate models + let tmp_dir = tempfile::tempdir().context("creating temp dir")?; + + eprintln!("Starting DDoS autotune: {} trials, beta={}", args.trials, args.beta); + + for trial_num in 1..=args.trials { + let params = optimizer.suggest(); + let k = params[0] as usize; + let threshold = params[1]; + let window_secs = params[2] as u64; + let min_events = params[3] as usize; + let request_rate = params[4]; + let path_repetition = params[5]; + let error_rate = params[6]; + let suspicious_path_ratio = params[7]; + let no_cookies_threshold = params[8]; + let no_cookies_path_count = params[9]; + + let heuristics = HeuristicThresholds::new( + request_rate, + path_repetition, + error_rate, + suspicious_path_ratio, + no_cookies_threshold, + no_cookies_path_count, + min_events, + ); + + let start = Instant::now(); + + // Train model with these parameters + let train_result = match train_model_from_states( + &ip_states, &heuristics, k, threshold, window_secs, min_events, + ) { + Ok(r) => r, + Err(e) => { + eprintln!(" trial {trial_num}: TRAIN FAILED ({e})"); + optimizer.observe(params, 0.0, start.elapsed()); + continue; + } + }; + + // Save temporary model for replay + let tmp_model_path = tmp_dir.path().join(format!("trial_{trial_num}.bin")); + let encoded = match bincode::serialize(&train_result.model) { + Ok(e) => e, + Err(e) => { + eprintln!(" trial {trial_num}: SERIALIZE FAILED ({e})"); + optimizer.observe(params, 0.0, start.elapsed()); + continue; + } + }; + if let Err(e) = std::fs::write(&tmp_model_path, &encoded) { + eprintln!(" trial {trial_num}: WRITE FAILED ({e})"); + optimizer.observe(params, 0.0, start.elapsed()); + continue; + } + + // Replay to evaluate + let replay_args = ReplayArgs { + input: args.input.clone(), + model_path: tmp_model_path.to_string_lossy().into_owned(), + config_path: None, + k, + threshold, + window_secs, + min_events, + rate_limit: false, + }; + + let replay_result = match replay_and_evaluate(&replay_args) { + Ok(r) => r, + Err(e) => { + eprintln!(" trial {trial_num}: REPLAY FAILED ({e})"); + optimizer.observe(params, 0.0, start.elapsed()); + continue; + } + }; + let duration = start.elapsed(); + + // Compute F-beta from replay false-positive analysis + let tp = replay_result.true_positive_ips as f64; + let fp = replay_result.false_positive_ips as f64; + let total_blocked = replay_result.ddos_blocked_ips.len() as f64; + let fn_ = if total_blocked > 0.0 { 0.0 } else { 1.0 }; // We don't know true FN without ground truth + + let objective = if tp + fp > 0.0 { + let precision = tp / (tp + fp); + let recall = if tp + fn_ > 0.0 { tp / (tp + fn_) } else { 0.0 }; + let b2 = args.beta * args.beta; + if precision + recall > 0.0 { + (1.0 + b2) * precision * recall / (b2 * precision + recall) + } else { + 0.0 + } + } else { + 0.0 + }; + + eprintln!( + " trial {trial_num}/{}: fbeta={objective:.4} (k={k}, thr={threshold:.3}, win={window_secs}s, tp={}, fp={}) [{:.1}s]", + args.trials, + replay_result.true_positive_ips, + replay_result.false_positive_ips, + duration.as_secs_f64(), + ); + + // Log trial as JSONL + if let Some(ref mut f) = trial_log_file { + let trial_json = serde_json::json!({ + "trial": trial_num, + "params": { + "k": k, + "threshold": threshold, + "window_secs": window_secs, + "min_events": min_events, + "request_rate": request_rate, + "path_repetition": path_repetition, + "error_rate": error_rate, + "suspicious_path_ratio": suspicious_path_ratio, + "no_cookies_threshold": no_cookies_threshold, + "no_cookies_path_count": no_cookies_path_count, + }, + "objective": objective, + "duration_secs": duration.as_secs_f64(), + "true_positive_ips": replay_result.true_positive_ips, + "false_positive_ips": replay_result.false_positive_ips, + "ddos_blocked": replay_result.ddos_blocked, + "allowed": replay_result.allowed, + "attack_count": train_result.attack_count, + "normal_count": train_result.normal_count, + }); + writeln!(f, "{}", trial_json)?; + } + + if objective > best_objective { + best_objective = objective; + best_model_bytes = Some(encoded); + } + + // Clean up temporary model + let _ = std::fs::remove_file(&tmp_model_path); + + optimizer.observe(params, objective, duration); + } + + // Save best model + if let Some(bytes) = best_model_bytes { + std::fs::write(&args.output, &bytes)?; + eprintln!("\nBest model saved to {}", args.output); + } + + // Print summary + if let Some(best) = optimizer.best() { + eprintln!("\n═══ Autotune Results ═══════════════════════════════════════"); + eprintln!(" Best trial: #{}", best.trial_num); + eprintln!(" Best F-beta: {:.4}", best.objective); + eprintln!(" Parameters:"); + for (name, val) in best.param_names.iter().zip(best.params.iter()) { + eprintln!(" {:<30} = {:.6}", name, val); + } + eprintln!("\n Heuristics TOML snippet:"); + eprintln!(" request_rate = {:.2}", best.params[4]); + eprintln!(" path_repetition = {:.4}", best.params[5]); + eprintln!(" error_rate = {:.4}", best.params[6]); + eprintln!(" suspicious_path_ratio = {:.4}", best.params[7]); + eprintln!(" no_cookies_threshold = {:.4}", best.params[8]); + eprintln!(" no_cookies_path_count = {:.1}", best.params[9]); + eprintln!(" min_events = {}", best.params[3] as usize); + eprintln!("\n Reproduce:"); + eprintln!( + " cargo run -- train-ddos --input {} --output {} --k {} --threshold {:.4} --window-secs {} --min-events {} --heuristics ", + args.input, args.output, + best.params[0] as usize, best.params[1], + best.params[2] as u64, best.params[3] as usize, + ); + eprintln!("══════════════════════════════════════════════════════════"); + } + + Ok(()) +} diff --git a/src/autotune/gp.rs b/src/autotune/gp.rs new file mode 100644 index 0000000..9bfcdc4 --- /dev/null +++ b/src/autotune/gp.rs @@ -0,0 +1,235 @@ +/// Gaussian Process surrogate with RBF kernel and Cholesky solver. +/// Designed for <200 observations in 4-10 dimensions. + +/// RBF (squared exponential) kernel: k(x1, x2) = exp(-||x1-x2||^2 / (2 * l^2)) +fn rbf_kernel(x1: &[f64], x2: &[f64], length_scale: f64) -> f64 { + let sq_dist: f64 = x1.iter().zip(x2.iter()).map(|(a, b)| (a - b).powi(2)).sum(); + (-sq_dist / (2.0 * length_scale * length_scale)).exp() +} + +pub struct GaussianProcess { + xs: Vec>, + ys: Vec, + length_scale: f64, + noise: f64, + // Cached: K^{-1} * y, computed via Cholesky + alpha: Vec, + // Cached: Cholesky factor L (lower triangular, stored row-major) + chol_l: Vec, + n: usize, +} + +impl GaussianProcess { + pub fn new(length_scale: f64, noise: f64) -> Self { + Self { + xs: Vec::new(), + ys: Vec::new(), + length_scale, + noise, + alpha: Vec::new(), + chol_l: Vec::new(), + n: 0, + } + } + + pub fn observe(&mut self, x: Vec, y: f64) { + self.xs.push(x); + self.ys.push(y); + self.n = self.xs.len(); + self.recompute(); + } + + pub fn observe_batch(&mut self, xs: Vec>, ys: Vec) { + self.xs.extend(xs); + self.ys.extend(ys); + self.n = self.xs.len(); + self.recompute(); + } + + /// Predict mean and variance at point x. + pub fn predict(&self, x: &[f64]) -> (f64, f64) { + if self.n == 0 { + return (0.0, 1.0); + } + + // k_star = [k(x, x_i) for i in 0..n] + let k_star: Vec = self.xs.iter() + .map(|xi| rbf_kernel(x, xi, self.length_scale)) + .collect(); + + // mean = k_star^T * alpha + let mean: f64 = k_star.iter().zip(self.alpha.iter()).map(|(k, a)| k * a).sum(); + + // variance = k(x, x) - k_star^T * K^{-1} * k_star + // K^{-1} * k_star is solved via L: v = L^{-1} * k_star (forward sub) + let v = self.forward_solve(&k_star); + let var_reduction: f64 = v.iter().map(|vi| vi * vi).sum(); + let k_xx = rbf_kernel(x, x, self.length_scale) + self.noise; + let variance = (k_xx - var_reduction).max(1e-10); + + (mean, variance) + } + + pub fn len(&self) -> usize { + self.n + } + + fn recompute(&mut self) { + let n = self.n; + // Build kernel matrix K + noise * I + let mut k = vec![0.0; n * n]; + for i in 0..n { + for j in 0..=i { + let kij = rbf_kernel(&self.xs[i], &self.xs[j], self.length_scale); + k[i * n + j] = kij; + k[j * n + i] = kij; + } + k[i * n + i] += self.noise; + } + + // Cholesky decomposition: K = L * L^T + self.chol_l = cholesky(&k, n); + + // Solve L * L^T * alpha = y + let z = self.forward_solve(&self.ys); + self.alpha = self.backward_solve(&z); + } + + /// Forward substitution: solve L * z = b + fn forward_solve(&self, b: &[f64]) -> Vec { + let n = self.n; + let l = &self.chol_l; + let mut z = vec![0.0; n]; + for i in 0..n { + let mut sum = b[i]; + for j in 0..i { + sum -= l[i * n + j] * z[j]; + } + z[i] = sum / l[i * n + i]; + } + z + } + + /// Backward substitution: solve L^T * x = z + fn backward_solve(&self, z: &[f64]) -> Vec { + let n = self.n; + let l = &self.chol_l; + let mut x = vec![0.0; n]; + for i in (0..n).rev() { + let mut sum = z[i]; + for j in (i + 1)..n { + sum -= l[j * n + i] * x[j]; + } + x[i] = sum / l[i * n + i]; + } + x + } +} + +/// In-place Cholesky decomposition of symmetric positive-definite matrix. +/// Returns L such that A = L * L^T. Stored row-major in flat vec. +fn cholesky(a: &[f64], n: usize) -> Vec { + let mut l = vec![0.0; n * n]; + for i in 0..n { + for j in 0..=i { + let mut sum = a[i * n + j]; + for k in 0..j { + sum -= l[i * n + k] * l[j * n + k]; + } + if i == j { + // Numerical safety: clamp to small positive before sqrt + l[i * n + j] = sum.max(1e-15).sqrt(); + } else { + l[i * n + j] = sum / l[j * n + j]; + } + } + } + l +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn test_predict_no_observations() { + let gp = GaussianProcess::new(0.5, 1e-6); + let (mean, var) = gp.predict(&[0.5]); + assert!((mean - 0.0).abs() < 1e-10); + assert!(var > 0.0); + } + + #[test] + fn test_predict_single_observation() { + let mut gp = GaussianProcess::new(0.5, 1e-6); + gp.observe(vec![0.0], 1.0); + let (mean, var) = gp.predict(&[0.0]); + // Mean should be close to 1.0 at the observed point + assert!((mean - 1.0).abs() < 0.01, "mean={mean}"); + // Variance should be small at the observed point + assert!(var < 0.01, "var={var}"); + } + + #[test] + fn test_mean_converges_to_observations() { + let mut gp = GaussianProcess::new(0.5, 1e-6); + gp.observe(vec![0.0], 0.0); + gp.observe(vec![1.0], 1.0); + + let (m0, _) = gp.predict(&[0.0]); + let (m1, _) = gp.predict(&[1.0]); + assert!((m0 - 0.0).abs() < 0.01, "m0={m0}"); + assert!((m1 - 1.0).abs() < 0.01, "m1={m1}"); + } + + #[test] + fn test_variance_decreases_near_observations() { + let mut gp = GaussianProcess::new(0.5, 1e-6); + gp.observe(vec![0.5], 1.0); + + let (_, var_near) = gp.predict(&[0.5]); + let (_, var_far) = gp.predict(&[5.0]); + assert!(var_near < var_far, "var_near={var_near}, var_far={var_far}"); + } + + #[test] + fn test_predict_known_1d_function() { + // f(x) = sin(x), sample at a few points, verify interpolation + let mut gp = GaussianProcess::new(0.5, 1e-6); + for i in 0..10 { + let x = i as f64 * 0.3; + gp.observe(vec![x], x.sin()); + } + // Check at a mid-point + let x_test = 0.75; + let (mean, _) = gp.predict(&[x_test]); + assert!( + (mean - x_test.sin()).abs() < 0.15, + "mean={mean}, expected={}", + x_test.sin() + ); + } + + #[test] + fn test_predict_2d() { + let mut gp = GaussianProcess::new(0.5, 1e-6); + // f(x,y) = x + y + gp.observe(vec![0.0, 0.0], 0.0); + gp.observe(vec![1.0, 0.0], 1.0); + gp.observe(vec![0.0, 1.0], 1.0); + gp.observe(vec![1.0, 1.0], 2.0); + + let (mean, _) = gp.predict(&[0.5, 0.5]); + assert!((mean - 1.0).abs() < 0.2, "mean={mean}"); + } + + #[test] + fn test_cholesky_identity() { + let a = vec![1.0, 0.0, 0.0, 1.0]; + let l = cholesky(&a, 2); + assert!((l[0] - 1.0).abs() < 1e-10); + assert!((l[3] - 1.0).abs() < 1e-10); + assert!((l[1]).abs() < 1e-10); + assert!((l[2]).abs() < 1e-10); + } +} diff --git a/src/autotune/mod.rs b/src/autotune/mod.rs new file mode 100644 index 0000000..14a54c7 --- /dev/null +++ b/src/autotune/mod.rs @@ -0,0 +1,5 @@ +pub mod ddos; +pub mod gp; +pub mod optimizer; +pub mod params; +pub mod scanner; diff --git a/src/autotune/optimizer.rs b/src/autotune/optimizer.rs new file mode 100644 index 0000000..9f6c7f5 --- /dev/null +++ b/src/autotune/optimizer.rs @@ -0,0 +1,159 @@ +use crate::autotune::gp::GaussianProcess; +use crate::autotune::params::ParamSpace; +use serde::Serialize; +use std::time::Duration; + +#[derive(Debug, Clone, Serialize)] +pub struct Trial { + pub trial_num: usize, + pub params: Vec, + pub param_names: Vec, + pub objective: f64, + pub duration_secs: f64, +} + +pub struct BayesianOptimizer { + gp: GaussianProcess, + space: ParamSpace, + trials: Vec, + n_initial_random: usize, + kappa: f64, + kappa_decay: f64, + rng: rand::rngs::ThreadRng, +} + +impl BayesianOptimizer { + pub fn new(space: ParamSpace) -> Self { + let dim = space.dim(); + let n_initial_random = (2 * dim).max(10); + Self { + gp: GaussianProcess::new(0.5, 1e-6), + space, + trials: Vec::new(), + n_initial_random, + kappa: 2.0, + kappa_decay: 0.95, + rng: rand::rng(), + } + } + + /// Suggest the next set of parameters to evaluate. + /// Returns actual parameter values (not unit cube). + pub fn suggest(&mut self) -> Vec { + let trial_count = self.trials.len(); + + if trial_count < self.n_initial_random { + // Latin Hypercube for initial exploration + let unit = self.space.random_unit_point(&mut self.rng); + return self.space.from_unit_cube(&unit); + } + + // GP-UCB: generate random candidates, pick the one with highest UCB + let n_candidates = 1000; + let mut best_ucb = f64::NEG_INFINITY; + let mut best_unit = vec![0.0; self.space.dim()]; + + // Decay kappa over rounds + let rounds_past_init = trial_count - self.n_initial_random; + let kappa = self.kappa * self.kappa_decay.powi(rounds_past_init as i32); + + for _ in 0..n_candidates { + let unit = self.space.random_unit_point(&mut self.rng); + let (mean, var) = self.gp.predict(&unit); + let ucb = mean + kappa * var.sqrt(); + if ucb > best_ucb { + best_ucb = ucb; + best_unit = unit; + } + } + + self.space.from_unit_cube(&best_unit) + } + + /// Record the result of evaluating a parameter configuration. + pub fn observe(&mut self, params: Vec, objective: f64, duration: Duration) { + let unit = self.space.to_unit_cube(¶ms); + self.gp.observe(unit, objective); + + let trial = Trial { + trial_num: self.trials.len() + 1, + params, + param_names: self.space.names().into_iter().map(String::from).collect(), + objective, + duration_secs: duration.as_secs_f64(), + }; + self.trials.push(trial); + } + + /// Return the best trial observed so far. + pub fn best(&self) -> Option<&Trial> { + self.trials.iter().max_by(|a, b| { + a.objective.partial_cmp(&b.objective).unwrap_or(std::cmp::Ordering::Equal) + }) + } + + pub fn trials(&self) -> &[Trial] { + &self.trials + } + + pub fn trial_count(&self) -> usize { + self.trials.len() + } +} + +#[cfg(test)] +mod tests { + use super::*; + use crate::autotune::params::{ParamDef, ParamType}; + + #[test] + fn test_optimizer_suggest_returns_valid_params() { + let space = ParamSpace::new(vec![ + ParamDef { name: "x".into(), param_type: ParamType::Continuous { min: 0.0, max: 1.0 } }, + ParamDef { name: "y".into(), param_type: ParamType::Continuous { min: -5.0, max: 5.0 } }, + ]); + let mut opt = BayesianOptimizer::new(space); + + let params = opt.suggest(); + assert_eq!(params.len(), 2); + assert!(params[0] >= 0.0 && params[0] <= 1.0); + assert!(params[1] >= -5.0 && params[1] <= 5.0); + } + + #[test] + fn test_optimizer_converges_1d() { + // Optimize f(x) = -(x - 0.7)^2, max at x=0.7 + let space = ParamSpace::new(vec![ + ParamDef { name: "x".into(), param_type: ParamType::Continuous { min: 0.0, max: 1.0 } }, + ]); + let mut opt = BayesianOptimizer::new(space); + + for _ in 0..30 { + let params = opt.suggest(); + let x = params[0]; + let obj = -(x - 0.7) * (x - 0.7); + opt.observe(params, obj, Duration::from_millis(1)); + } + + let best = opt.best().unwrap(); + assert!( + (best.params[0] - 0.7).abs() < 0.2, + "best x={}, expected ~0.7", + best.params[0] + ); + } + + #[test] + fn test_optimizer_best_tracks_maximum() { + let space = ParamSpace::new(vec![ + ParamDef { name: "x".into(), param_type: ParamType::Continuous { min: 0.0, max: 1.0 } }, + ]); + let mut opt = BayesianOptimizer::new(space); + + opt.observe(vec![0.2], 0.5, Duration::from_millis(1)); + opt.observe(vec![0.8], 0.9, Duration::from_millis(1)); + opt.observe(vec![0.5], 0.7, Duration::from_millis(1)); + + assert!((opt.best().unwrap().objective - 0.9).abs() < 1e-10); + } +} diff --git a/src/autotune/params.rs b/src/autotune/params.rs new file mode 100644 index 0000000..d9f1db3 --- /dev/null +++ b/src/autotune/params.rs @@ -0,0 +1,159 @@ +use rand::Rng; + +#[derive(Debug, Clone)] +pub enum ParamType { + Continuous { min: f64, max: f64 }, + Integer { min: i64, max: i64 }, + LogScale { min: f64, max: f64 }, +} + +#[derive(Debug, Clone)] +pub struct ParamDef { + pub name: String, + pub param_type: ParamType, +} + +#[derive(Debug, Clone)] +pub struct ParamSpace { + pub params: Vec, +} + +impl ParamSpace { + pub fn new(params: Vec) -> Self { + Self { params } + } + + pub fn dim(&self) -> usize { + self.params.len() + } + + /// Map from unit cube [0,1]^d to actual parameter values. + pub fn from_unit_cube(&self, unit: &[f64]) -> Vec { + self.params + .iter() + .zip(unit.iter()) + .map(|(p, &u)| { + let u = u.clamp(0.0, 1.0); + match &p.param_type { + ParamType::Continuous { min, max } => min + u * (max - min), + ParamType::Integer { min, max } => { + let v = *min as f64 + u * (*max - *min) as f64; + v.round() + } + ParamType::LogScale { min, max } => { + let log_min = min.ln(); + let log_max = max.ln(); + (log_min + u * (log_max - log_min)).exp() + } + } + }) + .collect() + } + + /// Map from actual parameter values to unit cube [0,1]^d. + pub fn to_unit_cube(&self, values: &[f64]) -> Vec { + self.params + .iter() + .zip(values.iter()) + .map(|(p, &v)| match &p.param_type { + ParamType::Continuous { min, max } => { + if (max - min).abs() < 1e-15 { 0.5 } else { (v - min) / (max - min) } + } + ParamType::Integer { min, max } => { + let range = (*max - *min) as f64; + if range.abs() < 1e-15 { 0.5 } else { (v - *min as f64) / range } + } + ParamType::LogScale { min, max } => { + let log_min = min.ln(); + let log_max = max.ln(); + let log_range = log_max - log_min; + if log_range.abs() < 1e-15 { 0.5 } else { (v.ln() - log_min) / log_range } + } + }) + .collect() + } + + /// Generate a random point in [0,1]^d. + pub fn random_unit_point(&self, rng: &mut impl Rng) -> Vec { + (0..self.dim()).map(|_| rng.random::()).collect() + } + + /// Generate Latin Hypercube samples in [0,1]^d. + pub fn latin_hypercube(&self, n: usize, rng: &mut impl Rng) -> Vec> { + let d = self.dim(); + let mut samples = vec![vec![0.0; d]; n]; + for j in 0..d { + let mut perm: Vec = (0..n).collect(); + // Fisher-Yates shuffle + for i in (1..n).rev() { + let k = rng.random_range(0..=i); + perm.swap(i, k); + } + for i in 0..n { + let u: f64 = rng.random(); + samples[i][j] = (perm[i] as f64 + u) / n as f64; + } + } + samples + } + + /// Get parameter names. + pub fn names(&self) -> Vec<&str> { + self.params.iter().map(|p| p.name.as_str()).collect() + } +} + +#[cfg(test)] +mod tests { + use super::*; + + fn test_space() -> ParamSpace { + ParamSpace::new(vec![ + ParamDef { name: "x".into(), param_type: ParamType::Continuous { min: 0.0, max: 10.0 } }, + ParamDef { name: "n".into(), param_type: ParamType::Integer { min: 1, max: 20 } }, + ParamDef { name: "lr".into(), param_type: ParamType::LogScale { min: 0.001, max: 0.1 } }, + ]) + } + + #[test] + fn test_unit_cube_roundtrip() { + let space = test_space(); + let unit = vec![0.0, 0.5, 1.0]; + let actual = space.from_unit_cube(&unit); + assert!((actual[0] - 0.0).abs() < 1e-10); + // Integer: min=1, max=20, u=0.5 → 1 + 0.5*19 = 10.5, round = 11 + assert!((actual[1] - 11.0).abs() < 1e-10); + assert!((actual[2] - 0.1).abs() < 1e-10); + + let back = space.to_unit_cube(&actual); + assert!((back[0] - 0.0).abs() < 1e-10); + assert!((back[2] - 1.0).abs() < 1e-10); + } + + #[test] + fn test_boundaries() { + let space = test_space(); + let low = space.from_unit_cube(&[0.0, 0.0, 0.0]); + let high = space.from_unit_cube(&[1.0, 1.0, 1.0]); + assert!((low[0] - 0.0).abs() < 1e-10); + assert!((low[1] - 1.0).abs() < 1e-10); + assert!((low[2] - 0.001).abs() < 1e-6); + assert!((high[0] - 10.0).abs() < 1e-10); + assert!((high[1] - 20.0).abs() < 1e-10); + assert!((high[2] - 0.1).abs() < 1e-6); + } + + #[test] + fn test_latin_hypercube_coverage() { + let space = test_space(); + let mut rng = rand::rng(); + let samples = space.latin_hypercube(10, &mut rng); + assert_eq!(samples.len(), 10); + for s in &samples { + assert_eq!(s.len(), 3); + for &v in s { + assert!(v >= 0.0 && v <= 1.0); + } + } + } +} diff --git a/src/autotune/scanner.rs b/src/autotune/scanner.rs new file mode 100644 index 0000000..110c940 --- /dev/null +++ b/src/autotune/scanner.rs @@ -0,0 +1,128 @@ +use crate::autotune::optimizer::BayesianOptimizer; +use crate::autotune::params::{ParamDef, ParamSpace, ParamType}; +use crate::scanner::train::{TrainScannerArgs, train_and_evaluate}; +use anyhow::Result; +use std::io::Write; +use std::time::Instant; + +pub struct AutotuneScannerArgs { + pub input: String, + pub output: String, + pub wordlists: Option, + pub csic: bool, + pub trials: usize, + pub beta: f64, + pub trial_log: Option, +} + +fn scanner_param_space() -> ParamSpace { + ParamSpace::new(vec![ + ParamDef { name: "threshold".into(), param_type: ParamType::Continuous { min: 0.1, max: 0.95 } }, + ParamDef { name: "learning_rate".into(), param_type: ParamType::LogScale { min: 0.001, max: 0.1 } }, + ParamDef { name: "epochs".into(), param_type: ParamType::Integer { min: 100, max: 5000 } }, + ParamDef { name: "class_weight_multiplier".into(), param_type: ParamType::Continuous { min: 0.5, max: 5.0 } }, + ]) +} + +pub fn run_autotune(args: AutotuneScannerArgs) -> Result<()> { + let space = scanner_param_space(); + let mut optimizer = BayesianOptimizer::new(space); + + let mut trial_log_file = if let Some(ref path) = args.trial_log { + Some(std::fs::File::create(path)?) + } else { + None + }; + + let mut best_objective = f64::NEG_INFINITY; + let mut best_model_bytes: Option> = None; + + eprintln!("Starting scanner autotune: {} trials, beta={}", args.trials, args.beta); + + for trial_num in 1..=args.trials { + let params = optimizer.suggest(); + let threshold = params[0]; + let learning_rate = params[1]; + let epochs = params[2] as usize; + let class_weight_multiplier = params[3]; + + let train_args = TrainScannerArgs { + input: args.input.clone(), + output: String::new(), // don't save intermediate models + wordlists: args.wordlists.clone(), + threshold, + csic: args.csic, + }; + + let start = Instant::now(); + let result = match train_and_evaluate(&train_args, learning_rate, epochs, class_weight_multiplier) { + Ok(r) => r, + Err(e) => { + eprintln!(" trial {trial_num}: FAILED ({e})"); + optimizer.observe(params, 0.0, start.elapsed()); + continue; + } + }; + let duration = start.elapsed(); + + let objective = result.test_metrics.fbeta(args.beta); + + eprintln!( + " trial {trial_num}/{}: fbeta={objective:.4} (threshold={threshold:.3}, lr={learning_rate:.5}, epochs={epochs}, cwm={class_weight_multiplier:.2}) [{:.1}s]", + args.trials, + duration.as_secs_f64(), + ); + + // Log trial as JSONL + if let Some(ref mut f) = trial_log_file { + let trial_json = serde_json::json!({ + "trial": trial_num, + "params": { + "threshold": threshold, + "learning_rate": learning_rate, + "epochs": epochs, + "class_weight_multiplier": class_weight_multiplier, + }, + "objective": objective, + "duration_secs": duration.as_secs_f64(), + "train_f1": result.train_metrics.f1(), + "test_precision": result.test_metrics.precision(), + "test_recall": result.test_metrics.recall(), + }); + writeln!(f, "{}", trial_json)?; + } + + if objective > best_objective { + best_objective = objective; + let encoded = bincode::serialize(&result.model)?; + best_model_bytes = Some(encoded); + } + + optimizer.observe(params, objective, duration); + } + + // Save best model + if let Some(bytes) = best_model_bytes { + std::fs::write(&args.output, &bytes)?; + eprintln!("\nBest model saved to {}", args.output); + } + + // Print summary + if let Some(best) = optimizer.best() { + eprintln!("\n═══ Autotune Results ═══════════════════════════════════════"); + eprintln!(" Best trial: #{}", best.trial_num); + eprintln!(" Best F-beta: {:.4}", best.objective); + eprintln!(" Parameters:"); + for (name, val) in best.param_names.iter().zip(best.params.iter()) { + eprintln!(" {:<30} = {:.6}", name, val); + } + eprintln!("\n Reproduce:"); + eprintln!( + " cargo run -- train-scanner --input {} --output {} --threshold {:.4}", + args.input, args.output, best.params[0], + ); + eprintln!("══════════════════════════════════════════════════════════"); + } + + Ok(()) +}