use crate::config::RouteConfig; use crate::scanner::features::{ self, fx_hash_bytes, ScannerNormParams, SUSPICIOUS_EXTENSIONS_LIST, NUM_SCANNER_FEATURES, NUM_SCANNER_WEIGHTS, }; use crate::scanner::model::{ScannerAction, ScannerModel, ScannerVerdict}; use rustc_hash::FxHashSet; /// Immutable, zero-state per-request scanner detector. /// Safe to share across threads via `Arc` with no locks. pub struct ScannerDetector { fragment_hashes: FxHashSet, extension_hashes: FxHashSet, configured_hosts: FxHashSet, weights: [f64; NUM_SCANNER_WEIGHTS], threshold: f64, norm_params: ScannerNormParams, } impl ScannerDetector { pub fn new(model: &ScannerModel, routes: &[RouteConfig]) -> Self { let fragment_hashes: FxHashSet = model .fragments .iter() .map(|f| fx_hash_bytes(f.to_ascii_lowercase().as_bytes())) .collect(); let extension_hashes: FxHashSet = SUSPICIOUS_EXTENSIONS_LIST .iter() .map(|e| fx_hash_bytes(e.as_bytes())) .collect(); let configured_hosts: FxHashSet = routes .iter() .map(|r| fx_hash_bytes(r.host_prefix.as_bytes())) .collect(); Self { fragment_hashes, extension_hashes, configured_hosts, weights: model.weights, threshold: model.threshold, norm_params: model.norm_params.clone(), } } /// Classify a single request. ~200ns, no heap allocation, no state mutation. /// /// Returns a verdict with the action, raw score, and reason. /// The score and reason are captured in pipeline logs so the training /// pipeline always has unfiltered data to retrain from. #[allow(clippy::too_many_arguments)] pub fn check( &self, method: &str, path: &str, host_prefix: &str, has_cookies: bool, has_referer: bool, has_accept_language: bool, accept: &str, user_agent: &str, content_length: u64, ) -> ScannerVerdict { // Hard allowlist: obviously legitimate traffic bypasses the model. // This prevents model drift from ever blocking real users and ensures // the training pipeline always has clean positive labels. let host_known = { let hash = features::fx_hash_bytes(host_prefix.as_bytes()); self.configured_hosts.contains(&hash) }; if host_known && has_cookies { return ScannerVerdict { action: ScannerAction::Allow, score: -1.0, reason: "allowlist:host+cookies", }; } if host_known && has_accept_language && features::ua_is_browser(user_agent) { return ScannerVerdict { action: ScannerAction::Allow, score: -1.0, reason: "allowlist:host+browser", }; } // 1. Extract 12 features let raw = features::extract_features( method, path, host_prefix, has_cookies, has_referer, has_accept_language, accept, user_agent, content_length, &self.fragment_hashes, &self.extension_hashes, &self.configured_hosts, ); // 2. Normalize let f = self.norm_params.normalize(&raw); // 3. Compute score = bias + dot(weights, features) + interaction terms let mut score = self.weights[NUM_SCANNER_FEATURES + 2]; // bias (index 14) for (i, &fi) in f.iter().enumerate().take(NUM_SCANNER_FEATURES) { score += self.weights[i] * fi; } // Interaction: suspicious_path AND no_cookies score += self.weights[12] * f[0] * (1.0 - f[3]); // Interaction: unknown_host AND no_accept_language score += self.weights[13] * (1.0 - f[9]) * (1.0 - f[5]); // 4. Threshold let action = if score > self.threshold { ScannerAction::Block } else { ScannerAction::Allow }; ScannerVerdict { action, score, reason: "model", } } } #[cfg(test)] mod tests { use super::*; use crate::scanner::features::NUM_SCANNER_FEATURES; fn make_detector(weights: [f64; NUM_SCANNER_WEIGHTS], threshold: f64) -> ScannerDetector { let model = ScannerModel { weights, threshold, norm_params: ScannerNormParams { mins: [0.0; NUM_SCANNER_FEATURES], maxs: [1.0; NUM_SCANNER_FEATURES], }, fragments: vec![ ".env".into(), "wp-admin".into(), "wp-login".into(), "phpinfo".into(), "phpmyadmin".into(), ".git".into(), "cgi-bin".into(), ".htaccess".into(), ".htpasswd".into(), ], }; let routes = vec![RouteConfig { host_prefix: "app".into(), backend: "http://127.0.0.1:8080".into(), websocket: false, disable_secure_redirection: false, paths: vec![], static_root: None, fallback: None, rewrites: vec![], body_rewrites: vec![], response_headers: vec![], cache: None, }]; ScannerDetector::new(&model, &routes) } /// Weights tuned to block scanner-like requests: /// High weight on suspicious_path (w[0]), no_cookies interaction (w[12]), /// has_suspicious_extension (w[2]), traversal (w[11]). /// Negative weight on has_cookies (w[3]), has_referer (w[4]), /// accept_quality (w[6]), ua_category (w[7]), host_is_configured (w[9]). fn attack_tuned_weights() -> [f64; NUM_SCANNER_WEIGHTS] { let mut w = [0.0; NUM_SCANNER_WEIGHTS]; w[0] = 2.0; // suspicious_path_score w[2] = 2.0; // has_suspicious_extension w[3] = -2.0; // has_cookies (negative = good) w[4] = -1.0; // has_referer (negative = good) w[5] = -1.0; // has_accept_language (negative = good) w[6] = -0.5; // accept_quality (negative = good) w[7] = -1.0; // ua_category (negative = browser is good) w[9] = -1.5; // host_is_configured (negative = known host is good) w[11] = 2.0; // path_has_traversal w[12] = 1.5; // interaction: suspicious_path AND no_cookies w[13] = 1.0; // interaction: unknown_host AND no_accept_lang w[14] = 0.5; // bias w } #[test] fn test_normal_browser_request_allowed() { let detector = make_detector(attack_tuned_weights(), 0.5); let verdict = detector.check( "GET", "/blog/hello-world", "app", true, // has_cookies true, // has_referer true, // has_accept_language "text/html,application/xhtml+xml", "Mozilla/5.0 (Windows NT 10.0; Win64; x64) Chrome/120", 0, ); assert_eq!(verdict.action, ScannerAction::Allow); assert_eq!(verdict.reason, "allowlist:host+cookies"); } #[test] fn test_api_client_with_auth_allowed() { let detector = make_detector(attack_tuned_weights(), 0.5); let verdict = detector.check( "POST", "/api/v1/data", "app", true, // has_cookies (session cookie) false, true, "application/json", "MyApp/2.0", 256, ); assert_eq!(verdict.action, ScannerAction::Allow); assert_eq!(verdict.reason, "allowlist:host+cookies"); } #[test] fn test_env_probe_blocked() { let detector = make_detector(attack_tuned_weights(), 0.5); let verdict = detector.check( "GET", "/.env", "unknown", false, // no cookies false, // no referer false, // no accept-language "*/*", "curl/7.0", 0, ); assert_eq!(verdict.action, ScannerAction::Block); assert_eq!(verdict.reason, "model"); } #[test] fn test_wordpress_scan_blocked() { let detector = make_detector(attack_tuned_weights(), 0.5); let verdict = detector.check( "GET", "/wp-admin/install.php", "unknown", false, false, false, "*/*", "", 0, ); assert_eq!(verdict.action, ScannerAction::Block); assert_eq!(verdict.reason, "model"); } #[test] fn test_path_traversal_blocked() { let detector = make_detector(attack_tuned_weights(), 0.5); let verdict = detector.check( "GET", "/etc/../../../passwd", "unknown", false, false, false, "*/*", "python-requests/2.28", 0, ); assert_eq!(verdict.action, ScannerAction::Block); assert_eq!(verdict.reason, "model"); } #[test] fn test_legitimate_php_path_allowed() { let detector = make_detector(attack_tuned_weights(), 0.5); // "/blog/php-is-dead" — "php-is-dead" is not a known fragment // has_cookies=true + known host "app" → hits allowlist let verdict = detector.check( "GET", "/blog/php-is-dead", "app", true, true, true, "text/html", "Mozilla/5.0 Chrome/120", 0, ); assert_eq!(verdict.action, ScannerAction::Allow); } #[test] fn test_allowlist_browser_on_known_host() { let detector = make_detector(attack_tuned_weights(), 0.5); // No cookies but browser UA + accept-language + known host → allowlist let verdict = detector.check( "GET", "/", "app", false, false, true, "text/html", "Mozilla/5.0 (Macintosh; Intel Mac OS X) Safari/537.36", 0, ); assert_eq!(verdict.action, ScannerAction::Allow); assert_eq!(verdict.reason, "allowlist:host+browser"); } #[test] fn test_model_path_for_non_allowlisted() { let detector = make_detector(attack_tuned_weights(), 0.5); // Unknown host, no cookies, curl UA → goes through model let verdict = detector.check( "GET", "/robots.txt", "unknown", false, false, false, "*/*", "curl/7.0", 0, ); assert_eq!(verdict.reason, "model"); } }