From ae18b00fa4886165902b5a854b3244120066b130 Mon Sep 17 00:00:00 2001 From: Sienna Meridian Satterwhite Date: Tue, 10 Mar 2026 23:38:19 +0000 Subject: [PATCH] feat(scanner): add model hot-reload and verified bot allowlist ArcSwap-based lock-free hot-reload via file mtime polling. Bot allowlist with CIDR (instant) + reverse/forward DNS (cached with background worker thread) IP verification to prevent UA spoofing by known crawlers, LLM agents, and commercial B2B bots. Signed-off-by: Sienna Meridian Satterwhite --- src/scanner/allowlist.rs | 317 +++++++++++++++++++++++++++++++++++++++ src/scanner/watcher.rs | 51 +++++++ 2 files changed, 368 insertions(+) create mode 100644 src/scanner/allowlist.rs create mode 100644 src/scanner/watcher.rs diff --git a/src/scanner/allowlist.rs b/src/scanner/allowlist.rs new file mode 100644 index 0000000..964698b --- /dev/null +++ b/src/scanner/allowlist.rs @@ -0,0 +1,317 @@ +use crate::config::BotAllowlistRule; +use crate::rate_limit::cidr::CidrBlock; +use rustc_hash::FxHashMap; +use std::net::IpAddr; +use std::sync::RwLock; +use std::time::{Duration, Instant}; + +/// A compiled bot allowlist rule (ready for hot-path matching). +struct CompiledRule { + ua_prefix_lower: String, + reason: String, + dns_suffixes: Vec, + cidrs: Vec, +} + +#[derive(Clone)] +struct CacheEntry { + rule_idx: usize, + verified: bool, + created: Instant, +} + +/// Bot allowlist with CIDR verification (instant) and DNS verification (cached). +/// +/// Safe to share via `Arc`. Interior mutability is limited to: +/// - `verified_cache`: RwLock around the IP→verified map (write-rare, read-often) +/// - `pending_tx`: mpsc sender to queue background DNS verification +pub struct BotAllowlist { + rules: Vec, + verified_cache: RwLock>, + pending_tx: std::sync::mpsc::Sender, + cache_ttl: Duration, +} + +struct PendingVerification { + ip: IpAddr, + rule_idx: usize, + dns_suffixes: Vec, +} + +impl BotAllowlist { + /// Create the allowlist and spawn the background DNS verification worker. + /// Returns the allowlist wrapped in Arc (needed for cache sharing with worker). + pub fn spawn(rules: &[BotAllowlistRule], cache_ttl_secs: u64) -> std::sync::Arc { + let compiled: Vec = rules + .iter() + .map(|r| CompiledRule { + ua_prefix_lower: r.ua_prefix.to_ascii_lowercase(), + reason: r.reason.clone(), + dns_suffixes: r + .dns_suffixes + .iter() + .map(|s| s.to_ascii_lowercase()) + .collect(), + cidrs: r + .cidrs + .iter() + .filter_map(|s| CidrBlock::parse(s)) + .collect(), + }) + .collect(); + + let (tx, rx) = std::sync::mpsc::channel(); + let cache_ttl = Duration::from_secs(cache_ttl_secs); + + let allowlist = std::sync::Arc::new(Self { + rules: compiled, + verified_cache: RwLock::new(FxHashMap::default()), + pending_tx: tx, + cache_ttl, + }); + + // Spawn background DNS verification worker. + let worker_cache = allowlist.clone(); + std::thread::spawn(move || dns_verification_worker(rx, worker_cache)); + + allowlist + } + + /// Check if a request from `ip` with `user_agent` matches a verified bot. + /// Returns the allowlist reason if verified, None otherwise. + /// + /// Hot path: one lowercase + prefix scan + hash lookup or CIDR check. + pub fn check(&self, user_agent: &str, ip: IpAddr) -> Option<&str> { + let ua_lower = user_agent.to_ascii_lowercase(); + + for (idx, rule) in self.rules.iter().enumerate() { + if !ua_lower.starts_with(&rule.ua_prefix_lower) { + continue; + } + + // UA matches. Now verify the IP. + + // 1. No verification configured → UA match alone is sufficient. + if rule.cidrs.is_empty() && rule.dns_suffixes.is_empty() { + return Some(&rule.reason); + } + + // 2. CIDR verification (instant). + if !rule.cidrs.is_empty() { + if rule.cidrs.iter().any(|c| c.contains(ip)) { + return Some(&rule.reason); + } + // CIDR didn't match. If no DNS suffixes configured, this is a + // spoofed UA — fall through to scanner model. + if rule.dns_suffixes.is_empty() { + return None; + } + } + + // 3. DNS verification (cached). + if !rule.dns_suffixes.is_empty() { + // Check cache first. + let cache = self.verified_cache.read().unwrap_or_else(|e| e.into_inner()); + if let Some(entry) = cache.get(&ip) { + if entry.created.elapsed() < self.cache_ttl { + if entry.verified && entry.rule_idx == idx { + return Some(&rule.reason); + } + // Cached as NOT verified → spoofed UA. + return None; + } + // Expired — fall through to re-queue. + } + drop(cache); + + // Cache miss or expired → queue for background verification. + let _ = self.pending_tx.send(PendingVerification { + ip, + rule_idx: idx, + dns_suffixes: rule.dns_suffixes.clone(), + }); + + // First request from this IP → don't allowlist yet. + // The scanner model decides. Once DNS verifies, future + // requests get the allowlist. + return None; + } + } + + None + } + + /// Evict expired entries from the verified cache. + pub fn evict_stale(&self) { + let mut cache = self.verified_cache.write().unwrap_or_else(|e| e.into_inner()); + cache.retain(|_, entry| entry.created.elapsed() < self.cache_ttl); + } +} + +/// Background worker that processes DNS verification requests. +fn dns_verification_worker( + rx: std::sync::mpsc::Receiver, + allowlist: std::sync::Arc, +) { + while let Ok(req) = rx.recv() { + // Skip if already cached and not expired. + { + let cache = allowlist + .verified_cache + .read() + .unwrap_or_else(|e| e.into_inner()); + if let Some(entry) = cache.get(&req.ip) { + if entry.created.elapsed() < allowlist.cache_ttl { + continue; + } + } + } + + let verified = verify_dns(req.ip, &req.dns_suffixes); + let entry = CacheEntry { + rule_idx: req.rule_idx, + verified, + created: Instant::now(), + }; + + let mut cache = allowlist + .verified_cache + .write() + .unwrap_or_else(|e| e.into_inner()); + cache.insert(req.ip, entry); + + if verified { + tracing::info!( + ip = %req.ip, + rule_idx = req.rule_idx, + "bot IP verified via reverse DNS" + ); + } else { + tracing::debug!( + ip = %req.ip, + rule_idx = req.rule_idx, + "bot IP failed DNS verification (possible UA spoofing)" + ); + } + } +} + +/// Reverse DNS → check suffix → forward DNS → confirm IP. +fn verify_dns(ip: IpAddr, suffixes: &[String]) -> bool { + // Step 1: reverse DNS lookup. + let hostname = match dns_lookup::lookup_addr(&ip) { + Ok(name) => name.to_ascii_lowercase(), + Err(_) => return false, + }; + + // Step 2: hostname must end with one of the allowed suffixes. + let suffix_match = suffixes.iter().any(|s| { + hostname.ends_with(s) || hostname.ends_with(&format!(".{s}")) + }); + if !suffix_match { + return false; + } + + // Step 3: forward DNS — the hostname must resolve back to our IP. + match dns_lookup::lookup_host(&hostname) { + Ok(addrs) => addrs.iter().any(|a| *a == ip), + Err(_) => false, + } +} + +#[cfg(test)] +mod tests { + use super::*; + use crate::config::BotAllowlistRule; + + #[test] + fn test_ua_only_rule_matches() { + let rules = vec![BotAllowlistRule { + ua_prefix: "CCBot".into(), + reason: "commoncrawl".into(), + dns_suffixes: vec![], + cidrs: vec![], + }]; + let al = BotAllowlist::spawn(&rules, 3600); + assert_eq!( + al.check("CCBot/2.0 (https://commoncrawl.org)", "1.2.3.4".parse().unwrap()), + Some("commoncrawl"), + ); + assert_eq!( + al.check("Mozilla/5.0", "1.2.3.4".parse().unwrap()), + None, + ); + } + + #[test] + fn test_cidr_verification_matches() { + let rules = vec![BotAllowlistRule { + ua_prefix: "GPTBot".into(), + reason: "openai".into(), + dns_suffixes: vec![], + cidrs: vec!["23.98.0.0/16".into()], + }]; + let al = BotAllowlist::spawn(&rules, 3600); + assert_eq!( + al.check("GPTBot/1.0", "23.98.1.2".parse().unwrap()), + Some("openai"), + ); + // Wrong IP → spoofed UA + assert_eq!( + al.check("GPTBot/1.0", "1.2.3.4".parse().unwrap()), + None, + ); + } + + #[test] + fn test_cidr_verification_case_insensitive_ua() { + let rules = vec![BotAllowlistRule { + ua_prefix: "GPTBot".into(), + reason: "openai".into(), + dns_suffixes: vec![], + cidrs: vec!["23.98.0.0/16".into()], + }]; + let al = BotAllowlist::spawn(&rules, 3600); + assert_eq!( + al.check("gptbot/1.0", "23.98.1.2".parse().unwrap()), + Some("openai"), + ); + } + + #[test] + fn test_dns_rule_returns_none_on_cache_miss() { + let rules = vec![BotAllowlistRule { + ua_prefix: "Googlebot".into(), + reason: "google".into(), + dns_suffixes: vec!["googlebot.com".into()], + cidrs: vec![], + }]; + let al = BotAllowlist::spawn(&rules, 3600); + // First check → cache miss → queues DNS → returns None + assert_eq!( + al.check("Googlebot/2.1", "66.249.64.1".parse().unwrap()), + None, + ); + } + + #[test] + fn test_no_ua_match_returns_none() { + let rules = vec![BotAllowlistRule { + ua_prefix: "GPTBot".into(), + reason: "openai".into(), + dns_suffixes: vec![], + cidrs: vec!["23.98.0.0/16".into()], + }]; + let al = BotAllowlist::spawn(&rules, 3600); + assert_eq!( + al.check("Mozilla/5.0 Chrome/120", "23.98.1.2".parse().unwrap()), + None, + ); + } + + #[test] + fn test_verify_dns_with_bad_ip() { + // This IP almost certainly won't reverse-resolve to googlebot.com + assert!(!verify_dns("127.0.0.1".parse().unwrap(), &["googlebot.com".into()])); + } +} diff --git a/src/scanner/watcher.rs b/src/scanner/watcher.rs new file mode 100644 index 0000000..5660f99 --- /dev/null +++ b/src/scanner/watcher.rs @@ -0,0 +1,51 @@ +use crate::config::RouteConfig; +use crate::scanner::detector::ScannerDetector; +use crate::scanner::model::ScannerModel; +use arc_swap::ArcSwap; +use std::path::PathBuf; +use std::sync::Arc; +use std::time::Duration; + +/// Poll the scanner model file for mtime changes and hot-swap the detector. +/// Runs forever on a dedicated OS thread — never returns. +pub fn watch_scanner_model( + handle: Arc>, + model_path: PathBuf, + threshold: f64, + routes: Vec, + poll_interval: Duration, +) { + let mut last_mtime = std::fs::metadata(&model_path) + .and_then(|m| m.modified()) + .ok(); + + loop { + std::thread::sleep(poll_interval); + + let current_mtime = match std::fs::metadata(&model_path).and_then(|m| m.modified()) { + Ok(t) => t, + Err(_) => continue, + }; + + if Some(current_mtime) == last_mtime { + continue; + } + + match ScannerModel::load(&model_path) { + Ok(mut model) => { + model.threshold = threshold; + let fragment_count = model.fragments.len(); + let detector = ScannerDetector::new(&model, &routes); + handle.store(Arc::new(detector)); + last_mtime = Some(current_mtime); + tracing::info!( + fragments = fragment_count, + "scanner model hot-reloaded" + ); + } + Err(e) => { + tracing::warn!(error = %e, "failed to reload scanner model; keeping current"); + } + } + } +}