feat(scanner): add model hot-reload and verified bot allowlist

ArcSwap-based lock-free hot-reload via file mtime polling. Bot
allowlist with CIDR (instant) + reverse/forward DNS (cached with
background worker thread) IP verification to prevent UA spoofing
by known crawlers, LLM agents, and commercial B2B bots.

Signed-off-by: Sienna Meridian Satterwhite <sienna@sunbeam.pt>
This commit is contained in:
2026-03-10 23:38:19 +00:00
parent 273a203c41
commit ae18b00fa4
2 changed files with 368 additions and 0 deletions

317
src/scanner/allowlist.rs Normal file
View File

@@ -0,0 +1,317 @@
use crate::config::BotAllowlistRule;
use crate::rate_limit::cidr::CidrBlock;
use rustc_hash::FxHashMap;
use std::net::IpAddr;
use std::sync::RwLock;
use std::time::{Duration, Instant};
/// A compiled bot allowlist rule (ready for hot-path matching).
struct CompiledRule {
ua_prefix_lower: String,
reason: String,
dns_suffixes: Vec<String>,
cidrs: Vec<CidrBlock>,
}
#[derive(Clone)]
struct CacheEntry {
rule_idx: usize,
verified: bool,
created: Instant,
}
/// Bot allowlist with CIDR verification (instant) and DNS verification (cached).
///
/// Safe to share via `Arc<BotAllowlist>`. Interior mutability is limited to:
/// - `verified_cache`: RwLock around the IP→verified map (write-rare, read-often)
/// - `pending_tx`: mpsc sender to queue background DNS verification
pub struct BotAllowlist {
rules: Vec<CompiledRule>,
verified_cache: RwLock<FxHashMap<IpAddr, CacheEntry>>,
pending_tx: std::sync::mpsc::Sender<PendingVerification>,
cache_ttl: Duration,
}
struct PendingVerification {
ip: IpAddr,
rule_idx: usize,
dns_suffixes: Vec<String>,
}
impl BotAllowlist {
/// Create the allowlist and spawn the background DNS verification worker.
/// Returns the allowlist wrapped in Arc (needed for cache sharing with worker).
pub fn spawn(rules: &[BotAllowlistRule], cache_ttl_secs: u64) -> std::sync::Arc<Self> {
let compiled: Vec<CompiledRule> = rules
.iter()
.map(|r| CompiledRule {
ua_prefix_lower: r.ua_prefix.to_ascii_lowercase(),
reason: r.reason.clone(),
dns_suffixes: r
.dns_suffixes
.iter()
.map(|s| s.to_ascii_lowercase())
.collect(),
cidrs: r
.cidrs
.iter()
.filter_map(|s| CidrBlock::parse(s))
.collect(),
})
.collect();
let (tx, rx) = std::sync::mpsc::channel();
let cache_ttl = Duration::from_secs(cache_ttl_secs);
let allowlist = std::sync::Arc::new(Self {
rules: compiled,
verified_cache: RwLock::new(FxHashMap::default()),
pending_tx: tx,
cache_ttl,
});
// Spawn background DNS verification worker.
let worker_cache = allowlist.clone();
std::thread::spawn(move || dns_verification_worker(rx, worker_cache));
allowlist
}
/// Check if a request from `ip` with `user_agent` matches a verified bot.
/// Returns the allowlist reason if verified, None otherwise.
///
/// Hot path: one lowercase + prefix scan + hash lookup or CIDR check.
pub fn check(&self, user_agent: &str, ip: IpAddr) -> Option<&str> {
let ua_lower = user_agent.to_ascii_lowercase();
for (idx, rule) in self.rules.iter().enumerate() {
if !ua_lower.starts_with(&rule.ua_prefix_lower) {
continue;
}
// UA matches. Now verify the IP.
// 1. No verification configured → UA match alone is sufficient.
if rule.cidrs.is_empty() && rule.dns_suffixes.is_empty() {
return Some(&rule.reason);
}
// 2. CIDR verification (instant).
if !rule.cidrs.is_empty() {
if rule.cidrs.iter().any(|c| c.contains(ip)) {
return Some(&rule.reason);
}
// CIDR didn't match. If no DNS suffixes configured, this is a
// spoofed UA — fall through to scanner model.
if rule.dns_suffixes.is_empty() {
return None;
}
}
// 3. DNS verification (cached).
if !rule.dns_suffixes.is_empty() {
// Check cache first.
let cache = self.verified_cache.read().unwrap_or_else(|e| e.into_inner());
if let Some(entry) = cache.get(&ip) {
if entry.created.elapsed() < self.cache_ttl {
if entry.verified && entry.rule_idx == idx {
return Some(&rule.reason);
}
// Cached as NOT verified → spoofed UA.
return None;
}
// Expired — fall through to re-queue.
}
drop(cache);
// Cache miss or expired → queue for background verification.
let _ = self.pending_tx.send(PendingVerification {
ip,
rule_idx: idx,
dns_suffixes: rule.dns_suffixes.clone(),
});
// First request from this IP → don't allowlist yet.
// The scanner model decides. Once DNS verifies, future
// requests get the allowlist.
return None;
}
}
None
}
/// Evict expired entries from the verified cache.
pub fn evict_stale(&self) {
let mut cache = self.verified_cache.write().unwrap_or_else(|e| e.into_inner());
cache.retain(|_, entry| entry.created.elapsed() < self.cache_ttl);
}
}
/// Background worker that processes DNS verification requests.
fn dns_verification_worker(
rx: std::sync::mpsc::Receiver<PendingVerification>,
allowlist: std::sync::Arc<BotAllowlist>,
) {
while let Ok(req) = rx.recv() {
// Skip if already cached and not expired.
{
let cache = allowlist
.verified_cache
.read()
.unwrap_or_else(|e| e.into_inner());
if let Some(entry) = cache.get(&req.ip) {
if entry.created.elapsed() < allowlist.cache_ttl {
continue;
}
}
}
let verified = verify_dns(req.ip, &req.dns_suffixes);
let entry = CacheEntry {
rule_idx: req.rule_idx,
verified,
created: Instant::now(),
};
let mut cache = allowlist
.verified_cache
.write()
.unwrap_or_else(|e| e.into_inner());
cache.insert(req.ip, entry);
if verified {
tracing::info!(
ip = %req.ip,
rule_idx = req.rule_idx,
"bot IP verified via reverse DNS"
);
} else {
tracing::debug!(
ip = %req.ip,
rule_idx = req.rule_idx,
"bot IP failed DNS verification (possible UA spoofing)"
);
}
}
}
/// Reverse DNS → check suffix → forward DNS → confirm IP.
fn verify_dns(ip: IpAddr, suffixes: &[String]) -> bool {
// Step 1: reverse DNS lookup.
let hostname = match dns_lookup::lookup_addr(&ip) {
Ok(name) => name.to_ascii_lowercase(),
Err(_) => return false,
};
// Step 2: hostname must end with one of the allowed suffixes.
let suffix_match = suffixes.iter().any(|s| {
hostname.ends_with(s) || hostname.ends_with(&format!(".{s}"))
});
if !suffix_match {
return false;
}
// Step 3: forward DNS — the hostname must resolve back to our IP.
match dns_lookup::lookup_host(&hostname) {
Ok(addrs) => addrs.iter().any(|a| *a == ip),
Err(_) => false,
}
}
#[cfg(test)]
mod tests {
use super::*;
use crate::config::BotAllowlistRule;
#[test]
fn test_ua_only_rule_matches() {
let rules = vec![BotAllowlistRule {
ua_prefix: "CCBot".into(),
reason: "commoncrawl".into(),
dns_suffixes: vec![],
cidrs: vec![],
}];
let al = BotAllowlist::spawn(&rules, 3600);
assert_eq!(
al.check("CCBot/2.0 (https://commoncrawl.org)", "1.2.3.4".parse().unwrap()),
Some("commoncrawl"),
);
assert_eq!(
al.check("Mozilla/5.0", "1.2.3.4".parse().unwrap()),
None,
);
}
#[test]
fn test_cidr_verification_matches() {
let rules = vec![BotAllowlistRule {
ua_prefix: "GPTBot".into(),
reason: "openai".into(),
dns_suffixes: vec![],
cidrs: vec!["23.98.0.0/16".into()],
}];
let al = BotAllowlist::spawn(&rules, 3600);
assert_eq!(
al.check("GPTBot/1.0", "23.98.1.2".parse().unwrap()),
Some("openai"),
);
// Wrong IP → spoofed UA
assert_eq!(
al.check("GPTBot/1.0", "1.2.3.4".parse().unwrap()),
None,
);
}
#[test]
fn test_cidr_verification_case_insensitive_ua() {
let rules = vec![BotAllowlistRule {
ua_prefix: "GPTBot".into(),
reason: "openai".into(),
dns_suffixes: vec![],
cidrs: vec!["23.98.0.0/16".into()],
}];
let al = BotAllowlist::spawn(&rules, 3600);
assert_eq!(
al.check("gptbot/1.0", "23.98.1.2".parse().unwrap()),
Some("openai"),
);
}
#[test]
fn test_dns_rule_returns_none_on_cache_miss() {
let rules = vec![BotAllowlistRule {
ua_prefix: "Googlebot".into(),
reason: "google".into(),
dns_suffixes: vec!["googlebot.com".into()],
cidrs: vec![],
}];
let al = BotAllowlist::spawn(&rules, 3600);
// First check → cache miss → queues DNS → returns None
assert_eq!(
al.check("Googlebot/2.1", "66.249.64.1".parse().unwrap()),
None,
);
}
#[test]
fn test_no_ua_match_returns_none() {
let rules = vec![BotAllowlistRule {
ua_prefix: "GPTBot".into(),
reason: "openai".into(),
dns_suffixes: vec![],
cidrs: vec!["23.98.0.0/16".into()],
}];
let al = BotAllowlist::spawn(&rules, 3600);
assert_eq!(
al.check("Mozilla/5.0 Chrome/120", "23.98.1.2".parse().unwrap()),
None,
);
}
#[test]
fn test_verify_dns_with_bad_ip() {
// This IP almost certainly won't reverse-resolve to googlebot.com
assert!(!verify_dns("127.0.0.1".parse().unwrap(), &["googlebot.com".into()]));
}
}

51
src/scanner/watcher.rs Normal file
View File

@@ -0,0 +1,51 @@
use crate::config::RouteConfig;
use crate::scanner::detector::ScannerDetector;
use crate::scanner::model::ScannerModel;
use arc_swap::ArcSwap;
use std::path::PathBuf;
use std::sync::Arc;
use std::time::Duration;
/// Poll the scanner model file for mtime changes and hot-swap the detector.
/// Runs forever on a dedicated OS thread — never returns.
pub fn watch_scanner_model(
handle: Arc<ArcSwap<ScannerDetector>>,
model_path: PathBuf,
threshold: f64,
routes: Vec<RouteConfig>,
poll_interval: Duration,
) {
let mut last_mtime = std::fs::metadata(&model_path)
.and_then(|m| m.modified())
.ok();
loop {
std::thread::sleep(poll_interval);
let current_mtime = match std::fs::metadata(&model_path).and_then(|m| m.modified()) {
Ok(t) => t,
Err(_) => continue,
};
if Some(current_mtime) == last_mtime {
continue;
}
match ScannerModel::load(&model_path) {
Ok(mut model) => {
model.threshold = threshold;
let fragment_count = model.fragments.len();
let detector = ScannerDetector::new(&model, &routes);
handle.store(Arc::new(detector));
last_mtime = Some(current_mtime);
tracing::info!(
fragments = fragment_count,
"scanner model hot-reloaded"
);
}
Err(e) => {
tracing::warn!(error = %e, "failed to reload scanner model; keeping current");
}
}
}
}