feat(scanner): add model hot-reload and verified bot allowlist
ArcSwap-based lock-free hot-reload via file mtime polling. Bot allowlist with CIDR (instant) + reverse/forward DNS (cached with background worker thread) IP verification to prevent UA spoofing by known crawlers, LLM agents, and commercial B2B bots. Signed-off-by: Sienna Meridian Satterwhite <sienna@sunbeam.pt>
This commit is contained in:
317
src/scanner/allowlist.rs
Normal file
317
src/scanner/allowlist.rs
Normal file
@@ -0,0 +1,317 @@
|
||||
use crate::config::BotAllowlistRule;
|
||||
use crate::rate_limit::cidr::CidrBlock;
|
||||
use rustc_hash::FxHashMap;
|
||||
use std::net::IpAddr;
|
||||
use std::sync::RwLock;
|
||||
use std::time::{Duration, Instant};
|
||||
|
||||
/// A compiled bot allowlist rule (ready for hot-path matching).
|
||||
struct CompiledRule {
|
||||
ua_prefix_lower: String,
|
||||
reason: String,
|
||||
dns_suffixes: Vec<String>,
|
||||
cidrs: Vec<CidrBlock>,
|
||||
}
|
||||
|
||||
#[derive(Clone)]
|
||||
struct CacheEntry {
|
||||
rule_idx: usize,
|
||||
verified: bool,
|
||||
created: Instant,
|
||||
}
|
||||
|
||||
/// Bot allowlist with CIDR verification (instant) and DNS verification (cached).
|
||||
///
|
||||
/// Safe to share via `Arc<BotAllowlist>`. Interior mutability is limited to:
|
||||
/// - `verified_cache`: RwLock around the IP→verified map (write-rare, read-often)
|
||||
/// - `pending_tx`: mpsc sender to queue background DNS verification
|
||||
pub struct BotAllowlist {
|
||||
rules: Vec<CompiledRule>,
|
||||
verified_cache: RwLock<FxHashMap<IpAddr, CacheEntry>>,
|
||||
pending_tx: std::sync::mpsc::Sender<PendingVerification>,
|
||||
cache_ttl: Duration,
|
||||
}
|
||||
|
||||
struct PendingVerification {
|
||||
ip: IpAddr,
|
||||
rule_idx: usize,
|
||||
dns_suffixes: Vec<String>,
|
||||
}
|
||||
|
||||
impl BotAllowlist {
|
||||
/// Create the allowlist and spawn the background DNS verification worker.
|
||||
/// Returns the allowlist wrapped in Arc (needed for cache sharing with worker).
|
||||
pub fn spawn(rules: &[BotAllowlistRule], cache_ttl_secs: u64) -> std::sync::Arc<Self> {
|
||||
let compiled: Vec<CompiledRule> = rules
|
||||
.iter()
|
||||
.map(|r| CompiledRule {
|
||||
ua_prefix_lower: r.ua_prefix.to_ascii_lowercase(),
|
||||
reason: r.reason.clone(),
|
||||
dns_suffixes: r
|
||||
.dns_suffixes
|
||||
.iter()
|
||||
.map(|s| s.to_ascii_lowercase())
|
||||
.collect(),
|
||||
cidrs: r
|
||||
.cidrs
|
||||
.iter()
|
||||
.filter_map(|s| CidrBlock::parse(s))
|
||||
.collect(),
|
||||
})
|
||||
.collect();
|
||||
|
||||
let (tx, rx) = std::sync::mpsc::channel();
|
||||
let cache_ttl = Duration::from_secs(cache_ttl_secs);
|
||||
|
||||
let allowlist = std::sync::Arc::new(Self {
|
||||
rules: compiled,
|
||||
verified_cache: RwLock::new(FxHashMap::default()),
|
||||
pending_tx: tx,
|
||||
cache_ttl,
|
||||
});
|
||||
|
||||
// Spawn background DNS verification worker.
|
||||
let worker_cache = allowlist.clone();
|
||||
std::thread::spawn(move || dns_verification_worker(rx, worker_cache));
|
||||
|
||||
allowlist
|
||||
}
|
||||
|
||||
/// Check if a request from `ip` with `user_agent` matches a verified bot.
|
||||
/// Returns the allowlist reason if verified, None otherwise.
|
||||
///
|
||||
/// Hot path: one lowercase + prefix scan + hash lookup or CIDR check.
|
||||
pub fn check(&self, user_agent: &str, ip: IpAddr) -> Option<&str> {
|
||||
let ua_lower = user_agent.to_ascii_lowercase();
|
||||
|
||||
for (idx, rule) in self.rules.iter().enumerate() {
|
||||
if !ua_lower.starts_with(&rule.ua_prefix_lower) {
|
||||
continue;
|
||||
}
|
||||
|
||||
// UA matches. Now verify the IP.
|
||||
|
||||
// 1. No verification configured → UA match alone is sufficient.
|
||||
if rule.cidrs.is_empty() && rule.dns_suffixes.is_empty() {
|
||||
return Some(&rule.reason);
|
||||
}
|
||||
|
||||
// 2. CIDR verification (instant).
|
||||
if !rule.cidrs.is_empty() {
|
||||
if rule.cidrs.iter().any(|c| c.contains(ip)) {
|
||||
return Some(&rule.reason);
|
||||
}
|
||||
// CIDR didn't match. If no DNS suffixes configured, this is a
|
||||
// spoofed UA — fall through to scanner model.
|
||||
if rule.dns_suffixes.is_empty() {
|
||||
return None;
|
||||
}
|
||||
}
|
||||
|
||||
// 3. DNS verification (cached).
|
||||
if !rule.dns_suffixes.is_empty() {
|
||||
// Check cache first.
|
||||
let cache = self.verified_cache.read().unwrap_or_else(|e| e.into_inner());
|
||||
if let Some(entry) = cache.get(&ip) {
|
||||
if entry.created.elapsed() < self.cache_ttl {
|
||||
if entry.verified && entry.rule_idx == idx {
|
||||
return Some(&rule.reason);
|
||||
}
|
||||
// Cached as NOT verified → spoofed UA.
|
||||
return None;
|
||||
}
|
||||
// Expired — fall through to re-queue.
|
||||
}
|
||||
drop(cache);
|
||||
|
||||
// Cache miss or expired → queue for background verification.
|
||||
let _ = self.pending_tx.send(PendingVerification {
|
||||
ip,
|
||||
rule_idx: idx,
|
||||
dns_suffixes: rule.dns_suffixes.clone(),
|
||||
});
|
||||
|
||||
// First request from this IP → don't allowlist yet.
|
||||
// The scanner model decides. Once DNS verifies, future
|
||||
// requests get the allowlist.
|
||||
return None;
|
||||
}
|
||||
}
|
||||
|
||||
None
|
||||
}
|
||||
|
||||
/// Evict expired entries from the verified cache.
|
||||
pub fn evict_stale(&self) {
|
||||
let mut cache = self.verified_cache.write().unwrap_or_else(|e| e.into_inner());
|
||||
cache.retain(|_, entry| entry.created.elapsed() < self.cache_ttl);
|
||||
}
|
||||
}
|
||||
|
||||
/// Background worker that processes DNS verification requests.
|
||||
fn dns_verification_worker(
|
||||
rx: std::sync::mpsc::Receiver<PendingVerification>,
|
||||
allowlist: std::sync::Arc<BotAllowlist>,
|
||||
) {
|
||||
while let Ok(req) = rx.recv() {
|
||||
// Skip if already cached and not expired.
|
||||
{
|
||||
let cache = allowlist
|
||||
.verified_cache
|
||||
.read()
|
||||
.unwrap_or_else(|e| e.into_inner());
|
||||
if let Some(entry) = cache.get(&req.ip) {
|
||||
if entry.created.elapsed() < allowlist.cache_ttl {
|
||||
continue;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
let verified = verify_dns(req.ip, &req.dns_suffixes);
|
||||
let entry = CacheEntry {
|
||||
rule_idx: req.rule_idx,
|
||||
verified,
|
||||
created: Instant::now(),
|
||||
};
|
||||
|
||||
let mut cache = allowlist
|
||||
.verified_cache
|
||||
.write()
|
||||
.unwrap_or_else(|e| e.into_inner());
|
||||
cache.insert(req.ip, entry);
|
||||
|
||||
if verified {
|
||||
tracing::info!(
|
||||
ip = %req.ip,
|
||||
rule_idx = req.rule_idx,
|
||||
"bot IP verified via reverse DNS"
|
||||
);
|
||||
} else {
|
||||
tracing::debug!(
|
||||
ip = %req.ip,
|
||||
rule_idx = req.rule_idx,
|
||||
"bot IP failed DNS verification (possible UA spoofing)"
|
||||
);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// Reverse DNS → check suffix → forward DNS → confirm IP.
|
||||
fn verify_dns(ip: IpAddr, suffixes: &[String]) -> bool {
|
||||
// Step 1: reverse DNS lookup.
|
||||
let hostname = match dns_lookup::lookup_addr(&ip) {
|
||||
Ok(name) => name.to_ascii_lowercase(),
|
||||
Err(_) => return false,
|
||||
};
|
||||
|
||||
// Step 2: hostname must end with one of the allowed suffixes.
|
||||
let suffix_match = suffixes.iter().any(|s| {
|
||||
hostname.ends_with(s) || hostname.ends_with(&format!(".{s}"))
|
||||
});
|
||||
if !suffix_match {
|
||||
return false;
|
||||
}
|
||||
|
||||
// Step 3: forward DNS — the hostname must resolve back to our IP.
|
||||
match dns_lookup::lookup_host(&hostname) {
|
||||
Ok(addrs) => addrs.iter().any(|a| *a == ip),
|
||||
Err(_) => false,
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
use crate::config::BotAllowlistRule;
|
||||
|
||||
#[test]
|
||||
fn test_ua_only_rule_matches() {
|
||||
let rules = vec![BotAllowlistRule {
|
||||
ua_prefix: "CCBot".into(),
|
||||
reason: "commoncrawl".into(),
|
||||
dns_suffixes: vec![],
|
||||
cidrs: vec![],
|
||||
}];
|
||||
let al = BotAllowlist::spawn(&rules, 3600);
|
||||
assert_eq!(
|
||||
al.check("CCBot/2.0 (https://commoncrawl.org)", "1.2.3.4".parse().unwrap()),
|
||||
Some("commoncrawl"),
|
||||
);
|
||||
assert_eq!(
|
||||
al.check("Mozilla/5.0", "1.2.3.4".parse().unwrap()),
|
||||
None,
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_cidr_verification_matches() {
|
||||
let rules = vec![BotAllowlistRule {
|
||||
ua_prefix: "GPTBot".into(),
|
||||
reason: "openai".into(),
|
||||
dns_suffixes: vec![],
|
||||
cidrs: vec!["23.98.0.0/16".into()],
|
||||
}];
|
||||
let al = BotAllowlist::spawn(&rules, 3600);
|
||||
assert_eq!(
|
||||
al.check("GPTBot/1.0", "23.98.1.2".parse().unwrap()),
|
||||
Some("openai"),
|
||||
);
|
||||
// Wrong IP → spoofed UA
|
||||
assert_eq!(
|
||||
al.check("GPTBot/1.0", "1.2.3.4".parse().unwrap()),
|
||||
None,
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_cidr_verification_case_insensitive_ua() {
|
||||
let rules = vec![BotAllowlistRule {
|
||||
ua_prefix: "GPTBot".into(),
|
||||
reason: "openai".into(),
|
||||
dns_suffixes: vec![],
|
||||
cidrs: vec!["23.98.0.0/16".into()],
|
||||
}];
|
||||
let al = BotAllowlist::spawn(&rules, 3600);
|
||||
assert_eq!(
|
||||
al.check("gptbot/1.0", "23.98.1.2".parse().unwrap()),
|
||||
Some("openai"),
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_dns_rule_returns_none_on_cache_miss() {
|
||||
let rules = vec![BotAllowlistRule {
|
||||
ua_prefix: "Googlebot".into(),
|
||||
reason: "google".into(),
|
||||
dns_suffixes: vec!["googlebot.com".into()],
|
||||
cidrs: vec![],
|
||||
}];
|
||||
let al = BotAllowlist::spawn(&rules, 3600);
|
||||
// First check → cache miss → queues DNS → returns None
|
||||
assert_eq!(
|
||||
al.check("Googlebot/2.1", "66.249.64.1".parse().unwrap()),
|
||||
None,
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_no_ua_match_returns_none() {
|
||||
let rules = vec![BotAllowlistRule {
|
||||
ua_prefix: "GPTBot".into(),
|
||||
reason: "openai".into(),
|
||||
dns_suffixes: vec![],
|
||||
cidrs: vec!["23.98.0.0/16".into()],
|
||||
}];
|
||||
let al = BotAllowlist::spawn(&rules, 3600);
|
||||
assert_eq!(
|
||||
al.check("Mozilla/5.0 Chrome/120", "23.98.1.2".parse().unwrap()),
|
||||
None,
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_verify_dns_with_bad_ip() {
|
||||
// This IP almost certainly won't reverse-resolve to googlebot.com
|
||||
assert!(!verify_dns("127.0.0.1".parse().unwrap(), &["googlebot.com".into()]));
|
||||
}
|
||||
}
|
||||
51
src/scanner/watcher.rs
Normal file
51
src/scanner/watcher.rs
Normal file
@@ -0,0 +1,51 @@
|
||||
use crate::config::RouteConfig;
|
||||
use crate::scanner::detector::ScannerDetector;
|
||||
use crate::scanner::model::ScannerModel;
|
||||
use arc_swap::ArcSwap;
|
||||
use std::path::PathBuf;
|
||||
use std::sync::Arc;
|
||||
use std::time::Duration;
|
||||
|
||||
/// Poll the scanner model file for mtime changes and hot-swap the detector.
|
||||
/// Runs forever on a dedicated OS thread — never returns.
|
||||
pub fn watch_scanner_model(
|
||||
handle: Arc<ArcSwap<ScannerDetector>>,
|
||||
model_path: PathBuf,
|
||||
threshold: f64,
|
||||
routes: Vec<RouteConfig>,
|
||||
poll_interval: Duration,
|
||||
) {
|
||||
let mut last_mtime = std::fs::metadata(&model_path)
|
||||
.and_then(|m| m.modified())
|
||||
.ok();
|
||||
|
||||
loop {
|
||||
std::thread::sleep(poll_interval);
|
||||
|
||||
let current_mtime = match std::fs::metadata(&model_path).and_then(|m| m.modified()) {
|
||||
Ok(t) => t,
|
||||
Err(_) => continue,
|
||||
};
|
||||
|
||||
if Some(current_mtime) == last_mtime {
|
||||
continue;
|
||||
}
|
||||
|
||||
match ScannerModel::load(&model_path) {
|
||||
Ok(mut model) => {
|
||||
model.threshold = threshold;
|
||||
let fragment_count = model.fragments.len();
|
||||
let detector = ScannerDetector::new(&model, &routes);
|
||||
handle.store(Arc::new(detector));
|
||||
last_mtime = Some(current_mtime);
|
||||
tracing::info!(
|
||||
fragments = fragment_count,
|
||||
"scanner model hot-reloaded"
|
||||
);
|
||||
}
|
||||
Err(e) => {
|
||||
tracing::warn!(error = %e, "failed to reload scanner model; keeping current");
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
Reference in New Issue
Block a user