feat: configurable k8s resources, CSIC training pipeline, unified Dockerfile

- Make K8s namespace, TLS secret, and config ConfigMap names configurable
  via [kubernetes] config section (previously hardcoded to "ingress")
- Add CSIC 2010 dataset converter and auto-download for scanner training
- Unify Dockerfile for local and production builds (remove cross-compile path)
- Bake ML models directory into container image
- Update CSIC dataset URL to self-hosted mirror (src.sunbeam.pt)
- Fix rate_limit pipeline log missing fields
- Consolidate docs/README.md into root README.md

Signed-off-by: Sienna Meridian Satterwhite <sienna@sunbeam.pt>
This commit is contained in:
2026-03-10 23:38:20 +00:00
parent 0baab92141
commit a5810dd8a7
23 changed files with 946 additions and 514 deletions

View File

@@ -6,7 +6,7 @@ use std::{collections::HashMap, sync::{Arc, RwLock}};
/// Maps a challenge path to the backend address that can answer it.
///
/// Key: `/.well-known/acme-challenge/<token>`
/// Value: `cm-acme-http-solver-<hash>.ingress.svc.cluster.local:8089`
/// Value: `cm-acme-http-solver-<hash>.<namespace>.svc.cluster.local:8089`
///
/// cert-manager creates one Ingress per challenge domain with exactly this
/// path and backend. Our proxy consults this table to route each challenge
@@ -18,15 +18,15 @@ use std::{collections::HashMap, sync::{Arc, RwLock}};
/// can be written from the watcher runtime without cross-runtime waker issues.
pub type AcmeRoutes = Arc<RwLock<HashMap<String, String>>>;
/// Watch Ingress objects in the ingress namespace and maintain `routes`.
/// Watch Ingress objects and maintain `routes`.
///
/// cert-manager creates an Ingress for each HTTP-01 challenge it manages.
/// The Ingress contains a path rule for `/.well-known/acme-challenge/<token>`
/// pointing to a per-challenge solver Service. We populate the route table
/// from these rules so the proxy can forward each challenge token to the
/// correct solver pod without the nondeterminism of a shared stable Service.
pub async fn watch_ingresses(client: Client, routes: AcmeRoutes) {
let api: Api<Ingress> = Api::namespaced(client, "ingress");
pub async fn watch_ingresses(client: Client, namespace: String, routes: AcmeRoutes) {
let api: Api<Ingress> = Api::namespaced(client, &namespace);
// Verify Ingress API access before entering the watch loop. A failure here
// almost always means cert-manager is not installed or RBAC is wrong.
@@ -43,12 +43,9 @@ pub async fn watch_ingresses(client: Client, routes: AcmeRoutes) {
while let Some(result) = stream.next().await {
match result {
// InitApply fires for each Ingress during the initial list (kube v3+).
// Apply fires for subsequent creates/updates.
// Both must be handled to catch Ingresses that existed before the proxy started.
Ok(watcher::Event::InitApply(ing)) | Ok(watcher::Event::Apply(ing)) => {
let mut map = routes.write().unwrap_or_else(|e| e.into_inner());
upsert_routes(&ing, &mut map);
upsert_routes(&ing, &namespace, &mut map);
}
Ok(watcher::Event::Delete(ing)) => {
let mut map = routes.write().unwrap_or_else(|e| e.into_inner());
@@ -63,7 +60,7 @@ pub async fn watch_ingresses(client: Client, routes: AcmeRoutes) {
}
}
fn upsert_routes(ingress: &Ingress, map: &mut HashMap<String, String>) {
fn upsert_routes(ingress: &Ingress, namespace: &str, map: &mut HashMap<String, String>) {
let Some(spec) = &ingress.spec else { return };
for rule in spec.rules.as_deref().unwrap_or(&[]) {
let Some(http) = &rule.http else { continue };
@@ -75,7 +72,7 @@ fn upsert_routes(ingress: &Ingress, map: &mut HashMap<String, String>) {
let Some(svc) = p.backend.service.as_ref() else { continue };
let Some(port) = svc.port.as_ref().and_then(|p| p.number) else { continue };
let backend = format!(
"{}.ingress.svc.cluster.local:{port}",
"{}.{namespace}.svc.cluster.local:{port}",
svc.name
);
tracing::debug!(path, %backend, "added ACME challenge route");

View File

@@ -2,18 +2,23 @@ use anyhow::{Context, Result};
use k8s_openapi::api::core::v1::Secret;
use kube::{Api, Client};
/// Fetch the `pingora-tls` Secret from the ingress namespace and write
/// `tls.crt` / `tls.key` to the paths declared in config.toml.
/// Fetch the TLS Secret and write `tls.crt` / `tls.key` to the configured paths.
///
/// Called at startup (non-upgrade) so the proxy never depends on kubelet
/// volume-sync timing: the cert files are written directly from the K8s API
/// before `svc.add_tls()` is called.
pub async fn fetch_and_write(client: &Client, cert_path: &str, key_path: &str) -> Result<()> {
let api: Api<Secret> = Api::namespaced(client.clone(), "ingress");
pub async fn fetch_and_write(
client: &Client,
namespace: &str,
secret_name: &str,
cert_path: &str,
key_path: &str,
) -> Result<()> {
let api: Api<Secret> = Api::namespaced(client.clone(), namespace);
let secret = api
.get("pingora-tls")
.get(secret_name)
.await
.context("fetching pingora-tls Secret from K8s API")?;
.with_context(|| format!("fetching {secret_name} Secret from K8s API"))?;
write_from_secret(&secret, cert_path, key_path)
}
@@ -27,14 +32,14 @@ pub fn write_from_secret(secret: &Secret, cert_path: &str, key_path: &str) -> Re
let data = secret
.data
.as_ref()
.ok_or_else(|| anyhow::anyhow!("pingora-tls Secret has no data"))?;
.ok_or_else(|| anyhow::anyhow!("TLS Secret has no data"))?;
let crt = data
.get("tls.crt")
.ok_or_else(|| anyhow::anyhow!("pingora-tls missing tls.crt"))?;
.ok_or_else(|| anyhow::anyhow!("TLS Secret missing tls.crt"))?;
let key = data
.get("tls.key")
.ok_or_else(|| anyhow::anyhow!("pingora-tls missing tls.key"))?;
.ok_or_else(|| anyhow::anyhow!("TLS Secret missing tls.key"))?;
// /etc/tls is an emptyDir; create it if the pod just started.
if let Some(parent) = std::path::Path::new(cert_path).parent() {

View File

@@ -24,8 +24,38 @@ pub struct Config {
pub rate_limit: Option<RateLimitConfig>,
/// Optional per-request scanner detection.
pub scanner: Option<ScannerConfig>,
/// Kubernetes resource names and namespaces for watchers.
#[serde(default)]
pub kubernetes: KubernetesConfig,
}
#[derive(Debug, Deserialize, Clone)]
pub struct KubernetesConfig {
/// Namespace where the proxy's resources live (Secret, ConfigMap, Ingresses).
#[serde(default = "default_k8s_namespace")]
pub namespace: String,
/// Name of the TLS Secret watched for cert hot-reload.
#[serde(default = "default_tls_secret")]
pub tls_secret: String,
/// Name of the ConfigMap watched for config hot-reload.
#[serde(default = "default_config_configmap")]
pub config_configmap: String,
}
impl Default for KubernetesConfig {
fn default() -> Self {
Self {
namespace: default_k8s_namespace(),
tls_secret: default_tls_secret(),
config_configmap: default_config_configmap(),
}
}
}
fn default_k8s_namespace() -> String { "ingress".to_string() }
fn default_tls_secret() -> String { "pingora-tls".to_string() }
fn default_config_configmap() -> String { "pingora-config".to_string() }
#[derive(Debug, Deserialize, Clone)]
pub struct DDoSConfig {
pub model_path: String,

View File

@@ -39,6 +39,7 @@ impl DDoSDetector {
/// Record an incoming request and classify the IP.
/// Called from request_filter (before upstream).
#[allow(clippy::too_many_arguments)]
pub fn check(
&self,
ip: IpAddr,

View File

@@ -70,6 +70,10 @@ impl IpState {
self.events.len()
}
pub fn is_empty(&self) -> bool {
self.events.is_empty()
}
/// Prune events older than `window` from the logical view.
/// Returns a slice of active events (not necessarily contiguous in ring buffer,
/// so we collect into a Vec).
@@ -274,6 +278,12 @@ pub struct LogIpState {
pub suspicious_paths: Vec<bool>,
}
impl Default for LogIpState {
fn default() -> Self {
Self::new()
}
}
impl LogIpState {
pub fn new() -> Self {
Self {

View File

@@ -113,7 +113,7 @@ pub fn run(args: TrainArgs) -> Result<()> {
let ip = audit_log::strip_port(&entry.fields.client_ip).to_string();
let ts = parse_timestamp(&entry.timestamp);
let state = ip_states.entry(ip).or_insert_with(LogIpState::new);
let state = ip_states.entry(ip).or_default();
state.timestamps.push(ts);
state.methods.push(method_to_u8(&entry.fields.method));
state.path_hashes.push(fx_hash(&entry.fields.path));

View File

@@ -33,7 +33,7 @@ enum Commands {
upgrade: bool,
},
/// Replay audit logs through the DDoS detector and rate limiter
Replay {
ReplayDdos {
/// Path to audit log JSONL file
#[arg(short, long)]
input: String,
@@ -60,7 +60,7 @@ enum Commands {
rate_limit: bool,
},
/// Train a DDoS detection model from audit logs
Train {
TrainDdos {
/// Path to audit log JSONL file
#[arg(short, long)]
input: String,
@@ -103,6 +103,9 @@ enum Commands {
/// Classification threshold
#[arg(long, default_value = "0.5")]
threshold: f64,
/// Include CSIC 2010 dataset as base training data (downloaded from GitHub, cached locally)
#[arg(long)]
csic: bool,
},
}
@@ -110,7 +113,7 @@ fn main() -> Result<()> {
let cli = Cli::parse();
match cli.command.unwrap_or(Commands::Serve { upgrade: false }) {
Commands::Serve { upgrade } => run_serve(upgrade),
Commands::Replay {
Commands::ReplayDdos {
input,
model,
config,
@@ -129,7 +132,7 @@ fn main() -> Result<()> {
min_events,
rate_limit,
}),
Commands::Train {
Commands::TrainDdos {
input,
output,
attack_ips,
@@ -155,11 +158,13 @@ fn main() -> Result<()> {
output,
wordlists,
threshold,
csic,
} => scanner::train::run(scanner::train::TrainScannerArgs {
input,
output,
wordlists,
threshold,
csic,
}),
}
}
@@ -309,7 +314,13 @@ fn run_serve(upgrade: bool) -> Result<()> {
Ok(c) => {
if !upgrade {
if let Err(e) =
cert::fetch_and_write(&c, &cfg.tls.cert_path, &cfg.tls.key_path).await
cert::fetch_and_write(
&c,
&cfg.kubernetes.namespace,
&cfg.kubernetes.tls_secret,
&cfg.tls.cert_path,
&cfg.tls.key_path,
).await
{
tracing::warn!(error = %e, "cert fetch from K8s failed; using existing files");
}
@@ -405,6 +416,7 @@ fn run_serve(upgrade: bool) -> Result<()> {
// 6. Background K8s watchers on their own OS thread + tokio runtime.
if k8s_available {
let k8s_cfg = cfg.kubernetes.clone();
let cert_path = cfg.tls.cert_path.clone();
let key_path = cfg.tls.key_path.clone();
std::thread::spawn(move || {
@@ -421,8 +433,19 @@ fn run_serve(upgrade: bool) -> Result<()> {
}
};
tokio::join!(
acme::watch_ingresses(client.clone(), acme_routes),
watcher::run_watcher(client, cert_path, key_path),
acme::watch_ingresses(
client.clone(),
k8s_cfg.namespace.clone(),
acme_routes,
),
watcher::run_watcher(
client,
k8s_cfg.namespace,
k8s_cfg.tls_secret,
k8s_cfg.config_configmap,
cert_path,
key_path,
),
);
});
});

View File

@@ -410,7 +410,7 @@ impl ProxyHttp for SunbeamProxy {
);
metrics::SCANNER_DECISIONS
.with_label_values(&[decision, &reason])
.with_label_values(&[decision, reason])
.inc();
if decision == "block" {
@@ -447,7 +447,11 @@ impl ProxyHttp for SunbeamProxy {
path = %session.req_header().uri.path(),
client_ip = %ip,
user_agent = session.req_header().headers.get("user-agent").and_then(|v| v.to_str().ok()).unwrap_or("-"),
content_length = session.req_header().headers.get("content-length").and_then(|v| v.to_str().ok()).unwrap_or("0"),
has_cookies = cookie.is_some(),
has_referer = session.req_header().headers.get("referer").is_some(),
has_accept_language = session.req_header().headers.get("accept-language").is_some(),
accept = session.req_header().headers.get("accept").and_then(|v| v.to_str().ok()).unwrap_or("-"),
"pipeline"
);

View File

@@ -214,7 +214,7 @@ fn verify_dns(ip: IpAddr, suffixes: &[String]) -> bool {
// Step 3: forward DNS — the hostname must resolve back to our IP.
match dns_lookup::lookup_host(&hostname) {
Ok(addrs) => addrs.iter().any(|a| *a == ip),
Ok(addrs) => addrs.contains(&ip),
Err(_) => false,
}
}

411
src/scanner/csic.rs Normal file
View File

@@ -0,0 +1,411 @@
//! Fetch and convert the CSIC 2010 HTTP dataset into labeled training samples.
//!
//! The CSIC 2010 dataset contains raw HTTP/1.1 requests (normal + anomalous)
//! from a web application. When `--csic` is passed to `train-scanner`, this
//! module downloads the dataset from GitHub, parses the raw HTTP requests,
//! and converts them into `AuditFields` entries with ground-truth labels.
use crate::ddos::audit_log::AuditFields;
use anyhow::{Context, Result};
use std::path::PathBuf;
const REPO_BASE: &str =
"https://src.sunbeam.pt/studio/csic-dataset/raw/branch/mainline";
const FILES: &[(&str, &str)] = &[
("normalTrafficTraining.txt", "normal"),
("normalTrafficTest.txt", "normal"),
("anomalousTrafficTest.txt", "anomalous"),
];
const DEFAULT_HOSTS: &[&str] = &[
"admin", "src", "docs", "auth", "drive", "grafana", "people", "meet", "s3", "livekit",
];
fn cache_dir() -> PathBuf {
let base = std::env::var("XDG_CACHE_HOME")
.map(PathBuf::from)
.unwrap_or_else(|_| {
let home = std::env::var("HOME").unwrap_or_else(|_| "/tmp".to_string());
PathBuf::from(home).join(".cache")
});
base.join("sunbeam").join("csic")
}
fn download_or_cached(filename: &str) -> Result<String> {
let dir = cache_dir();
let path = dir.join(filename);
if path.exists() {
eprintln!(" cached: {}", path.display());
return std::fs::read_to_string(&path)
.with_context(|| format!("reading cached {}", path.display()));
}
let url = format!("{REPO_BASE}/{filename}");
eprintln!(" downloading: {url}");
let body = reqwest::blocking::get(&url)
.with_context(|| format!("fetching {url}"))?
.error_for_status()
.with_context(|| format!("HTTP error for {url}"))?
.text()
.with_context(|| format!("reading body of {url}"))?;
std::fs::create_dir_all(&dir)?;
std::fs::write(&path, &body)?;
Ok(body)
}
struct ParsedRequest {
method: String,
path: String,
query: String,
user_agent: String,
has_cookies: bool,
content_length: u64,
referer: String,
accept_language: String,
}
fn parse_csic_content(content: &str) -> Vec<ParsedRequest> {
let mut requests = Vec::new();
let mut current_lines: Vec<&str> = Vec::new();
for line in content.lines() {
if line.is_empty() && !current_lines.is_empty() {
if let Some(req) = parse_single_request(&current_lines) {
requests.push(req);
}
current_lines.clear();
} else {
current_lines.push(line);
}
}
if !current_lines.is_empty() {
if let Some(req) = parse_single_request(&current_lines) {
requests.push(req);
}
}
requests
}
fn parse_single_request(lines: &[&str]) -> Option<ParsedRequest> {
if lines.is_empty() {
return None;
}
let parts: Vec<&str> = lines[0].splitn(3, ' ').collect();
if parts.len() < 2 {
return None;
}
let method = parts[0].to_string();
let raw_url = parts[1];
// Extract path and query — URL may be absolute (http://localhost:8080/path?q=1)
let (path, query) = if let Some(rest) = raw_url.strip_prefix("http://") {
// Skip host portion
let after_host = rest.find('/').map(|i| &rest[i..]).unwrap_or("/");
split_path_query(after_host)
} else if let Some(rest) = raw_url.strip_prefix("https://") {
let after_host = rest.find('/').map(|i| &rest[i..]).unwrap_or("/");
split_path_query(after_host)
} else {
split_path_query(raw_url)
};
// Parse headers
let mut headers: Vec<(&str, &str)> = Vec::new();
let mut body_start = None;
for (i, line) in lines[1..].iter().enumerate() {
if line.is_empty() {
body_start = Some(i + 2); // +2 because we started from lines[1..]
break;
}
if let Some(colon) = line.find(':') {
let key = line[..colon].trim();
let value = line[colon + 1..].trim();
headers.push((key, value));
}
}
let get_header = |name: &str| -> Option<&str> {
headers
.iter()
.find(|(k, _)| k.eq_ignore_ascii_case(name))
.map(|(_, v)| *v)
};
let body_len = if let Some(start) = body_start {
if start < lines.len() {
lines[start..].iter().map(|l| l.len()).sum::<usize>() as u64
} else {
0
}
} else {
0
};
let content_length = get_header("Content-Length")
.and_then(|v| v.parse().ok())
.unwrap_or(body_len);
Some(ParsedRequest {
method,
path,
query,
user_agent: get_header("User-Agent").unwrap_or("-").to_string(),
has_cookies: get_header("Cookie").is_some(),
content_length,
referer: get_header("Referer").unwrap_or("-").to_string(),
accept_language: get_header("Accept-Language").unwrap_or("-").to_string(),
})
}
fn split_path_query(url: &str) -> (String, String) {
if let Some(q) = url.find('?') {
(url[..q].to_string(), url[q + 1..].to_string())
} else {
(url.to_string(), String::new())
}
}
/// Simple deterministic LCG for reproducible randomness without pulling in `rand`.
struct Rng(u64);
impl Rng {
fn new(seed: u64) -> Self {
Self(seed)
}
fn next_u64(&mut self) -> u64 {
self.0 = self.0.wrapping_mul(6364136223846793005).wrapping_add(1442695040888963407);
self.0
}
fn next_usize(&mut self, bound: usize) -> usize {
(self.next_u64() >> 33) as usize % bound
}
fn next_f64(&mut self) -> f64 {
(self.next_u64() >> 11) as f64 / (1u64 << 53) as f64
}
fn choice<'a>(&mut self, items: &'a [&str]) -> &'a str {
items[self.next_usize(items.len())]
}
}
fn to_audit_fields(
req: &ParsedRequest,
label: &str,
hosts: &[&str],
rng: &mut Rng,
) -> AuditFields {
let (host_prefix, status) = if label == "normal" {
let host = rng.choice(hosts).to_string();
let statuses: &[u16] = &[200, 200, 200, 200, 301, 304];
let status = statuses[rng.next_usize(statuses.len())];
(host, status)
} else {
let host = if rng.next_f64() < 0.7 {
let unknown: &[&str] = &["unknown", "scanner", "probe", "test"];
rng.choice(unknown).to_string()
} else {
rng.choice(hosts).to_string()
};
let statuses: &[u16] = &[404, 404, 404, 400, 403, 500];
let status = statuses[rng.next_usize(statuses.len())];
(host, status)
};
let host = format!("{host_prefix}.sunbeam.pt");
// For anomalous samples, simulate real scanner behavior:
// strip cookies/referer/accept-language that CSIC attacks have from their session.
let (has_cookies, referer, accept_language, user_agent) = if label != "normal" {
let referer = None;
let accept_language = if rng.next_f64() < 0.8 {
None
} else {
Some(req.accept_language.clone()).filter(|a| a != "-")
};
let r = rng.next_f64();
let user_agent = if r < 0.15 {
String::new()
} else if r < 0.25 {
"curl/7.68.0".to_string()
} else if r < 0.35 {
"python-requests/2.28.0".to_string()
} else if r < 0.40 {
"Go-http-client/1.1".to_string()
} else {
req.user_agent.clone()
};
(false, referer, accept_language, user_agent)
} else {
(
req.has_cookies,
Some(req.referer.clone()).filter(|r| r != "-"),
Some(req.accept_language.clone()).filter(|a| a != "-"),
req.user_agent.clone(),
)
};
AuditFields {
method: req.method.clone(),
host,
path: req.path.clone(),
query: req.query.clone(),
client_ip: format!(
"{}.{}.{}.{}",
rng.next_usize(223) + 1,
rng.next_usize(256),
rng.next_usize(256),
rng.next_usize(254) + 1,
),
status,
duration_ms: rng.next_usize(50) as u64 + 1,
content_length: req.content_length,
user_agent,
has_cookies: Some(has_cookies),
referer,
accept_language,
backend: if label == "normal" {
format!("{host_prefix}-svc:8080")
} else {
"-".to_string()
},
label: Some(
if label == "normal" { "normal" } else { "attack" }.to_string(),
),
}
}
/// Download (or use cached) CSIC 2010 dataset, parse raw HTTP requests,
/// and convert into labeled `AuditFields` entries ready for scanner training.
pub fn fetch_csic_dataset() -> Result<Vec<(AuditFields, String)>> {
eprintln!("fetching CSIC 2010 dataset...");
let mut rng = Rng::new(42);
let mut all_entries: Vec<(AuditFields, String)> = Vec::new();
for (filename, label) in FILES {
let content = download_or_cached(filename)?;
let requests = parse_csic_content(&content);
eprintln!(" parsed {} {label} requests from {filename}", requests.len());
for req in &requests {
let fields = to_audit_fields(req, label, DEFAULT_HOSTS, &mut rng);
let host_prefix = fields.host.split('.').next().unwrap_or("").to_string();
all_entries.push((fields, host_prefix));
}
}
// Shuffle to interleave normal/attack
let n = all_entries.len();
for i in (1..n).rev() {
let j = rng.next_usize(i + 1);
all_entries.swap(i, j);
}
eprintln!(
"CSIC total: {} ({} normal, {} attack)",
all_entries.len(),
all_entries.iter().filter(|(f, _)| f.label.as_deref() == Some("normal")).count(),
all_entries.iter().filter(|(f, _)| f.label.as_deref() == Some("attack")).count(),
);
Ok(all_entries)
}
/// Check if cached CSIC files exist.
pub fn csic_is_cached() -> bool {
let dir = cache_dir();
FILES.iter().all(|(f, _)| dir.join(f).exists())
}
/// Return the cache directory path for display.
pub fn csic_cache_path() -> PathBuf {
cache_dir()
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_parse_single_http_request() {
let lines = vec![
"GET /index.html HTTP/1.1",
"Host: localhost:8080",
"User-Agent: Mozilla/5.0",
"Cookie: session=abc",
"Accept: text/html",
];
let req = parse_single_request(&lines).unwrap();
assert_eq!(req.method, "GET");
assert_eq!(req.path, "/index.html");
assert!(req.has_cookies);
assert_eq!(req.user_agent, "Mozilla/5.0");
}
#[test]
fn test_parse_absolute_url() {
let lines = vec!["POST http://localhost:8080/tienda1/miembros/editar.jsp?id=2 HTTP/1.1"];
let req = parse_single_request(&lines).unwrap();
assert_eq!(req.method, "POST");
assert_eq!(req.path, "/tienda1/miembros/editar.jsp");
assert_eq!(req.query, "id=2");
}
#[test]
fn test_parse_csic_content_multiple_requests() {
let content = "GET /page1 HTTP/1.1\nHost: localhost\n\nPOST /page2 HTTP/1.1\nHost: localhost\n\n";
let reqs = parse_csic_content(content);
assert_eq!(reqs.len(), 2);
assert_eq!(reqs[0].method, "GET");
assert_eq!(reqs[1].method, "POST");
}
#[test]
fn test_to_audit_fields_normal() {
let req = ParsedRequest {
method: "GET".to_string(),
path: "/index.html".to_string(),
query: String::new(),
user_agent: "Mozilla/5.0".to_string(),
has_cookies: true,
content_length: 100,
referer: "https://example.com".to_string(),
accept_language: "en-US".to_string(),
};
let mut rng = Rng::new(42);
let fields = to_audit_fields(&req, "normal", DEFAULT_HOSTS, &mut rng);
assert_eq!(fields.label.as_deref(), Some("normal"));
assert!(fields.has_cookies.unwrap_or(false));
assert!(fields.host.ends_with(".sunbeam.pt"));
}
#[test]
fn test_to_audit_fields_anomalous_strips_cookies() {
let req = ParsedRequest {
method: "GET".to_string(),
path: "/.env".to_string(),
query: String::new(),
user_agent: "Mozilla/5.0".to_string(),
has_cookies: true,
content_length: 0,
referer: "https://example.com".to_string(),
accept_language: "en-US".to_string(),
};
let mut rng = Rng::new(42);
let fields = to_audit_fields(&req, "anomalous", DEFAULT_HOSTS, &mut rng);
assert_eq!(fields.label.as_deref(), Some("attack"));
assert!(!fields.has_cookies.unwrap_or(true));
}
#[test]
fn test_rng_deterministic() {
let mut a = Rng::new(42);
let mut b = Rng::new(42);
for _ in 0..100 {
assert_eq!(a.next_u64(), b.next_u64());
}
}
}

View File

@@ -50,6 +50,7 @@ impl ScannerDetector {
/// Returns a verdict with the action, raw score, and reason.
/// The score and reason are captured in pipeline logs so the training
/// pipeline always has unfiltered data to retrain from.
#[allow(clippy::too_many_arguments)]
pub fn check(
&self,
method: &str,
@@ -107,8 +108,8 @@ impl ScannerDetector {
// 3. Compute score = bias + dot(weights, features) + interaction terms
let mut score = self.weights[NUM_SCANNER_FEATURES + 2]; // bias (index 14)
for i in 0..NUM_SCANNER_FEATURES {
score += self.weights[i] * f[i];
for (i, &fi) in f.iter().enumerate().take(NUM_SCANNER_FEATURES) {
score += self.weights[i] * fi;
}
// Interaction: suspicious_path AND no_cookies
score += self.weights[12] * f[0] * (1.0 - f[3]);

View File

@@ -14,6 +14,7 @@ const TRAVERSAL_PATTERNS: &[&str] = &["..", "%00", "%0a", "%27", "%3c"];
/// Extract all 12 scanner features from a single request.
/// No heap allocation — all work done on references and stack buffers.
#[allow(clippy::too_many_arguments)]
pub fn extract_features(
method: &str,
path: &str,

View File

@@ -1,4 +1,5 @@
pub mod allowlist;
pub mod csic;
pub mod detector;
pub mod features;
pub mod model;

View File

@@ -14,6 +14,7 @@ pub struct TrainScannerArgs {
pub output: String,
pub wordlists: Option<String>,
pub threshold: f64,
pub csic: bool,
}
/// Default suspicious fragments — matches the DDoS feature list plus extras.
@@ -135,6 +136,54 @@ pub fn run(args: TrainScannerArgs) -> Result<()> {
}
}
// 1b. Optionally fetch CSIC 2010 dataset and add labeled entries
if args.csic {
let csic_entries = crate::scanner::csic::fetch_csic_dataset()?;
for (_, host_prefix) in &csic_entries {
log_hosts.insert(fx_hash_bytes(host_prefix.as_bytes()));
}
for (fields, host_prefix) in &csic_entries {
let has_cookies = fields.has_cookies.unwrap_or(false);
let has_referer = fields
.referer
.as_ref()
.map(|r| r != "-" && !r.is_empty())
.unwrap_or(false);
let has_accept_language = fields
.accept_language
.as_ref()
.map(|a| a != "-" && !a.is_empty())
.unwrap_or(false);
let feats = features::extract_features(
&fields.method,
&fields.path,
host_prefix,
has_cookies,
has_referer,
has_accept_language,
"-",
&fields.user_agent,
fields.content_length,
&fragment_hashes,
&extension_hashes,
&log_hosts,
);
// CSIC entries always have a ground-truth label.
let label = match fields.label.as_deref() {
Some("attack" | "anomalous") => 1.0,
Some("normal") => 0.0,
_ => continue,
};
samples.push(LabeledSample {
features: feats,
label,
});
}
}
let log_sample_count = samples.len();
let log_attack_count = samples.iter().filter(|s| s.label > 0.5).count();
eprintln!(
@@ -408,6 +457,7 @@ fn train_logistic_regression(
weights
}
#[allow(clippy::too_many_arguments)]
fn label_request(
path: &str,
has_cookies: bool,

View File

@@ -11,12 +11,12 @@ pub async fn run_tcp_proxy(listen: &str, backend: &str) {
let ipv6_addr = if listen.starts_with('[') {
listen.to_string()
} else {
format!("[::]:{}", listen.split(':').last().unwrap_or("22"))
format!("[::]:{}", listen.split(':').next_back().unwrap_or("22"))
};
let ipv4_addr = if listen.contains(':') {
// Extract port from the original address
let port = listen.split(':').last().unwrap_or("22");
let port = listen.split(':').next_back().unwrap_or("22");
format!("0.0.0.0:{}", port)
} else {
"0.0.0.0:22".to_string()

View File

@@ -3,7 +3,7 @@ use k8s_openapi::api::core::v1::{ConfigMap, Secret};
use kube::{runtime::watcher, Api, Client};
use tokio::sync::mpsc;
/// Watch `pingora-tls` and `pingora-config` in the ingress namespace.
/// Watch the TLS Secret and config ConfigMap for changes.
///
/// On cert change: write new cert bytes from the Apply event directly to the
/// configured paths (avoiding kubelet volume-sync delay), then trigger a
@@ -15,14 +15,21 @@ use tokio::sync::mpsc;
///
/// No-ops when no K8s client is available (e.g. ad-hoc local runs outside a
/// cluster) so the binary works in both environments.
pub async fn run_watcher(client: Client, cert_path: String, key_path: String) {
pub async fn run_watcher(
client: Client,
namespace: String,
tls_secret: String,
config_configmap: String,
cert_path: String,
key_path: String,
) {
let (tx, mut rx) = mpsc::channel::<()>(2);
let secret_api: Api<Secret> = Api::namespaced(client.clone(), "ingress");
let cm_api: Api<ConfigMap> = Api::namespaced(client.clone(), "ingress");
let secret_api: Api<Secret> = Api::namespaced(client.clone(), &namespace);
let cm_api: Api<ConfigMap> = Api::namespaced(client.clone(), &namespace);
tokio::spawn(watch_secret(secret_api, cert_path, key_path, tx.clone()));
tokio::spawn(watch_configmap(cm_api, tx));
tokio::spawn(watch_secret(secret_api, tls_secret, cert_path, key_path, tx.clone()));
tokio::spawn(watch_configmap(cm_api, config_configmap, tx));
if rx.recv().await.is_some() {
tracing::info!("initiating graceful upgrade");
@@ -32,11 +39,13 @@ pub async fn run_watcher(client: Client, cert_path: String, key_path: String) {
async fn watch_secret(
api: Api<Secret>,
secret_name: String,
cert_path: String,
key_path: String,
tx: mpsc::Sender<()>,
) {
let cfg = watcher::Config::default().fields("metadata.name=pingora-tls");
let field_selector = format!("metadata.name={secret_name}");
let cfg = watcher::Config::default().fields(&field_selector);
let mut stream = Box::pin(watcher(api, cfg));
let mut initialized = false;
@@ -44,14 +53,10 @@ async fn watch_secret(
match result {
Ok(watcher::Event::InitDone) => {
initialized = true;
tracing::debug!("pingora-tls watcher ready");
tracing::debug!(%secret_name, "TLS secret watcher ready");
}
// Write the new cert directly from the event object before triggering the
// upgrade. The Apply event carries the full updated Secret, so we don't
// need a separate API call and the cert files are ready before the new
// process's svc.add_tls() runs.
Ok(watcher::Event::Apply(secret)) if initialized => {
tracing::info!("pingora-tls changed — writing new cert");
tracing::info!(%secret_name, "TLS secret changed — writing new cert");
match crate::cert::write_from_secret(&secret, &cert_path, &key_path) {
Ok(()) => {
let _ = tx.send(()).await;
@@ -69,8 +74,9 @@ async fn watch_secret(
}
}
async fn watch_configmap(api: Api<ConfigMap>, tx: mpsc::Sender<()>) {
let cfg = watcher::Config::default().fields("metadata.name=pingora-config");
async fn watch_configmap(api: Api<ConfigMap>, configmap_name: String, tx: mpsc::Sender<()>) {
let field_selector = format!("metadata.name={configmap_name}");
let cfg = watcher::Config::default().fields(&field_selector);
let mut stream = Box::pin(watcher(api, cfg));
let mut initialized = false;
@@ -78,10 +84,10 @@ async fn watch_configmap(api: Api<ConfigMap>, tx: mpsc::Sender<()>) {
match result {
Ok(watcher::Event::InitDone) => {
initialized = true;
tracing::debug!("pingora-config watcher ready");
tracing::debug!(%configmap_name, "config watcher ready");
}
Ok(watcher::Event::Apply(_)) if initialized => {
tracing::info!("pingora-config changed — triggering upgrade");
tracing::info!(%configmap_name, "config changed — triggering upgrade");
let _ = tx.send(()).await;
return;
}