feat: configurable k8s resources, CSIC training pipeline, unified Dockerfile
- Make K8s namespace, TLS secret, and config ConfigMap names configurable via [kubernetes] config section (previously hardcoded to "ingress") - Add CSIC 2010 dataset converter and auto-download for scanner training - Unify Dockerfile for local and production builds (remove cross-compile path) - Bake ML models directory into container image - Update CSIC dataset URL to self-hosted mirror (src.sunbeam.pt) - Fix rate_limit pipeline log missing fields - Consolidate docs/README.md into root README.md Signed-off-by: Sienna Meridian Satterwhite <sienna@sunbeam.pt>
This commit is contained in:
17
src/acme.rs
17
src/acme.rs
@@ -6,7 +6,7 @@ use std::{collections::HashMap, sync::{Arc, RwLock}};
|
||||
/// Maps a challenge path to the backend address that can answer it.
|
||||
///
|
||||
/// Key: `/.well-known/acme-challenge/<token>`
|
||||
/// Value: `cm-acme-http-solver-<hash>.ingress.svc.cluster.local:8089`
|
||||
/// Value: `cm-acme-http-solver-<hash>.<namespace>.svc.cluster.local:8089`
|
||||
///
|
||||
/// cert-manager creates one Ingress per challenge domain with exactly this
|
||||
/// path and backend. Our proxy consults this table to route each challenge
|
||||
@@ -18,15 +18,15 @@ use std::{collections::HashMap, sync::{Arc, RwLock}};
|
||||
/// can be written from the watcher runtime without cross-runtime waker issues.
|
||||
pub type AcmeRoutes = Arc<RwLock<HashMap<String, String>>>;
|
||||
|
||||
/// Watch Ingress objects in the ingress namespace and maintain `routes`.
|
||||
/// Watch Ingress objects and maintain `routes`.
|
||||
///
|
||||
/// cert-manager creates an Ingress for each HTTP-01 challenge it manages.
|
||||
/// The Ingress contains a path rule for `/.well-known/acme-challenge/<token>`
|
||||
/// pointing to a per-challenge solver Service. We populate the route table
|
||||
/// from these rules so the proxy can forward each challenge token to the
|
||||
/// correct solver pod without the nondeterminism of a shared stable Service.
|
||||
pub async fn watch_ingresses(client: Client, routes: AcmeRoutes) {
|
||||
let api: Api<Ingress> = Api::namespaced(client, "ingress");
|
||||
pub async fn watch_ingresses(client: Client, namespace: String, routes: AcmeRoutes) {
|
||||
let api: Api<Ingress> = Api::namespaced(client, &namespace);
|
||||
|
||||
// Verify Ingress API access before entering the watch loop. A failure here
|
||||
// almost always means cert-manager is not installed or RBAC is wrong.
|
||||
@@ -43,12 +43,9 @@ pub async fn watch_ingresses(client: Client, routes: AcmeRoutes) {
|
||||
|
||||
while let Some(result) = stream.next().await {
|
||||
match result {
|
||||
// InitApply fires for each Ingress during the initial list (kube v3+).
|
||||
// Apply fires for subsequent creates/updates.
|
||||
// Both must be handled to catch Ingresses that existed before the proxy started.
|
||||
Ok(watcher::Event::InitApply(ing)) | Ok(watcher::Event::Apply(ing)) => {
|
||||
let mut map = routes.write().unwrap_or_else(|e| e.into_inner());
|
||||
upsert_routes(&ing, &mut map);
|
||||
upsert_routes(&ing, &namespace, &mut map);
|
||||
}
|
||||
Ok(watcher::Event::Delete(ing)) => {
|
||||
let mut map = routes.write().unwrap_or_else(|e| e.into_inner());
|
||||
@@ -63,7 +60,7 @@ pub async fn watch_ingresses(client: Client, routes: AcmeRoutes) {
|
||||
}
|
||||
}
|
||||
|
||||
fn upsert_routes(ingress: &Ingress, map: &mut HashMap<String, String>) {
|
||||
fn upsert_routes(ingress: &Ingress, namespace: &str, map: &mut HashMap<String, String>) {
|
||||
let Some(spec) = &ingress.spec else { return };
|
||||
for rule in spec.rules.as_deref().unwrap_or(&[]) {
|
||||
let Some(http) = &rule.http else { continue };
|
||||
@@ -75,7 +72,7 @@ fn upsert_routes(ingress: &Ingress, map: &mut HashMap<String, String>) {
|
||||
let Some(svc) = p.backend.service.as_ref() else { continue };
|
||||
let Some(port) = svc.port.as_ref().and_then(|p| p.number) else { continue };
|
||||
let backend = format!(
|
||||
"{}.ingress.svc.cluster.local:{port}",
|
||||
"{}.{namespace}.svc.cluster.local:{port}",
|
||||
svc.name
|
||||
);
|
||||
tracing::debug!(path, %backend, "added ACME challenge route");
|
||||
|
||||
23
src/cert.rs
23
src/cert.rs
@@ -2,18 +2,23 @@ use anyhow::{Context, Result};
|
||||
use k8s_openapi::api::core::v1::Secret;
|
||||
use kube::{Api, Client};
|
||||
|
||||
/// Fetch the `pingora-tls` Secret from the ingress namespace and write
|
||||
/// `tls.crt` / `tls.key` to the paths declared in config.toml.
|
||||
/// Fetch the TLS Secret and write `tls.crt` / `tls.key` to the configured paths.
|
||||
///
|
||||
/// Called at startup (non-upgrade) so the proxy never depends on kubelet
|
||||
/// volume-sync timing: the cert files are written directly from the K8s API
|
||||
/// before `svc.add_tls()` is called.
|
||||
pub async fn fetch_and_write(client: &Client, cert_path: &str, key_path: &str) -> Result<()> {
|
||||
let api: Api<Secret> = Api::namespaced(client.clone(), "ingress");
|
||||
pub async fn fetch_and_write(
|
||||
client: &Client,
|
||||
namespace: &str,
|
||||
secret_name: &str,
|
||||
cert_path: &str,
|
||||
key_path: &str,
|
||||
) -> Result<()> {
|
||||
let api: Api<Secret> = Api::namespaced(client.clone(), namespace);
|
||||
let secret = api
|
||||
.get("pingora-tls")
|
||||
.get(secret_name)
|
||||
.await
|
||||
.context("fetching pingora-tls Secret from K8s API")?;
|
||||
.with_context(|| format!("fetching {secret_name} Secret from K8s API"))?;
|
||||
write_from_secret(&secret, cert_path, key_path)
|
||||
}
|
||||
|
||||
@@ -27,14 +32,14 @@ pub fn write_from_secret(secret: &Secret, cert_path: &str, key_path: &str) -> Re
|
||||
let data = secret
|
||||
.data
|
||||
.as_ref()
|
||||
.ok_or_else(|| anyhow::anyhow!("pingora-tls Secret has no data"))?;
|
||||
.ok_or_else(|| anyhow::anyhow!("TLS Secret has no data"))?;
|
||||
|
||||
let crt = data
|
||||
.get("tls.crt")
|
||||
.ok_or_else(|| anyhow::anyhow!("pingora-tls missing tls.crt"))?;
|
||||
.ok_or_else(|| anyhow::anyhow!("TLS Secret missing tls.crt"))?;
|
||||
let key = data
|
||||
.get("tls.key")
|
||||
.ok_or_else(|| anyhow::anyhow!("pingora-tls missing tls.key"))?;
|
||||
.ok_or_else(|| anyhow::anyhow!("TLS Secret missing tls.key"))?;
|
||||
|
||||
// /etc/tls is an emptyDir; create it if the pod just started.
|
||||
if let Some(parent) = std::path::Path::new(cert_path).parent() {
|
||||
|
||||
@@ -24,8 +24,38 @@ pub struct Config {
|
||||
pub rate_limit: Option<RateLimitConfig>,
|
||||
/// Optional per-request scanner detection.
|
||||
pub scanner: Option<ScannerConfig>,
|
||||
/// Kubernetes resource names and namespaces for watchers.
|
||||
#[serde(default)]
|
||||
pub kubernetes: KubernetesConfig,
|
||||
}
|
||||
|
||||
#[derive(Debug, Deserialize, Clone)]
|
||||
pub struct KubernetesConfig {
|
||||
/// Namespace where the proxy's resources live (Secret, ConfigMap, Ingresses).
|
||||
#[serde(default = "default_k8s_namespace")]
|
||||
pub namespace: String,
|
||||
/// Name of the TLS Secret watched for cert hot-reload.
|
||||
#[serde(default = "default_tls_secret")]
|
||||
pub tls_secret: String,
|
||||
/// Name of the ConfigMap watched for config hot-reload.
|
||||
#[serde(default = "default_config_configmap")]
|
||||
pub config_configmap: String,
|
||||
}
|
||||
|
||||
impl Default for KubernetesConfig {
|
||||
fn default() -> Self {
|
||||
Self {
|
||||
namespace: default_k8s_namespace(),
|
||||
tls_secret: default_tls_secret(),
|
||||
config_configmap: default_config_configmap(),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
fn default_k8s_namespace() -> String { "ingress".to_string() }
|
||||
fn default_tls_secret() -> String { "pingora-tls".to_string() }
|
||||
fn default_config_configmap() -> String { "pingora-config".to_string() }
|
||||
|
||||
#[derive(Debug, Deserialize, Clone)]
|
||||
pub struct DDoSConfig {
|
||||
pub model_path: String,
|
||||
|
||||
@@ -39,6 +39,7 @@ impl DDoSDetector {
|
||||
|
||||
/// Record an incoming request and classify the IP.
|
||||
/// Called from request_filter (before upstream).
|
||||
#[allow(clippy::too_many_arguments)]
|
||||
pub fn check(
|
||||
&self,
|
||||
ip: IpAddr,
|
||||
|
||||
@@ -70,6 +70,10 @@ impl IpState {
|
||||
self.events.len()
|
||||
}
|
||||
|
||||
pub fn is_empty(&self) -> bool {
|
||||
self.events.is_empty()
|
||||
}
|
||||
|
||||
/// Prune events older than `window` from the logical view.
|
||||
/// Returns a slice of active events (not necessarily contiguous in ring buffer,
|
||||
/// so we collect into a Vec).
|
||||
@@ -274,6 +278,12 @@ pub struct LogIpState {
|
||||
pub suspicious_paths: Vec<bool>,
|
||||
}
|
||||
|
||||
impl Default for LogIpState {
|
||||
fn default() -> Self {
|
||||
Self::new()
|
||||
}
|
||||
}
|
||||
|
||||
impl LogIpState {
|
||||
pub fn new() -> Self {
|
||||
Self {
|
||||
|
||||
@@ -113,7 +113,7 @@ pub fn run(args: TrainArgs) -> Result<()> {
|
||||
let ip = audit_log::strip_port(&entry.fields.client_ip).to_string();
|
||||
let ts = parse_timestamp(&entry.timestamp);
|
||||
|
||||
let state = ip_states.entry(ip).or_insert_with(LogIpState::new);
|
||||
let state = ip_states.entry(ip).or_default();
|
||||
state.timestamps.push(ts);
|
||||
state.methods.push(method_to_u8(&entry.fields.method));
|
||||
state.path_hashes.push(fx_hash(&entry.fields.path));
|
||||
|
||||
37
src/main.rs
37
src/main.rs
@@ -33,7 +33,7 @@ enum Commands {
|
||||
upgrade: bool,
|
||||
},
|
||||
/// Replay audit logs through the DDoS detector and rate limiter
|
||||
Replay {
|
||||
ReplayDdos {
|
||||
/// Path to audit log JSONL file
|
||||
#[arg(short, long)]
|
||||
input: String,
|
||||
@@ -60,7 +60,7 @@ enum Commands {
|
||||
rate_limit: bool,
|
||||
},
|
||||
/// Train a DDoS detection model from audit logs
|
||||
Train {
|
||||
TrainDdos {
|
||||
/// Path to audit log JSONL file
|
||||
#[arg(short, long)]
|
||||
input: String,
|
||||
@@ -103,6 +103,9 @@ enum Commands {
|
||||
/// Classification threshold
|
||||
#[arg(long, default_value = "0.5")]
|
||||
threshold: f64,
|
||||
/// Include CSIC 2010 dataset as base training data (downloaded from GitHub, cached locally)
|
||||
#[arg(long)]
|
||||
csic: bool,
|
||||
},
|
||||
}
|
||||
|
||||
@@ -110,7 +113,7 @@ fn main() -> Result<()> {
|
||||
let cli = Cli::parse();
|
||||
match cli.command.unwrap_or(Commands::Serve { upgrade: false }) {
|
||||
Commands::Serve { upgrade } => run_serve(upgrade),
|
||||
Commands::Replay {
|
||||
Commands::ReplayDdos {
|
||||
input,
|
||||
model,
|
||||
config,
|
||||
@@ -129,7 +132,7 @@ fn main() -> Result<()> {
|
||||
min_events,
|
||||
rate_limit,
|
||||
}),
|
||||
Commands::Train {
|
||||
Commands::TrainDdos {
|
||||
input,
|
||||
output,
|
||||
attack_ips,
|
||||
@@ -155,11 +158,13 @@ fn main() -> Result<()> {
|
||||
output,
|
||||
wordlists,
|
||||
threshold,
|
||||
csic,
|
||||
} => scanner::train::run(scanner::train::TrainScannerArgs {
|
||||
input,
|
||||
output,
|
||||
wordlists,
|
||||
threshold,
|
||||
csic,
|
||||
}),
|
||||
}
|
||||
}
|
||||
@@ -309,7 +314,13 @@ fn run_serve(upgrade: bool) -> Result<()> {
|
||||
Ok(c) => {
|
||||
if !upgrade {
|
||||
if let Err(e) =
|
||||
cert::fetch_and_write(&c, &cfg.tls.cert_path, &cfg.tls.key_path).await
|
||||
cert::fetch_and_write(
|
||||
&c,
|
||||
&cfg.kubernetes.namespace,
|
||||
&cfg.kubernetes.tls_secret,
|
||||
&cfg.tls.cert_path,
|
||||
&cfg.tls.key_path,
|
||||
).await
|
||||
{
|
||||
tracing::warn!(error = %e, "cert fetch from K8s failed; using existing files");
|
||||
}
|
||||
@@ -405,6 +416,7 @@ fn run_serve(upgrade: bool) -> Result<()> {
|
||||
|
||||
// 6. Background K8s watchers on their own OS thread + tokio runtime.
|
||||
if k8s_available {
|
||||
let k8s_cfg = cfg.kubernetes.clone();
|
||||
let cert_path = cfg.tls.cert_path.clone();
|
||||
let key_path = cfg.tls.key_path.clone();
|
||||
std::thread::spawn(move || {
|
||||
@@ -421,8 +433,19 @@ fn run_serve(upgrade: bool) -> Result<()> {
|
||||
}
|
||||
};
|
||||
tokio::join!(
|
||||
acme::watch_ingresses(client.clone(), acme_routes),
|
||||
watcher::run_watcher(client, cert_path, key_path),
|
||||
acme::watch_ingresses(
|
||||
client.clone(),
|
||||
k8s_cfg.namespace.clone(),
|
||||
acme_routes,
|
||||
),
|
||||
watcher::run_watcher(
|
||||
client,
|
||||
k8s_cfg.namespace,
|
||||
k8s_cfg.tls_secret,
|
||||
k8s_cfg.config_configmap,
|
||||
cert_path,
|
||||
key_path,
|
||||
),
|
||||
);
|
||||
});
|
||||
});
|
||||
|
||||
@@ -410,7 +410,7 @@ impl ProxyHttp for SunbeamProxy {
|
||||
);
|
||||
|
||||
metrics::SCANNER_DECISIONS
|
||||
.with_label_values(&[decision, &reason])
|
||||
.with_label_values(&[decision, reason])
|
||||
.inc();
|
||||
|
||||
if decision == "block" {
|
||||
@@ -447,7 +447,11 @@ impl ProxyHttp for SunbeamProxy {
|
||||
path = %session.req_header().uri.path(),
|
||||
client_ip = %ip,
|
||||
user_agent = session.req_header().headers.get("user-agent").and_then(|v| v.to_str().ok()).unwrap_or("-"),
|
||||
content_length = session.req_header().headers.get("content-length").and_then(|v| v.to_str().ok()).unwrap_or("0"),
|
||||
has_cookies = cookie.is_some(),
|
||||
has_referer = session.req_header().headers.get("referer").is_some(),
|
||||
has_accept_language = session.req_header().headers.get("accept-language").is_some(),
|
||||
accept = session.req_header().headers.get("accept").and_then(|v| v.to_str().ok()).unwrap_or("-"),
|
||||
"pipeline"
|
||||
);
|
||||
|
||||
|
||||
@@ -214,7 +214,7 @@ fn verify_dns(ip: IpAddr, suffixes: &[String]) -> bool {
|
||||
|
||||
// Step 3: forward DNS — the hostname must resolve back to our IP.
|
||||
match dns_lookup::lookup_host(&hostname) {
|
||||
Ok(addrs) => addrs.iter().any(|a| *a == ip),
|
||||
Ok(addrs) => addrs.contains(&ip),
|
||||
Err(_) => false,
|
||||
}
|
||||
}
|
||||
|
||||
411
src/scanner/csic.rs
Normal file
411
src/scanner/csic.rs
Normal file
@@ -0,0 +1,411 @@
|
||||
//! Fetch and convert the CSIC 2010 HTTP dataset into labeled training samples.
|
||||
//!
|
||||
//! The CSIC 2010 dataset contains raw HTTP/1.1 requests (normal + anomalous)
|
||||
//! from a web application. When `--csic` is passed to `train-scanner`, this
|
||||
//! module downloads the dataset from GitHub, parses the raw HTTP requests,
|
||||
//! and converts them into `AuditFields` entries with ground-truth labels.
|
||||
|
||||
use crate::ddos::audit_log::AuditFields;
|
||||
use anyhow::{Context, Result};
|
||||
use std::path::PathBuf;
|
||||
|
||||
const REPO_BASE: &str =
|
||||
"https://src.sunbeam.pt/studio/csic-dataset/raw/branch/mainline";
|
||||
|
||||
const FILES: &[(&str, &str)] = &[
|
||||
("normalTrafficTraining.txt", "normal"),
|
||||
("normalTrafficTest.txt", "normal"),
|
||||
("anomalousTrafficTest.txt", "anomalous"),
|
||||
];
|
||||
|
||||
const DEFAULT_HOSTS: &[&str] = &[
|
||||
"admin", "src", "docs", "auth", "drive", "grafana", "people", "meet", "s3", "livekit",
|
||||
];
|
||||
|
||||
fn cache_dir() -> PathBuf {
|
||||
let base = std::env::var("XDG_CACHE_HOME")
|
||||
.map(PathBuf::from)
|
||||
.unwrap_or_else(|_| {
|
||||
let home = std::env::var("HOME").unwrap_or_else(|_| "/tmp".to_string());
|
||||
PathBuf::from(home).join(".cache")
|
||||
});
|
||||
base.join("sunbeam").join("csic")
|
||||
}
|
||||
|
||||
fn download_or_cached(filename: &str) -> Result<String> {
|
||||
let dir = cache_dir();
|
||||
let path = dir.join(filename);
|
||||
|
||||
if path.exists() {
|
||||
eprintln!(" cached: {}", path.display());
|
||||
return std::fs::read_to_string(&path)
|
||||
.with_context(|| format!("reading cached {}", path.display()));
|
||||
}
|
||||
|
||||
let url = format!("{REPO_BASE}/{filename}");
|
||||
eprintln!(" downloading: {url}");
|
||||
let body = reqwest::blocking::get(&url)
|
||||
.with_context(|| format!("fetching {url}"))?
|
||||
.error_for_status()
|
||||
.with_context(|| format!("HTTP error for {url}"))?
|
||||
.text()
|
||||
.with_context(|| format!("reading body of {url}"))?;
|
||||
|
||||
std::fs::create_dir_all(&dir)?;
|
||||
std::fs::write(&path, &body)?;
|
||||
Ok(body)
|
||||
}
|
||||
|
||||
struct ParsedRequest {
|
||||
method: String,
|
||||
path: String,
|
||||
query: String,
|
||||
user_agent: String,
|
||||
has_cookies: bool,
|
||||
content_length: u64,
|
||||
referer: String,
|
||||
accept_language: String,
|
||||
}
|
||||
|
||||
fn parse_csic_content(content: &str) -> Vec<ParsedRequest> {
|
||||
let mut requests = Vec::new();
|
||||
let mut current_lines: Vec<&str> = Vec::new();
|
||||
|
||||
for line in content.lines() {
|
||||
if line.is_empty() && !current_lines.is_empty() {
|
||||
if let Some(req) = parse_single_request(¤t_lines) {
|
||||
requests.push(req);
|
||||
}
|
||||
current_lines.clear();
|
||||
} else {
|
||||
current_lines.push(line);
|
||||
}
|
||||
}
|
||||
if !current_lines.is_empty() {
|
||||
if let Some(req) = parse_single_request(¤t_lines) {
|
||||
requests.push(req);
|
||||
}
|
||||
}
|
||||
requests
|
||||
}
|
||||
|
||||
fn parse_single_request(lines: &[&str]) -> Option<ParsedRequest> {
|
||||
if lines.is_empty() {
|
||||
return None;
|
||||
}
|
||||
|
||||
let parts: Vec<&str> = lines[0].splitn(3, ' ').collect();
|
||||
if parts.len() < 2 {
|
||||
return None;
|
||||
}
|
||||
let method = parts[0].to_string();
|
||||
let raw_url = parts[1];
|
||||
|
||||
// Extract path and query — URL may be absolute (http://localhost:8080/path?q=1)
|
||||
let (path, query) = if let Some(rest) = raw_url.strip_prefix("http://") {
|
||||
// Skip host portion
|
||||
let after_host = rest.find('/').map(|i| &rest[i..]).unwrap_or("/");
|
||||
split_path_query(after_host)
|
||||
} else if let Some(rest) = raw_url.strip_prefix("https://") {
|
||||
let after_host = rest.find('/').map(|i| &rest[i..]).unwrap_or("/");
|
||||
split_path_query(after_host)
|
||||
} else {
|
||||
split_path_query(raw_url)
|
||||
};
|
||||
|
||||
// Parse headers
|
||||
let mut headers: Vec<(&str, &str)> = Vec::new();
|
||||
let mut body_start = None;
|
||||
for (i, line) in lines[1..].iter().enumerate() {
|
||||
if line.is_empty() {
|
||||
body_start = Some(i + 2); // +2 because we started from lines[1..]
|
||||
break;
|
||||
}
|
||||
if let Some(colon) = line.find(':') {
|
||||
let key = line[..colon].trim();
|
||||
let value = line[colon + 1..].trim();
|
||||
headers.push((key, value));
|
||||
}
|
||||
}
|
||||
|
||||
let get_header = |name: &str| -> Option<&str> {
|
||||
headers
|
||||
.iter()
|
||||
.find(|(k, _)| k.eq_ignore_ascii_case(name))
|
||||
.map(|(_, v)| *v)
|
||||
};
|
||||
|
||||
let body_len = if let Some(start) = body_start {
|
||||
if start < lines.len() {
|
||||
lines[start..].iter().map(|l| l.len()).sum::<usize>() as u64
|
||||
} else {
|
||||
0
|
||||
}
|
||||
} else {
|
||||
0
|
||||
};
|
||||
|
||||
let content_length = get_header("Content-Length")
|
||||
.and_then(|v| v.parse().ok())
|
||||
.unwrap_or(body_len);
|
||||
|
||||
Some(ParsedRequest {
|
||||
method,
|
||||
path,
|
||||
query,
|
||||
user_agent: get_header("User-Agent").unwrap_or("-").to_string(),
|
||||
has_cookies: get_header("Cookie").is_some(),
|
||||
content_length,
|
||||
referer: get_header("Referer").unwrap_or("-").to_string(),
|
||||
accept_language: get_header("Accept-Language").unwrap_or("-").to_string(),
|
||||
})
|
||||
}
|
||||
|
||||
fn split_path_query(url: &str) -> (String, String) {
|
||||
if let Some(q) = url.find('?') {
|
||||
(url[..q].to_string(), url[q + 1..].to_string())
|
||||
} else {
|
||||
(url.to_string(), String::new())
|
||||
}
|
||||
}
|
||||
|
||||
/// Simple deterministic LCG for reproducible randomness without pulling in `rand`.
|
||||
struct Rng(u64);
|
||||
|
||||
impl Rng {
|
||||
fn new(seed: u64) -> Self {
|
||||
Self(seed)
|
||||
}
|
||||
fn next_u64(&mut self) -> u64 {
|
||||
self.0 = self.0.wrapping_mul(6364136223846793005).wrapping_add(1442695040888963407);
|
||||
self.0
|
||||
}
|
||||
fn next_usize(&mut self, bound: usize) -> usize {
|
||||
(self.next_u64() >> 33) as usize % bound
|
||||
}
|
||||
fn next_f64(&mut self) -> f64 {
|
||||
(self.next_u64() >> 11) as f64 / (1u64 << 53) as f64
|
||||
}
|
||||
fn choice<'a>(&mut self, items: &'a [&str]) -> &'a str {
|
||||
items[self.next_usize(items.len())]
|
||||
}
|
||||
}
|
||||
|
||||
fn to_audit_fields(
|
||||
req: &ParsedRequest,
|
||||
label: &str,
|
||||
hosts: &[&str],
|
||||
rng: &mut Rng,
|
||||
) -> AuditFields {
|
||||
let (host_prefix, status) = if label == "normal" {
|
||||
let host = rng.choice(hosts).to_string();
|
||||
let statuses: &[u16] = &[200, 200, 200, 200, 301, 304];
|
||||
let status = statuses[rng.next_usize(statuses.len())];
|
||||
(host, status)
|
||||
} else {
|
||||
let host = if rng.next_f64() < 0.7 {
|
||||
let unknown: &[&str] = &["unknown", "scanner", "probe", "test"];
|
||||
rng.choice(unknown).to_string()
|
||||
} else {
|
||||
rng.choice(hosts).to_string()
|
||||
};
|
||||
let statuses: &[u16] = &[404, 404, 404, 400, 403, 500];
|
||||
let status = statuses[rng.next_usize(statuses.len())];
|
||||
(host, status)
|
||||
};
|
||||
|
||||
let host = format!("{host_prefix}.sunbeam.pt");
|
||||
|
||||
// For anomalous samples, simulate real scanner behavior:
|
||||
// strip cookies/referer/accept-language that CSIC attacks have from their session.
|
||||
let (has_cookies, referer, accept_language, user_agent) = if label != "normal" {
|
||||
let referer = None;
|
||||
let accept_language = if rng.next_f64() < 0.8 {
|
||||
None
|
||||
} else {
|
||||
Some(req.accept_language.clone()).filter(|a| a != "-")
|
||||
};
|
||||
let r = rng.next_f64();
|
||||
let user_agent = if r < 0.15 {
|
||||
String::new()
|
||||
} else if r < 0.25 {
|
||||
"curl/7.68.0".to_string()
|
||||
} else if r < 0.35 {
|
||||
"python-requests/2.28.0".to_string()
|
||||
} else if r < 0.40 {
|
||||
"Go-http-client/1.1".to_string()
|
||||
} else {
|
||||
req.user_agent.clone()
|
||||
};
|
||||
(false, referer, accept_language, user_agent)
|
||||
} else {
|
||||
(
|
||||
req.has_cookies,
|
||||
Some(req.referer.clone()).filter(|r| r != "-"),
|
||||
Some(req.accept_language.clone()).filter(|a| a != "-"),
|
||||
req.user_agent.clone(),
|
||||
)
|
||||
};
|
||||
|
||||
AuditFields {
|
||||
method: req.method.clone(),
|
||||
host,
|
||||
path: req.path.clone(),
|
||||
query: req.query.clone(),
|
||||
client_ip: format!(
|
||||
"{}.{}.{}.{}",
|
||||
rng.next_usize(223) + 1,
|
||||
rng.next_usize(256),
|
||||
rng.next_usize(256),
|
||||
rng.next_usize(254) + 1,
|
||||
),
|
||||
status,
|
||||
duration_ms: rng.next_usize(50) as u64 + 1,
|
||||
content_length: req.content_length,
|
||||
user_agent,
|
||||
has_cookies: Some(has_cookies),
|
||||
referer,
|
||||
accept_language,
|
||||
backend: if label == "normal" {
|
||||
format!("{host_prefix}-svc:8080")
|
||||
} else {
|
||||
"-".to_string()
|
||||
},
|
||||
label: Some(
|
||||
if label == "normal" { "normal" } else { "attack" }.to_string(),
|
||||
),
|
||||
}
|
||||
}
|
||||
|
||||
/// Download (or use cached) CSIC 2010 dataset, parse raw HTTP requests,
|
||||
/// and convert into labeled `AuditFields` entries ready for scanner training.
|
||||
pub fn fetch_csic_dataset() -> Result<Vec<(AuditFields, String)>> {
|
||||
eprintln!("fetching CSIC 2010 dataset...");
|
||||
|
||||
let mut rng = Rng::new(42);
|
||||
let mut all_entries: Vec<(AuditFields, String)> = Vec::new();
|
||||
|
||||
for (filename, label) in FILES {
|
||||
let content = download_or_cached(filename)?;
|
||||
let requests = parse_csic_content(&content);
|
||||
eprintln!(" parsed {} {label} requests from {filename}", requests.len());
|
||||
|
||||
for req in &requests {
|
||||
let fields = to_audit_fields(req, label, DEFAULT_HOSTS, &mut rng);
|
||||
let host_prefix = fields.host.split('.').next().unwrap_or("").to_string();
|
||||
all_entries.push((fields, host_prefix));
|
||||
}
|
||||
}
|
||||
|
||||
// Shuffle to interleave normal/attack
|
||||
let n = all_entries.len();
|
||||
for i in (1..n).rev() {
|
||||
let j = rng.next_usize(i + 1);
|
||||
all_entries.swap(i, j);
|
||||
}
|
||||
|
||||
eprintln!(
|
||||
"CSIC total: {} ({} normal, {} attack)",
|
||||
all_entries.len(),
|
||||
all_entries.iter().filter(|(f, _)| f.label.as_deref() == Some("normal")).count(),
|
||||
all_entries.iter().filter(|(f, _)| f.label.as_deref() == Some("attack")).count(),
|
||||
);
|
||||
|
||||
Ok(all_entries)
|
||||
}
|
||||
|
||||
/// Check if cached CSIC files exist.
|
||||
pub fn csic_is_cached() -> bool {
|
||||
let dir = cache_dir();
|
||||
FILES.iter().all(|(f, _)| dir.join(f).exists())
|
||||
}
|
||||
|
||||
/// Return the cache directory path for display.
|
||||
pub fn csic_cache_path() -> PathBuf {
|
||||
cache_dir()
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
|
||||
#[test]
|
||||
fn test_parse_single_http_request() {
|
||||
let lines = vec![
|
||||
"GET /index.html HTTP/1.1",
|
||||
"Host: localhost:8080",
|
||||
"User-Agent: Mozilla/5.0",
|
||||
"Cookie: session=abc",
|
||||
"Accept: text/html",
|
||||
];
|
||||
let req = parse_single_request(&lines).unwrap();
|
||||
assert_eq!(req.method, "GET");
|
||||
assert_eq!(req.path, "/index.html");
|
||||
assert!(req.has_cookies);
|
||||
assert_eq!(req.user_agent, "Mozilla/5.0");
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_parse_absolute_url() {
|
||||
let lines = vec!["POST http://localhost:8080/tienda1/miembros/editar.jsp?id=2 HTTP/1.1"];
|
||||
let req = parse_single_request(&lines).unwrap();
|
||||
assert_eq!(req.method, "POST");
|
||||
assert_eq!(req.path, "/tienda1/miembros/editar.jsp");
|
||||
assert_eq!(req.query, "id=2");
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_parse_csic_content_multiple_requests() {
|
||||
let content = "GET /page1 HTTP/1.1\nHost: localhost\n\nPOST /page2 HTTP/1.1\nHost: localhost\n\n";
|
||||
let reqs = parse_csic_content(content);
|
||||
assert_eq!(reqs.len(), 2);
|
||||
assert_eq!(reqs[0].method, "GET");
|
||||
assert_eq!(reqs[1].method, "POST");
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_to_audit_fields_normal() {
|
||||
let req = ParsedRequest {
|
||||
method: "GET".to_string(),
|
||||
path: "/index.html".to_string(),
|
||||
query: String::new(),
|
||||
user_agent: "Mozilla/5.0".to_string(),
|
||||
has_cookies: true,
|
||||
content_length: 100,
|
||||
referer: "https://example.com".to_string(),
|
||||
accept_language: "en-US".to_string(),
|
||||
};
|
||||
let mut rng = Rng::new(42);
|
||||
let fields = to_audit_fields(&req, "normal", DEFAULT_HOSTS, &mut rng);
|
||||
assert_eq!(fields.label.as_deref(), Some("normal"));
|
||||
assert!(fields.has_cookies.unwrap_or(false));
|
||||
assert!(fields.host.ends_with(".sunbeam.pt"));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_to_audit_fields_anomalous_strips_cookies() {
|
||||
let req = ParsedRequest {
|
||||
method: "GET".to_string(),
|
||||
path: "/.env".to_string(),
|
||||
query: String::new(),
|
||||
user_agent: "Mozilla/5.0".to_string(),
|
||||
has_cookies: true,
|
||||
content_length: 0,
|
||||
referer: "https://example.com".to_string(),
|
||||
accept_language: "en-US".to_string(),
|
||||
};
|
||||
let mut rng = Rng::new(42);
|
||||
let fields = to_audit_fields(&req, "anomalous", DEFAULT_HOSTS, &mut rng);
|
||||
assert_eq!(fields.label.as_deref(), Some("attack"));
|
||||
assert!(!fields.has_cookies.unwrap_or(true));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_rng_deterministic() {
|
||||
let mut a = Rng::new(42);
|
||||
let mut b = Rng::new(42);
|
||||
for _ in 0..100 {
|
||||
assert_eq!(a.next_u64(), b.next_u64());
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -50,6 +50,7 @@ impl ScannerDetector {
|
||||
/// Returns a verdict with the action, raw score, and reason.
|
||||
/// The score and reason are captured in pipeline logs so the training
|
||||
/// pipeline always has unfiltered data to retrain from.
|
||||
#[allow(clippy::too_many_arguments)]
|
||||
pub fn check(
|
||||
&self,
|
||||
method: &str,
|
||||
@@ -107,8 +108,8 @@ impl ScannerDetector {
|
||||
|
||||
// 3. Compute score = bias + dot(weights, features) + interaction terms
|
||||
let mut score = self.weights[NUM_SCANNER_FEATURES + 2]; // bias (index 14)
|
||||
for i in 0..NUM_SCANNER_FEATURES {
|
||||
score += self.weights[i] * f[i];
|
||||
for (i, &fi) in f.iter().enumerate().take(NUM_SCANNER_FEATURES) {
|
||||
score += self.weights[i] * fi;
|
||||
}
|
||||
// Interaction: suspicious_path AND no_cookies
|
||||
score += self.weights[12] * f[0] * (1.0 - f[3]);
|
||||
|
||||
@@ -14,6 +14,7 @@ const TRAVERSAL_PATTERNS: &[&str] = &["..", "%00", "%0a", "%27", "%3c"];
|
||||
|
||||
/// Extract all 12 scanner features from a single request.
|
||||
/// No heap allocation — all work done on references and stack buffers.
|
||||
#[allow(clippy::too_many_arguments)]
|
||||
pub fn extract_features(
|
||||
method: &str,
|
||||
path: &str,
|
||||
|
||||
@@ -1,4 +1,5 @@
|
||||
pub mod allowlist;
|
||||
pub mod csic;
|
||||
pub mod detector;
|
||||
pub mod features;
|
||||
pub mod model;
|
||||
|
||||
@@ -14,6 +14,7 @@ pub struct TrainScannerArgs {
|
||||
pub output: String,
|
||||
pub wordlists: Option<String>,
|
||||
pub threshold: f64,
|
||||
pub csic: bool,
|
||||
}
|
||||
|
||||
/// Default suspicious fragments — matches the DDoS feature list plus extras.
|
||||
@@ -135,6 +136,54 @@ pub fn run(args: TrainScannerArgs) -> Result<()> {
|
||||
}
|
||||
}
|
||||
|
||||
// 1b. Optionally fetch CSIC 2010 dataset and add labeled entries
|
||||
if args.csic {
|
||||
let csic_entries = crate::scanner::csic::fetch_csic_dataset()?;
|
||||
for (_, host_prefix) in &csic_entries {
|
||||
log_hosts.insert(fx_hash_bytes(host_prefix.as_bytes()));
|
||||
}
|
||||
for (fields, host_prefix) in &csic_entries {
|
||||
let has_cookies = fields.has_cookies.unwrap_or(false);
|
||||
let has_referer = fields
|
||||
.referer
|
||||
.as_ref()
|
||||
.map(|r| r != "-" && !r.is_empty())
|
||||
.unwrap_or(false);
|
||||
let has_accept_language = fields
|
||||
.accept_language
|
||||
.as_ref()
|
||||
.map(|a| a != "-" && !a.is_empty())
|
||||
.unwrap_or(false);
|
||||
|
||||
let feats = features::extract_features(
|
||||
&fields.method,
|
||||
&fields.path,
|
||||
host_prefix,
|
||||
has_cookies,
|
||||
has_referer,
|
||||
has_accept_language,
|
||||
"-",
|
||||
&fields.user_agent,
|
||||
fields.content_length,
|
||||
&fragment_hashes,
|
||||
&extension_hashes,
|
||||
&log_hosts,
|
||||
);
|
||||
|
||||
// CSIC entries always have a ground-truth label.
|
||||
let label = match fields.label.as_deref() {
|
||||
Some("attack" | "anomalous") => 1.0,
|
||||
Some("normal") => 0.0,
|
||||
_ => continue,
|
||||
};
|
||||
|
||||
samples.push(LabeledSample {
|
||||
features: feats,
|
||||
label,
|
||||
});
|
||||
}
|
||||
}
|
||||
|
||||
let log_sample_count = samples.len();
|
||||
let log_attack_count = samples.iter().filter(|s| s.label > 0.5).count();
|
||||
eprintln!(
|
||||
@@ -408,6 +457,7 @@ fn train_logistic_regression(
|
||||
weights
|
||||
}
|
||||
|
||||
#[allow(clippy::too_many_arguments)]
|
||||
fn label_request(
|
||||
path: &str,
|
||||
has_cookies: bool,
|
||||
|
||||
@@ -11,12 +11,12 @@ pub async fn run_tcp_proxy(listen: &str, backend: &str) {
|
||||
let ipv6_addr = if listen.starts_with('[') {
|
||||
listen.to_string()
|
||||
} else {
|
||||
format!("[::]:{}", listen.split(':').last().unwrap_or("22"))
|
||||
format!("[::]:{}", listen.split(':').next_back().unwrap_or("22"))
|
||||
};
|
||||
|
||||
let ipv4_addr = if listen.contains(':') {
|
||||
// Extract port from the original address
|
||||
let port = listen.split(':').last().unwrap_or("22");
|
||||
let port = listen.split(':').next_back().unwrap_or("22");
|
||||
format!("0.0.0.0:{}", port)
|
||||
} else {
|
||||
"0.0.0.0:22".to_string()
|
||||
|
||||
@@ -3,7 +3,7 @@ use k8s_openapi::api::core::v1::{ConfigMap, Secret};
|
||||
use kube::{runtime::watcher, Api, Client};
|
||||
use tokio::sync::mpsc;
|
||||
|
||||
/// Watch `pingora-tls` and `pingora-config` in the ingress namespace.
|
||||
/// Watch the TLS Secret and config ConfigMap for changes.
|
||||
///
|
||||
/// On cert change: write new cert bytes from the Apply event directly to the
|
||||
/// configured paths (avoiding kubelet volume-sync delay), then trigger a
|
||||
@@ -15,14 +15,21 @@ use tokio::sync::mpsc;
|
||||
///
|
||||
/// No-ops when no K8s client is available (e.g. ad-hoc local runs outside a
|
||||
/// cluster) so the binary works in both environments.
|
||||
pub async fn run_watcher(client: Client, cert_path: String, key_path: String) {
|
||||
pub async fn run_watcher(
|
||||
client: Client,
|
||||
namespace: String,
|
||||
tls_secret: String,
|
||||
config_configmap: String,
|
||||
cert_path: String,
|
||||
key_path: String,
|
||||
) {
|
||||
let (tx, mut rx) = mpsc::channel::<()>(2);
|
||||
|
||||
let secret_api: Api<Secret> = Api::namespaced(client.clone(), "ingress");
|
||||
let cm_api: Api<ConfigMap> = Api::namespaced(client.clone(), "ingress");
|
||||
let secret_api: Api<Secret> = Api::namespaced(client.clone(), &namespace);
|
||||
let cm_api: Api<ConfigMap> = Api::namespaced(client.clone(), &namespace);
|
||||
|
||||
tokio::spawn(watch_secret(secret_api, cert_path, key_path, tx.clone()));
|
||||
tokio::spawn(watch_configmap(cm_api, tx));
|
||||
tokio::spawn(watch_secret(secret_api, tls_secret, cert_path, key_path, tx.clone()));
|
||||
tokio::spawn(watch_configmap(cm_api, config_configmap, tx));
|
||||
|
||||
if rx.recv().await.is_some() {
|
||||
tracing::info!("initiating graceful upgrade");
|
||||
@@ -32,11 +39,13 @@ pub async fn run_watcher(client: Client, cert_path: String, key_path: String) {
|
||||
|
||||
async fn watch_secret(
|
||||
api: Api<Secret>,
|
||||
secret_name: String,
|
||||
cert_path: String,
|
||||
key_path: String,
|
||||
tx: mpsc::Sender<()>,
|
||||
) {
|
||||
let cfg = watcher::Config::default().fields("metadata.name=pingora-tls");
|
||||
let field_selector = format!("metadata.name={secret_name}");
|
||||
let cfg = watcher::Config::default().fields(&field_selector);
|
||||
let mut stream = Box::pin(watcher(api, cfg));
|
||||
let mut initialized = false;
|
||||
|
||||
@@ -44,14 +53,10 @@ async fn watch_secret(
|
||||
match result {
|
||||
Ok(watcher::Event::InitDone) => {
|
||||
initialized = true;
|
||||
tracing::debug!("pingora-tls watcher ready");
|
||||
tracing::debug!(%secret_name, "TLS secret watcher ready");
|
||||
}
|
||||
// Write the new cert directly from the event object before triggering the
|
||||
// upgrade. The Apply event carries the full updated Secret, so we don't
|
||||
// need a separate API call and the cert files are ready before the new
|
||||
// process's svc.add_tls() runs.
|
||||
Ok(watcher::Event::Apply(secret)) if initialized => {
|
||||
tracing::info!("pingora-tls changed — writing new cert");
|
||||
tracing::info!(%secret_name, "TLS secret changed — writing new cert");
|
||||
match crate::cert::write_from_secret(&secret, &cert_path, &key_path) {
|
||||
Ok(()) => {
|
||||
let _ = tx.send(()).await;
|
||||
@@ -69,8 +74,9 @@ async fn watch_secret(
|
||||
}
|
||||
}
|
||||
|
||||
async fn watch_configmap(api: Api<ConfigMap>, tx: mpsc::Sender<()>) {
|
||||
let cfg = watcher::Config::default().fields("metadata.name=pingora-config");
|
||||
async fn watch_configmap(api: Api<ConfigMap>, configmap_name: String, tx: mpsc::Sender<()>) {
|
||||
let field_selector = format!("metadata.name={configmap_name}");
|
||||
let cfg = watcher::Config::default().fields(&field_selector);
|
||||
let mut stream = Box::pin(watcher(api, cfg));
|
||||
let mut initialized = false;
|
||||
|
||||
@@ -78,10 +84,10 @@ async fn watch_configmap(api: Api<ConfigMap>, tx: mpsc::Sender<()>) {
|
||||
match result {
|
||||
Ok(watcher::Event::InitDone) => {
|
||||
initialized = true;
|
||||
tracing::debug!("pingora-config watcher ready");
|
||||
tracing::debug!(%configmap_name, "config watcher ready");
|
||||
}
|
||||
Ok(watcher::Event::Apply(_)) if initialized => {
|
||||
tracing::info!("pingora-config changed — triggering upgrade");
|
||||
tracing::info!(%configmap_name, "config changed — triggering upgrade");
|
||||
let _ = tx.send(()).await;
|
||||
return;
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user