feat(bench): add Criterion benchmarks and CSIC 2010 dataset converter

8 scanner benchmarks covering allowlist fast path (7.6ns), model path
(172-445ns), and feature extraction (248ns). Python converter script
transforms CSIC 2010 raw HTTP dataset into Sunbeam audit-log JSONL
with realistic scanner feature adaptation.

Signed-off-by: Sienna Meridian Satterwhite <sienna@sunbeam.pt>
This commit is contained in:
2026-03-10 23:38:20 +00:00
parent 867b6b2489
commit 45f0751e1e
3 changed files with 540 additions and 0 deletions

5
.gitignore vendored
View File

@@ -4,3 +4,8 @@ certs/
*.pem
*.key
*.crt
# Training data and model binaries
*.bin
*.jsonl
heuristics.toml

246
benches/scanner_bench.rs Normal file
View File

@@ -0,0 +1,246 @@
use criterion::{black_box, criterion_group, criterion_main, Criterion};
use sunbeam_proxy::config::RouteConfig;
use sunbeam_proxy::scanner::detector::ScannerDetector;
use sunbeam_proxy::scanner::features::{
self, fx_hash_bytes, ScannerNormParams, NUM_SCANNER_FEATURES, NUM_SCANNER_WEIGHTS,
};
use sunbeam_proxy::scanner::model::ScannerModel;
fn make_detector() -> ScannerDetector {
// Use realistic trained weights (from the base model)
let mut weights = [0.0f64; NUM_SCANNER_WEIGHTS];
weights[0] = 0.155; // suspicious_path_score
weights[1] = 0.039; // path_depth
weights[2] = 0.328; // has_suspicious_extension
weights[3] = -1.376; // has_cookies
weights[4] = -0.196; // has_referer
weights[5] = -0.590; // has_accept_language
weights[7] = -0.254; // ua_category
weights[8] = 0.023; // method_is_unusual
weights[11] = 0.001; // path_has_traversal
weights[12] = 0.155; // interaction:path*no_cookies
weights[13] = 1.051; // interaction:no_host*no_lang
weights[14] = 0.461; // bias
let model = ScannerModel {
weights,
threshold: 0.5,
norm_params: ScannerNormParams {
mins: [0.0; NUM_SCANNER_FEATURES],
maxs: [1.0; NUM_SCANNER_FEATURES],
},
fragments: vec![
".env".into(), "wp-admin".into(), "wp-login".into(), "wp-includes".into(),
"wp-content".into(), "xmlrpc".into(), "phpinfo".into(), "phpmyadmin".into(),
"cgi-bin".into(), ".git".into(), ".htaccess".into(), ".htpasswd".into(),
"config.".into(), "admin".into(), "actuator".into(), "telescope".into(),
"debug".into(), "shell".into(), "eval-stdin".into(),
],
};
let routes = vec![
RouteConfig {
host_prefix: "admin".into(),
backend: "http://127.0.0.1:8080".into(),
websocket: false,
disable_secure_redirection: false,
paths: vec![],
},
RouteConfig {
host_prefix: "src".into(),
backend: "http://127.0.0.1:8081".into(),
websocket: false,
disable_secure_redirection: false,
paths: vec![],
},
RouteConfig {
host_prefix: "docs".into(),
backend: "http://127.0.0.1:8082".into(),
websocket: false,
disable_secure_redirection: false,
paths: vec![],
},
];
ScannerDetector::new(&model, &routes)
}
fn bench_check_normal_browser(c: &mut Criterion) {
let detector = make_detector();
c.bench_function("scanner::check normal_browser", |b| {
b.iter(|| {
detector.check(
black_box("GET"),
black_box("/blog/hello-world"),
black_box("admin"),
black_box(true), // has_cookies
black_box(true), // has_referer
black_box(true), // has_accept_language
black_box("text/html,application/xhtml+xml"),
black_box("Mozilla/5.0 (Windows NT 10.0; Win64; x64) Chrome/120.0.0.0"),
black_box(0),
)
})
});
}
fn bench_check_allowlist_host_cookies(c: &mut Criterion) {
let detector = make_detector();
c.bench_function("scanner::check allowlist:host+cookies", |b| {
b.iter(|| {
detector.check(
black_box("POST"),
black_box("/api/v1/data"),
black_box("src"),
black_box(true),
black_box(false),
black_box(true),
black_box("application/json"),
black_box("MyApp/2.0"),
black_box(256),
)
})
});
}
fn bench_check_scanner_env_probe(c: &mut Criterion) {
let detector = make_detector();
c.bench_function("scanner::check env_probe (block)", |b| {
b.iter(|| {
detector.check(
black_box("GET"),
black_box("/.env"),
black_box("unknown"),
black_box(false),
black_box(false),
black_box(false),
black_box("*/*"),
black_box("curl/7.0"),
black_box(0),
)
})
});
}
fn bench_check_wordpress_scan(c: &mut Criterion) {
let detector = make_detector();
c.bench_function("scanner::check wp_scan (block)", |b| {
b.iter(|| {
detector.check(
black_box("GET"),
black_box("/wp-admin/install.php"),
black_box("random"),
black_box(false),
black_box(false),
black_box(false),
black_box("*/*"),
black_box(""),
black_box(0),
)
})
});
}
fn bench_check_path_traversal(c: &mut Criterion) {
let detector = make_detector();
c.bench_function("scanner::check path_traversal (block)", |b| {
b.iter(|| {
detector.check(
black_box("GET"),
black_box("/etc/../../../passwd"),
black_box("unknown"),
black_box(false),
black_box(false),
black_box(false),
black_box("*/*"),
black_box("python-requests/2.28"),
black_box(0),
)
})
});
}
fn bench_check_deep_path(c: &mut Criterion) {
let detector = make_detector();
c.bench_function("scanner::check deep_path (10 segments)", |b| {
b.iter(|| {
detector.check(
black_box("GET"),
black_box("/a/b/c/d/e/f/g/h/i/j"),
black_box("unknown"),
black_box(false),
black_box(false),
black_box(false),
black_box("*/*"),
black_box("Go-http-client/1.1"),
black_box(0),
)
})
});
}
fn bench_check_api_legitimate(c: &mut Criterion) {
let detector = make_detector();
c.bench_function("scanner::check api_legit (model path, allow)", |b| {
b.iter(|| {
detector.check(
black_box("POST"),
black_box("/api/webhooks/github"),
black_box("unknown"), // unknown host, no allowlist shortcut
black_box(false),
black_box(false),
black_box(true), // has accept-language
black_box("application/json"),
black_box("GitHub-Hookshot/abc123"),
black_box(1024),
)
})
});
}
fn bench_extract_features(c: &mut Criterion) {
let fragment_hashes: rustc_hash::FxHashSet<u64> = [
".env", "wp-admin", "wp-login", "phpinfo", "phpmyadmin", "cgi-bin", ".git",
]
.iter()
.map(|f| fx_hash_bytes(f.as_bytes()))
.collect();
let extension_hashes: rustc_hash::FxHashSet<u64> = features::SUSPICIOUS_EXTENSIONS_LIST
.iter()
.map(|e| fx_hash_bytes(e.as_bytes()))
.collect();
let configured_hosts: rustc_hash::FxHashSet<u64> =
["admin", "src", "docs"].iter().map(|h| fx_hash_bytes(h.as_bytes())).collect();
c.bench_function("scanner::extract_features", |b| {
b.iter(|| {
features::extract_features(
black_box("GET"),
black_box("/wp-admin/install.php"),
black_box("unknown"),
black_box(false),
black_box(false),
black_box(false),
black_box("*/*"),
black_box("Mozilla/5.0 Chrome/120"),
black_box(0),
black_box(&fragment_hashes),
black_box(&extension_hashes),
black_box(&configured_hosts),
)
})
});
}
criterion_group!(
benches,
bench_check_normal_browser,
bench_check_allowlist_host_cookies,
bench_check_scanner_env_probe,
bench_check_wordpress_scan,
bench_check_path_traversal,
bench_check_deep_path,
bench_check_api_legitimate,
bench_extract_features,
);
criterion_main!(benches);

289
scripts/convert_csic.py Executable file
View File

@@ -0,0 +1,289 @@
#!/usr/bin/env python3
"""
Convert CSIC 2010 HTTP dataset files into Sunbeam audit-log JSONL format.
The CSIC 2010 dataset contains raw HTTP/1.1 requests separated by blank lines.
Label is determined by which file it came from (normal vs anomalous).
Usage:
# Download the dataset first:
git clone https://github.com/msudol/Web-Application-Attack-Datasets.git /tmp/csic
# Convert all three files:
python3 scripts/convert_csic.py \
--normal /tmp/csic/OriginalDataSets/normalTrafficTraining.txt \
--normal /tmp/csic/OriginalDataSets/normalTrafficTest.txt \
--anomalous /tmp/csic/OriginalDataSets/anomalousTrafficTest.txt \
--hosts admin,src,docs,auth,drive,grafana,people,meet,s3,livekit \
--output csic_converted.jsonl
# Merge with production logs:
cat logs.jsonl csic_converted.jsonl > combined.jsonl
# Train:
cargo run -- train-scanner --input combined.jsonl --output scanner_model.bin
"""
import argparse
import json
import random
import sys
from datetime import datetime, timedelta
from urllib.parse import urlparse, unquote
def parse_csic_file(filepath):
"""Parse a CSIC 2010 raw HTTP file into individual requests."""
requests = []
current_lines = []
with open(filepath, "r", encoding="utf-8", errors="replace") as f:
for line in f:
stripped = line.rstrip("\r\n")
if stripped == "" and current_lines:
req = parse_single_request(current_lines)
if req:
requests.append(req)
current_lines = []
else:
current_lines.append(stripped)
# Handle last request if file doesn't end with blank line
if current_lines:
req = parse_single_request(current_lines)
if req:
requests.append(req)
return requests
def parse_single_request(lines):
"""Parse a single HTTP request from its lines into a dict of headers/fields."""
if not lines:
return None
# First line: METHOD url HTTP/1.1
request_line = lines[0]
parts = request_line.split(" ", 2)
if len(parts) < 2:
return None
method = parts[0]
raw_url = parts[1]
# Extract path from URL (may be absolute like http://localhost:8080/path)
parsed = urlparse(raw_url)
path = parsed.path or "/"
query = parsed.query or ""
# Parse headers
headers = {}
body_start = None
for i, line in enumerate(lines[1:], start=1):
if line == "":
body_start = i + 1
break
if ":" in line:
key, _, value = line.partition(":")
headers[key.strip().lower()] = value.strip()
# Extract body if present
body = ""
if body_start and body_start < len(lines):
body = "\n".join(lines[body_start:])
content_length = 0
if "content-length" in headers:
try:
content_length = int(headers["content-length"])
except ValueError:
content_length = len(body)
elif body:
content_length = len(body)
return {
"method": method,
"path": path,
"query": query,
"user_agent": headers.get("user-agent", "-"),
"has_cookies": "cookie" in headers,
"content_length": content_length,
"referer": headers.get("referer", "-"),
"accept_language": headers.get("accept-language", "-"),
"accept": headers.get("accept", "*/*"),
"host_header": headers.get("host", "localhost:8080"),
}
def to_audit_jsonl(req, label, configured_hosts, base_time, offset_secs):
"""Convert a parsed request into our audit log JSONL format."""
# Assign a host: normal traffic gets a configured host, attack gets random
if label == "normal":
host_prefix = random.choice(configured_hosts)
status = random.choice([200, 200, 200, 200, 301, 304])
else:
# 70% unknown host, 30% configured (attacks do hit real hosts)
if random.random() < 0.7:
host_prefix = random.choice([
"unknown", "scanner", "probe", "test",
"random-" + str(random.randint(1000, 9999)),
])
else:
host_prefix = random.choice(configured_hosts)
status = random.choice([404, 404, 404, 400, 403, 500])
host = f"{host_prefix}.sunbeam.pt"
# Synthesize a client IP
ip = f"{random.randint(1,223)}.{random.randint(0,255)}.{random.randint(0,255)}.{random.randint(1,254)}"
timestamp = (base_time + timedelta(seconds=offset_secs)).strftime(
"%Y-%m-%dT%H:%M:%S.%fZ"
)
referer = req["referer"]
accept_language = req["accept_language"]
# For anomalous samples, simulate realistic scanner behavior:
# scanners don't carry session cookies, referer, or accept-language.
# CSIC attacks all have these because they were generated from a user
# session — strip them to match what real scanners look like.
if label != "normal":
has_cookies = False
referer = "-"
# 80% drop accept-language (most scanners), 20% keep (sophisticated ones)
if random.random() < 0.8:
accept_language = "-"
# 40% use a scanner-like UA instead of the CSIC browser UA
r = random.random()
if r < 0.15:
user_agent = ""
elif r < 0.25:
user_agent = "curl/7.68.0"
elif r < 0.35:
user_agent = "python-requests/2.28.0"
elif r < 0.40:
user_agent = f"Go-http-client/1.1"
else:
user_agent = req["user_agent"]
else:
has_cookies = req["has_cookies"]
user_agent = req["user_agent"]
entry = {
"timestamp": timestamp,
"level": "INFO",
"fields": {
"message": "request",
"target": "audit",
"method": req["method"],
"host": host,
"path": req["path"],
"query": req.get("query", ""),
"client_ip": ip,
"status": status,
"duration_ms": random.randint(1, 50),
"content_length": req["content_length"],
"user_agent": user_agent,
"referer": referer,
"accept_language": accept_language,
"accept": req["accept"],
"has_cookies": has_cookies,
"cf_country": "-",
"backend": f"{host_prefix}-svc:8080" if label == "normal" else "-",
"label": "normal" if label == "normal" else "attack",
},
"target": "sunbeam_proxy::proxy",
}
return json.dumps(entry, ensure_ascii=False)
def main():
parser = argparse.ArgumentParser(
description="Convert CSIC 2010 dataset to Sunbeam audit-log JSONL"
)
parser.add_argument(
"--normal",
action="append",
default=[],
help="Path to normal traffic file(s). Can be specified multiple times.",
)
parser.add_argument(
"--anomalous",
action="append",
default=[],
help="Path to anomalous traffic file(s). Can be specified multiple times.",
)
parser.add_argument(
"--hosts",
default="admin,src,docs,auth,drive,grafana,people,meet,s3,livekit",
help="Comma-separated list of configured host prefixes (default: sunbeam.pt subdomains)",
)
parser.add_argument(
"--output",
default="-",
help="Output JSONL file (default: stdout)",
)
parser.add_argument(
"--shuffle",
action="store_true",
default=True,
help="Shuffle output to interleave normal/attack (default: true)",
)
parser.add_argument(
"--seed",
type=int,
default=42,
help="Random seed for reproducibility (default: 42)",
)
args = parser.parse_args()
if not args.normal and not args.anomalous:
parser.error("provide at least one --normal or --anomalous file")
random.seed(args.seed)
configured_hosts = [h.strip() for h in args.hosts.split(",")]
base_time = datetime(2026, 3, 1, 0, 0, 0)
# Parse all files
labeled = []
for path in args.normal:
reqs = parse_csic_file(path)
print(f"parsed {len(reqs)} normal requests from {path}", file=sys.stderr)
for r in reqs:
labeled.append((r, "normal"))
for path in args.anomalous:
reqs = parse_csic_file(path)
print(f"parsed {len(reqs)} anomalous requests from {path}", file=sys.stderr)
for r in reqs:
labeled.append((r, "anomalous"))
if args.shuffle:
random.shuffle(labeled)
print(
f"total: {len(labeled)} requests "
f"({sum(1 for _, l in labeled if l == 'normal')} normal, "
f"{sum(1 for _, l in labeled if l == 'anomalous')} anomalous)",
file=sys.stderr,
)
# Write output
out = open(args.output, "w") if args.output != "-" else sys.stdout
try:
for i, (req, label) in enumerate(labeled):
line = to_audit_jsonl(req, label, configured_hosts, base_time, i)
out.write(line + "\n")
finally:
if out is not sys.stdout:
out.close()
if args.output != "-":
print(f"written to {args.output}", file=sys.stderr)
if __name__ == "__main__":
main()