feat(bench): add Criterion benchmarks and CSIC 2010 dataset converter
8 scanner benchmarks covering allowlist fast path (7.6ns), model path (172-445ns), and feature extraction (248ns). Python converter script transforms CSIC 2010 raw HTTP dataset into Sunbeam audit-log JSONL with realistic scanner feature adaptation. Signed-off-by: Sienna Meridian Satterwhite <sienna@sunbeam.pt>
This commit is contained in:
5
.gitignore
vendored
5
.gitignore
vendored
@@ -4,3 +4,8 @@ certs/
|
||||
*.pem
|
||||
*.key
|
||||
*.crt
|
||||
|
||||
# Training data and model binaries
|
||||
*.bin
|
||||
*.jsonl
|
||||
heuristics.toml
|
||||
|
||||
246
benches/scanner_bench.rs
Normal file
246
benches/scanner_bench.rs
Normal file
@@ -0,0 +1,246 @@
|
||||
use criterion::{black_box, criterion_group, criterion_main, Criterion};
|
||||
use sunbeam_proxy::config::RouteConfig;
|
||||
use sunbeam_proxy::scanner::detector::ScannerDetector;
|
||||
use sunbeam_proxy::scanner::features::{
|
||||
self, fx_hash_bytes, ScannerNormParams, NUM_SCANNER_FEATURES, NUM_SCANNER_WEIGHTS,
|
||||
};
|
||||
use sunbeam_proxy::scanner::model::ScannerModel;
|
||||
|
||||
fn make_detector() -> ScannerDetector {
|
||||
// Use realistic trained weights (from the base model)
|
||||
let mut weights = [0.0f64; NUM_SCANNER_WEIGHTS];
|
||||
weights[0] = 0.155; // suspicious_path_score
|
||||
weights[1] = 0.039; // path_depth
|
||||
weights[2] = 0.328; // has_suspicious_extension
|
||||
weights[3] = -1.376; // has_cookies
|
||||
weights[4] = -0.196; // has_referer
|
||||
weights[5] = -0.590; // has_accept_language
|
||||
weights[7] = -0.254; // ua_category
|
||||
weights[8] = 0.023; // method_is_unusual
|
||||
weights[11] = 0.001; // path_has_traversal
|
||||
weights[12] = 0.155; // interaction:path*no_cookies
|
||||
weights[13] = 1.051; // interaction:no_host*no_lang
|
||||
weights[14] = 0.461; // bias
|
||||
|
||||
let model = ScannerModel {
|
||||
weights,
|
||||
threshold: 0.5,
|
||||
norm_params: ScannerNormParams {
|
||||
mins: [0.0; NUM_SCANNER_FEATURES],
|
||||
maxs: [1.0; NUM_SCANNER_FEATURES],
|
||||
},
|
||||
fragments: vec![
|
||||
".env".into(), "wp-admin".into(), "wp-login".into(), "wp-includes".into(),
|
||||
"wp-content".into(), "xmlrpc".into(), "phpinfo".into(), "phpmyadmin".into(),
|
||||
"cgi-bin".into(), ".git".into(), ".htaccess".into(), ".htpasswd".into(),
|
||||
"config.".into(), "admin".into(), "actuator".into(), "telescope".into(),
|
||||
"debug".into(), "shell".into(), "eval-stdin".into(),
|
||||
],
|
||||
};
|
||||
|
||||
let routes = vec![
|
||||
RouteConfig {
|
||||
host_prefix: "admin".into(),
|
||||
backend: "http://127.0.0.1:8080".into(),
|
||||
websocket: false,
|
||||
disable_secure_redirection: false,
|
||||
paths: vec![],
|
||||
},
|
||||
RouteConfig {
|
||||
host_prefix: "src".into(),
|
||||
backend: "http://127.0.0.1:8081".into(),
|
||||
websocket: false,
|
||||
disable_secure_redirection: false,
|
||||
paths: vec![],
|
||||
},
|
||||
RouteConfig {
|
||||
host_prefix: "docs".into(),
|
||||
backend: "http://127.0.0.1:8082".into(),
|
||||
websocket: false,
|
||||
disable_secure_redirection: false,
|
||||
paths: vec![],
|
||||
},
|
||||
];
|
||||
|
||||
ScannerDetector::new(&model, &routes)
|
||||
}
|
||||
|
||||
fn bench_check_normal_browser(c: &mut Criterion) {
|
||||
let detector = make_detector();
|
||||
c.bench_function("scanner::check normal_browser", |b| {
|
||||
b.iter(|| {
|
||||
detector.check(
|
||||
black_box("GET"),
|
||||
black_box("/blog/hello-world"),
|
||||
black_box("admin"),
|
||||
black_box(true), // has_cookies
|
||||
black_box(true), // has_referer
|
||||
black_box(true), // has_accept_language
|
||||
black_box("text/html,application/xhtml+xml"),
|
||||
black_box("Mozilla/5.0 (Windows NT 10.0; Win64; x64) Chrome/120.0.0.0"),
|
||||
black_box(0),
|
||||
)
|
||||
})
|
||||
});
|
||||
}
|
||||
|
||||
fn bench_check_allowlist_host_cookies(c: &mut Criterion) {
|
||||
let detector = make_detector();
|
||||
c.bench_function("scanner::check allowlist:host+cookies", |b| {
|
||||
b.iter(|| {
|
||||
detector.check(
|
||||
black_box("POST"),
|
||||
black_box("/api/v1/data"),
|
||||
black_box("src"),
|
||||
black_box(true),
|
||||
black_box(false),
|
||||
black_box(true),
|
||||
black_box("application/json"),
|
||||
black_box("MyApp/2.0"),
|
||||
black_box(256),
|
||||
)
|
||||
})
|
||||
});
|
||||
}
|
||||
|
||||
fn bench_check_scanner_env_probe(c: &mut Criterion) {
|
||||
let detector = make_detector();
|
||||
c.bench_function("scanner::check env_probe (block)", |b| {
|
||||
b.iter(|| {
|
||||
detector.check(
|
||||
black_box("GET"),
|
||||
black_box("/.env"),
|
||||
black_box("unknown"),
|
||||
black_box(false),
|
||||
black_box(false),
|
||||
black_box(false),
|
||||
black_box("*/*"),
|
||||
black_box("curl/7.0"),
|
||||
black_box(0),
|
||||
)
|
||||
})
|
||||
});
|
||||
}
|
||||
|
||||
fn bench_check_wordpress_scan(c: &mut Criterion) {
|
||||
let detector = make_detector();
|
||||
c.bench_function("scanner::check wp_scan (block)", |b| {
|
||||
b.iter(|| {
|
||||
detector.check(
|
||||
black_box("GET"),
|
||||
black_box("/wp-admin/install.php"),
|
||||
black_box("random"),
|
||||
black_box(false),
|
||||
black_box(false),
|
||||
black_box(false),
|
||||
black_box("*/*"),
|
||||
black_box(""),
|
||||
black_box(0),
|
||||
)
|
||||
})
|
||||
});
|
||||
}
|
||||
|
||||
fn bench_check_path_traversal(c: &mut Criterion) {
|
||||
let detector = make_detector();
|
||||
c.bench_function("scanner::check path_traversal (block)", |b| {
|
||||
b.iter(|| {
|
||||
detector.check(
|
||||
black_box("GET"),
|
||||
black_box("/etc/../../../passwd"),
|
||||
black_box("unknown"),
|
||||
black_box(false),
|
||||
black_box(false),
|
||||
black_box(false),
|
||||
black_box("*/*"),
|
||||
black_box("python-requests/2.28"),
|
||||
black_box(0),
|
||||
)
|
||||
})
|
||||
});
|
||||
}
|
||||
|
||||
fn bench_check_deep_path(c: &mut Criterion) {
|
||||
let detector = make_detector();
|
||||
c.bench_function("scanner::check deep_path (10 segments)", |b| {
|
||||
b.iter(|| {
|
||||
detector.check(
|
||||
black_box("GET"),
|
||||
black_box("/a/b/c/d/e/f/g/h/i/j"),
|
||||
black_box("unknown"),
|
||||
black_box(false),
|
||||
black_box(false),
|
||||
black_box(false),
|
||||
black_box("*/*"),
|
||||
black_box("Go-http-client/1.1"),
|
||||
black_box(0),
|
||||
)
|
||||
})
|
||||
});
|
||||
}
|
||||
|
||||
fn bench_check_api_legitimate(c: &mut Criterion) {
|
||||
let detector = make_detector();
|
||||
c.bench_function("scanner::check api_legit (model path, allow)", |b| {
|
||||
b.iter(|| {
|
||||
detector.check(
|
||||
black_box("POST"),
|
||||
black_box("/api/webhooks/github"),
|
||||
black_box("unknown"), // unknown host, no allowlist shortcut
|
||||
black_box(false),
|
||||
black_box(false),
|
||||
black_box(true), // has accept-language
|
||||
black_box("application/json"),
|
||||
black_box("GitHub-Hookshot/abc123"),
|
||||
black_box(1024),
|
||||
)
|
||||
})
|
||||
});
|
||||
}
|
||||
|
||||
fn bench_extract_features(c: &mut Criterion) {
|
||||
let fragment_hashes: rustc_hash::FxHashSet<u64> = [
|
||||
".env", "wp-admin", "wp-login", "phpinfo", "phpmyadmin", "cgi-bin", ".git",
|
||||
]
|
||||
.iter()
|
||||
.map(|f| fx_hash_bytes(f.as_bytes()))
|
||||
.collect();
|
||||
let extension_hashes: rustc_hash::FxHashSet<u64> = features::SUSPICIOUS_EXTENSIONS_LIST
|
||||
.iter()
|
||||
.map(|e| fx_hash_bytes(e.as_bytes()))
|
||||
.collect();
|
||||
let configured_hosts: rustc_hash::FxHashSet<u64> =
|
||||
["admin", "src", "docs"].iter().map(|h| fx_hash_bytes(h.as_bytes())).collect();
|
||||
|
||||
c.bench_function("scanner::extract_features", |b| {
|
||||
b.iter(|| {
|
||||
features::extract_features(
|
||||
black_box("GET"),
|
||||
black_box("/wp-admin/install.php"),
|
||||
black_box("unknown"),
|
||||
black_box(false),
|
||||
black_box(false),
|
||||
black_box(false),
|
||||
black_box("*/*"),
|
||||
black_box("Mozilla/5.0 Chrome/120"),
|
||||
black_box(0),
|
||||
black_box(&fragment_hashes),
|
||||
black_box(&extension_hashes),
|
||||
black_box(&configured_hosts),
|
||||
)
|
||||
})
|
||||
});
|
||||
}
|
||||
|
||||
criterion_group!(
|
||||
benches,
|
||||
bench_check_normal_browser,
|
||||
bench_check_allowlist_host_cookies,
|
||||
bench_check_scanner_env_probe,
|
||||
bench_check_wordpress_scan,
|
||||
bench_check_path_traversal,
|
||||
bench_check_deep_path,
|
||||
bench_check_api_legitimate,
|
||||
bench_extract_features,
|
||||
);
|
||||
criterion_main!(benches);
|
||||
289
scripts/convert_csic.py
Executable file
289
scripts/convert_csic.py
Executable file
@@ -0,0 +1,289 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
Convert CSIC 2010 HTTP dataset files into Sunbeam audit-log JSONL format.
|
||||
|
||||
The CSIC 2010 dataset contains raw HTTP/1.1 requests separated by blank lines.
|
||||
Label is determined by which file it came from (normal vs anomalous).
|
||||
|
||||
Usage:
|
||||
# Download the dataset first:
|
||||
git clone https://github.com/msudol/Web-Application-Attack-Datasets.git /tmp/csic
|
||||
|
||||
# Convert all three files:
|
||||
python3 scripts/convert_csic.py \
|
||||
--normal /tmp/csic/OriginalDataSets/normalTrafficTraining.txt \
|
||||
--normal /tmp/csic/OriginalDataSets/normalTrafficTest.txt \
|
||||
--anomalous /tmp/csic/OriginalDataSets/anomalousTrafficTest.txt \
|
||||
--hosts admin,src,docs,auth,drive,grafana,people,meet,s3,livekit \
|
||||
--output csic_converted.jsonl
|
||||
|
||||
# Merge with production logs:
|
||||
cat logs.jsonl csic_converted.jsonl > combined.jsonl
|
||||
|
||||
# Train:
|
||||
cargo run -- train-scanner --input combined.jsonl --output scanner_model.bin
|
||||
"""
|
||||
|
||||
import argparse
|
||||
import json
|
||||
import random
|
||||
import sys
|
||||
from datetime import datetime, timedelta
|
||||
from urllib.parse import urlparse, unquote
|
||||
|
||||
|
||||
def parse_csic_file(filepath):
|
||||
"""Parse a CSIC 2010 raw HTTP file into individual requests."""
|
||||
requests = []
|
||||
current_lines = []
|
||||
|
||||
with open(filepath, "r", encoding="utf-8", errors="replace") as f:
|
||||
for line in f:
|
||||
stripped = line.rstrip("\r\n")
|
||||
if stripped == "" and current_lines:
|
||||
req = parse_single_request(current_lines)
|
||||
if req:
|
||||
requests.append(req)
|
||||
current_lines = []
|
||||
else:
|
||||
current_lines.append(stripped)
|
||||
|
||||
# Handle last request if file doesn't end with blank line
|
||||
if current_lines:
|
||||
req = parse_single_request(current_lines)
|
||||
if req:
|
||||
requests.append(req)
|
||||
|
||||
return requests
|
||||
|
||||
|
||||
def parse_single_request(lines):
|
||||
"""Parse a single HTTP request from its lines into a dict of headers/fields."""
|
||||
if not lines:
|
||||
return None
|
||||
|
||||
# First line: METHOD url HTTP/1.1
|
||||
request_line = lines[0]
|
||||
parts = request_line.split(" ", 2)
|
||||
if len(parts) < 2:
|
||||
return None
|
||||
|
||||
method = parts[0]
|
||||
raw_url = parts[1]
|
||||
|
||||
# Extract path from URL (may be absolute like http://localhost:8080/path)
|
||||
parsed = urlparse(raw_url)
|
||||
path = parsed.path or "/"
|
||||
query = parsed.query or ""
|
||||
|
||||
# Parse headers
|
||||
headers = {}
|
||||
body_start = None
|
||||
for i, line in enumerate(lines[1:], start=1):
|
||||
if line == "":
|
||||
body_start = i + 1
|
||||
break
|
||||
if ":" in line:
|
||||
key, _, value = line.partition(":")
|
||||
headers[key.strip().lower()] = value.strip()
|
||||
|
||||
# Extract body if present
|
||||
body = ""
|
||||
if body_start and body_start < len(lines):
|
||||
body = "\n".join(lines[body_start:])
|
||||
|
||||
content_length = 0
|
||||
if "content-length" in headers:
|
||||
try:
|
||||
content_length = int(headers["content-length"])
|
||||
except ValueError:
|
||||
content_length = len(body)
|
||||
elif body:
|
||||
content_length = len(body)
|
||||
|
||||
return {
|
||||
"method": method,
|
||||
"path": path,
|
||||
"query": query,
|
||||
"user_agent": headers.get("user-agent", "-"),
|
||||
"has_cookies": "cookie" in headers,
|
||||
"content_length": content_length,
|
||||
"referer": headers.get("referer", "-"),
|
||||
"accept_language": headers.get("accept-language", "-"),
|
||||
"accept": headers.get("accept", "*/*"),
|
||||
"host_header": headers.get("host", "localhost:8080"),
|
||||
}
|
||||
|
||||
|
||||
def to_audit_jsonl(req, label, configured_hosts, base_time, offset_secs):
|
||||
"""Convert a parsed request into our audit log JSONL format."""
|
||||
# Assign a host: normal traffic gets a configured host, attack gets random
|
||||
if label == "normal":
|
||||
host_prefix = random.choice(configured_hosts)
|
||||
status = random.choice([200, 200, 200, 200, 301, 304])
|
||||
else:
|
||||
# 70% unknown host, 30% configured (attacks do hit real hosts)
|
||||
if random.random() < 0.7:
|
||||
host_prefix = random.choice([
|
||||
"unknown", "scanner", "probe", "test",
|
||||
"random-" + str(random.randint(1000, 9999)),
|
||||
])
|
||||
else:
|
||||
host_prefix = random.choice(configured_hosts)
|
||||
status = random.choice([404, 404, 404, 400, 403, 500])
|
||||
|
||||
host = f"{host_prefix}.sunbeam.pt"
|
||||
|
||||
# Synthesize a client IP
|
||||
ip = f"{random.randint(1,223)}.{random.randint(0,255)}.{random.randint(0,255)}.{random.randint(1,254)}"
|
||||
|
||||
timestamp = (base_time + timedelta(seconds=offset_secs)).strftime(
|
||||
"%Y-%m-%dT%H:%M:%S.%fZ"
|
||||
)
|
||||
|
||||
referer = req["referer"]
|
||||
accept_language = req["accept_language"]
|
||||
|
||||
# For anomalous samples, simulate realistic scanner behavior:
|
||||
# scanners don't carry session cookies, referer, or accept-language.
|
||||
# CSIC attacks all have these because they were generated from a user
|
||||
# session — strip them to match what real scanners look like.
|
||||
if label != "normal":
|
||||
has_cookies = False
|
||||
referer = "-"
|
||||
# 80% drop accept-language (most scanners), 20% keep (sophisticated ones)
|
||||
if random.random() < 0.8:
|
||||
accept_language = "-"
|
||||
# 40% use a scanner-like UA instead of the CSIC browser UA
|
||||
r = random.random()
|
||||
if r < 0.15:
|
||||
user_agent = ""
|
||||
elif r < 0.25:
|
||||
user_agent = "curl/7.68.0"
|
||||
elif r < 0.35:
|
||||
user_agent = "python-requests/2.28.0"
|
||||
elif r < 0.40:
|
||||
user_agent = f"Go-http-client/1.1"
|
||||
else:
|
||||
user_agent = req["user_agent"]
|
||||
else:
|
||||
has_cookies = req["has_cookies"]
|
||||
user_agent = req["user_agent"]
|
||||
|
||||
entry = {
|
||||
"timestamp": timestamp,
|
||||
"level": "INFO",
|
||||
"fields": {
|
||||
"message": "request",
|
||||
"target": "audit",
|
||||
"method": req["method"],
|
||||
"host": host,
|
||||
"path": req["path"],
|
||||
"query": req.get("query", ""),
|
||||
"client_ip": ip,
|
||||
"status": status,
|
||||
"duration_ms": random.randint(1, 50),
|
||||
"content_length": req["content_length"],
|
||||
"user_agent": user_agent,
|
||||
"referer": referer,
|
||||
"accept_language": accept_language,
|
||||
"accept": req["accept"],
|
||||
"has_cookies": has_cookies,
|
||||
"cf_country": "-",
|
||||
"backend": f"{host_prefix}-svc:8080" if label == "normal" else "-",
|
||||
"label": "normal" if label == "normal" else "attack",
|
||||
},
|
||||
"target": "sunbeam_proxy::proxy",
|
||||
}
|
||||
|
||||
return json.dumps(entry, ensure_ascii=False)
|
||||
|
||||
|
||||
def main():
|
||||
parser = argparse.ArgumentParser(
|
||||
description="Convert CSIC 2010 dataset to Sunbeam audit-log JSONL"
|
||||
)
|
||||
parser.add_argument(
|
||||
"--normal",
|
||||
action="append",
|
||||
default=[],
|
||||
help="Path to normal traffic file(s). Can be specified multiple times.",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--anomalous",
|
||||
action="append",
|
||||
default=[],
|
||||
help="Path to anomalous traffic file(s). Can be specified multiple times.",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--hosts",
|
||||
default="admin,src,docs,auth,drive,grafana,people,meet,s3,livekit",
|
||||
help="Comma-separated list of configured host prefixes (default: sunbeam.pt subdomains)",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--output",
|
||||
default="-",
|
||||
help="Output JSONL file (default: stdout)",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--shuffle",
|
||||
action="store_true",
|
||||
default=True,
|
||||
help="Shuffle output to interleave normal/attack (default: true)",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--seed",
|
||||
type=int,
|
||||
default=42,
|
||||
help="Random seed for reproducibility (default: 42)",
|
||||
)
|
||||
args = parser.parse_args()
|
||||
|
||||
if not args.normal and not args.anomalous:
|
||||
parser.error("provide at least one --normal or --anomalous file")
|
||||
|
||||
random.seed(args.seed)
|
||||
configured_hosts = [h.strip() for h in args.hosts.split(",")]
|
||||
base_time = datetime(2026, 3, 1, 0, 0, 0)
|
||||
|
||||
# Parse all files
|
||||
labeled = []
|
||||
|
||||
for path in args.normal:
|
||||
reqs = parse_csic_file(path)
|
||||
print(f"parsed {len(reqs)} normal requests from {path}", file=sys.stderr)
|
||||
for r in reqs:
|
||||
labeled.append((r, "normal"))
|
||||
|
||||
for path in args.anomalous:
|
||||
reqs = parse_csic_file(path)
|
||||
print(f"parsed {len(reqs)} anomalous requests from {path}", file=sys.stderr)
|
||||
for r in reqs:
|
||||
labeled.append((r, "anomalous"))
|
||||
|
||||
if args.shuffle:
|
||||
random.shuffle(labeled)
|
||||
|
||||
print(
|
||||
f"total: {len(labeled)} requests "
|
||||
f"({sum(1 for _, l in labeled if l == 'normal')} normal, "
|
||||
f"{sum(1 for _, l in labeled if l == 'anomalous')} anomalous)",
|
||||
file=sys.stderr,
|
||||
)
|
||||
|
||||
# Write output
|
||||
out = open(args.output, "w") if args.output != "-" else sys.stdout
|
||||
try:
|
||||
for i, (req, label) in enumerate(labeled):
|
||||
line = to_audit_jsonl(req, label, configured_hosts, base_time, i)
|
||||
out.write(line + "\n")
|
||||
finally:
|
||||
if out is not sys.stdout:
|
||||
out.close()
|
||||
|
||||
if args.output != "-":
|
||||
print(f"written to {args.output}", file=sys.stderr)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
Reference in New Issue
Block a user