diff --git a/.gitignore b/.gitignore index 1441169..fcd1646 100644 --- a/.gitignore +++ b/.gitignore @@ -4,3 +4,8 @@ certs/ *.pem *.key *.crt + +# Training data and model binaries +*.bin +*.jsonl +heuristics.toml diff --git a/benches/scanner_bench.rs b/benches/scanner_bench.rs new file mode 100644 index 0000000..2f56626 --- /dev/null +++ b/benches/scanner_bench.rs @@ -0,0 +1,246 @@ +use criterion::{black_box, criterion_group, criterion_main, Criterion}; +use sunbeam_proxy::config::RouteConfig; +use sunbeam_proxy::scanner::detector::ScannerDetector; +use sunbeam_proxy::scanner::features::{ + self, fx_hash_bytes, ScannerNormParams, NUM_SCANNER_FEATURES, NUM_SCANNER_WEIGHTS, +}; +use sunbeam_proxy::scanner::model::ScannerModel; + +fn make_detector() -> ScannerDetector { + // Use realistic trained weights (from the base model) + let mut weights = [0.0f64; NUM_SCANNER_WEIGHTS]; + weights[0] = 0.155; // suspicious_path_score + weights[1] = 0.039; // path_depth + weights[2] = 0.328; // has_suspicious_extension + weights[3] = -1.376; // has_cookies + weights[4] = -0.196; // has_referer + weights[5] = -0.590; // has_accept_language + weights[7] = -0.254; // ua_category + weights[8] = 0.023; // method_is_unusual + weights[11] = 0.001; // path_has_traversal + weights[12] = 0.155; // interaction:path*no_cookies + weights[13] = 1.051; // interaction:no_host*no_lang + weights[14] = 0.461; // bias + + let model = ScannerModel { + weights, + threshold: 0.5, + norm_params: ScannerNormParams { + mins: [0.0; NUM_SCANNER_FEATURES], + maxs: [1.0; NUM_SCANNER_FEATURES], + }, + fragments: vec![ + ".env".into(), "wp-admin".into(), "wp-login".into(), "wp-includes".into(), + "wp-content".into(), "xmlrpc".into(), "phpinfo".into(), "phpmyadmin".into(), + "cgi-bin".into(), ".git".into(), ".htaccess".into(), ".htpasswd".into(), + "config.".into(), "admin".into(), "actuator".into(), "telescope".into(), + "debug".into(), "shell".into(), "eval-stdin".into(), + ], + }; + + let routes = vec![ + RouteConfig { + host_prefix: "admin".into(), + backend: "http://127.0.0.1:8080".into(), + websocket: false, + disable_secure_redirection: false, + paths: vec![], + }, + RouteConfig { + host_prefix: "src".into(), + backend: "http://127.0.0.1:8081".into(), + websocket: false, + disable_secure_redirection: false, + paths: vec![], + }, + RouteConfig { + host_prefix: "docs".into(), + backend: "http://127.0.0.1:8082".into(), + websocket: false, + disable_secure_redirection: false, + paths: vec![], + }, + ]; + + ScannerDetector::new(&model, &routes) +} + +fn bench_check_normal_browser(c: &mut Criterion) { + let detector = make_detector(); + c.bench_function("scanner::check normal_browser", |b| { + b.iter(|| { + detector.check( + black_box("GET"), + black_box("/blog/hello-world"), + black_box("admin"), + black_box(true), // has_cookies + black_box(true), // has_referer + black_box(true), // has_accept_language + black_box("text/html,application/xhtml+xml"), + black_box("Mozilla/5.0 (Windows NT 10.0; Win64; x64) Chrome/120.0.0.0"), + black_box(0), + ) + }) + }); +} + +fn bench_check_allowlist_host_cookies(c: &mut Criterion) { + let detector = make_detector(); + c.bench_function("scanner::check allowlist:host+cookies", |b| { + b.iter(|| { + detector.check( + black_box("POST"), + black_box("/api/v1/data"), + black_box("src"), + black_box(true), + black_box(false), + black_box(true), + black_box("application/json"), + black_box("MyApp/2.0"), + black_box(256), + ) + }) + }); +} + +fn bench_check_scanner_env_probe(c: &mut Criterion) { + let detector = make_detector(); + c.bench_function("scanner::check env_probe (block)", |b| { + b.iter(|| { + detector.check( + black_box("GET"), + black_box("/.env"), + black_box("unknown"), + black_box(false), + black_box(false), + black_box(false), + black_box("*/*"), + black_box("curl/7.0"), + black_box(0), + ) + }) + }); +} + +fn bench_check_wordpress_scan(c: &mut Criterion) { + let detector = make_detector(); + c.bench_function("scanner::check wp_scan (block)", |b| { + b.iter(|| { + detector.check( + black_box("GET"), + black_box("/wp-admin/install.php"), + black_box("random"), + black_box(false), + black_box(false), + black_box(false), + black_box("*/*"), + black_box(""), + black_box(0), + ) + }) + }); +} + +fn bench_check_path_traversal(c: &mut Criterion) { + let detector = make_detector(); + c.bench_function("scanner::check path_traversal (block)", |b| { + b.iter(|| { + detector.check( + black_box("GET"), + black_box("/etc/../../../passwd"), + black_box("unknown"), + black_box(false), + black_box(false), + black_box(false), + black_box("*/*"), + black_box("python-requests/2.28"), + black_box(0), + ) + }) + }); +} + +fn bench_check_deep_path(c: &mut Criterion) { + let detector = make_detector(); + c.bench_function("scanner::check deep_path (10 segments)", |b| { + b.iter(|| { + detector.check( + black_box("GET"), + black_box("/a/b/c/d/e/f/g/h/i/j"), + black_box("unknown"), + black_box(false), + black_box(false), + black_box(false), + black_box("*/*"), + black_box("Go-http-client/1.1"), + black_box(0), + ) + }) + }); +} + +fn bench_check_api_legitimate(c: &mut Criterion) { + let detector = make_detector(); + c.bench_function("scanner::check api_legit (model path, allow)", |b| { + b.iter(|| { + detector.check( + black_box("POST"), + black_box("/api/webhooks/github"), + black_box("unknown"), // unknown host, no allowlist shortcut + black_box(false), + black_box(false), + black_box(true), // has accept-language + black_box("application/json"), + black_box("GitHub-Hookshot/abc123"), + black_box(1024), + ) + }) + }); +} + +fn bench_extract_features(c: &mut Criterion) { + let fragment_hashes: rustc_hash::FxHashSet = [ + ".env", "wp-admin", "wp-login", "phpinfo", "phpmyadmin", "cgi-bin", ".git", + ] + .iter() + .map(|f| fx_hash_bytes(f.as_bytes())) + .collect(); + let extension_hashes: rustc_hash::FxHashSet = features::SUSPICIOUS_EXTENSIONS_LIST + .iter() + .map(|e| fx_hash_bytes(e.as_bytes())) + .collect(); + let configured_hosts: rustc_hash::FxHashSet = + ["admin", "src", "docs"].iter().map(|h| fx_hash_bytes(h.as_bytes())).collect(); + + c.bench_function("scanner::extract_features", |b| { + b.iter(|| { + features::extract_features( + black_box("GET"), + black_box("/wp-admin/install.php"), + black_box("unknown"), + black_box(false), + black_box(false), + black_box(false), + black_box("*/*"), + black_box("Mozilla/5.0 Chrome/120"), + black_box(0), + black_box(&fragment_hashes), + black_box(&extension_hashes), + black_box(&configured_hosts), + ) + }) + }); +} + +criterion_group!( + benches, + bench_check_normal_browser, + bench_check_allowlist_host_cookies, + bench_check_scanner_env_probe, + bench_check_wordpress_scan, + bench_check_path_traversal, + bench_check_deep_path, + bench_check_api_legitimate, + bench_extract_features, +); +criterion_main!(benches); diff --git a/scripts/convert_csic.py b/scripts/convert_csic.py new file mode 100755 index 0000000..a3dcd8f --- /dev/null +++ b/scripts/convert_csic.py @@ -0,0 +1,289 @@ +#!/usr/bin/env python3 +""" +Convert CSIC 2010 HTTP dataset files into Sunbeam audit-log JSONL format. + +The CSIC 2010 dataset contains raw HTTP/1.1 requests separated by blank lines. +Label is determined by which file it came from (normal vs anomalous). + +Usage: + # Download the dataset first: + git clone https://github.com/msudol/Web-Application-Attack-Datasets.git /tmp/csic + + # Convert all three files: + python3 scripts/convert_csic.py \ + --normal /tmp/csic/OriginalDataSets/normalTrafficTraining.txt \ + --normal /tmp/csic/OriginalDataSets/normalTrafficTest.txt \ + --anomalous /tmp/csic/OriginalDataSets/anomalousTrafficTest.txt \ + --hosts admin,src,docs,auth,drive,grafana,people,meet,s3,livekit \ + --output csic_converted.jsonl + + # Merge with production logs: + cat logs.jsonl csic_converted.jsonl > combined.jsonl + + # Train: + cargo run -- train-scanner --input combined.jsonl --output scanner_model.bin +""" + +import argparse +import json +import random +import sys +from datetime import datetime, timedelta +from urllib.parse import urlparse, unquote + + +def parse_csic_file(filepath): + """Parse a CSIC 2010 raw HTTP file into individual requests.""" + requests = [] + current_lines = [] + + with open(filepath, "r", encoding="utf-8", errors="replace") as f: + for line in f: + stripped = line.rstrip("\r\n") + if stripped == "" and current_lines: + req = parse_single_request(current_lines) + if req: + requests.append(req) + current_lines = [] + else: + current_lines.append(stripped) + + # Handle last request if file doesn't end with blank line + if current_lines: + req = parse_single_request(current_lines) + if req: + requests.append(req) + + return requests + + +def parse_single_request(lines): + """Parse a single HTTP request from its lines into a dict of headers/fields.""" + if not lines: + return None + + # First line: METHOD url HTTP/1.1 + request_line = lines[0] + parts = request_line.split(" ", 2) + if len(parts) < 2: + return None + + method = parts[0] + raw_url = parts[1] + + # Extract path from URL (may be absolute like http://localhost:8080/path) + parsed = urlparse(raw_url) + path = parsed.path or "/" + query = parsed.query or "" + + # Parse headers + headers = {} + body_start = None + for i, line in enumerate(lines[1:], start=1): + if line == "": + body_start = i + 1 + break + if ":" in line: + key, _, value = line.partition(":") + headers[key.strip().lower()] = value.strip() + + # Extract body if present + body = "" + if body_start and body_start < len(lines): + body = "\n".join(lines[body_start:]) + + content_length = 0 + if "content-length" in headers: + try: + content_length = int(headers["content-length"]) + except ValueError: + content_length = len(body) + elif body: + content_length = len(body) + + return { + "method": method, + "path": path, + "query": query, + "user_agent": headers.get("user-agent", "-"), + "has_cookies": "cookie" in headers, + "content_length": content_length, + "referer": headers.get("referer", "-"), + "accept_language": headers.get("accept-language", "-"), + "accept": headers.get("accept", "*/*"), + "host_header": headers.get("host", "localhost:8080"), + } + + +def to_audit_jsonl(req, label, configured_hosts, base_time, offset_secs): + """Convert a parsed request into our audit log JSONL format.""" + # Assign a host: normal traffic gets a configured host, attack gets random + if label == "normal": + host_prefix = random.choice(configured_hosts) + status = random.choice([200, 200, 200, 200, 301, 304]) + else: + # 70% unknown host, 30% configured (attacks do hit real hosts) + if random.random() < 0.7: + host_prefix = random.choice([ + "unknown", "scanner", "probe", "test", + "random-" + str(random.randint(1000, 9999)), + ]) + else: + host_prefix = random.choice(configured_hosts) + status = random.choice([404, 404, 404, 400, 403, 500]) + + host = f"{host_prefix}.sunbeam.pt" + + # Synthesize a client IP + ip = f"{random.randint(1,223)}.{random.randint(0,255)}.{random.randint(0,255)}.{random.randint(1,254)}" + + timestamp = (base_time + timedelta(seconds=offset_secs)).strftime( + "%Y-%m-%dT%H:%M:%S.%fZ" + ) + + referer = req["referer"] + accept_language = req["accept_language"] + + # For anomalous samples, simulate realistic scanner behavior: + # scanners don't carry session cookies, referer, or accept-language. + # CSIC attacks all have these because they were generated from a user + # session — strip them to match what real scanners look like. + if label != "normal": + has_cookies = False + referer = "-" + # 80% drop accept-language (most scanners), 20% keep (sophisticated ones) + if random.random() < 0.8: + accept_language = "-" + # 40% use a scanner-like UA instead of the CSIC browser UA + r = random.random() + if r < 0.15: + user_agent = "" + elif r < 0.25: + user_agent = "curl/7.68.0" + elif r < 0.35: + user_agent = "python-requests/2.28.0" + elif r < 0.40: + user_agent = f"Go-http-client/1.1" + else: + user_agent = req["user_agent"] + else: + has_cookies = req["has_cookies"] + user_agent = req["user_agent"] + + entry = { + "timestamp": timestamp, + "level": "INFO", + "fields": { + "message": "request", + "target": "audit", + "method": req["method"], + "host": host, + "path": req["path"], + "query": req.get("query", ""), + "client_ip": ip, + "status": status, + "duration_ms": random.randint(1, 50), + "content_length": req["content_length"], + "user_agent": user_agent, + "referer": referer, + "accept_language": accept_language, + "accept": req["accept"], + "has_cookies": has_cookies, + "cf_country": "-", + "backend": f"{host_prefix}-svc:8080" if label == "normal" else "-", + "label": "normal" if label == "normal" else "attack", + }, + "target": "sunbeam_proxy::proxy", + } + + return json.dumps(entry, ensure_ascii=False) + + +def main(): + parser = argparse.ArgumentParser( + description="Convert CSIC 2010 dataset to Sunbeam audit-log JSONL" + ) + parser.add_argument( + "--normal", + action="append", + default=[], + help="Path to normal traffic file(s). Can be specified multiple times.", + ) + parser.add_argument( + "--anomalous", + action="append", + default=[], + help="Path to anomalous traffic file(s). Can be specified multiple times.", + ) + parser.add_argument( + "--hosts", + default="admin,src,docs,auth,drive,grafana,people,meet,s3,livekit", + help="Comma-separated list of configured host prefixes (default: sunbeam.pt subdomains)", + ) + parser.add_argument( + "--output", + default="-", + help="Output JSONL file (default: stdout)", + ) + parser.add_argument( + "--shuffle", + action="store_true", + default=True, + help="Shuffle output to interleave normal/attack (default: true)", + ) + parser.add_argument( + "--seed", + type=int, + default=42, + help="Random seed for reproducibility (default: 42)", + ) + args = parser.parse_args() + + if not args.normal and not args.anomalous: + parser.error("provide at least one --normal or --anomalous file") + + random.seed(args.seed) + configured_hosts = [h.strip() for h in args.hosts.split(",")] + base_time = datetime(2026, 3, 1, 0, 0, 0) + + # Parse all files + labeled = [] + + for path in args.normal: + reqs = parse_csic_file(path) + print(f"parsed {len(reqs)} normal requests from {path}", file=sys.stderr) + for r in reqs: + labeled.append((r, "normal")) + + for path in args.anomalous: + reqs = parse_csic_file(path) + print(f"parsed {len(reqs)} anomalous requests from {path}", file=sys.stderr) + for r in reqs: + labeled.append((r, "anomalous")) + + if args.shuffle: + random.shuffle(labeled) + + print( + f"total: {len(labeled)} requests " + f"({sum(1 for _, l in labeled if l == 'normal')} normal, " + f"{sum(1 for _, l in labeled if l == 'anomalous')} anomalous)", + file=sys.stderr, + ) + + # Write output + out = open(args.output, "w") if args.output != "-" else sys.stdout + try: + for i, (req, label) in enumerate(labeled): + line = to_audit_jsonl(req, label, configured_hosts, base_time, i) + out.write(line + "\n") + finally: + if out is not sys.stdout: + out.close() + + if args.output != "-": + print(f"written to {args.output}", file=sys.stderr) + + +if __name__ == "__main__": + main()