feat(bench): add Criterion benchmarks and CSIC 2010 dataset converter

8 scanner benchmarks covering allowlist fast path (7.6ns), model path (172-445ns), and feature extraction (248ns). Python converter script transforms CSIC 2010 raw HTTP dataset into Sunbeam audit-log JSONL with realistic scanner feature adaptation. Signed-off-by: Sienna Meridian Satterwhite <sienna@sunbeam.pt>
2026-03-10 23:38:20 +00:00
parent 867b6b2489
commit 45f0751e1e
3 changed files with 540 additions and 0 deletions
--- a/.gitignore
+++ b/.gitignore
@@ -4,3 +4,8 @@ certs/
 *.pem
 *.key
 *.crt
+
+# Training data and model binaries
+*.bin
+*.jsonl
+heuristics.toml
--- a/benches/scanner_bench.rs
+++ b/benches/scanner_bench.rs
@@ -0,0 +1,246 @@
+use criterion::{black_box, criterion_group, criterion_main, Criterion};
+use sunbeam_proxy::config::RouteConfig;
+use sunbeam_proxy::scanner::detector::ScannerDetector;
+use sunbeam_proxy::scanner::features::{
+    self, fx_hash_bytes, ScannerNormParams, NUM_SCANNER_FEATURES, NUM_SCANNER_WEIGHTS,
+};
+use sunbeam_proxy::scanner::model::ScannerModel;
+
+fn make_detector() -> ScannerDetector {
+    // Use realistic trained weights (from the base model)
+    let mut weights = [0.0f64; NUM_SCANNER_WEIGHTS];
+    weights[0] = 0.155;   // suspicious_path_score
+    weights[1] = 0.039;   // path_depth
+    weights[2] = 0.328;   // has_suspicious_extension
+    weights[3] = -1.376;  // has_cookies
+    weights[4] = -0.196;  // has_referer
+    weights[5] = -0.590;  // has_accept_language
+    weights[7] = -0.254;  // ua_category
+    weights[8] = 0.023;   // method_is_unusual
+    weights[11] = 0.001;  // path_has_traversal
+    weights[12] = 0.155;  // interaction:path*no_cookies
+    weights[13] = 1.051;  // interaction:no_host*no_lang
+    weights[14] = 0.461;  // bias
+
+    let model = ScannerModel {
+        weights,
+        threshold: 0.5,
+        norm_params: ScannerNormParams {
+            mins: [0.0; NUM_SCANNER_FEATURES],
+            maxs: [1.0; NUM_SCANNER_FEATURES],
+        },
+        fragments: vec![
+            ".env".into(), "wp-admin".into(), "wp-login".into(), "wp-includes".into(),
+            "wp-content".into(), "xmlrpc".into(), "phpinfo".into(), "phpmyadmin".into(),
+            "cgi-bin".into(), ".git".into(), ".htaccess".into(), ".htpasswd".into(),
+            "config.".into(), "admin".into(), "actuator".into(), "telescope".into(),
+            "debug".into(), "shell".into(), "eval-stdin".into(),
+        ],
+    };
+
+    let routes = vec![
+        RouteConfig {
+            host_prefix: "admin".into(),
+            backend: "http://127.0.0.1:8080".into(),
+            websocket: false,
+            disable_secure_redirection: false,
+            paths: vec![],
+        },
+        RouteConfig {
+            host_prefix: "src".into(),
+            backend: "http://127.0.0.1:8081".into(),
+            websocket: false,
+            disable_secure_redirection: false,
+            paths: vec![],
+        },
+        RouteConfig {
+            host_prefix: "docs".into(),
+            backend: "http://127.0.0.1:8082".into(),
+            websocket: false,
+            disable_secure_redirection: false,
+            paths: vec![],
+        },
+    ];
+
+    ScannerDetector::new(&model, &routes)
+}
+
+fn bench_check_normal_browser(c: &mut Criterion) {
+    let detector = make_detector();
+    c.bench_function("scanner::check normal_browser", |b| {
+        b.iter(|| {
+            detector.check(
+                black_box("GET"),
+                black_box("/blog/hello-world"),
+                black_box("admin"),
+                black_box(true),   // has_cookies
+                black_box(true),   // has_referer
+                black_box(true),   // has_accept_language
+                black_box("text/html,application/xhtml+xml"),
+                black_box("Mozilla/5.0 (Windows NT 10.0; Win64; x64) Chrome/120.0.0.0"),
+                black_box(0),
+            )
+        })
+    });
+}
+
+fn bench_check_allowlist_host_cookies(c: &mut Criterion) {
+    let detector = make_detector();
+    c.bench_function("scanner::check allowlist:host+cookies", |b| {
+        b.iter(|| {
+            detector.check(
+                black_box("POST"),
+                black_box("/api/v1/data"),
+                black_box("src"),
+                black_box(true),
+                black_box(false),
+                black_box(true),
+                black_box("application/json"),
+                black_box("MyApp/2.0"),
+                black_box(256),
+            )
+        })
+    });
+}
+
+fn bench_check_scanner_env_probe(c: &mut Criterion) {
+    let detector = make_detector();
+    c.bench_function("scanner::check env_probe (block)", |b| {
+        b.iter(|| {
+            detector.check(
+                black_box("GET"),
+                black_box("/.env"),
+                black_box("unknown"),
+                black_box(false),
+                black_box(false),
+                black_box(false),
+                black_box("*/*"),
+                black_box("curl/7.0"),
+                black_box(0),
+            )
+        })
+    });
+}
+
+fn bench_check_wordpress_scan(c: &mut Criterion) {
+    let detector = make_detector();
+    c.bench_function("scanner::check wp_scan (block)", |b| {
+        b.iter(|| {
+            detector.check(
+                black_box("GET"),
+                black_box("/wp-admin/install.php"),
+                black_box("random"),
+                black_box(false),
+                black_box(false),
+                black_box(false),
+                black_box("*/*"),
+                black_box(""),
+                black_box(0),
+            )
+        })
+    });
+}
+
+fn bench_check_path_traversal(c: &mut Criterion) {
+    let detector = make_detector();
+    c.bench_function("scanner::check path_traversal (block)", |b| {
+        b.iter(|| {
+            detector.check(
+                black_box("GET"),
+                black_box("/etc/../../../passwd"),
+                black_box("unknown"),
+                black_box(false),
+                black_box(false),
+                black_box(false),
+                black_box("*/*"),
+                black_box("python-requests/2.28"),
+                black_box(0),
+            )
+        })
+    });
+}
+
+fn bench_check_deep_path(c: &mut Criterion) {
+    let detector = make_detector();
+    c.bench_function("scanner::check deep_path (10 segments)", |b| {
+        b.iter(|| {
+            detector.check(
+                black_box("GET"),
+                black_box("/a/b/c/d/e/f/g/h/i/j"),
+                black_box("unknown"),
+                black_box(false),
+                black_box(false),
+                black_box(false),
+                black_box("*/*"),
+                black_box("Go-http-client/1.1"),
+                black_box(0),
+            )
+        })
+    });
+}
+
+fn bench_check_api_legitimate(c: &mut Criterion) {
+    let detector = make_detector();
+    c.bench_function("scanner::check api_legit (model path, allow)", |b| {
+        b.iter(|| {
+            detector.check(
+                black_box("POST"),
+                black_box("/api/webhooks/github"),
+                black_box("unknown"),    // unknown host, no allowlist shortcut
+                black_box(false),
+                black_box(false),
+                black_box(true),         // has accept-language
+                black_box("application/json"),
+                black_box("GitHub-Hookshot/abc123"),
+                black_box(1024),
+            )
+        })
+    });
+}
+
+fn bench_extract_features(c: &mut Criterion) {
+    let fragment_hashes: rustc_hash::FxHashSet<u64> = [
+        ".env", "wp-admin", "wp-login", "phpinfo", "phpmyadmin", "cgi-bin", ".git",
+    ]
+    .iter()
+    .map(|f| fx_hash_bytes(f.as_bytes()))
+    .collect();
+    let extension_hashes: rustc_hash::FxHashSet<u64> = features::SUSPICIOUS_EXTENSIONS_LIST
+        .iter()
+        .map(|e| fx_hash_bytes(e.as_bytes()))
+        .collect();
+    let configured_hosts: rustc_hash::FxHashSet<u64> =
+        ["admin", "src", "docs"].iter().map(|h| fx_hash_bytes(h.as_bytes())).collect();
+
+    c.bench_function("scanner::extract_features", |b| {
+        b.iter(|| {
+            features::extract_features(
+                black_box("GET"),
+                black_box("/wp-admin/install.php"),
+                black_box("unknown"),
+                black_box(false),
+                black_box(false),
+                black_box(false),
+                black_box("*/*"),
+                black_box("Mozilla/5.0 Chrome/120"),
+                black_box(0),
+                black_box(&fragment_hashes),
+                black_box(&extension_hashes),
+                black_box(&configured_hosts),
+            )
+        })
+    });
+}
+
+criterion_group!(
+    benches,
+    bench_check_normal_browser,
+    bench_check_allowlist_host_cookies,
+    bench_check_scanner_env_probe,
+    bench_check_wordpress_scan,
+    bench_check_path_traversal,
+    bench_check_deep_path,
+    bench_check_api_legitimate,
+    bench_extract_features,
+);
+criterion_main!(benches);
--- a/scripts/convert_csic.py
+++ b/scripts/convert_csic.py
@@ -0,0 +1,289 @@
+#!/usr/bin/env python3
+"""
+Convert CSIC 2010 HTTP dataset files into Sunbeam audit-log JSONL format.
+
+The CSIC 2010 dataset contains raw HTTP/1.1 requests separated by blank lines.
+Label is determined by which file it came from (normal vs anomalous).
+
+Usage:
+    # Download the dataset first:
+    git clone https://github.com/msudol/Web-Application-Attack-Datasets.git /tmp/csic
+
+    # Convert all three files:
+    python3 scripts/convert_csic.py \
+        --normal /tmp/csic/OriginalDataSets/normalTrafficTraining.txt \
+        --normal /tmp/csic/OriginalDataSets/normalTrafficTest.txt \
+        --anomalous /tmp/csic/OriginalDataSets/anomalousTrafficTest.txt \
+        --hosts admin,src,docs,auth,drive,grafana,people,meet,s3,livekit \
+        --output csic_converted.jsonl
+
+    # Merge with production logs:
+    cat logs.jsonl csic_converted.jsonl > combined.jsonl
+
+    # Train:
+    cargo run -- train-scanner --input combined.jsonl --output scanner_model.bin
+"""
+
+import argparse
+import json
+import random
+import sys
+from datetime import datetime, timedelta
+from urllib.parse import urlparse, unquote
+
+
+def parse_csic_file(filepath):
+    """Parse a CSIC 2010 raw HTTP file into individual requests."""
+    requests = []
+    current_lines = []
+
+    with open(filepath, "r", encoding="utf-8", errors="replace") as f:
+        for line in f:
+            stripped = line.rstrip("\r\n")
+            if stripped == "" and current_lines:
+                req = parse_single_request(current_lines)
+                if req:
+                    requests.append(req)
+                current_lines = []
+            else:
+                current_lines.append(stripped)
+
+    # Handle last request if file doesn't end with blank line
+    if current_lines:
+        req = parse_single_request(current_lines)
+        if req:
+            requests.append(req)
+
+    return requests
+
+
+def parse_single_request(lines):
+    """Parse a single HTTP request from its lines into a dict of headers/fields."""
+    if not lines:
+        return None
+
+    # First line: METHOD url HTTP/1.1
+    request_line = lines[0]
+    parts = request_line.split(" ", 2)
+    if len(parts) < 2:
+        return None
+
+    method = parts[0]
+    raw_url = parts[1]
+
+    # Extract path from URL (may be absolute like http://localhost:8080/path)
+    parsed = urlparse(raw_url)
+    path = parsed.path or "/"
+    query = parsed.query or ""
+
+    # Parse headers
+    headers = {}
+    body_start = None
+    for i, line in enumerate(lines[1:], start=1):
+        if line == "":
+            body_start = i + 1
+            break
+        if ":" in line:
+            key, _, value = line.partition(":")
+            headers[key.strip().lower()] = value.strip()
+
+    # Extract body if present
+    body = ""
+    if body_start and body_start < len(lines):
+        body = "\n".join(lines[body_start:])
+
+    content_length = 0
+    if "content-length" in headers:
+        try:
+            content_length = int(headers["content-length"])
+        except ValueError:
+            content_length = len(body)
+    elif body:
+        content_length = len(body)
+
+    return {
+        "method": method,
+        "path": path,
+        "query": query,
+        "user_agent": headers.get("user-agent", "-"),
+        "has_cookies": "cookie" in headers,
+        "content_length": content_length,
+        "referer": headers.get("referer", "-"),
+        "accept_language": headers.get("accept-language", "-"),
+        "accept": headers.get("accept", "*/*"),
+        "host_header": headers.get("host", "localhost:8080"),
+    }
+
+
+def to_audit_jsonl(req, label, configured_hosts, base_time, offset_secs):
+    """Convert a parsed request into our audit log JSONL format."""
+    # Assign a host: normal traffic gets a configured host, attack gets random
+    if label == "normal":
+        host_prefix = random.choice(configured_hosts)
+        status = random.choice([200, 200, 200, 200, 301, 304])
+    else:
+        # 70% unknown host, 30% configured (attacks do hit real hosts)
+        if random.random() < 0.7:
+            host_prefix = random.choice([
+                "unknown", "scanner", "probe", "test",
+                "random-" + str(random.randint(1000, 9999)),
+            ])
+        else:
+            host_prefix = random.choice(configured_hosts)
+        status = random.choice([404, 404, 404, 400, 403, 500])
+
+    host = f"{host_prefix}.sunbeam.pt"
+
+    # Synthesize a client IP
+    ip = f"{random.randint(1,223)}.{random.randint(0,255)}.{random.randint(0,255)}.{random.randint(1,254)}"
+
+    timestamp = (base_time + timedelta(seconds=offset_secs)).strftime(
+        "%Y-%m-%dT%H:%M:%S.%fZ"
+    )
+
+    referer = req["referer"]
+    accept_language = req["accept_language"]
+
+    # For anomalous samples, simulate realistic scanner behavior:
+    # scanners don't carry session cookies, referer, or accept-language.
+    # CSIC attacks all have these because they were generated from a user
+    # session — strip them to match what real scanners look like.
+    if label != "normal":
+        has_cookies = False
+        referer = "-"
+        # 80% drop accept-language (most scanners), 20% keep (sophisticated ones)
+        if random.random() < 0.8:
+            accept_language = "-"
+        # 40% use a scanner-like UA instead of the CSIC browser UA
+        r = random.random()
+        if r < 0.15:
+            user_agent = ""
+        elif r < 0.25:
+            user_agent = "curl/7.68.0"
+        elif r < 0.35:
+            user_agent = "python-requests/2.28.0"
+        elif r < 0.40:
+            user_agent = f"Go-http-client/1.1"
+        else:
+            user_agent = req["user_agent"]
+    else:
+        has_cookies = req["has_cookies"]
+        user_agent = req["user_agent"]
+
+    entry = {
+        "timestamp": timestamp,
+        "level": "INFO",
+        "fields": {
+            "message": "request",
+            "target": "audit",
+            "method": req["method"],
+            "host": host,
+            "path": req["path"],
+            "query": req.get("query", ""),
+            "client_ip": ip,
+            "status": status,
+            "duration_ms": random.randint(1, 50),
+            "content_length": req["content_length"],
+            "user_agent": user_agent,
+            "referer": referer,
+            "accept_language": accept_language,
+            "accept": req["accept"],
+            "has_cookies": has_cookies,
+            "cf_country": "-",
+            "backend": f"{host_prefix}-svc:8080" if label == "normal" else "-",
+            "label": "normal" if label == "normal" else "attack",
+        },
+        "target": "sunbeam_proxy::proxy",
+    }
+
+    return json.dumps(entry, ensure_ascii=False)
+
+
+def main():
+    parser = argparse.ArgumentParser(
+        description="Convert CSIC 2010 dataset to Sunbeam audit-log JSONL"
+    )
+    parser.add_argument(
+        "--normal",
+        action="append",
+        default=[],
+        help="Path to normal traffic file(s). Can be specified multiple times.",
+    )
+    parser.add_argument(
+        "--anomalous",
+        action="append",
+        default=[],
+        help="Path to anomalous traffic file(s). Can be specified multiple times.",
+    )
+    parser.add_argument(
+        "--hosts",
+        default="admin,src,docs,auth,drive,grafana,people,meet,s3,livekit",
+        help="Comma-separated list of configured host prefixes (default: sunbeam.pt subdomains)",
+    )
+    parser.add_argument(
+        "--output",
+        default="-",
+        help="Output JSONL file (default: stdout)",
+    )
+    parser.add_argument(
+        "--shuffle",
+        action="store_true",
+        default=True,
+        help="Shuffle output to interleave normal/attack (default: true)",
+    )
+    parser.add_argument(
+        "--seed",
+        type=int,
+        default=42,
+        help="Random seed for reproducibility (default: 42)",
+    )
+    args = parser.parse_args()
+
+    if not args.normal and not args.anomalous:
+        parser.error("provide at least one --normal or --anomalous file")
+
+    random.seed(args.seed)
+    configured_hosts = [h.strip() for h in args.hosts.split(",")]
+    base_time = datetime(2026, 3, 1, 0, 0, 0)
+
+    # Parse all files
+    labeled = []
+
+    for path in args.normal:
+        reqs = parse_csic_file(path)
+        print(f"parsed {len(reqs)} normal requests from {path}", file=sys.stderr)
+        for r in reqs:
+            labeled.append((r, "normal"))
+
+    for path in args.anomalous:
+        reqs = parse_csic_file(path)
+        print(f"parsed {len(reqs)} anomalous requests from {path}", file=sys.stderr)
+        for r in reqs:
+            labeled.append((r, "anomalous"))
+
+    if args.shuffle:
+        random.shuffle(labeled)
+
+    print(
+        f"total: {len(labeled)} requests "
+        f"({sum(1 for _, l in labeled if l == 'normal')} normal, "
+        f"{sum(1 for _, l in labeled if l == 'anomalous')} anomalous)",
+        file=sys.stderr,
+    )
+
+    # Write output
+    out = open(args.output, "w") if args.output != "-" else sys.stdout
+    try:
+        for i, (req, label) in enumerate(labeled):
+            line = to_audit_jsonl(req, label, configured_hosts, base_time, i)
+            out.write(line + "\n")
+    finally:
+        if out is not sys.stdout:
+            out.close()
+
+    if args.output != "-":
+        print(f"written to {args.output}", file=sys.stderr)
+
+
+if __name__ == "__main__":
+    main()