From e9bac0a8fe87eac2db4fdfe221dcbad648971b76 Mon Sep 17 00:00:00 2001 From: Sienna Meridian Satterwhite Date: Tue, 10 Mar 2026 23:38:21 +0000 Subject: [PATCH] chore: remove legacy deps (fnntw, rayon) and unused files - Remove fnntw (KNN) and rayon dependencies, no longer needed with ensemble architecture - Update burn features to include wgpu and train backends - Remove dev.toml, models/.gitkeep, scripts/convert_csic.py, and pingora-headless.yaml (superseded by cluster gossip discovery) - Add .DS_Store to .gitignore Signed-off-by: Sienna Meridian Satterwhite --- .gitignore | 3 + Cargo.lock | 107 +------ Cargo.toml | 6 +- dev.toml | 67 ---- .../base/ingress/pingora-headless.yaml | 16 - models/.gitkeep | 0 scripts/convert_csic.py | 290 ------------------ 7 files changed, 7 insertions(+), 482 deletions(-) delete mode 100644 dev.toml delete mode 100644 infrastructure/base/ingress/pingora-headless.yaml delete mode 100644 models/.gitkeep delete mode 100755 scripts/convert_csic.py diff --git a/.gitignore b/.gitignore index fcd1646..402d994 100644 --- a/.gitignore +++ b/.gitignore @@ -4,8 +4,11 @@ certs/ *.pem *.key *.crt +*.tmp +.DS_Store # Training data and model binaries *.bin *.jsonl heuristics.toml +**/*artifacts diff --git a/Cargo.lock b/Cargo.lock index 8cccf71..7b3062f 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -2,12 +2,6 @@ # It is not intended for manual editing. version = 4 -[[package]] -name = "Inflector" -version = "0.11.4" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "fe438c63458706e03479442743baae6c88256498e6431708f6dfc520a26515d3" - [[package]] name = "addr2line" version = "0.25.1" @@ -3187,23 +3181,6 @@ dependencies = [ "rand_distr", ] -[[package]] -name = "fnntw" -version = "0.4.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "8466be0d69b615cc756992651fe2eb11bfbb2cbf945b21a7746844b8293cbfe8" -dependencies = [ - "crossbeam-channel", - "likely_stable", - "num-format", - "ordered-float 3.9.2", - "ouroboros 0.15.6", - "permutation", - "rayon", - "sync-unsafe-cell", - "thiserror 1.0.69", -] - [[package]] name = "fnv" version = "1.0.7" @@ -5076,15 +5053,6 @@ dependencies = [ "libc", ] -[[package]] -name = "likely_stable" -version = "0.1.3" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d61f7017d8abea1fc23ff7f01a8147b2656dea3aeb24d519aab6e2177eaf671c" -dependencies = [ - "rustc_version", -] - [[package]] name = "linux-raw-sys" version = "0.4.15" @@ -5796,16 +5764,6 @@ dependencies = [ "syn 2.0.117", ] -[[package]] -name = "num-format" -version = "0.4.4" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a652d9771a63711fd3c3deb670acfbe5c30a4072e664d7a3bf5a9e1056ac72c3" -dependencies = [ - "arrayvec", - "itoa", -] - [[package]] name = "num-integer" version = "0.1.46" @@ -6140,16 +6098,6 @@ dependencies = [ "num-traits", ] -[[package]] -name = "ouroboros" -version = "0.15.6" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e1358bd1558bd2a083fed428ffeda486fbfb323e698cdda7794259d592ca72db" -dependencies = [ - "aliasable", - "ouroboros_macro 0.15.6", -] - [[package]] name = "ouroboros" version = "0.18.5" @@ -6157,23 +6105,10 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "1e0f050db9c44b97a94723127e6be766ac5c340c48f2c4bb3ffa11713744be59" dependencies = [ "aliasable", - "ouroboros_macro 0.18.5", + "ouroboros_macro", "static_assertions", ] -[[package]] -name = "ouroboros_macro" -version = "0.15.6" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5f7d21ccd03305a674437ee1248f3ab5d4b1db095cf1caf49f1713ddf61956b7" -dependencies = [ - "Inflector", - "proc-macro-error", - "proc-macro2", - "quote", - "syn 1.0.109", -] - [[package]] name = "ouroboros_macro" version = "0.18.5" @@ -6286,12 +6221,6 @@ version = "2.3.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "9b4f627cb1b25917193a259e49bdad08f671f8d9708acfd5fe0a8c1455d87220" -[[package]] -name = "permutation" -version = "0.4.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "df202b0b0f5b8e389955afd5f27b007b00fb948162953f1db9c70d2c7e3157d7" - [[package]] name = "pest" version = "2.8.6" @@ -6475,7 +6404,7 @@ dependencies = [ "nix", "once_cell", "openssl-probe 0.1.6", - "ouroboros 0.18.5", + "ouroboros", "parking_lot", "percent-encoding", "pingora-error", @@ -6887,30 +6816,6 @@ dependencies = [ "toml_edit 0.25.4+spec-1.1.0", ] -[[package]] -name = "proc-macro-error" -version = "1.0.4" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "da25490ff9892aab3fcf7c36f08cfb902dd3e71ca0f9f9517bea02a73a5ce38c" -dependencies = [ - "proc-macro-error-attr", - "proc-macro2", - "quote", - "syn 1.0.109", - "version_check", -] - -[[package]] -name = "proc-macro-error-attr" -version = "1.0.4" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a1be40180e52ecc98ad80b184934baf3d0d29f979574e439af5a55274b35f869" -dependencies = [ - "proc-macro2", - "quote", - "version_check", -] - [[package]] name = "proc-macro-error-attr2" version = "2.0.0" @@ -8463,7 +8368,6 @@ dependencies = [ "criterion", "csv", "dns-lookup", - "fnntw", "futures", "hex", "http", @@ -8484,7 +8388,6 @@ dependencies = [ "prometheus", "proptest", "rand 0.9.2", - "rayon", "regex", "reqwest 0.12.28", "rustc-hash 2.1.1", @@ -8522,12 +8425,6 @@ dependencies = [ "unicode-ident", ] -[[package]] -name = "sync-unsafe-cell" -version = "0.1.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "8deaecba5382c095cb432cd1e21068dadb144208f057b13720e76bf89749beb4" - [[package]] name = "sync_wrapper" version = "1.0.2" diff --git a/Cargo.toml b/Cargo.toml index 58b64e2..176a6e2 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -38,9 +38,8 @@ opentelemetry-otlp = { version = "0.27", features = ["http-proto", "reqwest-c serde_json = "1" anyhow = "1" -# DDoS detection (KNN classifier) +# CLI + serialization clap = { version = "4", features = ["derive"] } -fnntw = "0.4" bincode = "1" rustc-hash = "2" @@ -77,14 +76,13 @@ iroh-gossip = { version = "0.96", features = ["net"] } blake3 = "1" hex = "0.4" rand = "0.9" -rayon = "1" tempfile = "3" # Dataset ingestion (CIC-IDS2017 CSV parsing) csv = "1" # burn-rs ML framework (training only, behind `training` feature) -burn = { version = "0.20", features = ["ndarray", "autodiff"], optional = true } +burn = { version = "0.20", features = ["wgpu", "autodiff", "train"], optional = true } [features] training = ["burn"] diff --git a/dev.toml b/dev.toml deleted file mode 100644 index 7e8e947..0000000 --- a/dev.toml +++ /dev/null @@ -1,67 +0,0 @@ -# Local dev config for running sunbeam-proxy directly on macOS. -# -# Uses non-privileged ports (8080/8443) and a mkcert cert for localhost. -# Certs are generated once with: -# mkcert -cert-file certs/tls.crt -key-file certs/tls.key localhost 127.0.0.1 -# -# Run with: -# SUNBEAM_CONFIG=dev.toml RUST_LOG=info cargo run -# -# Then test: -# curl -v http://localhost:8080/ # → 301 to https -# curl -vk https://localhost:8443/ -H "Host: docs.localhost" # → 502 (backend unreachable, routing works) -# curl -vk https://localhost:8443/.well-known/acme-challenge/test # → 404 (no active challenge) - -[listen] -http = "0.0.0.0:8080" -https = "0.0.0.0:8443" - -[tls] -cert_path = "certs/tls.crt" -key_path = "certs/tls.key" - -[telemetry] -otlp_endpoint = "" - -# Dummy routes that mirror production — backends won't be reachable locally -# but routing, TLS termination, and redirect logic are fully exercised. - -[[routes]] -host_prefix = "docs" -backend = "http://127.0.0.1:9001" -websocket = true - -[[routes]] -host_prefix = "meet" -backend = "http://127.0.0.1:9002" -websocket = true - -[[routes]] -host_prefix = "drive" -backend = "http://127.0.0.1:9003" - -[[routes]] -host_prefix = "mail" -backend = "http://127.0.0.1:9004" - -[[routes]] -host_prefix = "chat" -backend = "http://127.0.0.1:9005" -websocket = true - -[[routes]] -host_prefix = "people" -backend = "http://127.0.0.1:9006" - -[[routes]] -host_prefix = "src" -backend = "http://127.0.0.1:9007" -websocket = true - -[[routes]] -host_prefix = "auth" -backend = "http://127.0.0.1:9008" - -[[routes]] -host_prefix = "s3" -backend = "http://127.0.0.1:9009" diff --git a/infrastructure/base/ingress/pingora-headless.yaml b/infrastructure/base/ingress/pingora-headless.yaml deleted file mode 100644 index fe7a66e..0000000 --- a/infrastructure/base/ingress/pingora-headless.yaml +++ /dev/null @@ -1,16 +0,0 @@ -apiVersion: v1 -kind: Service -metadata: - name: pingora-headless - namespace: ingress - labels: - app: pingora -spec: - clusterIP: None - selector: - app: pingora - ports: - - name: gossip-udp - port: 11204 - targetPort: 11204 - protocol: UDP diff --git a/models/.gitkeep b/models/.gitkeep deleted file mode 100644 index e69de29..0000000 diff --git a/scripts/convert_csic.py b/scripts/convert_csic.py deleted file mode 100755 index 278d7a3..0000000 --- a/scripts/convert_csic.py +++ /dev/null @@ -1,290 +0,0 @@ -#!/usr/bin/env python3 -""" -Convert CSIC 2010 HTTP dataset files into Sunbeam audit-log JSONL format. - -The CSIC 2010 dataset contains raw HTTP/1.1 requests separated by blank lines. -Label is determined by which file it came from (normal vs anomalous). - -Usage: - # Download the dataset first: - git clone https://src.sunbeam.pt/studio/csic-dataset.git /tmp/csic - - # Convert all three files: - python3 scripts/convert_csic.py \ - --normal /tmp/csic/OriginalDataSets/normalTrafficTraining.txt \ - --normal /tmp/csic/OriginalDataSets/normalTrafficTest.txt \ - --anomalous /tmp/csic/OriginalDataSets/anomalousTrafficTest.txt \ - --hosts admin,src,docs,auth,drive,grafana,people,meet,s3,livekit \ - --output csic_converted.jsonl - - # Merge with production logs: - cat logs.jsonl csic_converted.jsonl > combined.jsonl - - # Train (or just use --csic flag which does this automatically): - cargo run -- train-scanner --input combined.jsonl --output scanner_model.bin - # Simpler: cargo run -- train-scanner --input logs.jsonl --output scanner_model.bin --csic -""" - -import argparse -import json -import random -import sys -from datetime import datetime, timedelta -from urllib.parse import urlparse, unquote - - -def parse_csic_file(filepath): - """Parse a CSIC 2010 raw HTTP file into individual requests.""" - requests = [] - current_lines = [] - - with open(filepath, "r", encoding="utf-8", errors="replace") as f: - for line in f: - stripped = line.rstrip("\r\n") - if stripped == "" and current_lines: - req = parse_single_request(current_lines) - if req: - requests.append(req) - current_lines = [] - else: - current_lines.append(stripped) - - # Handle last request if file doesn't end with blank line - if current_lines: - req = parse_single_request(current_lines) - if req: - requests.append(req) - - return requests - - -def parse_single_request(lines): - """Parse a single HTTP request from its lines into a dict of headers/fields.""" - if not lines: - return None - - # First line: METHOD url HTTP/1.1 - request_line = lines[0] - parts = request_line.split(" ", 2) - if len(parts) < 2: - return None - - method = parts[0] - raw_url = parts[1] - - # Extract path from URL (may be absolute like http://localhost:8080/path) - parsed = urlparse(raw_url) - path = parsed.path or "/" - query = parsed.query or "" - - # Parse headers - headers = {} - body_start = None - for i, line in enumerate(lines[1:], start=1): - if line == "": - body_start = i + 1 - break - if ":" in line: - key, _, value = line.partition(":") - headers[key.strip().lower()] = value.strip() - - # Extract body if present - body = "" - if body_start and body_start < len(lines): - body = "\n".join(lines[body_start:]) - - content_length = 0 - if "content-length" in headers: - try: - content_length = int(headers["content-length"]) - except ValueError: - content_length = len(body) - elif body: - content_length = len(body) - - return { - "method": method, - "path": path, - "query": query, - "user_agent": headers.get("user-agent", "-"), - "has_cookies": "cookie" in headers, - "content_length": content_length, - "referer": headers.get("referer", "-"), - "accept_language": headers.get("accept-language", "-"), - "accept": headers.get("accept", "*/*"), - "host_header": headers.get("host", "localhost:8080"), - } - - -def to_audit_jsonl(req, label, configured_hosts, base_time, offset_secs): - """Convert a parsed request into our audit log JSONL format.""" - # Assign a host: normal traffic gets a configured host, attack gets random - if label == "normal": - host_prefix = random.choice(configured_hosts) - status = random.choice([200, 200, 200, 200, 301, 304]) - else: - # 70% unknown host, 30% configured (attacks do hit real hosts) - if random.random() < 0.7: - host_prefix = random.choice([ - "unknown", "scanner", "probe", "test", - "random-" + str(random.randint(1000, 9999)), - ]) - else: - host_prefix = random.choice(configured_hosts) - status = random.choice([404, 404, 404, 400, 403, 500]) - - host = f"{host_prefix}.sunbeam.pt" - - # Synthesize a client IP - ip = f"{random.randint(1,223)}.{random.randint(0,255)}.{random.randint(0,255)}.{random.randint(1,254)}" - - timestamp = (base_time + timedelta(seconds=offset_secs)).strftime( - "%Y-%m-%dT%H:%M:%S.%fZ" - ) - - referer = req["referer"] - accept_language = req["accept_language"] - - # For anomalous samples, simulate realistic scanner behavior: - # scanners don't carry session cookies, referer, or accept-language. - # CSIC attacks all have these because they were generated from a user - # session — strip them to match what real scanners look like. - if label != "normal": - has_cookies = False - referer = "-" - # 80% drop accept-language (most scanners), 20% keep (sophisticated ones) - if random.random() < 0.8: - accept_language = "-" - # 40% use a scanner-like UA instead of the CSIC browser UA - r = random.random() - if r < 0.15: - user_agent = "" - elif r < 0.25: - user_agent = "curl/7.68.0" - elif r < 0.35: - user_agent = "python-requests/2.28.0" - elif r < 0.40: - user_agent = f"Go-http-client/1.1" - else: - user_agent = req["user_agent"] - else: - has_cookies = req["has_cookies"] - user_agent = req["user_agent"] - - entry = { - "timestamp": timestamp, - "level": "INFO", - "fields": { - "message": "request", - "target": "audit", - "method": req["method"], - "host": host, - "path": req["path"], - "query": req.get("query", ""), - "client_ip": ip, - "status": status, - "duration_ms": random.randint(1, 50), - "content_length": req["content_length"], - "user_agent": user_agent, - "referer": referer, - "accept_language": accept_language, - "accept": req["accept"], - "has_cookies": has_cookies, - "cf_country": "-", - "backend": f"{host_prefix}-svc:8080" if label == "normal" else "-", - "label": "normal" if label == "normal" else "attack", - }, - "target": "sunbeam_proxy::proxy", - } - - return json.dumps(entry, ensure_ascii=False) - - -def main(): - parser = argparse.ArgumentParser( - description="Convert CSIC 2010 dataset to Sunbeam audit-log JSONL" - ) - parser.add_argument( - "--normal", - action="append", - default=[], - help="Path to normal traffic file(s). Can be specified multiple times.", - ) - parser.add_argument( - "--anomalous", - action="append", - default=[], - help="Path to anomalous traffic file(s). Can be specified multiple times.", - ) - parser.add_argument( - "--hosts", - default="admin,src,docs,auth,drive,grafana,people,meet,s3,livekit", - help="Comma-separated list of configured host prefixes (default: sunbeam.pt subdomains)", - ) - parser.add_argument( - "--output", - default="-", - help="Output JSONL file (default: stdout)", - ) - parser.add_argument( - "--shuffle", - action="store_true", - default=True, - help="Shuffle output to interleave normal/attack (default: true)", - ) - parser.add_argument( - "--seed", - type=int, - default=42, - help="Random seed for reproducibility (default: 42)", - ) - args = parser.parse_args() - - if not args.normal and not args.anomalous: - parser.error("provide at least one --normal or --anomalous file") - - random.seed(args.seed) - configured_hosts = [h.strip() for h in args.hosts.split(",")] - base_time = datetime(2026, 3, 1, 0, 0, 0) - - # Parse all files - labeled = [] - - for path in args.normal: - reqs = parse_csic_file(path) - print(f"parsed {len(reqs)} normal requests from {path}", file=sys.stderr) - for r in reqs: - labeled.append((r, "normal")) - - for path in args.anomalous: - reqs = parse_csic_file(path) - print(f"parsed {len(reqs)} anomalous requests from {path}", file=sys.stderr) - for r in reqs: - labeled.append((r, "anomalous")) - - if args.shuffle: - random.shuffle(labeled) - - print( - f"total: {len(labeled)} requests " - f"({sum(1 for _, l in labeled if l == 'normal')} normal, " - f"{sum(1 for _, l in labeled if l == 'anomalous')} anomalous)", - file=sys.stderr, - ) - - # Write output - out = open(args.output, "w") if args.output != "-" else sys.stdout - try: - for i, (req, label) in enumerate(labeled): - line = to_audit_jsonl(req, label, configured_hosts, base_time, i) - out.write(line + "\n") - finally: - if out is not sys.stdout: - out.close() - - if args.output != "-": - print(f"written to {args.output}", file=sys.stderr) - - -if __name__ == "__main__": - main()