chore: remove legacy deps (fnntw, rayon) and unused files

- Remove fnntw (KNN) and rayon dependencies, no longer needed with
  ensemble architecture
- Update burn features to include wgpu and train backends
- Remove dev.toml, models/.gitkeep, scripts/convert_csic.py, and
  pingora-headless.yaml (superseded by cluster gossip discovery)
- Add .DS_Store to .gitignore

Signed-off-by: Sienna Meridian Satterwhite <sienna@sunbeam.pt>
This commit is contained in:
2026-03-10 23:38:21 +00:00
parent 385e9d4c59
commit e9bac0a8fe
7 changed files with 7 additions and 482 deletions

3
.gitignore vendored
View File

@@ -4,8 +4,11 @@ certs/
*.pem
*.key
*.crt
*.tmp
.DS_Store
# Training data and model binaries
*.bin
*.jsonl
heuristics.toml
**/*artifacts

107
Cargo.lock generated
View File

@@ -2,12 +2,6 @@
# It is not intended for manual editing.
version = 4
[[package]]
name = "Inflector"
version = "0.11.4"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "fe438c63458706e03479442743baae6c88256498e6431708f6dfc520a26515d3"
[[package]]
name = "addr2line"
version = "0.25.1"
@@ -3187,23 +3181,6 @@ dependencies = [
"rand_distr",
]
[[package]]
name = "fnntw"
version = "0.4.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "8466be0d69b615cc756992651fe2eb11bfbb2cbf945b21a7746844b8293cbfe8"
dependencies = [
"crossbeam-channel",
"likely_stable",
"num-format",
"ordered-float 3.9.2",
"ouroboros 0.15.6",
"permutation",
"rayon",
"sync-unsafe-cell",
"thiserror 1.0.69",
]
[[package]]
name = "fnv"
version = "1.0.7"
@@ -5076,15 +5053,6 @@ dependencies = [
"libc",
]
[[package]]
name = "likely_stable"
version = "0.1.3"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "d61f7017d8abea1fc23ff7f01a8147b2656dea3aeb24d519aab6e2177eaf671c"
dependencies = [
"rustc_version",
]
[[package]]
name = "linux-raw-sys"
version = "0.4.15"
@@ -5796,16 +5764,6 @@ dependencies = [
"syn 2.0.117",
]
[[package]]
name = "num-format"
version = "0.4.4"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "a652d9771a63711fd3c3deb670acfbe5c30a4072e664d7a3bf5a9e1056ac72c3"
dependencies = [
"arrayvec",
"itoa",
]
[[package]]
name = "num-integer"
version = "0.1.46"
@@ -6140,16 +6098,6 @@ dependencies = [
"num-traits",
]
[[package]]
name = "ouroboros"
version = "0.15.6"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "e1358bd1558bd2a083fed428ffeda486fbfb323e698cdda7794259d592ca72db"
dependencies = [
"aliasable",
"ouroboros_macro 0.15.6",
]
[[package]]
name = "ouroboros"
version = "0.18.5"
@@ -6157,23 +6105,10 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "1e0f050db9c44b97a94723127e6be766ac5c340c48f2c4bb3ffa11713744be59"
dependencies = [
"aliasable",
"ouroboros_macro 0.18.5",
"ouroboros_macro",
"static_assertions",
]
[[package]]
name = "ouroboros_macro"
version = "0.15.6"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "5f7d21ccd03305a674437ee1248f3ab5d4b1db095cf1caf49f1713ddf61956b7"
dependencies = [
"Inflector",
"proc-macro-error",
"proc-macro2",
"quote",
"syn 1.0.109",
]
[[package]]
name = "ouroboros_macro"
version = "0.18.5"
@@ -6286,12 +6221,6 @@ version = "2.3.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "9b4f627cb1b25917193a259e49bdad08f671f8d9708acfd5fe0a8c1455d87220"
[[package]]
name = "permutation"
version = "0.4.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "df202b0b0f5b8e389955afd5f27b007b00fb948162953f1db9c70d2c7e3157d7"
[[package]]
name = "pest"
version = "2.8.6"
@@ -6475,7 +6404,7 @@ dependencies = [
"nix",
"once_cell",
"openssl-probe 0.1.6",
"ouroboros 0.18.5",
"ouroboros",
"parking_lot",
"percent-encoding",
"pingora-error",
@@ -6887,30 +6816,6 @@ dependencies = [
"toml_edit 0.25.4+spec-1.1.0",
]
[[package]]
name = "proc-macro-error"
version = "1.0.4"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "da25490ff9892aab3fcf7c36f08cfb902dd3e71ca0f9f9517bea02a73a5ce38c"
dependencies = [
"proc-macro-error-attr",
"proc-macro2",
"quote",
"syn 1.0.109",
"version_check",
]
[[package]]
name = "proc-macro-error-attr"
version = "1.0.4"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "a1be40180e52ecc98ad80b184934baf3d0d29f979574e439af5a55274b35f869"
dependencies = [
"proc-macro2",
"quote",
"version_check",
]
[[package]]
name = "proc-macro-error-attr2"
version = "2.0.0"
@@ -8463,7 +8368,6 @@ dependencies = [
"criterion",
"csv",
"dns-lookup",
"fnntw",
"futures",
"hex",
"http",
@@ -8484,7 +8388,6 @@ dependencies = [
"prometheus",
"proptest",
"rand 0.9.2",
"rayon",
"regex",
"reqwest 0.12.28",
"rustc-hash 2.1.1",
@@ -8522,12 +8425,6 @@ dependencies = [
"unicode-ident",
]
[[package]]
name = "sync-unsafe-cell"
version = "0.1.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "8deaecba5382c095cb432cd1e21068dadb144208f057b13720e76bf89749beb4"
[[package]]
name = "sync_wrapper"
version = "1.0.2"

View File

@@ -38,9 +38,8 @@ opentelemetry-otlp = { version = "0.27", features = ["http-proto", "reqwest-c
serde_json = "1"
anyhow = "1"
# DDoS detection (KNN classifier)
# CLI + serialization
clap = { version = "4", features = ["derive"] }
fnntw = "0.4"
bincode = "1"
rustc-hash = "2"
@@ -77,14 +76,13 @@ iroh-gossip = { version = "0.96", features = ["net"] }
blake3 = "1"
hex = "0.4"
rand = "0.9"
rayon = "1"
tempfile = "3"
# Dataset ingestion (CIC-IDS2017 CSV parsing)
csv = "1"
# burn-rs ML framework (training only, behind `training` feature)
burn = { version = "0.20", features = ["ndarray", "autodiff"], optional = true }
burn = { version = "0.20", features = ["wgpu", "autodiff", "train"], optional = true }
[features]
training = ["burn"]

View File

@@ -1,67 +0,0 @@
# Local dev config for running sunbeam-proxy directly on macOS.
#
# Uses non-privileged ports (8080/8443) and a mkcert cert for localhost.
# Certs are generated once with:
# mkcert -cert-file certs/tls.crt -key-file certs/tls.key localhost 127.0.0.1
#
# Run with:
# SUNBEAM_CONFIG=dev.toml RUST_LOG=info cargo run
#
# Then test:
# curl -v http://localhost:8080/ # → 301 to https
# curl -vk https://localhost:8443/ -H "Host: docs.localhost" # → 502 (backend unreachable, routing works)
# curl -vk https://localhost:8443/.well-known/acme-challenge/test # → 404 (no active challenge)
[listen]
http = "0.0.0.0:8080"
https = "0.0.0.0:8443"
[tls]
cert_path = "certs/tls.crt"
key_path = "certs/tls.key"
[telemetry]
otlp_endpoint = ""
# Dummy routes that mirror production — backends won't be reachable locally
# but routing, TLS termination, and redirect logic are fully exercised.
[[routes]]
host_prefix = "docs"
backend = "http://127.0.0.1:9001"
websocket = true
[[routes]]
host_prefix = "meet"
backend = "http://127.0.0.1:9002"
websocket = true
[[routes]]
host_prefix = "drive"
backend = "http://127.0.0.1:9003"
[[routes]]
host_prefix = "mail"
backend = "http://127.0.0.1:9004"
[[routes]]
host_prefix = "chat"
backend = "http://127.0.0.1:9005"
websocket = true
[[routes]]
host_prefix = "people"
backend = "http://127.0.0.1:9006"
[[routes]]
host_prefix = "src"
backend = "http://127.0.0.1:9007"
websocket = true
[[routes]]
host_prefix = "auth"
backend = "http://127.0.0.1:9008"
[[routes]]
host_prefix = "s3"
backend = "http://127.0.0.1:9009"

View File

@@ -1,16 +0,0 @@
apiVersion: v1
kind: Service
metadata:
name: pingora-headless
namespace: ingress
labels:
app: pingora
spec:
clusterIP: None
selector:
app: pingora
ports:
- name: gossip-udp
port: 11204
targetPort: 11204
protocol: UDP

View File

View File

@@ -1,290 +0,0 @@
#!/usr/bin/env python3
"""
Convert CSIC 2010 HTTP dataset files into Sunbeam audit-log JSONL format.
The CSIC 2010 dataset contains raw HTTP/1.1 requests separated by blank lines.
Label is determined by which file it came from (normal vs anomalous).
Usage:
# Download the dataset first:
git clone https://src.sunbeam.pt/studio/csic-dataset.git /tmp/csic
# Convert all three files:
python3 scripts/convert_csic.py \
--normal /tmp/csic/OriginalDataSets/normalTrafficTraining.txt \
--normal /tmp/csic/OriginalDataSets/normalTrafficTest.txt \
--anomalous /tmp/csic/OriginalDataSets/anomalousTrafficTest.txt \
--hosts admin,src,docs,auth,drive,grafana,people,meet,s3,livekit \
--output csic_converted.jsonl
# Merge with production logs:
cat logs.jsonl csic_converted.jsonl > combined.jsonl
# Train (or just use --csic flag which does this automatically):
cargo run -- train-scanner --input combined.jsonl --output scanner_model.bin
# Simpler: cargo run -- train-scanner --input logs.jsonl --output scanner_model.bin --csic
"""
import argparse
import json
import random
import sys
from datetime import datetime, timedelta
from urllib.parse import urlparse, unquote
def parse_csic_file(filepath):
"""Parse a CSIC 2010 raw HTTP file into individual requests."""
requests = []
current_lines = []
with open(filepath, "r", encoding="utf-8", errors="replace") as f:
for line in f:
stripped = line.rstrip("\r\n")
if stripped == "" and current_lines:
req = parse_single_request(current_lines)
if req:
requests.append(req)
current_lines = []
else:
current_lines.append(stripped)
# Handle last request if file doesn't end with blank line
if current_lines:
req = parse_single_request(current_lines)
if req:
requests.append(req)
return requests
def parse_single_request(lines):
"""Parse a single HTTP request from its lines into a dict of headers/fields."""
if not lines:
return None
# First line: METHOD url HTTP/1.1
request_line = lines[0]
parts = request_line.split(" ", 2)
if len(parts) < 2:
return None
method = parts[0]
raw_url = parts[1]
# Extract path from URL (may be absolute like http://localhost:8080/path)
parsed = urlparse(raw_url)
path = parsed.path or "/"
query = parsed.query or ""
# Parse headers
headers = {}
body_start = None
for i, line in enumerate(lines[1:], start=1):
if line == "":
body_start = i + 1
break
if ":" in line:
key, _, value = line.partition(":")
headers[key.strip().lower()] = value.strip()
# Extract body if present
body = ""
if body_start and body_start < len(lines):
body = "\n".join(lines[body_start:])
content_length = 0
if "content-length" in headers:
try:
content_length = int(headers["content-length"])
except ValueError:
content_length = len(body)
elif body:
content_length = len(body)
return {
"method": method,
"path": path,
"query": query,
"user_agent": headers.get("user-agent", "-"),
"has_cookies": "cookie" in headers,
"content_length": content_length,
"referer": headers.get("referer", "-"),
"accept_language": headers.get("accept-language", "-"),
"accept": headers.get("accept", "*/*"),
"host_header": headers.get("host", "localhost:8080"),
}
def to_audit_jsonl(req, label, configured_hosts, base_time, offset_secs):
"""Convert a parsed request into our audit log JSONL format."""
# Assign a host: normal traffic gets a configured host, attack gets random
if label == "normal":
host_prefix = random.choice(configured_hosts)
status = random.choice([200, 200, 200, 200, 301, 304])
else:
# 70% unknown host, 30% configured (attacks do hit real hosts)
if random.random() < 0.7:
host_prefix = random.choice([
"unknown", "scanner", "probe", "test",
"random-" + str(random.randint(1000, 9999)),
])
else:
host_prefix = random.choice(configured_hosts)
status = random.choice([404, 404, 404, 400, 403, 500])
host = f"{host_prefix}.sunbeam.pt"
# Synthesize a client IP
ip = f"{random.randint(1,223)}.{random.randint(0,255)}.{random.randint(0,255)}.{random.randint(1,254)}"
timestamp = (base_time + timedelta(seconds=offset_secs)).strftime(
"%Y-%m-%dT%H:%M:%S.%fZ"
)
referer = req["referer"]
accept_language = req["accept_language"]
# For anomalous samples, simulate realistic scanner behavior:
# scanners don't carry session cookies, referer, or accept-language.
# CSIC attacks all have these because they were generated from a user
# session — strip them to match what real scanners look like.
if label != "normal":
has_cookies = False
referer = "-"
# 80% drop accept-language (most scanners), 20% keep (sophisticated ones)
if random.random() < 0.8:
accept_language = "-"
# 40% use a scanner-like UA instead of the CSIC browser UA
r = random.random()
if r < 0.15:
user_agent = ""
elif r < 0.25:
user_agent = "curl/7.68.0"
elif r < 0.35:
user_agent = "python-requests/2.28.0"
elif r < 0.40:
user_agent = f"Go-http-client/1.1"
else:
user_agent = req["user_agent"]
else:
has_cookies = req["has_cookies"]
user_agent = req["user_agent"]
entry = {
"timestamp": timestamp,
"level": "INFO",
"fields": {
"message": "request",
"target": "audit",
"method": req["method"],
"host": host,
"path": req["path"],
"query": req.get("query", ""),
"client_ip": ip,
"status": status,
"duration_ms": random.randint(1, 50),
"content_length": req["content_length"],
"user_agent": user_agent,
"referer": referer,
"accept_language": accept_language,
"accept": req["accept"],
"has_cookies": has_cookies,
"cf_country": "-",
"backend": f"{host_prefix}-svc:8080" if label == "normal" else "-",
"label": "normal" if label == "normal" else "attack",
},
"target": "sunbeam_proxy::proxy",
}
return json.dumps(entry, ensure_ascii=False)
def main():
parser = argparse.ArgumentParser(
description="Convert CSIC 2010 dataset to Sunbeam audit-log JSONL"
)
parser.add_argument(
"--normal",
action="append",
default=[],
help="Path to normal traffic file(s). Can be specified multiple times.",
)
parser.add_argument(
"--anomalous",
action="append",
default=[],
help="Path to anomalous traffic file(s). Can be specified multiple times.",
)
parser.add_argument(
"--hosts",
default="admin,src,docs,auth,drive,grafana,people,meet,s3,livekit",
help="Comma-separated list of configured host prefixes (default: sunbeam.pt subdomains)",
)
parser.add_argument(
"--output",
default="-",
help="Output JSONL file (default: stdout)",
)
parser.add_argument(
"--shuffle",
action="store_true",
default=True,
help="Shuffle output to interleave normal/attack (default: true)",
)
parser.add_argument(
"--seed",
type=int,
default=42,
help="Random seed for reproducibility (default: 42)",
)
args = parser.parse_args()
if not args.normal and not args.anomalous:
parser.error("provide at least one --normal or --anomalous file")
random.seed(args.seed)
configured_hosts = [h.strip() for h in args.hosts.split(",")]
base_time = datetime(2026, 3, 1, 0, 0, 0)
# Parse all files
labeled = []
for path in args.normal:
reqs = parse_csic_file(path)
print(f"parsed {len(reqs)} normal requests from {path}", file=sys.stderr)
for r in reqs:
labeled.append((r, "normal"))
for path in args.anomalous:
reqs = parse_csic_file(path)
print(f"parsed {len(reqs)} anomalous requests from {path}", file=sys.stderr)
for r in reqs:
labeled.append((r, "anomalous"))
if args.shuffle:
random.shuffle(labeled)
print(
f"total: {len(labeled)} requests "
f"({sum(1 for _, l in labeled if l == 'normal')} normal, "
f"{sum(1 for _, l in labeled if l == 'anomalous')} anomalous)",
file=sys.stderr,
)
# Write output
out = open(args.output, "w") if args.output != "-" else sys.stdout
try:
for i, (req, label) in enumerate(labeled):
line = to_audit_jsonl(req, label, configured_hosts, base_time, i)
out.write(line + "\n")
finally:
if out is not sys.stdout:
out.close()
if args.output != "-":
print(f"written to {args.output}", file=sys.stderr)
if __name__ == "__main__":
main()