chore: remove legacy deps (fnntw, rayon) and unused files
- Remove fnntw (KNN) and rayon dependencies, no longer needed with ensemble architecture - Update burn features to include wgpu and train backends - Remove dev.toml, models/.gitkeep, scripts/convert_csic.py, and pingora-headless.yaml (superseded by cluster gossip discovery) - Add .DS_Store to .gitignore Signed-off-by: Sienna Meridian Satterwhite <sienna@sunbeam.pt>
This commit is contained in:
3
.gitignore
vendored
3
.gitignore
vendored
@@ -4,8 +4,11 @@ certs/
|
|||||||
*.pem
|
*.pem
|
||||||
*.key
|
*.key
|
||||||
*.crt
|
*.crt
|
||||||
|
*.tmp
|
||||||
|
.DS_Store
|
||||||
|
|
||||||
# Training data and model binaries
|
# Training data and model binaries
|
||||||
*.bin
|
*.bin
|
||||||
*.jsonl
|
*.jsonl
|
||||||
heuristics.toml
|
heuristics.toml
|
||||||
|
**/*artifacts
|
||||||
|
|||||||
107
Cargo.lock
generated
107
Cargo.lock
generated
@@ -2,12 +2,6 @@
|
|||||||
# It is not intended for manual editing.
|
# It is not intended for manual editing.
|
||||||
version = 4
|
version = 4
|
||||||
|
|
||||||
[[package]]
|
|
||||||
name = "Inflector"
|
|
||||||
version = "0.11.4"
|
|
||||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
||||||
checksum = "fe438c63458706e03479442743baae6c88256498e6431708f6dfc520a26515d3"
|
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "addr2line"
|
name = "addr2line"
|
||||||
version = "0.25.1"
|
version = "0.25.1"
|
||||||
@@ -3187,23 +3181,6 @@ dependencies = [
|
|||||||
"rand_distr",
|
"rand_distr",
|
||||||
]
|
]
|
||||||
|
|
||||||
[[package]]
|
|
||||||
name = "fnntw"
|
|
||||||
version = "0.4.1"
|
|
||||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
||||||
checksum = "8466be0d69b615cc756992651fe2eb11bfbb2cbf945b21a7746844b8293cbfe8"
|
|
||||||
dependencies = [
|
|
||||||
"crossbeam-channel",
|
|
||||||
"likely_stable",
|
|
||||||
"num-format",
|
|
||||||
"ordered-float 3.9.2",
|
|
||||||
"ouroboros 0.15.6",
|
|
||||||
"permutation",
|
|
||||||
"rayon",
|
|
||||||
"sync-unsafe-cell",
|
|
||||||
"thiserror 1.0.69",
|
|
||||||
]
|
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "fnv"
|
name = "fnv"
|
||||||
version = "1.0.7"
|
version = "1.0.7"
|
||||||
@@ -5076,15 +5053,6 @@ dependencies = [
|
|||||||
"libc",
|
"libc",
|
||||||
]
|
]
|
||||||
|
|
||||||
[[package]]
|
|
||||||
name = "likely_stable"
|
|
||||||
version = "0.1.3"
|
|
||||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
||||||
checksum = "d61f7017d8abea1fc23ff7f01a8147b2656dea3aeb24d519aab6e2177eaf671c"
|
|
||||||
dependencies = [
|
|
||||||
"rustc_version",
|
|
||||||
]
|
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "linux-raw-sys"
|
name = "linux-raw-sys"
|
||||||
version = "0.4.15"
|
version = "0.4.15"
|
||||||
@@ -5796,16 +5764,6 @@ dependencies = [
|
|||||||
"syn 2.0.117",
|
"syn 2.0.117",
|
||||||
]
|
]
|
||||||
|
|
||||||
[[package]]
|
|
||||||
name = "num-format"
|
|
||||||
version = "0.4.4"
|
|
||||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
||||||
checksum = "a652d9771a63711fd3c3deb670acfbe5c30a4072e664d7a3bf5a9e1056ac72c3"
|
|
||||||
dependencies = [
|
|
||||||
"arrayvec",
|
|
||||||
"itoa",
|
|
||||||
]
|
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "num-integer"
|
name = "num-integer"
|
||||||
version = "0.1.46"
|
version = "0.1.46"
|
||||||
@@ -6140,16 +6098,6 @@ dependencies = [
|
|||||||
"num-traits",
|
"num-traits",
|
||||||
]
|
]
|
||||||
|
|
||||||
[[package]]
|
|
||||||
name = "ouroboros"
|
|
||||||
version = "0.15.6"
|
|
||||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
||||||
checksum = "e1358bd1558bd2a083fed428ffeda486fbfb323e698cdda7794259d592ca72db"
|
|
||||||
dependencies = [
|
|
||||||
"aliasable",
|
|
||||||
"ouroboros_macro 0.15.6",
|
|
||||||
]
|
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "ouroboros"
|
name = "ouroboros"
|
||||||
version = "0.18.5"
|
version = "0.18.5"
|
||||||
@@ -6157,23 +6105,10 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
|
|||||||
checksum = "1e0f050db9c44b97a94723127e6be766ac5c340c48f2c4bb3ffa11713744be59"
|
checksum = "1e0f050db9c44b97a94723127e6be766ac5c340c48f2c4bb3ffa11713744be59"
|
||||||
dependencies = [
|
dependencies = [
|
||||||
"aliasable",
|
"aliasable",
|
||||||
"ouroboros_macro 0.18.5",
|
"ouroboros_macro",
|
||||||
"static_assertions",
|
"static_assertions",
|
||||||
]
|
]
|
||||||
|
|
||||||
[[package]]
|
|
||||||
name = "ouroboros_macro"
|
|
||||||
version = "0.15.6"
|
|
||||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
||||||
checksum = "5f7d21ccd03305a674437ee1248f3ab5d4b1db095cf1caf49f1713ddf61956b7"
|
|
||||||
dependencies = [
|
|
||||||
"Inflector",
|
|
||||||
"proc-macro-error",
|
|
||||||
"proc-macro2",
|
|
||||||
"quote",
|
|
||||||
"syn 1.0.109",
|
|
||||||
]
|
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "ouroboros_macro"
|
name = "ouroboros_macro"
|
||||||
version = "0.18.5"
|
version = "0.18.5"
|
||||||
@@ -6286,12 +6221,6 @@ version = "2.3.2"
|
|||||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
checksum = "9b4f627cb1b25917193a259e49bdad08f671f8d9708acfd5fe0a8c1455d87220"
|
checksum = "9b4f627cb1b25917193a259e49bdad08f671f8d9708acfd5fe0a8c1455d87220"
|
||||||
|
|
||||||
[[package]]
|
|
||||||
name = "permutation"
|
|
||||||
version = "0.4.1"
|
|
||||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
||||||
checksum = "df202b0b0f5b8e389955afd5f27b007b00fb948162953f1db9c70d2c7e3157d7"
|
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "pest"
|
name = "pest"
|
||||||
version = "2.8.6"
|
version = "2.8.6"
|
||||||
@@ -6475,7 +6404,7 @@ dependencies = [
|
|||||||
"nix",
|
"nix",
|
||||||
"once_cell",
|
"once_cell",
|
||||||
"openssl-probe 0.1.6",
|
"openssl-probe 0.1.6",
|
||||||
"ouroboros 0.18.5",
|
"ouroboros",
|
||||||
"parking_lot",
|
"parking_lot",
|
||||||
"percent-encoding",
|
"percent-encoding",
|
||||||
"pingora-error",
|
"pingora-error",
|
||||||
@@ -6887,30 +6816,6 @@ dependencies = [
|
|||||||
"toml_edit 0.25.4+spec-1.1.0",
|
"toml_edit 0.25.4+spec-1.1.0",
|
||||||
]
|
]
|
||||||
|
|
||||||
[[package]]
|
|
||||||
name = "proc-macro-error"
|
|
||||||
version = "1.0.4"
|
|
||||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
||||||
checksum = "da25490ff9892aab3fcf7c36f08cfb902dd3e71ca0f9f9517bea02a73a5ce38c"
|
|
||||||
dependencies = [
|
|
||||||
"proc-macro-error-attr",
|
|
||||||
"proc-macro2",
|
|
||||||
"quote",
|
|
||||||
"syn 1.0.109",
|
|
||||||
"version_check",
|
|
||||||
]
|
|
||||||
|
|
||||||
[[package]]
|
|
||||||
name = "proc-macro-error-attr"
|
|
||||||
version = "1.0.4"
|
|
||||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
||||||
checksum = "a1be40180e52ecc98ad80b184934baf3d0d29f979574e439af5a55274b35f869"
|
|
||||||
dependencies = [
|
|
||||||
"proc-macro2",
|
|
||||||
"quote",
|
|
||||||
"version_check",
|
|
||||||
]
|
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "proc-macro-error-attr2"
|
name = "proc-macro-error-attr2"
|
||||||
version = "2.0.0"
|
version = "2.0.0"
|
||||||
@@ -8463,7 +8368,6 @@ dependencies = [
|
|||||||
"criterion",
|
"criterion",
|
||||||
"csv",
|
"csv",
|
||||||
"dns-lookup",
|
"dns-lookup",
|
||||||
"fnntw",
|
|
||||||
"futures",
|
"futures",
|
||||||
"hex",
|
"hex",
|
||||||
"http",
|
"http",
|
||||||
@@ -8484,7 +8388,6 @@ dependencies = [
|
|||||||
"prometheus",
|
"prometheus",
|
||||||
"proptest",
|
"proptest",
|
||||||
"rand 0.9.2",
|
"rand 0.9.2",
|
||||||
"rayon",
|
|
||||||
"regex",
|
"regex",
|
||||||
"reqwest 0.12.28",
|
"reqwest 0.12.28",
|
||||||
"rustc-hash 2.1.1",
|
"rustc-hash 2.1.1",
|
||||||
@@ -8522,12 +8425,6 @@ dependencies = [
|
|||||||
"unicode-ident",
|
"unicode-ident",
|
||||||
]
|
]
|
||||||
|
|
||||||
[[package]]
|
|
||||||
name = "sync-unsafe-cell"
|
|
||||||
version = "0.1.1"
|
|
||||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
||||||
checksum = "8deaecba5382c095cb432cd1e21068dadb144208f057b13720e76bf89749beb4"
|
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "sync_wrapper"
|
name = "sync_wrapper"
|
||||||
version = "1.0.2"
|
version = "1.0.2"
|
||||||
|
|||||||
@@ -38,9 +38,8 @@ opentelemetry-otlp = { version = "0.27", features = ["http-proto", "reqwest-c
|
|||||||
serde_json = "1"
|
serde_json = "1"
|
||||||
anyhow = "1"
|
anyhow = "1"
|
||||||
|
|
||||||
# DDoS detection (KNN classifier)
|
# CLI + serialization
|
||||||
clap = { version = "4", features = ["derive"] }
|
clap = { version = "4", features = ["derive"] }
|
||||||
fnntw = "0.4"
|
|
||||||
bincode = "1"
|
bincode = "1"
|
||||||
rustc-hash = "2"
|
rustc-hash = "2"
|
||||||
|
|
||||||
@@ -77,14 +76,13 @@ iroh-gossip = { version = "0.96", features = ["net"] }
|
|||||||
blake3 = "1"
|
blake3 = "1"
|
||||||
hex = "0.4"
|
hex = "0.4"
|
||||||
rand = "0.9"
|
rand = "0.9"
|
||||||
rayon = "1"
|
|
||||||
tempfile = "3"
|
tempfile = "3"
|
||||||
|
|
||||||
# Dataset ingestion (CIC-IDS2017 CSV parsing)
|
# Dataset ingestion (CIC-IDS2017 CSV parsing)
|
||||||
csv = "1"
|
csv = "1"
|
||||||
|
|
||||||
# burn-rs ML framework (training only, behind `training` feature)
|
# burn-rs ML framework (training only, behind `training` feature)
|
||||||
burn = { version = "0.20", features = ["ndarray", "autodiff"], optional = true }
|
burn = { version = "0.20", features = ["wgpu", "autodiff", "train"], optional = true }
|
||||||
|
|
||||||
[features]
|
[features]
|
||||||
training = ["burn"]
|
training = ["burn"]
|
||||||
|
|||||||
67
dev.toml
67
dev.toml
@@ -1,67 +0,0 @@
|
|||||||
# Local dev config for running sunbeam-proxy directly on macOS.
|
|
||||||
#
|
|
||||||
# Uses non-privileged ports (8080/8443) and a mkcert cert for localhost.
|
|
||||||
# Certs are generated once with:
|
|
||||||
# mkcert -cert-file certs/tls.crt -key-file certs/tls.key localhost 127.0.0.1
|
|
||||||
#
|
|
||||||
# Run with:
|
|
||||||
# SUNBEAM_CONFIG=dev.toml RUST_LOG=info cargo run
|
|
||||||
#
|
|
||||||
# Then test:
|
|
||||||
# curl -v http://localhost:8080/ # → 301 to https
|
|
||||||
# curl -vk https://localhost:8443/ -H "Host: docs.localhost" # → 502 (backend unreachable, routing works)
|
|
||||||
# curl -vk https://localhost:8443/.well-known/acme-challenge/test # → 404 (no active challenge)
|
|
||||||
|
|
||||||
[listen]
|
|
||||||
http = "0.0.0.0:8080"
|
|
||||||
https = "0.0.0.0:8443"
|
|
||||||
|
|
||||||
[tls]
|
|
||||||
cert_path = "certs/tls.crt"
|
|
||||||
key_path = "certs/tls.key"
|
|
||||||
|
|
||||||
[telemetry]
|
|
||||||
otlp_endpoint = ""
|
|
||||||
|
|
||||||
# Dummy routes that mirror production — backends won't be reachable locally
|
|
||||||
# but routing, TLS termination, and redirect logic are fully exercised.
|
|
||||||
|
|
||||||
[[routes]]
|
|
||||||
host_prefix = "docs"
|
|
||||||
backend = "http://127.0.0.1:9001"
|
|
||||||
websocket = true
|
|
||||||
|
|
||||||
[[routes]]
|
|
||||||
host_prefix = "meet"
|
|
||||||
backend = "http://127.0.0.1:9002"
|
|
||||||
websocket = true
|
|
||||||
|
|
||||||
[[routes]]
|
|
||||||
host_prefix = "drive"
|
|
||||||
backend = "http://127.0.0.1:9003"
|
|
||||||
|
|
||||||
[[routes]]
|
|
||||||
host_prefix = "mail"
|
|
||||||
backend = "http://127.0.0.1:9004"
|
|
||||||
|
|
||||||
[[routes]]
|
|
||||||
host_prefix = "chat"
|
|
||||||
backend = "http://127.0.0.1:9005"
|
|
||||||
websocket = true
|
|
||||||
|
|
||||||
[[routes]]
|
|
||||||
host_prefix = "people"
|
|
||||||
backend = "http://127.0.0.1:9006"
|
|
||||||
|
|
||||||
[[routes]]
|
|
||||||
host_prefix = "src"
|
|
||||||
backend = "http://127.0.0.1:9007"
|
|
||||||
websocket = true
|
|
||||||
|
|
||||||
[[routes]]
|
|
||||||
host_prefix = "auth"
|
|
||||||
backend = "http://127.0.0.1:9008"
|
|
||||||
|
|
||||||
[[routes]]
|
|
||||||
host_prefix = "s3"
|
|
||||||
backend = "http://127.0.0.1:9009"
|
|
||||||
@@ -1,16 +0,0 @@
|
|||||||
apiVersion: v1
|
|
||||||
kind: Service
|
|
||||||
metadata:
|
|
||||||
name: pingora-headless
|
|
||||||
namespace: ingress
|
|
||||||
labels:
|
|
||||||
app: pingora
|
|
||||||
spec:
|
|
||||||
clusterIP: None
|
|
||||||
selector:
|
|
||||||
app: pingora
|
|
||||||
ports:
|
|
||||||
- name: gossip-udp
|
|
||||||
port: 11204
|
|
||||||
targetPort: 11204
|
|
||||||
protocol: UDP
|
|
||||||
@@ -1,290 +0,0 @@
|
|||||||
#!/usr/bin/env python3
|
|
||||||
"""
|
|
||||||
Convert CSIC 2010 HTTP dataset files into Sunbeam audit-log JSONL format.
|
|
||||||
|
|
||||||
The CSIC 2010 dataset contains raw HTTP/1.1 requests separated by blank lines.
|
|
||||||
Label is determined by which file it came from (normal vs anomalous).
|
|
||||||
|
|
||||||
Usage:
|
|
||||||
# Download the dataset first:
|
|
||||||
git clone https://src.sunbeam.pt/studio/csic-dataset.git /tmp/csic
|
|
||||||
|
|
||||||
# Convert all three files:
|
|
||||||
python3 scripts/convert_csic.py \
|
|
||||||
--normal /tmp/csic/OriginalDataSets/normalTrafficTraining.txt \
|
|
||||||
--normal /tmp/csic/OriginalDataSets/normalTrafficTest.txt \
|
|
||||||
--anomalous /tmp/csic/OriginalDataSets/anomalousTrafficTest.txt \
|
|
||||||
--hosts admin,src,docs,auth,drive,grafana,people,meet,s3,livekit \
|
|
||||||
--output csic_converted.jsonl
|
|
||||||
|
|
||||||
# Merge with production logs:
|
|
||||||
cat logs.jsonl csic_converted.jsonl > combined.jsonl
|
|
||||||
|
|
||||||
# Train (or just use --csic flag which does this automatically):
|
|
||||||
cargo run -- train-scanner --input combined.jsonl --output scanner_model.bin
|
|
||||||
# Simpler: cargo run -- train-scanner --input logs.jsonl --output scanner_model.bin --csic
|
|
||||||
"""
|
|
||||||
|
|
||||||
import argparse
|
|
||||||
import json
|
|
||||||
import random
|
|
||||||
import sys
|
|
||||||
from datetime import datetime, timedelta
|
|
||||||
from urllib.parse import urlparse, unquote
|
|
||||||
|
|
||||||
|
|
||||||
def parse_csic_file(filepath):
|
|
||||||
"""Parse a CSIC 2010 raw HTTP file into individual requests."""
|
|
||||||
requests = []
|
|
||||||
current_lines = []
|
|
||||||
|
|
||||||
with open(filepath, "r", encoding="utf-8", errors="replace") as f:
|
|
||||||
for line in f:
|
|
||||||
stripped = line.rstrip("\r\n")
|
|
||||||
if stripped == "" and current_lines:
|
|
||||||
req = parse_single_request(current_lines)
|
|
||||||
if req:
|
|
||||||
requests.append(req)
|
|
||||||
current_lines = []
|
|
||||||
else:
|
|
||||||
current_lines.append(stripped)
|
|
||||||
|
|
||||||
# Handle last request if file doesn't end with blank line
|
|
||||||
if current_lines:
|
|
||||||
req = parse_single_request(current_lines)
|
|
||||||
if req:
|
|
||||||
requests.append(req)
|
|
||||||
|
|
||||||
return requests
|
|
||||||
|
|
||||||
|
|
||||||
def parse_single_request(lines):
|
|
||||||
"""Parse a single HTTP request from its lines into a dict of headers/fields."""
|
|
||||||
if not lines:
|
|
||||||
return None
|
|
||||||
|
|
||||||
# First line: METHOD url HTTP/1.1
|
|
||||||
request_line = lines[0]
|
|
||||||
parts = request_line.split(" ", 2)
|
|
||||||
if len(parts) < 2:
|
|
||||||
return None
|
|
||||||
|
|
||||||
method = parts[0]
|
|
||||||
raw_url = parts[1]
|
|
||||||
|
|
||||||
# Extract path from URL (may be absolute like http://localhost:8080/path)
|
|
||||||
parsed = urlparse(raw_url)
|
|
||||||
path = parsed.path or "/"
|
|
||||||
query = parsed.query or ""
|
|
||||||
|
|
||||||
# Parse headers
|
|
||||||
headers = {}
|
|
||||||
body_start = None
|
|
||||||
for i, line in enumerate(lines[1:], start=1):
|
|
||||||
if line == "":
|
|
||||||
body_start = i + 1
|
|
||||||
break
|
|
||||||
if ":" in line:
|
|
||||||
key, _, value = line.partition(":")
|
|
||||||
headers[key.strip().lower()] = value.strip()
|
|
||||||
|
|
||||||
# Extract body if present
|
|
||||||
body = ""
|
|
||||||
if body_start and body_start < len(lines):
|
|
||||||
body = "\n".join(lines[body_start:])
|
|
||||||
|
|
||||||
content_length = 0
|
|
||||||
if "content-length" in headers:
|
|
||||||
try:
|
|
||||||
content_length = int(headers["content-length"])
|
|
||||||
except ValueError:
|
|
||||||
content_length = len(body)
|
|
||||||
elif body:
|
|
||||||
content_length = len(body)
|
|
||||||
|
|
||||||
return {
|
|
||||||
"method": method,
|
|
||||||
"path": path,
|
|
||||||
"query": query,
|
|
||||||
"user_agent": headers.get("user-agent", "-"),
|
|
||||||
"has_cookies": "cookie" in headers,
|
|
||||||
"content_length": content_length,
|
|
||||||
"referer": headers.get("referer", "-"),
|
|
||||||
"accept_language": headers.get("accept-language", "-"),
|
|
||||||
"accept": headers.get("accept", "*/*"),
|
|
||||||
"host_header": headers.get("host", "localhost:8080"),
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
def to_audit_jsonl(req, label, configured_hosts, base_time, offset_secs):
|
|
||||||
"""Convert a parsed request into our audit log JSONL format."""
|
|
||||||
# Assign a host: normal traffic gets a configured host, attack gets random
|
|
||||||
if label == "normal":
|
|
||||||
host_prefix = random.choice(configured_hosts)
|
|
||||||
status = random.choice([200, 200, 200, 200, 301, 304])
|
|
||||||
else:
|
|
||||||
# 70% unknown host, 30% configured (attacks do hit real hosts)
|
|
||||||
if random.random() < 0.7:
|
|
||||||
host_prefix = random.choice([
|
|
||||||
"unknown", "scanner", "probe", "test",
|
|
||||||
"random-" + str(random.randint(1000, 9999)),
|
|
||||||
])
|
|
||||||
else:
|
|
||||||
host_prefix = random.choice(configured_hosts)
|
|
||||||
status = random.choice([404, 404, 404, 400, 403, 500])
|
|
||||||
|
|
||||||
host = f"{host_prefix}.sunbeam.pt"
|
|
||||||
|
|
||||||
# Synthesize a client IP
|
|
||||||
ip = f"{random.randint(1,223)}.{random.randint(0,255)}.{random.randint(0,255)}.{random.randint(1,254)}"
|
|
||||||
|
|
||||||
timestamp = (base_time + timedelta(seconds=offset_secs)).strftime(
|
|
||||||
"%Y-%m-%dT%H:%M:%S.%fZ"
|
|
||||||
)
|
|
||||||
|
|
||||||
referer = req["referer"]
|
|
||||||
accept_language = req["accept_language"]
|
|
||||||
|
|
||||||
# For anomalous samples, simulate realistic scanner behavior:
|
|
||||||
# scanners don't carry session cookies, referer, or accept-language.
|
|
||||||
# CSIC attacks all have these because they were generated from a user
|
|
||||||
# session — strip them to match what real scanners look like.
|
|
||||||
if label != "normal":
|
|
||||||
has_cookies = False
|
|
||||||
referer = "-"
|
|
||||||
# 80% drop accept-language (most scanners), 20% keep (sophisticated ones)
|
|
||||||
if random.random() < 0.8:
|
|
||||||
accept_language = "-"
|
|
||||||
# 40% use a scanner-like UA instead of the CSIC browser UA
|
|
||||||
r = random.random()
|
|
||||||
if r < 0.15:
|
|
||||||
user_agent = ""
|
|
||||||
elif r < 0.25:
|
|
||||||
user_agent = "curl/7.68.0"
|
|
||||||
elif r < 0.35:
|
|
||||||
user_agent = "python-requests/2.28.0"
|
|
||||||
elif r < 0.40:
|
|
||||||
user_agent = f"Go-http-client/1.1"
|
|
||||||
else:
|
|
||||||
user_agent = req["user_agent"]
|
|
||||||
else:
|
|
||||||
has_cookies = req["has_cookies"]
|
|
||||||
user_agent = req["user_agent"]
|
|
||||||
|
|
||||||
entry = {
|
|
||||||
"timestamp": timestamp,
|
|
||||||
"level": "INFO",
|
|
||||||
"fields": {
|
|
||||||
"message": "request",
|
|
||||||
"target": "audit",
|
|
||||||
"method": req["method"],
|
|
||||||
"host": host,
|
|
||||||
"path": req["path"],
|
|
||||||
"query": req.get("query", ""),
|
|
||||||
"client_ip": ip,
|
|
||||||
"status": status,
|
|
||||||
"duration_ms": random.randint(1, 50),
|
|
||||||
"content_length": req["content_length"],
|
|
||||||
"user_agent": user_agent,
|
|
||||||
"referer": referer,
|
|
||||||
"accept_language": accept_language,
|
|
||||||
"accept": req["accept"],
|
|
||||||
"has_cookies": has_cookies,
|
|
||||||
"cf_country": "-",
|
|
||||||
"backend": f"{host_prefix}-svc:8080" if label == "normal" else "-",
|
|
||||||
"label": "normal" if label == "normal" else "attack",
|
|
||||||
},
|
|
||||||
"target": "sunbeam_proxy::proxy",
|
|
||||||
}
|
|
||||||
|
|
||||||
return json.dumps(entry, ensure_ascii=False)
|
|
||||||
|
|
||||||
|
|
||||||
def main():
|
|
||||||
parser = argparse.ArgumentParser(
|
|
||||||
description="Convert CSIC 2010 dataset to Sunbeam audit-log JSONL"
|
|
||||||
)
|
|
||||||
parser.add_argument(
|
|
||||||
"--normal",
|
|
||||||
action="append",
|
|
||||||
default=[],
|
|
||||||
help="Path to normal traffic file(s). Can be specified multiple times.",
|
|
||||||
)
|
|
||||||
parser.add_argument(
|
|
||||||
"--anomalous",
|
|
||||||
action="append",
|
|
||||||
default=[],
|
|
||||||
help="Path to anomalous traffic file(s). Can be specified multiple times.",
|
|
||||||
)
|
|
||||||
parser.add_argument(
|
|
||||||
"--hosts",
|
|
||||||
default="admin,src,docs,auth,drive,grafana,people,meet,s3,livekit",
|
|
||||||
help="Comma-separated list of configured host prefixes (default: sunbeam.pt subdomains)",
|
|
||||||
)
|
|
||||||
parser.add_argument(
|
|
||||||
"--output",
|
|
||||||
default="-",
|
|
||||||
help="Output JSONL file (default: stdout)",
|
|
||||||
)
|
|
||||||
parser.add_argument(
|
|
||||||
"--shuffle",
|
|
||||||
action="store_true",
|
|
||||||
default=True,
|
|
||||||
help="Shuffle output to interleave normal/attack (default: true)",
|
|
||||||
)
|
|
||||||
parser.add_argument(
|
|
||||||
"--seed",
|
|
||||||
type=int,
|
|
||||||
default=42,
|
|
||||||
help="Random seed for reproducibility (default: 42)",
|
|
||||||
)
|
|
||||||
args = parser.parse_args()
|
|
||||||
|
|
||||||
if not args.normal and not args.anomalous:
|
|
||||||
parser.error("provide at least one --normal or --anomalous file")
|
|
||||||
|
|
||||||
random.seed(args.seed)
|
|
||||||
configured_hosts = [h.strip() for h in args.hosts.split(",")]
|
|
||||||
base_time = datetime(2026, 3, 1, 0, 0, 0)
|
|
||||||
|
|
||||||
# Parse all files
|
|
||||||
labeled = []
|
|
||||||
|
|
||||||
for path in args.normal:
|
|
||||||
reqs = parse_csic_file(path)
|
|
||||||
print(f"parsed {len(reqs)} normal requests from {path}", file=sys.stderr)
|
|
||||||
for r in reqs:
|
|
||||||
labeled.append((r, "normal"))
|
|
||||||
|
|
||||||
for path in args.anomalous:
|
|
||||||
reqs = parse_csic_file(path)
|
|
||||||
print(f"parsed {len(reqs)} anomalous requests from {path}", file=sys.stderr)
|
|
||||||
for r in reqs:
|
|
||||||
labeled.append((r, "anomalous"))
|
|
||||||
|
|
||||||
if args.shuffle:
|
|
||||||
random.shuffle(labeled)
|
|
||||||
|
|
||||||
print(
|
|
||||||
f"total: {len(labeled)} requests "
|
|
||||||
f"({sum(1 for _, l in labeled if l == 'normal')} normal, "
|
|
||||||
f"{sum(1 for _, l in labeled if l == 'anomalous')} anomalous)",
|
|
||||||
file=sys.stderr,
|
|
||||||
)
|
|
||||||
|
|
||||||
# Write output
|
|
||||||
out = open(args.output, "w") if args.output != "-" else sys.stdout
|
|
||||||
try:
|
|
||||||
for i, (req, label) in enumerate(labeled):
|
|
||||||
line = to_audit_jsonl(req, label, configured_hosts, base_time, i)
|
|
||||||
out.write(line + "\n")
|
|
||||||
finally:
|
|
||||||
if out is not sys.stdout:
|
|
||||||
out.close()
|
|
||||||
|
|
||||||
if args.output != "-":
|
|
||||||
print(f"written to {args.output}", file=sys.stderr)
|
|
||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
|
||||||
main()
|
|
||||||
Reference in New Issue
Block a user