Files
proxy/src/dataset/download.rs
Sienna Meridian Satterwhite 1f4366566d feat(dataset): add dataset preparation with auto-download and heuristic labeling
Unified prepare-dataset pipeline that automatically downloads and caches
upstream datasets (CSIC 2010, CIC-IDS2017), applies heuristic auto-labeling
to unlabeled production logs, generates synthetic samples for both models,
and serializes everything as a bincode DatasetManifest. Includes OWASP
ModSec parser, CIC-IDS2017 timing profile extractor, and synthetic data
generators with configurable distributions.

Signed-off-by: Sienna Meridian Satterwhite <sienna@sunbeam.pt>
2026-03-10 23:38:21 +00:00

120 lines
3.5 KiB
Rust

//! Download and cache upstream datasets for training.
//!
//! Cached under `~/.cache/sunbeam/<dataset>/`. Files are only downloaded
//! once; subsequent runs reuse the cached copy.
use anyhow::{Context, Result};
use std::path::PathBuf;
/// Base cache directory for all sunbeam datasets.
fn cache_base() -> PathBuf {
let base = std::env::var("XDG_CACHE_HOME")
.map(PathBuf::from)
.unwrap_or_else(|_| {
let home = std::env::var("HOME").unwrap_or_else(|_| "/tmp".to_string());
PathBuf::from(home).join(".cache")
});
base.join("sunbeam")
}
// --- CIC-IDS2017 ---
/// Only the Friday DDoS file — contains DDoS Hulk, Slowloris, slowhttptest, GoldenEye.
const CICIDS_FILE: &str = "Friday-WorkingHours-Afternoon-DDos.pcap_ISCX.csv";
/// Hugging Face mirror (public, no auth required).
const CICIDS_BASE_URL: &str =
"https://huggingface.co/datasets/c01dsnap/CIC-IDS2017/resolve/main";
fn cicids_cache_dir() -> PathBuf {
cache_base().join("cicids")
}
/// Return the path to the cached CIC-IDS2017 DDoS CSV, or `None` if not downloaded.
pub fn cicids_cached_path() -> Option<PathBuf> {
let path = cicids_cache_dir().join(CICIDS_FILE);
if path.exists() {
Some(path)
} else {
None
}
}
/// Download the CIC-IDS2017 Friday DDoS CSV to cache. Returns the cached path.
pub fn download_cicids() -> Result<PathBuf> {
let dir = cicids_cache_dir();
let path = dir.join(CICIDS_FILE);
if path.exists() {
eprintln!(" cached: {}", path.display());
return Ok(path);
}
let url = format!("{CICIDS_BASE_URL}/{CICIDS_FILE}");
eprintln!(" downloading: {url}");
eprintln!(" (this is ~170 MB, may take a minute)");
std::fs::create_dir_all(&dir)?;
// Stream to file to avoid holding 170MB in memory.
let resp = reqwest::blocking::Client::builder()
.timeout(std::time::Duration::from_secs(600))
.build()?
.get(&url)
.send()
.with_context(|| format!("fetching {url}"))?
.error_for_status()
.with_context(|| format!("HTTP error for {url}"))?;
let mut file = std::fs::File::create(&path)
.with_context(|| format!("creating {}", path.display()))?;
let bytes = resp.bytes().with_context(|| "reading response body")?;
std::io::Write::write_all(&mut file, &bytes)?;
eprintln!(" saved: {}", path.display());
Ok(path)
}
// --- CSIC 2010 ---
/// Download CSIC 2010 dataset files to cache (delegates to scanner::csic).
pub fn download_csic() -> Result<()> {
if crate::scanner::csic::csic_is_cached() {
eprintln!(" cached: {}", crate::scanner::csic::csic_cache_path().display());
return Ok(());
}
// fetch_csic_dataset downloads, caches, and parses — we only need the download side-effect.
crate::scanner::csic::fetch_csic_dataset()?;
Ok(())
}
/// Download all upstream datasets.
pub fn download_all() -> Result<()> {
eprintln!("downloading upstream datasets...\n");
eprintln!("[1/2] CSIC 2010 (scanner training data)");
download_csic()?;
eprintln!();
eprintln!("[2/2] CIC-IDS2017 DDoS timing profiles");
let path = download_cicids()?;
eprintln!(" ok: {}\n", path.display());
eprintln!("all datasets cached.");
Ok(())
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_cache_paths() {
let base = cache_base();
assert!(base.to_str().unwrap().contains("sunbeam"));
let cicids = cicids_cache_dir();
assert!(cicids.to_str().unwrap().contains("cicids"));
}
}