Add spawn/delete commands, fix session state and entity broadcast

- marathonctl now supports spawn/delete entity commands - Fixed session state bug (was transitioning to Left every 5s) - Fixed entity broadcast to detect Added<NetworkedEntity> - Added AppCommandQueue pattern for app-level control commands References: #131, #132
2025-12-24 12:53:50 +00:00
parent a0c13be6d6
commit 8ca02fd492
12 changed files with 1736 additions and 22 deletions
--- a/crates/app/src/bin/marathonctl.rs
+++ b/crates/app/src/bin/marathonctl.rs
@@ -19,7 +19,7 @@ use clap::{Parser, Subcommand};
 use std::io::{Read, Write};
 use std::os::unix::net::UnixStream;

-use libmarathon::networking::{ControlCommand, ControlResponse, SessionId};
+use libmarathon::networking::{ControlCommand, ControlResponse};

 /// Marathon control CLI
 #[derive(Parser, Debug)]
@@ -51,6 +51,25 @@ enum Commands {
    },
    /// Broadcast a ping message
    Ping,
+    /// Spawn an entity
+    Spawn {
+        /// Entity type (e.g., "cube")
+        entity_type: String,
+        /// X position
+        #[arg(short, long, default_value = "0.0")]
+        x: f32,
+        /// Y position
+        #[arg(short, long, default_value = "0.0")]
+        y: f32,
+        /// Z position
+        #[arg(short, long, default_value = "0.0")]
+        z: f32,
+    },
+    /// Delete an entity by UUID
+    Delete {
+        /// Entity UUID
+        entity_id: String,
+    },
 }

 fn main() {
@@ -75,6 +94,22 @@ fn main() {
                },
            }
        }
+        Commands::Spawn { entity_type, x, y, z } => {
+            ControlCommand::SpawnEntity {
+                entity_type,
+                position: [x, y, z],
+            }
+        }
+        Commands::Delete { entity_id } => {
+            use uuid::Uuid;
+            match Uuid::parse_str(&entity_id) {
+                Ok(uuid) => ControlCommand::DeleteEntity { entity_id: uuid },
+                Err(e) => {
+                    eprintln!("Invalid UUID '{}': {}", entity_id, e);
+                    std::process::exit(1);
+                }
+            }
+        }
    };

    // Connect to Unix socket
@@ -135,8 +170,6 @@ fn receive_response(stream: &mut UnixStream) -> Result<ControlResponse, Box<dyn
 }

 fn print_response(response: ControlResponse) {
-    use libmarathon::networking::{SessionInfo, PeerInfo};
-
    match response {
        ControlResponse::Status {
            node_id,
--- a/crates/app/src/control.rs
+++ b/crates/app/src/control.rs
@@ -6,27 +6,90 @@

 use anyhow::Result;
 use bevy::prelude::*;
+use crossbeam_channel::{Receiver, Sender, unbounded};
 use libmarathon::{
    engine::{EngineBridge, EngineCommand},
    networking::{ControlCommand, ControlResponse, SessionId},
 };
+use uuid::Uuid;

 /// Resource holding the control socket path
 #[derive(Resource)]
 pub struct ControlSocketPath(pub String);

+pub fn cleanup_control_socket(
+    mut exit_events: MessageReader<bevy::app::AppExit>,
+    socket_path: Option<Res<ControlSocketPath>>,
+) {
+    for _ in exit_events.read() {
+        if let Some(ref path) = socket_path {
+            info!("Cleaning up control socket at {}", path.0);
+            let _ = std::fs::remove_file(&path.0);
+        }
+    }
+}
+
+/// Commands that can be sent from the control socket to the app
+#[derive(Debug, Clone)]
+pub enum AppCommand {
+    SpawnEntity {
+        entity_type: String,
+        position: Vec3,
+    },
+    DeleteEntity {
+        entity_id: Uuid,
+    },
+}
+
+/// Queue for app-level commands from control socket
+#[derive(Resource, Clone)]
+pub struct AppCommandQueue {
+    sender: Sender<AppCommand>,
+    receiver: Receiver<AppCommand>,
+}
+
+impl AppCommandQueue {
+    pub fn new() -> Self {
+        let (sender, receiver) = unbounded();
+        Self { sender, receiver }
+    }
+
+    pub fn send(&self, command: AppCommand) {
+        let _ = self.sender.send(command);
+    }
+
+    pub fn try_recv(&self) -> Option<AppCommand> {
+        self.receiver.try_recv().ok()
+    }
+}
+
+impl Default for AppCommandQueue {
+    fn default() -> Self {
+        Self::new()
+    }
+}
+
 /// Startup system to launch the control socket server
 #[cfg(not(target_os = "ios"))]
 #[cfg(debug_assertions)]
-pub fn start_control_socket_system(socket_path_res: Res<ControlSocketPath>, bridge: Res<EngineBridge>) {
-    use tokio::io::{AsyncReadExt, AsyncWriteExt};
+pub fn start_control_socket_system(
+    mut commands: Commands,
+    socket_path_res: Res<ControlSocketPath>,
+    bridge: Res<EngineBridge>,
+) {
+    use tokio::io::AsyncReadExt;
    use tokio::net::UnixListener;

    let socket_path = socket_path_res.0.clone();
    info!("Starting control socket at {}", socket_path);

-    // Clone bridge for the async task
+    // Create app command queue
+    let app_queue = AppCommandQueue::new();
+    commands.insert_resource(app_queue.clone());
+
+    // Clone bridge and queue for the async task
    let bridge = bridge.clone();
+    let queue = app_queue;

    // Spawn tokio runtime in background thread
    std::thread::spawn(move || {
@@ -52,6 +115,7 @@ pub fn start_control_socket_system(socket_path_res: Res<ControlSocketPath>, brid
                    Ok((mut stream, _addr)) => {
                        let bridge = bridge.clone();

+                        let queue_clone = queue.clone();
                        tokio::spawn(async move {
                            // Read command length
                            let mut len_buf = [0u8; 4];
@@ -84,7 +148,7 @@ pub fn start_control_socket_system(socket_path_res: Res<ControlSocketPath>, brid
                            info!("Received control command: {:?}", command);

                            // Handle command
-                            let response = handle_command(command, &bridge).await;
+                            let response = handle_command(command, &bridge, &queue_clone).await;

                            // Send response
                            if let Err(e) = send_response(&mut stream, response).await {
@@ -104,7 +168,11 @@ pub fn start_control_socket_system(socket_path_res: Res<ControlSocketPath>, brid
 /// Handle a control command and generate a response
 #[cfg(not(target_os = "ios"))]
 #[cfg(debug_assertions)]
-async fn handle_command(command: ControlCommand, bridge: &EngineBridge) -> ControlResponse {
+async fn handle_command(
+    command: ControlCommand,
+    bridge: &EngineBridge,
+    app_queue: &AppCommandQueue,
+) -> ControlResponse {
    match command {
        ControlCommand::JoinSession { session_code } => {
            match SessionId::from_code(&session_code) {
@@ -129,12 +197,58 @@ async fn handle_command(command: ControlCommand, bridge: &EngineBridge) -> Contr
            }
        }

+        ControlCommand::SpawnEntity { entity_type, position } => {
+            app_queue.send(AppCommand::SpawnEntity {
+                entity_type,
+                position: Vec3::from_array(position),
+            });
+            ControlResponse::Ok {
+                message: "Entity spawn command queued".to_string(),
+            }
+        }
+
+        ControlCommand::DeleteEntity { entity_id } => {
+            app_queue.send(AppCommand::DeleteEntity { entity_id });
+            ControlResponse::Ok {
+                message: format!("Entity delete command queued for {}", entity_id),
+            }
+        }
+
        _ => ControlResponse::Error {
            error: format!("Command {:?} not yet implemented", command),
        },
    }
 }

+/// System to process app commands from the control socket
+pub fn process_app_commands(
+    queue: Option<Res<AppCommandQueue>>,
+    mut spawn_cube_writer: MessageWriter<crate::cube::SpawnCubeEvent>,
+    mut delete_cube_writer: MessageWriter<crate::cube::DeleteCubeEvent>,
+) {
+    let Some(queue) = queue else { return };
+
+    while let Some(command) = queue.try_recv() {
+        match command {
+            AppCommand::SpawnEntity { entity_type, position } => {
+                match entity_type.as_str() {
+                    "cube" => {
+                        info!("Spawning cube at {:?}", position);
+                        spawn_cube_writer.write(crate::cube::SpawnCubeEvent { position });
+                    }
+                    _ => {
+                        warn!("Unknown entity type: {}", entity_type);
+                    }
+                }
+            }
+            AppCommand::DeleteEntity { entity_id } => {
+                info!("Deleting entity {}", entity_id);
+                delete_cube_writer.write(crate::cube::DeleteCubeEvent { entity_id });
+            }
+        }
+    }
+}
+
 /// Send a response back through the Unix socket
 #[cfg(not(target_os = "ios"))]
 #[cfg(debug_assertions)]
--- a/crates/app/src/main.rs
+++ b/crates/app/src/main.rs
@@ -205,6 +205,7 @@ fn main() {
    // Insert control socket path as resource
    app.insert_resource(control::ControlSocketPath(args.control_socket.clone()));
    app.add_systems(Startup, control::start_control_socket_system);
+    app.add_systems(Update, (control::process_app_commands, control::cleanup_control_socket));

    // Rendering-only plugins
    #[cfg(not(feature = "headless"))]
--- a/crates/app/src/setup/control_socket.rs
+++ b/crates/app/src/setup/control_socket.rs
@@ -0,0 +1,253 @@
+//! Unix domain socket control server for remote engine control
+//!
+//! This module provides a Unix socket server for controlling the engine
+//! programmatically without needing screen access or network ports.
+//!
+//! # Security
+//!
+//! Currently debug-only. See issue #135 for production security requirements.
+
+use anyhow::Result;
+use bevy::prelude::*;
+use libmarathon::networking::{ControlCommand, ControlResponse, GossipBridge, SessionId};
+use uuid::Uuid;
+
+/// Spawn Unix domain socket control server for remote engine control
+///
+/// This spawns a tokio task that listens on a Unix socket for control commands.
+/// The socket path is `/tmp/marathon-{session_id}.sock`.
+///
+/// **Security Note**: This is currently debug-only. See issue #135 for production
+/// security requirements (authentication, rate limiting, etc.).
+///
+/// # Platform Support
+///
+/// This function is only compiled on non-iOS platforms.
+#[cfg(not(target_os = "ios"))]
+#[cfg(debug_assertions)]
+pub fn spawn_control_socket(session_id: SessionId, bridge: GossipBridge, node_id: Uuid) {
+    use tokio::io::AsyncReadExt;
+    use tokio::net::UnixListener;
+
+    let socket_path = format!("/tmp/marathon-{}.sock", session_id);
+
+    tokio::spawn(async move {
+        // Clean up any existing socket
+        let _ = std::fs::remove_file(&socket_path);
+
+        let listener = match UnixListener::bind(&socket_path) {
+            Ok(l) => {
+                info!("Control socket listening at {}", socket_path);
+                l
+            }
+            Err(e) => {
+                error!("Failed to bind control socket at {}: {}", socket_path, e);
+                return;
+            }
+        };
+
+        // Accept connections in a loop
+        loop {
+            match listener.accept().await {
+                Ok((mut stream, _addr)) => {
+                    let bridge = bridge.clone();
+                    let session_id = session_id.clone();
+
+                    // Spawn a task to handle this connection
+                    tokio::spawn(async move {
+                        // Read command length (4 bytes)
+                        let mut len_buf = [0u8; 4];
+                        if let Err(e) = stream.read_exact(&mut len_buf).await {
+                            error!("Failed to read command length: {}", e);
+                            return;
+                        }
+                        let len = u32::from_le_bytes(len_buf) as usize;
+
+                        // Read command bytes
+                        let mut cmd_buf = vec![0u8; len];
+                        if let Err(e) = stream.read_exact(&mut cmd_buf).await {
+                            error!("Failed to read command: {}", e);
+                            return;
+                        }
+
+                        // Deserialize command
+                        let command = match ControlCommand::from_bytes(&cmd_buf) {
+                            Ok(cmd) => cmd,
+                            Err(e) => {
+                                error!("Failed to deserialize command: {}", e);
+                                let response = ControlResponse::Error {
+                                    error: format!("Failed to deserialize command: {}", e),
+                                };
+                                let _ = send_response(&mut stream, response).await;
+                                return;
+                            }
+                        };
+
+                        info!("Received control command: {:?}", command);
+
+                        // Execute command
+                        let response = handle_control_command(command, &bridge, session_id, node_id).await;
+
+                        // Send response
+                        if let Err(e) = send_response(&mut stream, response).await {
+                            error!("Failed to send response: {}", e);
+                        }
+                    });
+                }
+                Err(e) => {
+                    error!("Failed to accept control socket connection: {}", e);
+                }
+            }
+        }
+    });
+}
+
+/// Handle a control command and return a response
+#[cfg(not(target_os = "ios"))]
+#[cfg(debug_assertions)]
+async fn handle_control_command(
+    command: ControlCommand,
+    bridge: &GossipBridge,
+    session_id: SessionId,
+    node_id: Uuid,
+) -> ControlResponse {
+    match command {
+        ControlCommand::GetStatus => {
+            // Get queue sizes from bridge
+            let outgoing_size = bridge.try_recv_outgoing().map(|msg| {
+                // Put it back
+                let _ = bridge.send(msg);
+                1
+            }).unwrap_or(0);
+
+            ControlResponse::Status {
+                node_id,
+                session_id,
+                outgoing_queue_size: outgoing_size,
+                incoming_queue_size: 0, // We'd need to peek without consuming
+                connected_peers: None, // Not easily available from bridge
+            }
+        }
+        ControlCommand::SendTestMessage { content } => {
+            use libmarathon::networking::{VersionedMessage, VectorClock, SyncMessage};
+
+            // Send a SyncRequest as a test message (lightweight ping-like message)
+            let message = SyncMessage::SyncRequest {
+                node_id,
+                vector_clock: VectorClock::new(),
+            };
+            let versioned = VersionedMessage::new(message);
+
+            match bridge.send(versioned) {
+                Ok(_) => ControlResponse::Ok {
+                    message: format!("Sent test message: {}", content),
+                },
+                Err(e) => ControlResponse::Error {
+                    error: format!("Failed to send: {}", e),
+                },
+            }
+        }
+        ControlCommand::InjectMessage { message } => {
+            match bridge.push_incoming(message) {
+                Ok(_) => ControlResponse::Ok {
+                    message: "Message injected into incoming queue".to_string(),
+                },
+                Err(e) => ControlResponse::Error {
+                    error: format!("Failed to inject message: {}", e),
+                },
+            }
+        }
+        ControlCommand::BroadcastMessage { message } => {
+            use libmarathon::networking::VersionedMessage;
+
+            let versioned = VersionedMessage::new(message);
+            match bridge.send(versioned) {
+                Ok(_) => ControlResponse::Ok {
+                    message: "Message broadcast".to_string(),
+                },
+                Err(e) => ControlResponse::Error {
+                    error: format!("Failed to broadcast: {}", e),
+                },
+            }
+        }
+        ControlCommand::Shutdown => {
+            warn!("Shutdown command received via control socket");
+            ControlResponse::Ok {
+                message: "Shutdown not yet implemented".to_string(),
+            }
+        }
+
+        // Session lifecycle commands (TODO: implement these properly)
+        ControlCommand::JoinSession { session_code } => {
+            ControlResponse::Error {
+                error: format!("JoinSession not yet implemented (requested: {})", session_code),
+            }
+        }
+        ControlCommand::LeaveSession => {
+            ControlResponse::Error {
+                error: "LeaveSession not yet implemented".to_string(),
+            }
+        }
+        ControlCommand::GetSessionInfo => {
+            ControlResponse::Error {
+                error: "GetSessionInfo not yet implemented".to_string(),
+            }
+        }
+        ControlCommand::ListSessions => {
+            ControlResponse::Error {
+                error: "ListSessions not yet implemented".to_string(),
+            }
+        }
+        ControlCommand::DeleteSession { session_code } => {
+            ControlResponse::Error {
+                error: format!("DeleteSession not yet implemented (requested: {})", session_code),
+            }
+        }
+        ControlCommand::ListPeers => {
+            ControlResponse::Error {
+                error: "ListPeers not yet implemented".to_string(),
+            }
+        }
+        ControlCommand::SpawnEntity { .. } => {
+            ControlResponse::Error {
+                error: "SpawnEntity not available on session-level socket. Use app-level socket.".to_string(),
+            }
+        }
+        ControlCommand::DeleteEntity { .. } => {
+            ControlResponse::Error {
+                error: "DeleteEntity not available on session-level socket. Use app-level socket.".to_string(),
+            }
+        }
+    }
+}
+
+/// Send a response back through the Unix socket
+#[cfg(not(target_os = "ios"))]
+#[cfg(debug_assertions)]
+async fn send_response(
+    stream: &mut tokio::net::UnixStream,
+    response: ControlResponse,
+) -> Result<()> {
+    use tokio::io::AsyncWriteExt;
+
+    let bytes = response.to_bytes()?;
+    let len = bytes.len() as u32;
+
+    // Write length prefix
+    stream.write_all(&len.to_le_bytes()).await?;
+    // Write response bytes
+    stream.write_all(&bytes).await?;
+    stream.flush().await?;
+
+    Ok(())
+}
+
+// No-op stub for iOS builds
+#[cfg(target_os = "ios")]
+pub fn spawn_control_socket(_session_id: SessionId, _bridge: GossipBridge, _node_id: Uuid) {}
+
+// No-op stub for release builds
+#[cfg(all(not(target_os = "ios"), not(debug_assertions)))]
+pub fn spawn_control_socket(_session_id: SessionId, _bridge: GossipBridge, _node_id: Uuid) {
+    // TODO(#135): Implement secure control socket for release builds with authentication
+}
--- a/crates/app/src/setup/mod.rs
+++ b/crates/app/src/setup/mod.rs
@@ -47,11 +47,15 @@
 //! 2. **Tokio → Bevy**: GossipBridge's internal queue (push_incoming)
 //! 3. **Thread handoff**: crossbeam_channel (one-time GossipBridge transfer)

+mod control_socket;
+
 use anyhow::Result;
 use bevy::prelude::*;
 use libmarathon::networking::{GossipBridge, SessionId};
 use uuid::Uuid;

+use control_socket::spawn_control_socket;
+
 /// Session ID to use for network initialization
 ///
 /// This resource must be inserted before setup_gossip_networking runs.
@@ -222,11 +226,12 @@ async fn init_gossip(session_id: SessionId) -> Result<GossipBridge> {
    let (sender, mut receiver) = subscribe_handle.split();

    // Wait for join (with timeout since we might be the first node)
-    info!("Waiting for gossip join...");
-    match tokio::time::timeout(std::time::Duration::from_secs(2), receiver.joined()).await {
-        | Ok(Ok(())) => info!("Joined gossip swarm"),
+    // Increased timeout to 10s to allow mDNS discovery to work
+    info!("Waiting for gossip join (10s timeout for mDNS discovery)...");
+    match tokio::time::timeout(std::time::Duration::from_secs(10), receiver.joined()).await {
+        | Ok(Ok(())) => info!("Joined gossip swarm successfully"),
        | Ok(Err(e)) => warn!("Join error: {} (proceeding anyway)", e),
-        | Err(_) => info!("Join timeout (first node in swarm)"),
+        | Err(_) => info!("Join timeout - likely first node in swarm (proceeding anyway)"),
    }

    // Create bridge
@@ -236,6 +241,9 @@ async fn init_gossip(session_id: SessionId) -> Result<GossipBridge> {
    // Spawn forwarding tasks - pass endpoint, router, gossip to keep them alive
    spawn_bridge_tasks(sender, receiver, bridge.clone(), endpoint, router, gossip);

+    // Spawn control socket server for remote control (debug only)
+    spawn_control_socket(session_id, bridge.clone(), node_id);
+
    Ok(bridge)
 }

@@ -301,7 +309,9 @@ fn spawn_bridge_tasks(
        loop {
            match tokio::time::timeout(Duration::from_millis(100), receiver.next()).await {
                | Ok(Some(Ok(event))) => {
-                    if let iroh_gossip::api::Event::Received(msg) = event {
+                    match event {
+                        | iroh_gossip::api::Event::Received(msg) => {
+                            info!("[Node {}] Received message from gossip", node_id);
                            if let Ok(versioned_msg) =
                                rkyv::from_bytes::<VersionedMessage, rkyv::rancor::Failure>(&msg.content)
                            {
@@ -309,6 +319,16 @@ fn spawn_bridge_tasks(
                                    error!("[Node {}] Push incoming failed: {}", node_id, e);
                                }
                            }
+                        },
+                        | iroh_gossip::api::Event::NeighborUp(peer_id) => {
+                            info!("[Node {}] Peer connected: {}", node_id, peer_id);
+                        },
+                        | iroh_gossip::api::Event::NeighborDown(peer_id) => {
+                            warn!("[Node {}] Peer disconnected: {}", node_id, peer_id);
+                        },
+                        | iroh_gossip::api::Event::Lagged => {
+                            warn!("[Node {}] Event stream lagged - some events may have been missed", node_id);
+                        },
                    }
                },
                | Ok(Some(Err(e))) => error!("[Node {}] Receiver error: {}", node_id, e),
--- a/crates/libmarathon/src/networking/control.rs
+++ b/crates/libmarathon/src/networking/control.rs
@@ -0,0 +1,170 @@
+//! Control socket protocol for remote engine control
+//!
+//! This module defines the message protocol for controlling the engine via
+//! Unix domain sockets without exposing network ports. Used for testing,
+//! validation, and programmatic control of sessions.
+//!
+//! # Security
+//!
+//! Currently debug-only. See issue #135 for production security requirements.
+
+use uuid::Uuid;
+
+use crate::networking::{
+    SessionId,
+    SessionState,
+    SyncMessage,
+    VersionedMessage,
+};
+
+/// Control command sent to the engine
+#[derive(Debug, Clone, rkyv::Archive, rkyv::Serialize, rkyv::Deserialize)]
+pub enum ControlCommand {
+    /// Get current session status
+    GetStatus,
+
+    /// Send a test message through gossip
+    SendTestMessage { content: String },
+
+    /// Inject a message directly into the incoming queue (for testing)
+    InjectMessage { message: VersionedMessage },
+
+    /// Broadcast a full sync message through gossip
+    BroadcastMessage { message: SyncMessage },
+
+    /// Request graceful shutdown
+    Shutdown,
+
+    // Session lifecycle commands
+
+    /// Join a specific session by code
+    JoinSession { session_code: String },
+
+    /// Leave the current session gracefully
+    LeaveSession,
+
+    /// Get detailed current session information
+    GetSessionInfo,
+
+    /// List all sessions in the database
+    ListSessions,
+
+    /// Delete a session from the database
+    DeleteSession { session_code: String },
+
+    /// Get list of connected peers in current session
+    ListPeers,
+
+    // Entity commands
+
+    /// Spawn an entity with a given type and position
+    SpawnEntity {
+        entity_type: String,
+        position: [f32; 3],
+    },
+
+    /// Delete an entity by its UUID
+    DeleteEntity { entity_id: Uuid },
+}
+
+/// Detailed session information
+#[derive(Debug, Clone, rkyv::Archive, rkyv::Serialize, rkyv::Deserialize)]
+pub struct SessionInfo {
+    pub session_id: SessionId,
+    pub session_name: Option<String>,
+    pub state: SessionState,
+    pub created_at: i64,
+    pub last_active: i64,
+    pub entity_count: usize,
+}
+
+/// Peer information
+#[derive(Debug, Clone, rkyv::Archive, rkyv::Serialize, rkyv::Deserialize)]
+pub struct PeerInfo {
+    pub node_id: Uuid,
+    pub connected_since: Option<i64>,
+}
+
+/// Response from the engine to a control command
+#[derive(Debug, Clone, rkyv::Archive, rkyv::Serialize, rkyv::Deserialize)]
+pub enum ControlResponse {
+    /// Session status information
+    Status {
+        node_id: Uuid,
+        session_id: SessionId,
+        outgoing_queue_size: usize,
+        incoming_queue_size: usize,
+        /// Number of connected peers (if available from gossip)
+        connected_peers: Option<usize>,
+    },
+
+    /// Detailed session information
+    SessionInfo(SessionInfo),
+
+    /// List of sessions
+    Sessions(Vec<SessionInfo>),
+
+    /// List of connected peers
+    Peers(Vec<PeerInfo>),
+
+    /// Acknowledgment of command execution
+    Ok { message: String },
+
+    /// Error occurred during command execution
+    Error { error: String },
+}
+
+impl ControlCommand {
+    /// Serialize a command to bytes using rkyv
+    pub fn to_bytes(&self) -> Result<Vec<u8>, rkyv::rancor::Error> {
+        rkyv::to_bytes::<rkyv::rancor::Error>(self).map(|b| b.to_vec())
+    }
+
+    /// Deserialize a command from bytes using rkyv
+    pub fn from_bytes(bytes: &[u8]) -> Result<Self, rkyv::rancor::Error> {
+        rkyv::from_bytes::<Self, rkyv::rancor::Error>(bytes)
+    }
+}
+
+impl ControlResponse {
+    /// Serialize a response to bytes using rkyv
+    pub fn to_bytes(&self) -> Result<Vec<u8>, rkyv::rancor::Error> {
+        rkyv::to_bytes::<rkyv::rancor::Error>(self).map(|b| b.to_vec())
+    }
+
+    /// Deserialize a response from bytes using rkyv
+    pub fn from_bytes(bytes: &[u8]) -> Result<Self, rkyv::rancor::Error> {
+        rkyv::from_bytes::<Self, rkyv::rancor::Error>(bytes)
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    #[test]
+    fn test_command_roundtrip() {
+        let cmd = ControlCommand::GetStatus;
+        let bytes = cmd.to_bytes().unwrap();
+        let decoded = ControlCommand::from_bytes(&bytes).unwrap();
+
+        match decoded {
+            | ControlCommand::GetStatus => {},
+            | _ => panic!("Failed to decode GetStatus"),
+        }
+    }
+
+    #[test]
+    fn test_response_roundtrip() {
+        let resp = ControlResponse::Ok {
+            message: "Test".to_string(),
+        };
+        let bytes = resp.to_bytes().unwrap();
+        let decoded = ControlResponse::from_bytes(&bytes).unwrap();
+
+        match decoded {
+            | ControlResponse::Ok { message } => assert_eq!(message, "Test"),
+            | _ => panic!("Failed to decode Ok response"),
+        }
+    }
+}
--- a/crates/libmarathon/src/networking/delta_generation.rs
+++ b/crates/libmarathon/src/networking/delta_generation.rs
@@ -52,7 +52,7 @@ impl NodeVectorClock {
 /// System to generate and broadcast EntityDelta messages
 ///
 /// This system:
-/// 1. Queries for Changed<NetworkedEntity>
+/// 1. Queries for Added<NetworkedEntity> or Changed<NetworkedEntity>
 /// 2. Serializes all components on those entities
 /// 3. Builds EntityDelta messages
 /// 4. Broadcasts via GossipBridge
@@ -73,7 +73,7 @@ pub fn generate_delta_system(world: &mut World) {

    let changed_entities: Vec<(Entity, uuid::Uuid, uuid::Uuid)> = {
        let mut query =
-            world.query_filtered::<(Entity, &NetworkedEntity), Changed<NetworkedEntity>>();
+            world.query_filtered::<(Entity, &NetworkedEntity), Or<(Added<NetworkedEntity>, Changed<NetworkedEntity>)>>();
        query
            .iter(world)
            .map(|(entity, networked)| (entity, networked.network_id, networked.owner_node_id))
--- a/crates/libmarathon/src/networking/mod.rs
+++ b/crates/libmarathon/src/networking/mod.rs
@@ -36,6 +36,7 @@ mod auth;
 mod blob_support;
 mod change_detection;
 mod components;
+mod control;
 mod delta_generation;
 mod entity_map;
 mod error;
@@ -62,6 +63,7 @@ pub use auth::*;
 pub use blob_support::*;
 pub use change_detection::*;
 pub use components::*;
+pub use control::*;
 pub use delta_generation::*;
 pub use entity_map::*;
 pub use error::*;
--- a/crates/libmarathon/src/networking/session_lifecycle.rs
+++ b/crates/libmarathon/src/networking/session_lifecycle.rs
@@ -168,7 +168,8 @@ pub fn save_session_on_shutdown_system(world: &mut World) {

    // Update session metadata
    session.touch();
-    session.transition_to(SessionState::Left);
+    // Note: We don't transition to Left here - that only happens on actual shutdown
+    // This periodic save just persists the current state

    // Count entities in the world
    let entity_count = world
--- a/docs/agent-simulation-breakdown.md
+++ b/docs/agent-simulation-breakdown.md
@@ -0,0 +1,433 @@
+# Agent Simulation Framework - Task Breakdown
+
+**Epic:** Agent Simulation Framework (#5)
+**Overall Size:** XXXL (165 points across 12 phases)
+**Priority:** P0 (Critical - core differentiator for Marathon)
+
+This document breaks down the 12 layers of the agent simulation framework into specific, sized tasks for prioritization and scheduling.
+
+**Philosophy:** Generic composition over special cases. Emergence over authorship. Consequences without failure. Existence is cheap; cognition is expensive.
+
+**Approach:** Incremental implementation. Each layer should be validated in isolation before adding the next. Start with foundation, add one layer at a time, continuously test performance and emergent behavior.
+
+---
+
+## Phase 1: Foundation (Entity Composition)
+
+**Phase Goal:** Implement entity composition system with core component categories
+**Phase Size:** 16 points
+**Dependencies:** Bevy ECS
+**Risk:** Low (building on solid ECS foundation)
+
+### Tasks
+
+| # | Task | Size | Points | Rationale | Priority |
+|---|------|------|-----------|----------|----------|
+| 1.1 | Define entity component categories and marker traits | S | 2 | Create type hierarchy for components (Lifecycle, PhysicalPresence, Needs, etc.) | P0 |
+| 1.2 | Implement Lifecycle state machine for all entity types | M | 4 | States: Unborn → Infant → Child → Adolescent → Adult → Elder → Deceased | P0 |
+| 1.3 | Implement PhysicalPresence component with position and movement | M | 4 | Position, velocity, pathfinding integration, spatial indexing | P0 |
+| 1.4 | Implement Needs system (biological and psychological drives) | M | 4 | Hunger, thirst, sleep, social, safety needs with decay and satisfaction | P0 |
+| 1.5 | Implement Instincts system (species-typical behaviors) | S | 2 | Basic instincts (flee danger, seek food, seek shelter) | P0 |
+
+**Phase 1 Total:** 16 points
+
+### Lean Analysis
+- **Eliminate Waste:** Can we use existing ECS patterns? YES - leverage Bevy's component model
+- **Amplify Learning:** What will we learn? How to compose complex entities from simple components
+- **Deliver Fast:** Can we implement incrementally? YES - add components one at a time
+- **Build Quality In:** Risk of component coupling? YES - enforce clean boundaries between categories
+
+### Phase 1 Recommendations
+1. **Start with marker traits** - define component categories clearly
+2. **Validate Lifecycle separately** - ensure state machine is solid before adding other components
+3. **Test composition patterns** - verify entities can have any combination of components
+
+---
+
+## Phase 2: Life Arc System
+
+**Phase Goal:** Implement life arc state machines and autonomy gradient
+**Phase Size:** 12 points
+**Dependencies:** Phase 1 complete
+**Risk:** Medium (complex state transitions, schedule authority handoff)
+
+### Tasks
+
+| # | Task | Size | Points | Rationale | Priority |
+|---|------|------|-----------|----------|----------|
+| 2.1 | Implement life arc state machines (Child → Adult → Elder) | L | 8 | Major state transitions with capability changes and schedule authority shifts | P0 |
+| 2.2 | Implement autonomy gradient (None → Partial → Full) | S | 2 | Track who controls entity schedule (parent, self, institution) | P0 |
+| 2.3 | Implement life arc transition triggers | S | 2 | Age-based, event-based, and condition-based transitions | P1 |
+
+**Phase 2 Total:** 12 points
+
+### Lean Analysis
+- **Eliminate Waste:** Can we hardcode ages? NO - emergent transitions are key to feeling alive
+- **Amplify Learning:** What will we learn? How autonomy affects entity behavior
+- **Deliver Fast:** Can we implement incrementally? YES - start with simple age transitions
+- **Build Quality In:** Risk of broken transitions? YES - comprehensive state transition testing
+
+### Phase 2 Recommendations
+1. **Start with simple age-based transitions** - Child(0-12) → Adolescent(13-17) → Adult(18+)
+2. **Test authority handoff carefully** - ensure schedule control transfers correctly
+3. **Validate edge cases** - what happens when parent dies while child is dependent?
+
+---
+
+## Phase 3: Schedule System
+
+**Phase Goal:** Implement daily schedule generation from roles and personality
+**Phase Size:** 14 points
+**Dependencies:** Phase 2 complete
+**Risk:** Medium (schedule conflicts, authority model complexity)
+
+### Tasks
+
+| # | Task | Size | Points | Rationale | Priority |
+|---|------|------|-----------|----------|----------|
+| 3.1 | Implement daily schedule generation from roles | M | 4 | Generate time blocks (work, sleep, leisure) based on role and personality | P0 |
+| 3.2 | Implement schedule authority model (autonomous, external, partial) | M | 4 | Track who can modify entity schedule (self, parent, employer) | P0 |
+| 3.3 | Implement shared commitments between related entities | M | 4 | Couples, families, work teams share schedule blocks | P1 |
+| 3.4 | Implement schedule regeneration on condition changes | S | 2 | Rebuild schedule when sick, pregnant, grieving, etc. | P1 |
+
+**Phase 3 Total:** 14 points
+
+### Lean Analysis
+- **Eliminate Waste:** Can we use fixed schedules? NO - dynamic schedules enable emergent behavior
+- **Amplify Learning:** What will we learn? How schedule conflicts get resolved
+- **Deliver Fast:** Can we implement incrementally? YES - basic schedules first, then shared commitments
+- **Build Quality In:** Risk of schedule conflicts? YES - need clear resolution rules
+
+### Phase 3 Recommendations
+1. **Start with single-entity schedules** - validate basic time blocking works
+2. **Add authority later** - ensure autonomous schedules work before external control
+3. **Test regeneration** - verify schedules adapt to condition changes
+
+---
+
+## Phase 4: Behavior Tree System
+
+**Phase Goal:** Implement behavior tree framework and need-driven prioritization
+**Phase Size:** 16 points
+**Dependencies:** Phase 3 complete
+**Risk:** High (performance critical, complex evaluation logic)
+
+### Tasks
+
+| # | Task | Size | Points | Rationale | Priority |
+|---|------|------|-----------|----------|----------|
+| 4.1 | Implement behavior tree framework and evaluation engine | L | 8 | Tree structure, node types (sequence, selector, action), tick evaluation | P0 |
+| 4.2 | Implement activity-specific behavior trees | M | 4 | Work, Leisure, Social, Maintenance activities with contextual behaviors | P0 |
+| 4.3 | Implement need-driven behavior prioritization | M | 4 | Urgent needs interrupt schedule, utility-based selection | P0 |
+
+**Phase 4 Total:** 16 points
+
+### Lean Analysis
+- **Eliminate Waste:** Can we use FSM instead? NO - behavior trees compose better
+- **Amplify Learning:** What will we learn? How needs affect behavior selection
+- **Deliver Fast:** Can we implement incrementally? YES - simple trees first, add complexity
+- **Build Quality In:** Risk of infinite loops? YES - add evaluation limits and cycle detection
+
+### Phase 4 Recommendations
+1. **Start with simple trees** - single activity type, no interruption
+2. **Profile evaluation performance** - this is frame-rate critical for nearby entities
+3. **Test need interruption** - verify urgent hunger can interrupt work
+
+---
+
+## Phase 5: World Substrate
+
+**Phase Goal:** Implement substrate layer types and environmental effects
+**Phase Size:** 14 points
+**Dependencies:** Phase 1 complete (can parallelize with Phases 2-4)
+**Risk:** Medium (spatial queries performance, substance simulation)
+
+### Tasks
+
+| # | Task | Size | Points | Rationale | Priority |
+|---|------|------|-----------|----------|----------|
+| 5.1 | Implement substrate layer types (substances, temperature, light, sound, air quality, contagion) | L | 8 | Spatial grids or continuous fields for environmental state | P1 |
+| 5.2 | Implement trigger volume system for spatial events | S | 2 | Zones that affect entities (fire, poison gas, music) | P1 |
+| 5.3 | Implement generic substance transfer and consumption | M | 4 | Entities consume/produce substances, transfer between locations | P2 |
+
+**Phase 5 Total:** 14 points
+
+### Lean Analysis
+- **Eliminate Waste:** Can we skip substrate entirely? NO - enables fire, disease, pollution
+- **Amplify Learning:** What will we learn? How spatial state affects entity behavior
+- **Deliver Fast:** Can we implement incrementally? YES - start with simple trigger volumes
+- **Build Quality In:** Risk of performance issues? YES - spatial queries must be efficient
+
+### Phase 5 Recommendations
+1. **Start with trigger volumes** - simpler than continuous fields
+2. **Use spatial indexing** - KD-tree or grid for efficient queries
+3. **Test with 500 entities** - verify performance scales
+
+---
+
+## Phase 6: Institutions
+
+**Phase Goal:** Implement institution entity type with coordination blackboard
+**Phase Size:** 16 points
+**Dependencies:** Phase 3 complete
+**Risk:** High (cross-entity coordination, emergent organizational behavior)
+
+### Tasks
+
+| # | Task | Size | Points | Rationale | Priority |
+|---|------|------|-----------|----------|----------|
+| 6.1 | Implement Institution entity type with Governance, Resources, Reputation | M | 4 | Special entity type that manages members and shared state | P0 |
+| 6.2 | Implement shared blackboard state for coordination | L | 8 | Shared memory for work assignments, inventory, schedules | P0 |
+| 6.3 | Implement worker schedule template generation | M | 4 | Institutions create schedule templates for roles (baker, guard, teacher) | P1 |
+
+**Phase 6 Total:** 16 points
+
+### Lean Analysis
+- **Eliminate Waste:** Can we hardcode workplace behavior? NO - emergent coordination is key
+- **Amplify Learning:** What will we learn? How entities coordinate without communication
+- **Deliver Fast:** Can we implement incrementally? YES - simple bakery first, then generalize
+- **Build Quality In:** Risk of coordination deadlocks? YES - need clear state machine
+
+### Phase 6 Recommendations
+1. **Start with single institution** - bakery with 1 baker, 1 apprentice
+2. **Validate blackboard pattern** - ensure shared state enables coordination
+3. **Test with multiple institutions** - verify they don't interfere
+
+---
+
+## Phase 7: Relationships
+
+**Phase Goal:** Implement relationship state and coordination levels
+**Phase Size:** 14 points
+**Dependencies:** Phase 3 complete
+**Risk:** Medium (relationship evolution, coordination complexity)
+
+### Tasks
+
+| # | Task | Size | Points | Rationale | Priority |
+|---|------|------|-----------|----------|----------|
+| 7.1 | Implement Relationship state and evolution | M | 4 | Bond strength, interaction history, sentiment tracking | P1 |
+| 7.2 | Implement bond types (Romantic, Familial, Friendship, Professional, Caretaking) | M | 4 | Different bond types enable different interactions | P1 |
+| 7.3 | Implement coordination levels (None → AdHoc → Recurring → Cohabiting → Dependent) | M | 4 | Stronger bonds enable tighter schedule coordination | P1 |
+| 7.4 | Implement relationship effects on all simulation layers | S | 2 | Relationships affect needs, schedules, behaviors, conditions | P2 |
+
+**Phase 7 Total:** 14 points
+
+### Lean Analysis
+- **Eliminate Waste:** Can we skip relationships? NO - core to "living village" feel
+- **Amplify Learning:** What will we learn? How bonds enable coordination
+- **Deliver Fast:** Can we implement incrementally? YES - family bonds first, then friends
+- **Build Quality In:** Risk of relationship graph bugs? YES - need clear formation/dissolution rules
+
+### Phase 7 Recommendations
+1. **Start with familial bonds** - parent-child relationships are simplest
+2. **Test coordination levels** - verify cohabiting couples share schedules
+3. **Validate evolution** - bonds should strengthen/weaken from interactions
+
+---
+
+## Phase 8: Conditions
+
+**Phase Goal:** Implement condition system with effects on all layers
+**Phase Size:** 12 points
+**Dependencies:** Phases 1-4 complete
+**Risk:** Medium (condition interactions, contagion simulation)
+
+### Tasks
+
+| # | Task | Size | Points | Rationale | Priority |
+|---|------|------|-----------|----------|----------|
+| 8.1 | Implement Condition system with severity and duration | M | 4 | Temporary states (sick, pregnant, grieving, drunk) | P1 |
+| 8.2 | Implement condition effects (need modifiers, capability modifiers, schedule overrides, behavior modifiers) | M | 4 | Conditions affect all simulation layers | P1 |
+| 8.3 | Implement contagion system for spreading conditions | M | 4 | Diseases spread through proximity and substrate | P2 |
+
+**Phase 8 Total:** 12 points
+
+### Lean Analysis
+- **Eliminate Waste:** Can we hardcode illness? NO - generic conditions enable emergent drama
+- **Amplify Learning:** What will we learn? How conditions cascade through simulation
+- **Deliver Fast:** Can we implement incrementally? YES - simple illness first, then contagion
+- **Build Quality In:** Risk of runaway epidemics? YES - need disease progression model
+
+### Phase 8 Recommendations
+1. **Start with non-contagious conditions** - pregnancy, grief, intoxication
+2. **Add simple illness** - validate need modifiers and schedule overrides work
+3. **Test contagion carefully** - ensure diseases don't wipe out entire village
+
+---
+
+## Phase 9: Tools & Capabilities
+
+**Phase Goal:** Implement capability types and tool operator modes
+**Phase Size:** 12 points
+**Dependencies:** Phase 1 complete
+**Risk:** Medium (capability composition, tool quality effects)
+
+### Tasks
+
+| # | Task | Size | Points | Rationale | Priority |
+|---|------|------|-----------|----------|----------|
+| 9.1 | Implement capability types (Movement, Communication, Transformation, Force, Storage) | M | 4 | Abstract capabilities that entities and tools provide | P1 |
+| 9.2 | Implement tool operator modes (Inhabit, Wield, Station, Wear) | M | 4 | Different ways entities use tools (house, hammer, oven, clothes) | P1 |
+| 9.3 | Implement tools as institutional resources | S | 2 | Bakery owns ovens, blacksmith owns anvils | P2 |
+| 9.4 | Implement tool quality affecting output | S | 2 | Better tools produce better results | P2 |
+
+**Phase 9 Total:** 12 points
+
+### Lean Analysis
+- **Eliminate Waste:** Can we skip tools? NO - needed for production workflows
+- **Amplify Learning:** What will we learn? How capabilities compose
+- **Deliver Fast:** Can we implement incrementally? YES - simple wielded tools first
+- **Build Quality In:** Risk of capability conflicts? YES - need clear capability stacking rules
+
+### Phase 9 Recommendations
+1. **Start with wielded tools** - hammer, axe, hoe
+2. **Add stations** - ovens, forges, looms require entity presence
+3. **Test quality modifiers** - verify master blacksmith + good anvil = quality sword
+
+---
+
+## Phase 10: Production Workflows
+
+**Phase Goal:** Implement batch entity system for multi-step processes
+**Phase Size:** 14 points
+**Dependencies:** Phase 9 complete
+**Risk:** High (workflow state persistence, interruption handling)
+
+### Tasks
+
+| # | Task | Size | Points | Rationale | Priority |
+|---|------|------|-----------|----------|----------|
+| 10.1 | Implement batch entity system for multi-step processes | L | 8 | Bread batch progresses: mix → knead → proof → bake → cool | P1 |
+| 10.2 | Implement recipe and phase system | M | 4 | Recipes define inputs, phases, outputs, time/skill requirements | P1 |
+| 10.3 | Implement interruptible work with state persistence | S | 2 | Baker can pause kneading to serve customer, resume later | P2 |
+
+**Phase 10 Total:** 14 points
+
+### Lean Analysis
+- **Eliminate Waste:** Can we use instant crafting? NO - multi-step processes enable depth
+- **Amplify Learning:** What will we learn? How to persist complex workflow state
+- **Deliver Fast:** Can we implement incrementally? YES - simple 1-phase recipes first
+- **Build Quality In:** Risk of lost state on interruption? YES - need save/restore
+
+### Phase 10 Recommendations
+1. **Start with simple recipes** - bread (mix → bake), no interruption
+2. **Add multi-phase** - validate state transitions work
+3. **Test interruption** - ensure baker can resume kneading after break
+
+---
+
+## Phase 11: Performance Architecture
+
+**Phase Goal:** Implement simulation tier system and discrete event simulation
+**Phase Size:** 15 points
+**Dependencies:** Phase 4 complete
+**Risk:** Critical (performance determines feasibility of 500 entities)
+
+### Tasks
+
+| # | Task | Size | Points | Rationale | Priority |
+|---|------|------|-----------|----------|----------|
+| 11.1 | Implement simulation tier system (Attached, Nearby, Background, Wildlife) | L | 8 | Different update strategies for different proximity levels | P0 |
+| 11.2 | Implement tier promotion/demotion based on player proximity | M | 4 | Seamless transitions as players move through world | P0 |
+| 11.3 | Implement discrete event simulation and wake queue | S | 3 | Background entities only update when something changes | P1 |
+
+**Phase 11 Total:** 15 points
+
+### Lean Analysis
+- **Eliminate Waste:** Can we update all entities every frame? NO - exceeds iPad budget
+- **Amplify Learning:** What will we learn? How to scale simulation efficiently
+- **Deliver Fast:** Can we implement incrementally? YES - simple tier system first
+- **Build Quality In:** Risk of visible tier transitions? YES - need smooth fade
+
+### Phase 11 Recommendations
+1. **Profile early and often** - this determines if 500 entities is feasible
+2. **Start with 2 tiers** - nearby (full), background (interpolated)
+3. **Add wake queue** - discrete events are key to efficient background simulation
+4. **Test on iPad** - don't optimize for desktop and hope it works
+
+---
+
+## Phase 12: Networking & Sync
+
+**Phase Goal:** Implement CRDT-based synchronization and entity ownership
+**Phase Size:** 16 points
+**Dependencies:** All previous phases complete
+**Risk:** Critical (desyncs break multiplayer experience)
+
+### Tasks
+
+| # | Task | Size | Points | Rationale | Priority |
+|---|------|------|-----------|----------|----------|
+| 12.1 | Implement CRDT-based state synchronization for slow-changing layers | L | 8 | Life arcs, schedules, relationships use CRDTs for eventual consistency | P0 |
+| 12.2 | Implement behavior tree outcome synchronization | M | 4 | Fast-changing behavior trees sync outcomes, not state | P0 |
+| 12.3 | Implement entity ownership model | S | 2 | Entities owned by nearest player, ownership transfers on proximity | P0 |
+| 12.4 | Implement spatial ownership for substrate | S | 2 | World divided into chunks, each owned by one peer | P1 |
+
+**Phase 12 Total:** 16 points
+
+### Lean Analysis
+- **Eliminate Waste:** Can we use lockstep sync? NO - too slow for 500 entities
+- **Amplify Learning:** What will we learn? How to sync without desyncs
+- **Deliver Fast:** Can we implement incrementally? YES - basic ownership first
+- **Build Quality In:** Risk of desyncs? YES - extensive multiplayer testing required
+
+### Phase 12 Recommendations
+1. **Start with entity ownership** - simplest sync model
+2. **Add CRDT sync for schedules** - slow-changing state is easier
+3. **Test with 2 peers first** - validate before scaling to 4
+4. **Add reconciliation** - handle ownership conflicts gracefully
+5. **Stress test with 500 entities** - verify no desyncs in 1-hour sessions
+
+---
+
+## Overall WSJF Analysis
+
+Using Cost of Delay / Duration to prioritize phases:
+
+| Phase | Size | Business Value | Time Criticality | Risk Reduction | CoD | WSJF |
+|-------|------|----------------|------------------|----------------|-----|------|
+| Phase 1 | 16 | 8/10 | 9/10 | 9/10 | 26 | 1.63 |
+| Phase 2 | 12 | 7/10 | 8/10 | 8/10 | 23 | 1.92 |
+| Phase 3 | 14 | 8/10 | 8/10 | 7/10 | 23 | 1.64 |
+| Phase 4 | 16 | 9/10 | 9/10 | 9/10 | 27 | 1.69 |
+| Phase 11 | 15 | 10/10 | 10/10 | 10/10 | 30 | 2.00 |
+| Phase 12 | 16 | 10/10 | 10/10 | 10/10 | 30 | 1.88 |
+
+**Recommendation:** Implement in order 1 → 2 → 3 → 4 → 11 (critical path for viability), then parallelize remaining layers.
+
+## Success Metrics
+
+### Performance Targets
+- 500 entities simulated simultaneously
+- 60 FPS on iPad Pro with full simulation load
+- Background entities consume <5% of frame budget
+- Tier transitions are visually seamless
+
+### Multiplayer Targets
+- 2-4 peers supported
+- <100ms sync latency for state propagation
+- Zero desyncs in 1-hour multiplayer sessions
+- Ownership transitions are seamless
+
+### Emergence Targets
+- Entities follow believable daily routines without hand-authoring
+- Relationships form and evolve through interaction
+- Institutions coordinate workers without explicit communication
+- Unexpected behaviors emerge from generic rule interactions
+
+## Implementation Notes
+
+1. **Incremental Development**: Each phase should be fully validated before moving to the next
+2. **Performance First**: Profile on iPad hardware from Phase 1 onwards
+3. **Test Emergent Behavior**: Create test scenarios that exercise rule interactions
+4. **Document Invariants**: Clear rules for how systems interact prevents bugs
+5. **Content Light**: Focus on generic rules that recombine, not hand-authored content
+6. **Multiplayer Always**: Test sync from Phase 1, don't bolt on later
+
+## Total Breakdown
+
+- **Total Points:** 165
+- **Total Phases:** 12
+- **Critical Path:** Phases 1, 2, 3, 4, 11, 12 (89 points)
+- **Parallel Work:** Phases 5-10 can be done alongside or after critical path (76 points)
--- a/docs/bevy-rendering-vendoring-breakdown.md
+++ b/docs/bevy-rendering-vendoring-breakdown.md
@@ -0,0 +1,268 @@
+# Bevy Rendering Vendoring - Task Breakdown
+
+**Epic:** Vendor Bevy Renderer and Eliminate Window Component Duplication (#2)
+**Overall Size:** XXL (32+ points across 5 phases)
+**Priority:** P2 (Medium - architectural improvement)
+
+This document breaks down the 5 phases into specific, sized tasks for prioritization and scheduling.
+
+---
+
+## Phase 1: Vendoring
+
+**Phase Goal:** Bring Bevy's rendering stack into Marathon codebase
+**Phase Size:** 20 points
+**Dependencies:** None (can start immediately)
+**Risk:** Medium (large code drop, potential API mismatches)
+
+### Tasks
+
+| # | Task | Size | Points | Rationale | Priority |
+|---|------|------|--------|-----------|----------|
+| 1.1 | Vendor `bevy_render` core into `crates/libmarathon/src/render/` | L | 8 | ~15K LOC, complex module structure, need to preserve API surface | P2 |
+| 1.2 | Vendor `bevy_pbr` materials and lighting | M | 4 | Smaller than render core, well-isolated system | P2 |
+| 1.3 | Vendor `bevy_core_pipeline` | S | 2 | Thin abstraction layer over render core | P2 |
+| 1.4 | Vendor wgpu integration helpers | S | 2 | Limited surface area, mostly type wrappers | P2 |
+| 1.5 | Update `Cargo.toml` and remove Bevy renderer dependencies | XS | 1 | Straightforward dependency changes | P2 |
+| 1.6 | Verify existing rendering still works (smoke test) | M | 4 | Need to test all platforms, lighting, PBR materials | P1 |
+
+**Phase 1 Total:** 21 points
+
+### Lean Analysis
+- **Eliminate Waste:** Is vendoring 15K+ LOC necessary? YES - window state duplication is causing bugs
+- **Amplify Learning:** What will we learn? How deeply Bevy's renderer couples to Window components
+- **Deliver Fast:** Can we vendor incrementally? YES - by module (render, then pbr, then pipeline)
+- **Build Quality In:** Risk of introducing regressions? YES - comprehensive smoke testing critical
+
+### Phase 1 Recommendations
+1. **Do 1.1-1.5 as a batch** - vendoring is all-or-nothing, partial state is worse
+2. **Do 1.6 immediately after** - verify nothing broke before proceeding
+3. **Consider:** Create a feature flag `vendored-renderer` to toggle between vendored/upstream during transition
+
+---
+
+## Phase 2: Renderer Refactoring
+
+**Phase Goal:** Make renderer work with winit handles directly
+**Phase Size:** 18 points
+**Dependencies:** Phase 1 complete
+**Risk:** High (core renderer architecture changes)
+
+### Tasks
+
+| # | Task | Size | Points | Rationale | Priority |
+|---|------|------|--------|-----------|----------|
+| 2.1 | Design `WindowInfo` abstraction for render queries | S | 2 | Clear API, minimal state | P1 |
+| 2.2 | Modify renderer initialization to accept winit handles | M | 4 | Need to trace through render graph setup | P1 |
+| 2.3 | Update `RawHandleWrapper` to provide window info | S | 2 | Add methods for size, scale_factor queries | P2 |
+| 2.4 | Refactor camera viewport calculations | M | 4 | Cameras need aspect ratio, DPI - multiple call sites | P1 |
+| 2.5 | Audit and update all window queries in render systems | L | 8 | Many systems query window, need comprehensive search | P1 |
+| 2.6 | Verify PBR materials work with new architecture | M | 4 | Test metallic/roughness, normal maps, AO | P1 |
+
+**Phase 2 Total:** 24 points
+
+### Lean Analysis
+- **Eliminate Waste:** Can we avoid refactoring everything? NO - window queries are scattered
+- **Amplify Learning:** Should we prototype WindowInfo first? YES - design task 2.1 is critical
+- **Decide Late:** Can we defer PBR verification? NO - it's core to the aesthetic
+- **Optimize Whole:** Does this improve both desktop and iOS? YES - fixes DPI bugs on both
+
+### Critical Path
+```
+2.1 (WindowInfo design)
+  ↓
+2.2 (renderer init) → 2.3 (RawHandleWrapper)
+  ↓
+2.4 (cameras) + 2.5 (audit systems)
+  ↓
+2.6 (PBR verification)
+```
+
+### Phase 2 Recommendations
+1. **Start with 2.1** - get design right before touching renderer
+2. **Parallelize 2.4 and 2.5** - different areas of codebase
+3. **Consider:** Keep old Window component code paths behind feature flag during transition
+
+---
+
+## Phase 3: Executor Cleanup
+
+**Phase Goal:** Remove duplicate Bevy Window components
+**Phase Size:** 8 points
+**Dependencies:** Phase 2 complete (renderer no longer needs Window components)
+**Risk:** Low (pure deletion once renderer is independent)
+
+### Tasks
+
+| # | Task | Size | Points | Rationale | Priority |
+|---|------|------|--------|-----------|----------|
+| 3.1 | Remove `bevy::window::Window` creation from iOS executor | S | 2 | Delete code, verify iOS still builds | P1 |
+| 3.2 | Remove `bevy::window::Window` creation from desktop executor | S | 2 | Delete code, verify desktop still works | P1 |
+| 3.3 | Migrate window config to winit `WindowAttributes` | M | 4 | Some logic may have lived in Bevy window creation | P2 |
+| 3.4 | Remove `WindowMode` enum usage | XS | 1 | Straightforward deletion | P2 |
+| 3.5 | Clean up unused imports and dead code | XS | 1 | Cargo clippy + manual review | P3 |
+
+**Phase 3 Total:** 10 points
+
+### Lean Analysis
+- **Eliminate Waste:** This entire phase IS waste elimination - removing duplicate state
+- **Deliver Fast:** Can we do this immediately after Phase 2? YES - it's pure cleanup
+- **Build Quality In:** Risk of breaking something? LOW if Phase 2 is solid
+
+### Phase 3 Recommendations
+1. **Do 3.1 and 3.2 together** - both platforms should behave identically
+2. **Do 3.5 last** - easy win after harder work
+3. **Fast phase:** Mostly verification
+
+---
+
+## Phase 4: egui Integration
+
+**Phase Goal:** Ensure debug UI works with winit-only window state
+**Phase Size:** 6 points
+**Dependencies:** Phase 3 complete
+**Risk:** Low (egui is already vendored and working)
+
+### Tasks
+
+| # | Task | Size | Points | Rationale | Priority |
+|---|------|------|--------|-----------|----------|
+| 4.1 | Update debug UI to query scale factor from winit | S | 2 | Replace any Bevy window queries | P1 |
+| 4.2 | Verify custom input system still works | S | 2 | Input already uses custom event buffer | P1 |
+| 4.3 | Test DPI scaling on HiDPI displays | S | 2 | Manual testing on Retina macOS + iPad | P1 |
+| 4.4 | Update debug UI documentation | XS | 1 | Reflect new architecture | P3 |
+
+**Phase 4 Total:** 7 points
+
+### Lean Analysis
+- **Amplify Learning:** Will this reveal DPI bugs? YES - explicit test for it
+- **Build Quality In:** Test HiDPI early? YES - that's what this phase is
+
+### Phase 4 Recommendations
+1. **Do 4.1-4.3 as quick verification** - egui should "just work" since we already vendored it
+2. **This is a checkpoint** - if 4.3 reveals DPI issues, they're from Phase 2/3
+
+---
+
+## Phase 5: Testing & Documentation
+
+**Phase Goal:** Comprehensive verification and knowledge capture
+**Phase Size:** 12 points
+**Dependencies:** Phases 1-4 complete
+**Risk:** Low (pure verification)
+
+### Tasks
+
+| # | Task | Size | Points | Rationale | Priority |
+|---|------|------|--------|-----------|----------|
+| 5.1 | PBR materials test with low-poly assets | M | 4 | Create test scene, verify metallic/roughness | P1 |
+| 5.2 | Lighting system verification | M | 4 | Point, directional, spot lights + shadows | P1 |
+| 5.3 | Cross-platform testing battery | S | 2 | macOS desktop, macOS Retina, iOS device, iPad simulator | P1 |
+| 5.4 | Update architecture docs (RFC or new doc) | S | 2 | Explain window ownership, renderer changes | P2 |
+| 5.5 | Remove obsolete TODOs and comments | XS | 1 | Code archaeology, cleanup | P3 |
+| 5.6 | Create before/after architecture diagrams | S | 2 | Visual explanation for future contributors | P3 |
+
+**Phase 5 Total:** 15 points
+
+### Lean Analysis
+- **Amplify Learning:** Testing amplifies confidence, docs amplify knowledge transfer
+- **Build Quality In:** When should we test? CONTINUOUSLY, but this is final verification
+- **Eliminate Waste:** Are diagrams worth 2 points? YES if they prevent future confusion
+
+### Phase 5 Recommendations
+1. **Do 5.1-5.3 first** - verification before celebration
+2. **Do 5.4 immediately** - knowledge is fresh
+3. **Do 5.5-5.6 when inspired** - nice-to-haves, P3 priority
+
+---
+
+## Overall Scheduling Recommendations
+
+### Critical Path (Sequential)
+```
+Phase 1 → Phase 2 → Phase 3 → Phase 4 → Phase 5
+Total: ~77 points
+```
+
+### Parallel Opportunities
+- **During Phase 1:** Write Phase 2 design docs (2.1)
+- **During Phase 2:** Plan Phase 3 deletions
+- **During Phase 5:** Parallelize testing (5.1, 5.2, 5.3) if multiple devices available
+
+### Risk Mitigation Strategy
+1. **Phase 1.6 is a GO/NO-GO gate** - if smoke tests fail, stop and debug
+2. **Phase 2.1 design review** - get feedback on WindowInfo before implementing
+3. **Feature flags** - keep ability to toggle between old/new during Phases 1-3
+4. **Incremental commits** - don't batch entire phase into one PR
+
+---
+
+## WSJF Prioritization (Within P2 Tier)
+
+Scoring against other P2 work (hypothetical):
+
+| Item | Player Value | Time Criticality | Risk Reduction | CoD | Size | WSJF |
+|------|--------------|------------------|----------------|-----|------|------|
+| **Bevy Vendor Epic** | 4 | 2 | 8 | 14 | 32 | **0.44** |
+| Phase 1 alone | 3 | 2 | 9 | 14 | 21 | **0.67** |
+| Phase 2 alone | 6 | 3 | 9 | 18 | 24 | **0.75** |
+
+**Interpretation:**
+- **Low player value initially** - this is technical debt, not features
+- **High risk reduction** - fixes DPI bugs, enables future renderer work
+- **Do Phase 1 + 2 together** - they're meaningless separately
+- **Compare to:** Agent simulation (epic #5) likely has WSJF > 1.0, do that first if capacity allows
+
+---
+
+## Sequencing with Other Work
+
+### Good to do BEFORE this epic:
+- ✅ iOS deployment scripts (done)
+- ✅ Basic ECS setup (done)
+- Any small P1 bugs or quick wins
+
+### Good to do AFTER this epic:
+- Advanced rendering features (bloom, post-processing)
+- Agent simulation rendering (needs clean renderer)
+- Spatial audio visualization (uses renderer)
+
+### Can do IN PARALLEL:
+- Networking improvements (different subsystem)
+- Content creation (doesn't depend on window architecture)
+- Game design prototyping
+
+---
+
+## Decision Points
+
+### Before Starting Phase 1:
+- [ ] Do we have bandwidth for ~2 months of rendering work?
+- [ ] Are there higher-priority bugs blocking players/demos?
+- [ ] Have we validated PBR aesthetic matches Aspen vision?
+
+### Before Starting Phase 2:
+- [ ] Did Phase 1.6 smoke tests pass?
+- [ ] Do we understand all Window component usage?
+- [ ] Is WindowInfo design reviewed and approved?
+
+### Before Starting Phase 3:
+- [ ] Does renderer work 100% without Window components?
+- [ ] Have we tested on both iOS and desktop?
+
+### Before Closing Epic:
+- [ ] All platforms tested with HiDPI/Retina displays?
+- [ ] PBR materials look correct with low-poly assets?
+- [ ] Architecture docs updated?
+- [ ] Can we confidently say "winit is single source of truth"?
+
+---
+
+## Summary
+
+**Total Effort:** ~77 points (XXL epic)
+**Confidence:** Medium (vendoring is well-understood, refactoring has unknowns)
+**Recommendation:** Defer until higher-value work (agent simulation, core gameplay) is stable
+**When to do it:** When DPI bugs become P1, or when we need renderer extensibility
+
+This is important technical debt payoff but not immediately urgent. The current duplicate window state works, just inelegantly.
--- a/docs/spatial-audio-vendoring-breakdown.md
+++ b/docs/spatial-audio-vendoring-breakdown.md
@@ -0,0 +1,419 @@
+# Spatial Audio System - Task Breakdown
+
+**Epic:** Spatial Audio System (#4)
+**Overall Size:** XXL+ (91+ points across 8 phases)
+**Priority:** P1 (High - core immersion feature)
+
+This document breaks down the 8 phases into specific, sized tasks for prioritization and scheduling.
+
+**Note:** We are re-implementing bevy_seedling and bevy_steam_audio, not forking them. We depend on the underlying libraries (Firewheel and Steam Audio) as external crates, but write our own integration code that follows Marathon's patterns and doesn't lag behind Bevy version updates.
+
+---
+
+## Phase 1: Implement Firewheel Integration
+
+**Phase Goal:** Re-implement bevy_seedling's Firewheel integration for Marathon
+**Phase Size:** 14 points
+**Dependencies:** None (can start immediately)
+**Risk:** Medium (lock-free audio graph integration is complex)
+
+### Tasks
+
+| # | Task | Size | Points | Rationale | Priority |
+|---|------|------|--------|-----------|----------|
+| 1.1 | Add Firewheel dependency and create audio module structure | S | 2 | Add crate dependency, set up module hierarchy | P1 |
+| 1.2 | Implement audio graph initialization and lifecycle | M | 4 | Create graph, manage real-time thread, handle shutdown | P1 |
+| 1.3 | Create sample playback nodes and basic routing | M | 4 | Sampler nodes, gain nodes, basic graph connections | P1 |
+| 1.4 | Implement cpal audio output integration | S | 2 | Connect Firewheel graph to system audio output | P1 |
+| 1.5 | Verify basic playback works (smoke test) | S | 2 | Test on macOS and iOS, verify no glitches | P1 |
+
+**Phase 1 Total:** 14 points
+
+### Lean Analysis
+- **Eliminate Waste:** Can we use bevy_seedling directly? NO - lags Bevy updates, doesn't match Marathon patterns
+- **Amplify Learning:** What will we learn? How to integrate lock-free audio graphs with ECS
+- **Deliver Fast:** Can we implement incrementally? YES - basic playback first, then add features
+- **Build Quality In:** Risk of audio glitches? YES - comprehensive playback testing critical
+
+### Phase 1 Recommendations
+1. **Do 1.1-1.4 sequentially** - each builds on previous
+2. **Do 1.5 thoroughly** - verify no dropouts, glitches, or latency issues
+3. **Reference bevy_seedling** - use it as reference implementation, but write our own code
+
+---
+
+## Phase 2: Implement Steam Audio Integration
+
+**Phase Goal:** Re-implement bevy_steam_audio's Steam Audio integration for Marathon
+**Phase Size:** 18 points
+**Dependencies:** Phase 1 complete
+**Risk:** High (C++ bindings, HRTF complexity)
+
+### Tasks
+
+| # | Task | Size | Points | Rationale | Priority |
+|---|------|------|--------|-----------|----------|
+| 2.1 | Add steam-audio dependency (audionimbus bindings) | S | 2 | Add crate dependency, verify C++ library linking | P1 |
+| 2.2 | Create Firewheel processor node for Steam Audio | L | 8 | Bridge between Firewheel and Steam Audio APIs, handle FFI safely | P1 |
+| 2.3 | Implement HRTF initialization with default dataset | M | 4 | Load MIT KEMAR HRTF, verify initialization | P1 |
+| 2.4 | Implement distance attenuation and air absorption | S | 2 | Basic spatial processing before HRTF | P1 |
+| 2.5 | Test binaural output with positioned source | S | 2 | Create test scene, verify left/right panning and elevation | P1 |
+
+**Phase 2 Total:** 18 points
+
+### Lean Analysis
+- **Eliminate Waste:** Can we use simpler panning? NO - HRTF is core to immersion
+- **Amplify Learning:** Should we prototype Steam Audio separately? YES - task 2.5 is critical learning
+- **Decide Late:** Can we defer HRTF? NO - it's foundational to spatial audio
+- **Optimize Whole:** Does this improve both iOS and macOS? YES - cross-platform from start
+
+### Critical Path
+```
+2.1 (add dependency)
+  ↓
+2.2 (Firewheel processor node) → 2.3 (HRTF init)
+  ↓
+2.4 (distance/air absorption)
+  ↓
+2.5 (binaural test)
+```
+
+### Phase 2 Recommendations
+1. **Start with 2.1-2.3 sequentially** - Steam Audio setup is delicate
+2. **Test heavily at 2.5** - spatial accuracy is mission-critical
+3. **Reference bevy_steam_audio** - use as reference for Steam Audio API usage
+
+---
+
+## Phase 3: Bevy Integration
+
+**Phase Goal:** Connect ECS components to audio graph
+**Phase Size:** 20 points
+**Dependencies:** Phase 2 complete
+**Risk:** Medium (lock-free sync between game thread and audio thread)
+
+### Tasks
+
+| # | Task | Size | Points | Rationale | Priority |
+|---|------|------|--------|-----------|----------|
+| 3.1 | Create `AudioSource` and `AudioListener` components | S | 2 | Define component API, derive traits | P1 |
+| 3.2 | Implement position sync system (Transform → atomics) | L | 8 | Core sync logic, must be lock-free and glitch-free | P1 |
+| 3.3 | Implement component lifecycle (Added/Removed) | M | 4 | Handle entity spawn/despawn, cleanup nodes | P1 |
+| 3.4 | Create audio asset loading system | M | 4 | Decode audio files, integrate with Bevy assets | P1 |
+| 3.5 | Test with moving sources and listener | S | 2 | Verify Doppler-free position updates | P1 |
+
+**Phase 3 Total:** 20 points
+
+### Lean Analysis
+- **Eliminate Waste:** This IS the integration work - no waste
+- **Amplify Learning:** Will this reveal audio thread issues? YES - explicit test for it (3.5)
+- **Build Quality In:** Test concurrency early? YES - that's the whole phase
+- **Deliver Fast:** Can we ship without asset loading (3.4)? NO - need real audio files
+
+### Phase 3 Recommendations
+1. **Do 3.1 first** - API design gates everything else
+2. **3.2 is the critical path** - most complex, needs careful review
+3. **Do 3.5 extensively** - test on real hardware, listen for glitches
+
+---
+
+## Phase 4: Bus Mixer
+
+**Phase Goal:** Implement categorical bus-based mixing
+**Phase Size:** 14 points
+**Dependencies:** Phase 3 complete
+**Risk:** Low (straightforward audio routing)
+
+### Tasks
+
+| # | Task | Size | Points | Rationale | Priority |
+|---|------|------|--------|-----------|----------|
+| 4.1 | Create `MixerState` resource with bus hierarchy | S | 2 | Define SFX/Ambient/Music/UI/Voice buses | P1 |
+| 4.2 | Implement bus Firewheel nodes (gain, EQ, sends) | L | 8 | Multiple node types, routing complexity | P1 |
+| 4.3 | Connect all sources to appropriate buses | S | 2 | Route AudioSource components by bus type | P2 |
+| 4.4 | Add master bus with limiting | S | 2 | Prevent clipping, add safety limiter | P1 |
+| 4.5 | Test bus gain changes propagate correctly | XS | 1 | Verify mixer controls work | P2 |
+
+**Phase 4 Total:** 15 points
+
+### Lean Analysis
+- **Eliminate Waste:** Do we need 5 buses initially? YES - categorical thinking is core
+- **Amplify Learning:** Can we defer EQ? NO - it's essential for professional mixing
+- **Build Quality In:** Is limiting necessary? YES - prevents painful clipping accidents
+- **Optimize Whole:** Does bus structure match sound design needs? YES - aligns with RFC requirements
+
+### Phase 4 Recommendations
+1. **4.1 and 4.2 together** - design and implementation are coupled
+2. **4.4 is critical** - limiter saves ears during development
+3. **Fast phase:** Mostly plumbing once Firewheel is solid
+
+---
+
+## Phase 5: Prioritization and Culling
+
+**Phase Goal:** Handle 200+ sources by prioritizing top 64
+**Phase Size:** 12 points
+**Dependencies:** Phase 4 complete
+**Risk:** Medium (performance-critical code path)
+
+### Tasks
+
+| # | Task | Size | Points | Rationale | Priority |
+|---|------|------|--------|-----------|----------|
+| 5.1 | Implement priority scoring system | M | 4 | Distance, amplitude, bus type, recency factors | P1 |
+| 5.2 | Add distance and amplitude culling | S | 2 | Early exit for inaudible sources | P1 |
+| 5.3 | Enforce voice limit (64 simultaneous) | S | 2 | Sort by priority, take top N | P1 |
+| 5.4 | Optimize with spatial hashing | M | 4 | Fast neighbor queries for dense scenes | P2 |
+| 5.5 | Test with 200+ sources in dense scene | M | 4 | Create test scene, verify <1ms culling time | P1 |
+
+**Phase 5 Total:** 16 points
+
+### Lean Analysis
+- **Eliminate Waste:** Can we skip prioritization initially? NO - 200 sources will be muddy
+- **Amplify Learning:** What's the real voice limit? Test at 5.5 to find out
+- **Decide Late:** Can we defer spatial hashing (5.4)? YES if linear search is fast enough
+- **Optimize Whole:** Does this work for both desktop and iOS? YES - same culling logic
+
+### Phase 5 Recommendations
+1. **Do 5.1-5.3 first** - core prioritization logic
+2. **5.4 is optional optimization** - measure first, optimize if needed
+3. **5.5 is GO/NO-GO gate** - if performance fails, revisit 5.4
+
+---
+
+## Phase 6: Debug Visualization
+
+**Phase Goal:** Visual debugging of spatial audio sources
+**Phase Size:** 16 points
+**Dependencies:** Phase 5 complete (need full system working)
+**Risk:** Low (tooling, not core functionality)
+
+### Tasks
+
+| # | Task | Size | Points | Rationale | Priority |
+|---|------|------|--------|-----------|----------|
+| 6.1 | Implement gizmo rendering for active sources | M | 4 | Sphere gizmos with falloff ranges | P1 |
+| 6.2 | Add color-coding by bus type | S | 2 | Visual differentiation of audio categories | P2 |
+| 6.3 | Implement amplitude animation (brightness pulse) | S | 2 | Visual feedback for sound intensity | P2 |
+| 6.4 | Add selection raycasting and inspector panel | M | 4 | Click source → show details in egui | P1 |
+| 6.5 | Add occlusion ray visualization | S | 2 | Green = clear, red = occluded | P2 |
+| 6.6 | Test on complex scene with 50+ sources | S | 2 | Verify visualization remains readable | P2 |
+
+**Phase 6 Total:** 16 points
+
+### Lean Analysis
+- **Eliminate Waste:** Is visualization necessary? YES - critical for debugging spatial audio
+- **Amplify Learning:** Will this reveal mix problems? YES - that's the purpose
+- **Build Quality In:** Should this be P1? YES for 6.1 and 6.4, others are polish
+- **Deliver Fast:** Can we ship minimal version? YES - 6.1 and 6.4 are essential, others are nice-to-have
+
+### Phase 6 Recommendations
+1. **Do 6.1 and 6.4 first** - core debug functionality
+2. **6.2, 6.3, 6.5 are polish** - do when inspired
+3. **This is a checkpoint** - use visualization to verify Phases 1-5 work correctly
+
+---
+
+## Phase 7: Mixer Panel
+
+**Phase Goal:** Professional mixing console in egui
+**Phase Size:** 18 points
+**Dependencies:** Phase 4 complete (needs mixer state)
+**Risk:** Low (UI work)
+
+### Tasks
+
+| # | Task | Size | Points | Rationale | Priority |
+|---|------|------|--------|-----------|----------|
+| 7.1 | Implement egui mixer panel with channel strips | L | 8 | Layout 5 bus channels + master, faders, meters | P1 |
+| 7.2 | Add EQ controls (3-band, collapsible) | M | 4 | Low shelf, mid bell, high shelf UI | P2 |
+| 7.3 | Add solo/mute buttons | S | 2 | Isolation for debugging | P1 |
+| 7.4 | Implement metering (peak/RMS from audio thread) | M | 4 | Lock-free meter reads, visual bars | P1 |
+| 7.5 | Add LUFS integrated loudness meter | S | 2 | Master bus loudness monitoring | P3 |
+| 7.6 | Implement preset save/load (JSON) | S | 2 | Serialize mixer state, version control | P2 |
+
+**Phase 7 Total:** 22 points
+
+### Lean Analysis
+- **Eliminate Waste:** Do we need LUFS (7.5)? NO - defer to P3
+- **Amplify Learning:** Will this improve mix quality? YES - professional tools = professional results
+- **Build Quality In:** Is metering (7.4) essential? YES - you can't mix what you can't measure
+- **Deliver Fast:** What's minimum viable mixer? 7.1, 7.3, 7.4
+
+### Phase 7 Recommendations
+1. **Do 7.1 first** - foundation for all other tasks
+2. **7.3 and 7.4 immediately** - essential for mixing
+3. **7.2 and 7.6 are P2** - important but not blocking
+4. **7.5 is P3** - nice-to-have professional feature
+
+---
+
+## Phase 8: Soundscape Zones
+
+**Phase Goal:** Layered ambient audio zones
+**Phase Size:** 14 points
+**Dependencies:** Phase 3 complete (needs component system)
+**Risk:** Medium (complex activation logic)
+
+### Tasks
+
+| # | Task | Size | Points | Rationale | Priority |
+|---|------|------|--------|-----------|----------|
+| 8.1 | Implement `SoundscapeZone` component | S | 2 | Define zone shapes, layers, fade distance | P1 |
+| 8.2 | Add zone activation system (listener position) | M | 4 | Track listener, activate/deactivate zones | P1 |
+| 8.3 | Implement crossfading between overlapping zones | M | 4 | Smooth transitions, prevent popping | P1 |
+| 8.4 | Add randomized layer playback | S | 2 | Occasional sounds (birds, creaks) with random timing | P2 |
+| 8.5 | Test with 10+ overlapping zones | S | 2 | Verify performance, smooth crossfades | P1 |
+
+**Phase 8 Total:** 14 points
+
+### Lean Analysis
+- **Eliminate Waste:** Are zones necessary? YES - essential for hyper-dense soundscapes
+- **Amplify Learning:** Will this reveal performance issues? YES - test at 8.5
+- **Build Quality In:** Is crossfading (8.3) critical? YES - popping is unacceptable
+- **Optimize Whole:** Does this work with prioritization (Phase 5)? YES - zones create sources that get prioritized
+
+### Phase 8 Recommendations
+1. **Do 8.1-8.3 sequentially** - core zone system
+2. **8.4 is nice-to-have** - adds realism but not essential
+3. **8.5 is verification** - test in realistic scenario
+
+---
+
+## Overall Scheduling Recommendations
+
+### Critical Path (Sequential)
+```
+Phase 1 → Phase 2 → Phase 3 → Phase 4 → Phase 5
+                        ↓
+                  Phase 6 (can parallelize with Phase 7)
+                        ↓
+                  Phase 7 → Phase 8
+Total: ~91 points
+```
+
+### Parallel Opportunities
+- **During Phase 1:** Design Phase 3 component API
+- **During Phase 5:** Build Phase 6 visualization (uses same source data)
+- **After Phase 4:** Phases 6, 7, 8 can partially overlap (different subsystems)
+
+### Risk Mitigation Strategy
+1. **Phase 1.6 is a GO/NO-GO gate** - if basic playback fails, stop and debug
+2. **Phase 2.4 spatial accuracy test** - verify HRTF works before proceeding
+3. **Phase 3.5 concurrency test** - ensure lock-free sync works flawlessly
+4. **Phase 5.5 performance test** - verify 200+ sources cull to 64 in <1ms
+5. **Incremental commits** - don't batch entire phase into one PR
+
+---
+
+## WSJF Prioritization (P1 Tier)
+
+Scoring against other P1 work:
+
+| Item | Player Value | Time Criticality | Risk Reduction | CoD | Size | WSJF |
+|------|--------------|------------------|----------------|-----|------|------|
+| **Spatial Audio Epic** | 9 | 6 | 7 | 22 | 91 | **0.24** |
+| Phase 1 alone | 3 | 4 | 8 | 15 | 14 | **1.07** |
+| Phase 1-3 together | 8 | 5 | 8 | 21 | 52 | **0.40** |
+| Phases 1-5 (core) | 9 | 6 | 8 | 23 | 83 | **0.28** |
+
+**Interpretation:**
+- **High player value** - spatial audio is core immersion feature
+- **High time criticality** - needed for demos and content creation
+- **High risk reduction** - vendoring eliminates dependency lag
+- **Phases 1-3 are foundation** - nothing works without them
+- **Phases 6-8 are tooling** - can defer slightly if needed
+
+---
+
+## Sequencing with Other Work
+
+### Good to do BEFORE this epic:
+- ✅ iOS deployment scripts (done)
+- ✅ Basic ECS setup (done)
+- Bevy rendering vendoring (provides debugging context)
+
+### Good to do AFTER this epic:
+- Agent ambient sounds (depends on spatial audio)
+- Environmental soundscapes (depends on zones)
+- Dialogue system (depends on Voice bus)
+- Music system (depends on mixer)
+
+### Can do IN PARALLEL:
+- Content creation (3D assets, animations)
+- Networking improvements (different subsystem)
+- Game design prototyping (can use placeholder audio)
+
+---
+
+## Decision Points
+
+### Before Starting Phase 1:
+- [ ] Do we have bandwidth for ~3 months of audio work?
+- [ ] Are there higher-priority P1 bugs blocking demos?
+- [ ] Have we validated spatial audio is essential to Aspen vision?
+
+### Before Starting Phase 2:
+- [ ] Did Phase 1.6 smoke tests pass on both iOS and macOS?
+- [ ] Do we understand Firewheel's lock-free guarantees?
+- [ ] Is Steam Audio C++ library compatible with iOS?
+
+### Before Starting Phase 3:
+- [ ] Does binaural output sound correct? (Phase 2.4)
+- [ ] Have we tested with headphones/earbuds on device?
+- [ ] Do we understand the game thread → audio thread sync pattern?
+
+### Before Starting Phase 4:
+- [ ] Are moving sources glitch-free? (Phase 3.5)
+- [ ] Have we tested with 10+ simultaneous sources?
+
+### Before Starting Phase 5:
+- [ ] Does bus routing work correctly? (Phase 4.5)
+- [ ] Have we identified the voice limit threshold?
+
+### Before Starting Phases 6-8 (Tooling):
+- [ ] Is core spatial audio (Phases 1-5) solid?
+- [ ] Have we tested on both iOS and macOS extensively?
+- [ ] Can we demo spatial audio to stakeholders?
+
+### Before Closing Epic:
+- [ ] All platforms tested with 64+ voices?
+- [ ] Spatial accuracy test passed (blindfolded pointing <15° error)?
+- [ ] Mix quality validated by professional sound engineer?
+- [ ] Performance test passed (<2ms audio thread on M1 iPad)?
+- [ ] Documentation updated (API docs, audio design guidelines)?
+
+---
+
+## Minimum Viable Implementation
+
+If we need to ship faster, the **minimum viable spatial audio** is:
+
+**Phases 1-5 only** (83 points)
+- Firewheel and Steam Audio integration
+- ECS integration
+- Bus mixer
+- Prioritization
+
+This provides:
+- 3D positioned audio with HRTF
+- Bus-based mixing
+- Voice limiting for performance
+
+**Deferred to later:**
+- Debug visualization (Phase 6)
+- Mixer panel UI (Phase 7)
+- Soundscape zones (Phase 8)
+
+**Trade-off:** Harder to debug and mix without tooling, but core spatial audio works.
+
+---
+
+## Summary
+
+**Total Effort:** ~91 points (XXL epic)
+**Confidence:** Medium (audio is complex, vendoring reduces some risk)
+**Recommendation:** High priority - spatial audio is core to Aspen's immersion
+**When to do it:** After basic rendering is stable, before content creation ramps up
+
+This is essential for Aspen's "sense of place" design pillar. Unlike the Bevy renderer epic (P2 technical debt), spatial audio is P1 player-facing immersion.