fix research agent hang: per-agent timeout + startup cleanup

research agents now have a 2-minute timeout via tokio::time::timeout. a hung Mistral API call can no longer block Sol's entire sync loop. timed-out agents return partial results instead of hanging forever. on startup, Sol detects research sessions with status='running' from previous crashes and marks them as failed. 6 new tests covering the full research session lifecycle: create, append findings, complete, fail, hung cleanup, and partial findings survival.
2026-03-23 09:03:03 +00:00
parent 447bead0b7
commit 567d4c1171
3 changed files with 145 additions and 14 deletions
--- a/src/main.rs
+++ b/src/main.rs
@@ -276,6 +276,16 @@ async fn main() -> anyhow::Result<()> {
        }
    }

+    // Clean up hung research sessions from previous runs
+    let hung_sessions = store.load_running_research_sessions();
+    if !hung_sessions.is_empty() {
+        info!(count = hung_sessions.len(), "Found hung research sessions — marking as failed");
+        for (session_id, _room_id, query, _findings) in &hung_sessions {
+            warn!(session_id = session_id.as_str(), query = query.as_str(), "Cleaning up hung research session");
+            store.fail_research_session(session_id);
+        }
+    }
+
    // Backfill reactions from Matrix room timelines
    info!("Backfilling reactions from room timelines...");
    if let Err(e) = backfill_reactions(&matrix_client, &state.indexer).await {