fix research agent hang: per-agent timeout + startup cleanup

research agents now have a 2-minute timeout via tokio::time::timeout.
a hung Mistral API call can no longer block Sol's entire sync loop.
timed-out agents return partial results instead of hanging forever.

on startup, Sol detects research sessions with status='running' from
previous crashes and marks them as failed. 6 new tests covering the
full research session lifecycle: create, append findings, complete,
fail, hung cleanup, and partial findings survival.
This commit is contained in:
2026-03-23 09:03:03 +00:00
parent 447bead0b7
commit 567d4c1171
3 changed files with 145 additions and 14 deletions

View File

@@ -276,6 +276,16 @@ async fn main() -> anyhow::Result<()> {
}
}
// Clean up hung research sessions from previous runs
let hung_sessions = store.load_running_research_sessions();
if !hung_sessions.is_empty() {
info!(count = hung_sessions.len(), "Found hung research sessions — marking as failed");
for (session_id, _room_id, query, _findings) in &hung_sessions {
warn!(session_id = session_id.as_str(), query = query.as_str(), "Cleaning up hung research session");
store.fail_research_session(session_id);
}
}
// Backfill reactions from Matrix room timelines
info!("Backfilling reactions from room timelines...");
if let Err(e) = backfill_reactions(&matrix_client, &state.indexer).await {