fix research agent hang: per-agent timeout + startup cleanup
research agents now have a 2-minute timeout via tokio::time::timeout. a hung Mistral API call can no longer block Sol's entire sync loop. timed-out agents return partial results instead of hanging forever. on startup, Sol detects research sessions with status='running' from previous crashes and marks them as failed. 6 new tests covering the full research session lifecycle: create, append findings, complete, fail, hung cleanup, and partial findings survival.
This commit is contained in:
10
src/main.rs
10
src/main.rs
@@ -276,6 +276,16 @@ async fn main() -> anyhow::Result<()> {
|
||||
}
|
||||
}
|
||||
|
||||
// Clean up hung research sessions from previous runs
|
||||
let hung_sessions = store.load_running_research_sessions();
|
||||
if !hung_sessions.is_empty() {
|
||||
info!(count = hung_sessions.len(), "Found hung research sessions — marking as failed");
|
||||
for (session_id, _room_id, query, _findings) in &hung_sessions {
|
||||
warn!(session_id = session_id.as_str(), query = query.as_str(), "Cleaning up hung research session");
|
||||
store.fail_research_session(session_id);
|
||||
}
|
||||
}
|
||||
|
||||
// Backfill reactions from Matrix room timelines
|
||||
info!("Backfilling reactions from room timelines...");
|
||||
if let Err(e) = backfill_reactions(&matrix_client, &state.indexer).await {
|
||||
|
||||
Reference in New Issue
Block a user