diff --git a/wfe-kubernetes/src/logs.rs b/wfe-kubernetes/src/logs.rs index e4a6642..768be5c 100644 --- a/wfe-kubernetes/src/logs.rs +++ b/wfe-kubernetes/src/logs.rs @@ -83,17 +83,11 @@ pub async fn wait_for_pod_running( } } } - if let Some(conditions) = &status.conditions { - for cond in conditions { - if cond.type_ == "PodScheduled" && cond.status == "False" { - if let Some(ref msg) = cond.message { - return Err(WfeError::StepExecution(format!( - "pod '{pod_name}' scheduling failed: {msg}" - ))); - } - } - } - } + // Note: we intentionally do NOT treat PodScheduled=False + // as a fatal error here. Transient scheduling failures + // (e.g. "unbound PersistentVolumeClaims") resolve once + // the storage provisioner finishes. Let the timeout + // handle genuinely stuck pods instead of failing early. } } Err(kube::Error::Api(err)) if err.code == 404 => {} diff --git a/wfe-kubernetes/src/pvc.rs b/wfe-kubernetes/src/pvc.rs index f8f2ca7..c2472c3 100644 --- a/wfe-kubernetes/src/pvc.rs +++ b/wfe-kubernetes/src/pvc.rs @@ -76,13 +76,35 @@ pub async fn ensure_shared_volume_pvc( }; match api.create(&PostParams::default(), &pvc).await { - Ok(_) => Ok(()), + Ok(_) => {} // Another step created it between our get and create — also fine. - Err(kube::Error::Api(err)) if err.code == 409 => Ok(()), - Err(e) => Err(WfeError::StepExecution(format!( - "failed to create shared-volume PVC '{name}' in '{namespace}': {e}" - ))), + Err(kube::Error::Api(err)) if err.code == 409 => {} + Err(e) => { + return Err(WfeError::StepExecution(format!( + "failed to create shared-volume PVC '{name}' in '{namespace}': {e}" + ))); + } } + + // Wait for the PVC to be bound before returning. Storage provisioners + // (e.g. Longhorn) need a few seconds to create and attach the volume. + // If we return immediately the Job's pod is created while the PVC is + // still Pending, and the scheduler rejects it with "unbound immediate + // PersistentVolumeClaims". + for _ in 0..60 { + if let Ok(pvc) = api.get(name).await { + if let Some(status) = &pvc.status { + if status.phase.as_deref() == Some("Bound") { + return Ok(()); + } + } + } + tokio::time::sleep(std::time::Duration::from_secs(2)).await; + } + + Err(WfeError::StepExecution(format!( + "shared-volume PVC '{name}' in '{namespace}' was not bound within 120s" + ))) } #[cfg(test)]