fix(wfe-kubernetes): wait for PVC binding before creating Job

The storage provisioner (Longhorn) needs a few seconds to create and
attach the volume. Previously the Job was created immediately after
the PVC, and the scheduler rejected the pod with 'unbound immediate
PersistentVolumeClaims'. Now ensure_shared_volume_pvc polls until the
PVC reaches Bound status (up to 120s) before returning.

Also removes the early-exit on PodScheduled=False in wait_for_pod_running
since transient scheduling failures (like unbound PVCs) resolve on
their own and shouldn't be treated as fatal.
This commit is contained in:
2026-04-09 19:48:13 +01:00
parent 6a4aada4bf
commit 5de18f4532
2 changed files with 32 additions and 16 deletions

View File

@@ -83,17 +83,11 @@ pub async fn wait_for_pod_running(
}
}
}
if let Some(conditions) = &status.conditions {
for cond in conditions {
if cond.type_ == "PodScheduled" && cond.status == "False" {
if let Some(ref msg) = cond.message {
return Err(WfeError::StepExecution(format!(
"pod '{pod_name}' scheduling failed: {msg}"
)));
}
}
}
}
// Note: we intentionally do NOT treat PodScheduled=False
// as a fatal error here. Transient scheduling failures
// (e.g. "unbound PersistentVolumeClaims") resolve once
// the storage provisioner finishes. Let the timeout
// handle genuinely stuck pods instead of failing early.
}
}
Err(kube::Error::Api(err)) if err.code == 404 => {}

View File

@@ -76,13 +76,35 @@ pub async fn ensure_shared_volume_pvc(
};
match api.create(&PostParams::default(), &pvc).await {
Ok(_) => Ok(()),
Ok(_) => {}
// Another step created it between our get and create — also fine.
Err(kube::Error::Api(err)) if err.code == 409 => Ok(()),
Err(e) => Err(WfeError::StepExecution(format!(
"failed to create shared-volume PVC '{name}' in '{namespace}': {e}"
))),
Err(kube::Error::Api(err)) if err.code == 409 => {}
Err(e) => {
return Err(WfeError::StepExecution(format!(
"failed to create shared-volume PVC '{name}' in '{namespace}': {e}"
)));
}
}
// Wait for the PVC to be bound before returning. Storage provisioners
// (e.g. Longhorn) need a few seconds to create and attach the volume.
// If we return immediately the Job's pod is created while the PVC is
// still Pending, and the scheduler rejects it with "unbound immediate
// PersistentVolumeClaims".
for _ in 0..60 {
if let Ok(pvc) = api.get(name).await {
if let Some(status) = &pvc.status {
if status.phase.as_deref() == Some("Bound") {
return Ok(());
}
}
}
tokio::time::sleep(std::time::Duration::from_secs(2)).await;
}
Err(WfeError::StepExecution(format!(
"shared-volume PVC '{name}' in '{namespace}' was not bound within 120s"
)))
}
#[cfg(test)]