fix(wfe-kubernetes): wait for PVC binding before creating Job
The storage provisioner (Longhorn) needs a few seconds to create and attach the volume. Previously the Job was created immediately after the PVC, and the scheduler rejected the pod with 'unbound immediate PersistentVolumeClaims'. Now ensure_shared_volume_pvc polls until the PVC reaches Bound status (up to 120s) before returning. Also removes the early-exit on PodScheduled=False in wait_for_pod_running since transient scheduling failures (like unbound PVCs) resolve on their own and shouldn't be treated as fatal.
This commit is contained in:
@@ -83,17 +83,11 @@ pub async fn wait_for_pod_running(
|
||||
}
|
||||
}
|
||||
}
|
||||
if let Some(conditions) = &status.conditions {
|
||||
for cond in conditions {
|
||||
if cond.type_ == "PodScheduled" && cond.status == "False" {
|
||||
if let Some(ref msg) = cond.message {
|
||||
return Err(WfeError::StepExecution(format!(
|
||||
"pod '{pod_name}' scheduling failed: {msg}"
|
||||
)));
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
// Note: we intentionally do NOT treat PodScheduled=False
|
||||
// as a fatal error here. Transient scheduling failures
|
||||
// (e.g. "unbound PersistentVolumeClaims") resolve once
|
||||
// the storage provisioner finishes. Let the timeout
|
||||
// handle genuinely stuck pods instead of failing early.
|
||||
}
|
||||
}
|
||||
Err(kube::Error::Api(err)) if err.code == 404 => {}
|
||||
|
||||
@@ -76,13 +76,35 @@ pub async fn ensure_shared_volume_pvc(
|
||||
};
|
||||
|
||||
match api.create(&PostParams::default(), &pvc).await {
|
||||
Ok(_) => Ok(()),
|
||||
Ok(_) => {}
|
||||
// Another step created it between our get and create — also fine.
|
||||
Err(kube::Error::Api(err)) if err.code == 409 => Ok(()),
|
||||
Err(e) => Err(WfeError::StepExecution(format!(
|
||||
"failed to create shared-volume PVC '{name}' in '{namespace}': {e}"
|
||||
))),
|
||||
Err(kube::Error::Api(err)) if err.code == 409 => {}
|
||||
Err(e) => {
|
||||
return Err(WfeError::StepExecution(format!(
|
||||
"failed to create shared-volume PVC '{name}' in '{namespace}': {e}"
|
||||
)));
|
||||
}
|
||||
}
|
||||
|
||||
// Wait for the PVC to be bound before returning. Storage provisioners
|
||||
// (e.g. Longhorn) need a few seconds to create and attach the volume.
|
||||
// If we return immediately the Job's pod is created while the PVC is
|
||||
// still Pending, and the scheduler rejects it with "unbound immediate
|
||||
// PersistentVolumeClaims".
|
||||
for _ in 0..60 {
|
||||
if let Ok(pvc) = api.get(name).await {
|
||||
if let Some(status) = &pvc.status {
|
||||
if status.phase.as_deref() == Some("Bound") {
|
||||
return Ok(());
|
||||
}
|
||||
}
|
||||
}
|
||||
tokio::time::sleep(std::time::Duration::from_secs(2)).await;
|
||||
}
|
||||
|
||||
Err(WfeError::StepExecution(format!(
|
||||
"shared-volume PVC '{name}' in '{namespace}' was not bound within 120s"
|
||||
)))
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
|
||||
Reference in New Issue
Block a user