fix(wfe-kubernetes): wait for PVC binding before creating Job
The storage provisioner (Longhorn) needs a few seconds to create and attach the volume. Previously the Job was created immediately after the PVC, and the scheduler rejected the pod with 'unbound immediate PersistentVolumeClaims'. Now ensure_shared_volume_pvc polls until the PVC reaches Bound status (up to 120s) before returning. Also removes the early-exit on PodScheduled=False in wait_for_pod_running since transient scheduling failures (like unbound PVCs) resolve on their own and shouldn't be treated as fatal.
This commit is contained in:
@@ -83,17 +83,11 @@ pub async fn wait_for_pod_running(
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
if let Some(conditions) = &status.conditions {
|
// Note: we intentionally do NOT treat PodScheduled=False
|
||||||
for cond in conditions {
|
// as a fatal error here. Transient scheduling failures
|
||||||
if cond.type_ == "PodScheduled" && cond.status == "False" {
|
// (e.g. "unbound PersistentVolumeClaims") resolve once
|
||||||
if let Some(ref msg) = cond.message {
|
// the storage provisioner finishes. Let the timeout
|
||||||
return Err(WfeError::StepExecution(format!(
|
// handle genuinely stuck pods instead of failing early.
|
||||||
"pod '{pod_name}' scheduling failed: {msg}"
|
|
||||||
)));
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
Err(kube::Error::Api(err)) if err.code == 404 => {}
|
Err(kube::Error::Api(err)) if err.code == 404 => {}
|
||||||
|
|||||||
@@ -76,13 +76,35 @@ pub async fn ensure_shared_volume_pvc(
|
|||||||
};
|
};
|
||||||
|
|
||||||
match api.create(&PostParams::default(), &pvc).await {
|
match api.create(&PostParams::default(), &pvc).await {
|
||||||
Ok(_) => Ok(()),
|
Ok(_) => {}
|
||||||
// Another step created it between our get and create — also fine.
|
// Another step created it between our get and create — also fine.
|
||||||
Err(kube::Error::Api(err)) if err.code == 409 => Ok(()),
|
Err(kube::Error::Api(err)) if err.code == 409 => {}
|
||||||
Err(e) => Err(WfeError::StepExecution(format!(
|
Err(e) => {
|
||||||
"failed to create shared-volume PVC '{name}' in '{namespace}': {e}"
|
return Err(WfeError::StepExecution(format!(
|
||||||
))),
|
"failed to create shared-volume PVC '{name}' in '{namespace}': {e}"
|
||||||
|
)));
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// Wait for the PVC to be bound before returning. Storage provisioners
|
||||||
|
// (e.g. Longhorn) need a few seconds to create and attach the volume.
|
||||||
|
// If we return immediately the Job's pod is created while the PVC is
|
||||||
|
// still Pending, and the scheduler rejects it with "unbound immediate
|
||||||
|
// PersistentVolumeClaims".
|
||||||
|
for _ in 0..60 {
|
||||||
|
if let Ok(pvc) = api.get(name).await {
|
||||||
|
if let Some(status) = &pvc.status {
|
||||||
|
if status.phase.as_deref() == Some("Bound") {
|
||||||
|
return Ok(());
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
tokio::time::sleep(std::time::Duration::from_secs(2)).await;
|
||||||
|
}
|
||||||
|
|
||||||
|
Err(WfeError::StepExecution(format!(
|
||||||
|
"shared-volume PVC '{name}' in '{namespace}' was not bound within 120s"
|
||||||
|
)))
|
||||||
}
|
}
|
||||||
|
|
||||||
#[cfg(test)]
|
#[cfg(test)]
|
||||||
|
|||||||
Reference in New Issue
Block a user