fix(wfe-kubernetes): wait for PVC binding before creating Job

The storage provisioner (Longhorn) needs a few seconds to create and attach the volume. Previously the Job was created immediately after the PVC, and the scheduler rejected the pod with 'unbound immediate PersistentVolumeClaims'. Now ensure_shared_volume_pvc polls until the PVC reaches Bound status (up to 120s) before returning. Also removes the early-exit on PodScheduled=False in wait_for_pod_running since transient scheduling failures (like unbound PVCs) resolve on their own and shouldn't be treated as fatal.
fix(workflows.yaml): set shared volume to 15Gi
2026-04-09 19:48:13 +01:00 · 2026-04-09 19:40:38 +01:00 · 2026-04-09 18:38:41 +01:00 · 2026-04-09 18:32:55 +01:00 · 2026-04-09 17:26:09 +01:00
5 changed files with 47 additions and 20 deletions
--- a/Dockerfile.ci
+++ b/Dockerfile.ci
@@ -43,8 +43,8 @@ ARG TEA_VERSION=0.11.0
 RUN curl -fsSL "https://gitea.com/gitea/tea/releases/download/v${TEA_VERSION}/tea-${TEA_VERSION}-linux-amd64" \
      -o /usr/local/bin/tea && chmod +x /usr/local/bin/tea
-# llvm tools (needed by cargo-llvm-cov)
+# Rust components for CI lint + coverage
-RUN rustup component add llvm-tools-preview
+RUN rustup component add llvm-tools-preview rustfmt clippy
 # Sccache wrapper config — expects SCCACHE_S3_ENDPOINT, SCCACHE_BUCKET, etc. via env.
 ENV RUSTC_WRAPPER=/usr/local/cargo/bin/sccache \
--- a/wfe-kubernetes/src/logs.rs
+++ b/wfe-kubernetes/src/logs.rs
@@ -83,17 +83,11 @@ pub async fn wait_for_pod_running(
                            }
                        }
                    }
-                    if let Some(conditions) = &status.conditions {
+                    // Note: we intentionally do NOT treat PodScheduled=False
-                        for cond in conditions {
+                    // as a fatal error here. Transient scheduling failures
-                            if cond.type_ == "PodScheduled" && cond.status == "False" {
+                    // (e.g. "unbound PersistentVolumeClaims") resolve once
-                                if let Some(ref msg) = cond.message {
+                    // the storage provisioner finishes. Let the timeout
-                                    return Err(WfeError::StepExecution(format!(
+                    // handle genuinely stuck pods instead of failing early.
                                        "pod '{pod_name}' scheduling failed: {msg}"
                                    )));
                                }
                            }
                        }
                    }
                }
            }
            Err(kube::Error::Api(err)) if err.code == 404 => {}
--- a/wfe-kubernetes/src/pvc.rs
+++ b/wfe-kubernetes/src/pvc.rs
@@ -76,13 +76,35 @@ pub async fn ensure_shared_volume_pvc(
    };
    match api.create(&PostParams::default(), &pvc).await {
-        Ok(_) => Ok(()),
+        Ok(_) => {}
        // Another step created it between our get and create — also fine.
-        Err(kube::Error::Api(err)) if err.code == 409 => Ok(()),
+        Err(kube::Error::Api(err)) if err.code == 409 => {}
-        Err(e) => Err(WfeError::StepExecution(format!(
+        Err(e) => {
-            "failed to create shared-volume PVC '{name}' in '{namespace}': {e}"
+            return Err(WfeError::StepExecution(format!(
-        ))),
+                "failed to create shared-volume PVC '{name}' in '{namespace}': {e}"
            )));
        }
    }
    // Wait for the PVC to be bound before returning. Storage provisioners
    // (e.g. Longhorn) need a few seconds to create and attach the volume.
    // If we return immediately the Job's pod is created while the PVC is
    // still Pending, and the scheduler rejects it with "unbound immediate
    // PersistentVolumeClaims".
    for _ in 0..60 {
        if let Ok(pvc) = api.get(name).await {
            if let Some(status) = &pvc.status {
                if status.phase.as_deref() == Some("Bound") {
                    return Ok(());
                }
            }
        }
        tokio::time::sleep(std::time::Duration::from_secs(2)).await;
    }
    Err(WfeError::StepExecution(format!(
        "shared-volume PVC '{name}' in '{namespace}' was not bound within 120s"
    )))
 }
 #[cfg(test)]
--- a/wfe-kubernetes/src/step.rs
+++ b/wfe-kubernetes/src/step.rs
@@ -232,13 +232,22 @@ impl KubernetesStep {
        wait_for_pod_running(client, namespace, &pod_name).await?;
        // 8. Stream logs + capture stdout.
        // Store logs under the root workflow ID so `wfectl logs <ci-name>`
        // aggregates output from all sub-workflows in the tree. Without
        // this, each sub-workflow's logs are siloed under its own UUID
        // and the user sees nothing when querying the parent.
        let log_workflow_id = context
            .workflow
            .root_workflow_id
            .as_deref()
            .unwrap_or(workflow_id);
        let stdout = stream_logs(
            client,
            namespace,
            &pod_name,
            step_name,
            definition_id,
-            workflow_id,
+            log_workflow_id,
            context.step.id,
            context.log_sink,
        )
--- a/workflows.yaml
+++ b/workflows.yaml
@@ -60,6 +60,7 @@ _templates:
  # arrays, and other bashisms the default `/bin/sh` (dash) doesn't support.
  ci_config: &ci_config
    image: src.sunbeam.pt/studio/wfe-ci:latest
    pull_policy: Always
    shell: /bin/bash
    memory: 4Gi
    cpu: "2"
@@ -69,6 +70,7 @@ _templates:
  # Default config for long-running CI steps (8Gi memory, 60min timeout).
  ci_long_config: &ci_long_config
    image: src.sunbeam.pt/studio/wfe-ci:latest
    pull_policy: Always
    shell: /bin/bash
    memory: 8Gi
    cpu: "4"
@@ -735,7 +737,7 @@ workflows:
    # to fit a full `target/` build + sccache copy with headroom.
    shared_volume:
      mount_path: /workspace
-      size: 30Gi
+      size: 15Gi
    inputs:
      repo_url: string
      commit_sha: string
Author	SHA1	Message	Date
Sienna Meridian Satterwhite	5de18f4532	fix(wfe-kubernetes): wait for PVC binding before creating Job The storage provisioner (Longhorn) needs a few seconds to create and attach the volume. Previously the Job was created immediately after the PVC, and the scheduler rejected the pod with 'unbound immediate PersistentVolumeClaims'. Now ensure_shared_volume_pvc polls until the PVC reaches Bound status (up to 120s) before returning. Also removes the early-exit on PodScheduled=False in wait_for_pod_running since transient scheduling failures (like unbound PVCs) resolve on their own and shouldn't be treated as fatal.	2026-04-09 19:48:13 +01:00
Sienna Meridian Satterwhite	6a4aada4bf	fix(workflows.yaml): set shared volume to 15Gi	2026-04-09 19:40:38 +01:00
Sienna Meridian Satterwhite	b51d34093f	fix(wfe-kubernetes): aggregate sub-workflow logs under root workflow ID wfectl logs <ci-name> returned nothing because each sub-workflow's step logged under its own UUID. The LogStore is keyed by workflow_id, so querying the parent ci ID found zero entries. Now stream_logs uses root_workflow_id (the top-level ancestor) so all sub-workflow output aggregates under the ci run the user actually started.	2026-04-09 18:38:41 +01:00
Sienna Meridian Satterwhite	275664256d	fix(workflows.yaml): pull_policy Always for wfe-ci:latest	2026-04-09 18:32:55 +01:00
Sienna Meridian Satterwhite	322b9ec2c8	fix(ci): add rustfmt + clippy to wfe-ci image	2026-04-09 17:26:09 +01:00