5 Commits

Author SHA1 Message Date
5de18f4532 fix(wfe-kubernetes): wait for PVC binding before creating Job
The storage provisioner (Longhorn) needs a few seconds to create and
attach the volume. Previously the Job was created immediately after
the PVC, and the scheduler rejected the pod with 'unbound immediate
PersistentVolumeClaims'. Now ensure_shared_volume_pvc polls until the
PVC reaches Bound status (up to 120s) before returning.

Also removes the early-exit on PodScheduled=False in wait_for_pod_running
since transient scheduling failures (like unbound PVCs) resolve on
their own and shouldn't be treated as fatal.
2026-04-09 19:48:13 +01:00
6a4aada4bf fix(workflows.yaml): set shared volume to 15Gi 2026-04-09 19:40:38 +01:00
b51d34093f fix(wfe-kubernetes): aggregate sub-workflow logs under root workflow ID
wfectl logs <ci-name> returned nothing because each sub-workflow's
step logged under its own UUID. The LogStore is keyed by workflow_id,
so querying the parent ci ID found zero entries. Now stream_logs uses
root_workflow_id (the top-level ancestor) so all sub-workflow output
aggregates under the ci run the user actually started.
2026-04-09 18:38:41 +01:00
275664256d fix(workflows.yaml): pull_policy Always for wfe-ci:latest 2026-04-09 18:32:55 +01:00
322b9ec2c8 fix(ci): add rustfmt + clippy to wfe-ci image 2026-04-09 17:26:09 +01:00
5 changed files with 47 additions and 20 deletions

View File

@@ -43,8 +43,8 @@ ARG TEA_VERSION=0.11.0
RUN curl -fsSL "https://gitea.com/gitea/tea/releases/download/v${TEA_VERSION}/tea-${TEA_VERSION}-linux-amd64" \ RUN curl -fsSL "https://gitea.com/gitea/tea/releases/download/v${TEA_VERSION}/tea-${TEA_VERSION}-linux-amd64" \
-o /usr/local/bin/tea && chmod +x /usr/local/bin/tea -o /usr/local/bin/tea && chmod +x /usr/local/bin/tea
# llvm tools (needed by cargo-llvm-cov) # Rust components for CI lint + coverage
RUN rustup component add llvm-tools-preview RUN rustup component add llvm-tools-preview rustfmt clippy
# Sccache wrapper config — expects SCCACHE_S3_ENDPOINT, SCCACHE_BUCKET, etc. via env. # Sccache wrapper config — expects SCCACHE_S3_ENDPOINT, SCCACHE_BUCKET, etc. via env.
ENV RUSTC_WRAPPER=/usr/local/cargo/bin/sccache \ ENV RUSTC_WRAPPER=/usr/local/cargo/bin/sccache \

View File

@@ -83,17 +83,11 @@ pub async fn wait_for_pod_running(
} }
} }
} }
if let Some(conditions) = &status.conditions { // Note: we intentionally do NOT treat PodScheduled=False
for cond in conditions { // as a fatal error here. Transient scheduling failures
if cond.type_ == "PodScheduled" && cond.status == "False" { // (e.g. "unbound PersistentVolumeClaims") resolve once
if let Some(ref msg) = cond.message { // the storage provisioner finishes. Let the timeout
return Err(WfeError::StepExecution(format!( // handle genuinely stuck pods instead of failing early.
"pod '{pod_name}' scheduling failed: {msg}"
)));
}
}
}
}
} }
} }
Err(kube::Error::Api(err)) if err.code == 404 => {} Err(kube::Error::Api(err)) if err.code == 404 => {}

View File

@@ -76,13 +76,35 @@ pub async fn ensure_shared_volume_pvc(
}; };
match api.create(&PostParams::default(), &pvc).await { match api.create(&PostParams::default(), &pvc).await {
Ok(_) => Ok(()), Ok(_) => {}
// Another step created it between our get and create — also fine. // Another step created it between our get and create — also fine.
Err(kube::Error::Api(err)) if err.code == 409 => Ok(()), Err(kube::Error::Api(err)) if err.code == 409 => {}
Err(e) => Err(WfeError::StepExecution(format!( Err(e) => {
"failed to create shared-volume PVC '{name}' in '{namespace}': {e}" return Err(WfeError::StepExecution(format!(
))), "failed to create shared-volume PVC '{name}' in '{namespace}': {e}"
)));
}
} }
// Wait for the PVC to be bound before returning. Storage provisioners
// (e.g. Longhorn) need a few seconds to create and attach the volume.
// If we return immediately the Job's pod is created while the PVC is
// still Pending, and the scheduler rejects it with "unbound immediate
// PersistentVolumeClaims".
for _ in 0..60 {
if let Ok(pvc) = api.get(name).await {
if let Some(status) = &pvc.status {
if status.phase.as_deref() == Some("Bound") {
return Ok(());
}
}
}
tokio::time::sleep(std::time::Duration::from_secs(2)).await;
}
Err(WfeError::StepExecution(format!(
"shared-volume PVC '{name}' in '{namespace}' was not bound within 120s"
)))
} }
#[cfg(test)] #[cfg(test)]

View File

@@ -232,13 +232,22 @@ impl KubernetesStep {
wait_for_pod_running(client, namespace, &pod_name).await?; wait_for_pod_running(client, namespace, &pod_name).await?;
// 8. Stream logs + capture stdout. // 8. Stream logs + capture stdout.
// Store logs under the root workflow ID so `wfectl logs <ci-name>`
// aggregates output from all sub-workflows in the tree. Without
// this, each sub-workflow's logs are siloed under its own UUID
// and the user sees nothing when querying the parent.
let log_workflow_id = context
.workflow
.root_workflow_id
.as_deref()
.unwrap_or(workflow_id);
let stdout = stream_logs( let stdout = stream_logs(
client, client,
namespace, namespace,
&pod_name, &pod_name,
step_name, step_name,
definition_id, definition_id,
workflow_id, log_workflow_id,
context.step.id, context.step.id,
context.log_sink, context.log_sink,
) )

View File

@@ -60,6 +60,7 @@ _templates:
# arrays, and other bashisms the default `/bin/sh` (dash) doesn't support. # arrays, and other bashisms the default `/bin/sh` (dash) doesn't support.
ci_config: &ci_config ci_config: &ci_config
image: src.sunbeam.pt/studio/wfe-ci:latest image: src.sunbeam.pt/studio/wfe-ci:latest
pull_policy: Always
shell: /bin/bash shell: /bin/bash
memory: 4Gi memory: 4Gi
cpu: "2" cpu: "2"
@@ -69,6 +70,7 @@ _templates:
# Default config for long-running CI steps (8Gi memory, 60min timeout). # Default config for long-running CI steps (8Gi memory, 60min timeout).
ci_long_config: &ci_long_config ci_long_config: &ci_long_config
image: src.sunbeam.pt/studio/wfe-ci:latest image: src.sunbeam.pt/studio/wfe-ci:latest
pull_policy: Always
shell: /bin/bash shell: /bin/bash
memory: 8Gi memory: 8Gi
cpu: "4" cpu: "4"
@@ -735,7 +737,7 @@ workflows:
# to fit a full `target/` build + sccache copy with headroom. # to fit a full `target/` build + sccache copy with headroom.
shared_volume: shared_volume:
mount_path: /workspace mount_path: /workspace
size: 30Gi size: 15Gi
inputs: inputs:
repo_url: string repo_url: string
commit_sha: string commit_sha: string