feat: add OpenTelemetry tracing support behind otel feature flag

- Add tracing::instrument spans to executor (workflow.execute), host (workflow.start, event.publish, event.process) - Add otel feature flag to wfe-core and wfe crates - Add wfe/src/otel.rs helper for OTLP exporter initialization - Dependencies: tracing-opentelemetry, opentelemetry, opentelemetry_sdk, opentelemetry-otlp (all optional behind otel feature) - Step execution stays at info level, executor internals at debug
2026-03-25 20:41:34 +00:00
parent bd51517e9f
commit c8582eb514
5 changed files with 73 additions and 1 deletions
--- a/wfe-core/Cargo.toml
+++ b/wfe-core/Cargo.toml
@@ -8,8 +8,11 @@ description = "Core traits, models, builder, and executor for the WFE workflow e
 [features]
 default = []
 test-support = []
+otel = ["tracing-opentelemetry", "opentelemetry"]

 [dependencies]
+tracing-opentelemetry = { workspace = true, optional = true }
+opentelemetry = { workspace = true, optional = true }
 tokio = { workspace = true }
 serde = { workspace = true }
 serde_json = { workspace = true }
--- a/wfe-core/src/executor/workflow_executor.rs
+++ b/wfe-core/src/executor/workflow_executor.rs
@@ -59,6 +59,15 @@ impl WorkflowExecutor {
    /// 6. Check for completion
    /// 7. Persist
    /// 8. Release lock
+    #[tracing::instrument(
+        name = "workflow.execute",
+        skip(self, definition, step_registry),
+        fields(
+            workflow.id = %workflow_id,
+            workflow.definition_id,
+            workflow.status,
+        )
+    )]
    pub async fn execute(
        &self,
        workflow_id: &str,
@@ -96,6 +105,8 @@ impl WorkflowExecutor {
            .get_workflow_instance(workflow_id)
            .await?;

+        tracing::Span::current().record("workflow.definition_id", workflow.workflow_definition_id.as_str());
+
        if workflow.status != WorkflowStatus::Runnable {
            debug!(workflow_id, status = ?workflow.status, "Workflow not runnable, skipping");
            return Ok(());
@@ -170,6 +181,15 @@ impl WorkflowExecutor {
            // Now we can mutate again since context is dropped.
            match step_result {
                Ok(result) => {
+                    let step_status = if result.sleep_for.is_some() {
+                        "sleeping"
+                    } else if result.event_name.is_some() {
+                        "waiting_for_event"
+                    } else {
+                        "completed"
+                    };
+                    tracing::Span::current().record("step.status", step_status);
+
                    info!(
                        workflow_id,
                        step_id,
@@ -202,6 +222,7 @@ impl WorkflowExecutor {
                Err(e) => {
                    // f. Handle error.
                    let error_msg = e.to_string();
+                    tracing::Span::current().record("step.status", "failed");
                    warn!(workflow_id, step_id, error = %error_msg, "Step execution failed");

                    let pointer_id = workflow.execution_pointers[idx].id.clone();
@@ -253,6 +274,8 @@ impl WorkflowExecutor {
            workflow.complete_time = Some(Utc::now());
        }

+        tracing::Span::current().record("workflow.status", tracing::field::debug(&workflow.status));
+
        // Determine next_execution.
        let has_active = workflow.execution_pointers.iter().any(|p| p.active);
        if has_active {