🔊(summary) increase transcription Celery task logging verbosity

Add detailed logging for owner ID, recording metadata, and processing context in transcription tasks to improve debugging capabilities. It was especially important to get the created document id, so when having trouble with the docs API, I could share with them the newly created documents being impacted.
2025-10-23 05:59:59 +02:00
parent 6cd54f7e1e
commit 990507e3c7
1 changed files with 28 additions and 12 deletions
--- a/src/summary/summary/core/celery_worker.py
+++ b/src/summary/summary/core/celery_worker.py
@@ -233,11 +233,16 @@ def process_audio_transcribe_summarize_v2(
    3. Sends the results via webhook

    """
-    logger.info("Notification received")
-    logger.debug("filename: %s", filename)
+    logger.info(
+        "Notification received | Owner: %s | Room: %s",
+        owner_id,
+        room,
+    )

    task_id = self.request.id

+    logger.info("Download recording | Filename: %s", filename)
+
    minio_client = Minio(
        settings.aws_s3_endpoint_url,
        access_key=settings.aws_s3_access_key_id,
@@ -278,7 +283,9 @@ def process_audio_transcribe_summarize_v2(
    )

    try:
-        logger.info("Querying transcription …")
+        logger.info(
+            "Querying transcription for %s seconds of audio …", audio_file.info.length
+        )
        transcription_start_time = time.time()
        with open(temp_file_path, "rb") as audio_file:
            transcription = whisperx_client.audio.transcriptions.create(
@@ -286,15 +293,13 @@ def process_audio_transcribe_summarize_v2(
                file=audio_file,
                language=settings.whisperx_default_language,
            )
+
+            transcription_time = round(time.time() - transcription_start_time, 2)
            metadata_manager.track(
                task_id,
-                {
-                    "transcription_time": round(
-                        time.time() - transcription_start_time, 2
-                    )
-                },
+                {"transcription_time": transcription_time},
            )
-            logger.info("Transcription received.")
+            logger.info("Transcription received in %s seconds.", transcription_time)
            logger.debug("Transcription: \n %s", transcription)
    finally:
        if os.path.exists(temp_file_path):
@@ -329,8 +334,19 @@ def process_audio_transcribe_summarize_v2(

    response = post_with_retries(settings.webhook_url, data)

-    logger.info("Webhook submitted successfully. Status: %s", response.status_code)
-    logger.debug("Response body: %s", response.text)
+    try:
+        response_data = response.json()
+        document_id = response_data.get("id", "N/A")
+    except (json.JSONDecodeError, AttributeError):
+        document_id = "Unable to parse response"
+        response_data = response.text
+
+    logger.info(
+        "Webhook success | Document %s submitted (HTTP %s)",
+        document_id,
+        response.status_code,
+    )
+    logger.debug("Full response: %s", response_data)

    metadata_manager.capture(task_id, settings.posthog_event_success)

@@ -344,7 +360,7 @@ def process_audio_transcribe_summarize_v2(
            queue=settings.summarize_queue,
        )
    else:
-        logger.info("Summary generation not enabled for this user.")
+        logger.info("Summary generation not enabled for this user. Skipping.")


@signals.task_prerun.connect(sender=process_audio_transcribe_summarize_v2)