From 990507e3c7f06bb8b0aebde3c68f16a2ba072af9 Mon Sep 17 00:00:00 2001
From: lebaudantoine <lebaud.antoine131@gmail.com>
Date: Thu, 23 Oct 2025 05:59:59 +0200
Subject: [PATCH] =?UTF-8?q?=F0=9F=94=8A(summary)=20increase=20transcriptio?=
 =?UTF-8?q?n=20Celery=20task=20logging=20verbosity?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Add detailed logging for owner ID, recording metadata, and
processing context in transcription tasks to improve debugging
capabilities.

It was especially important to get the created document id,
so when having trouble with the docs API, I could share
with them the newly created documents being impacted.
---
 src/summary/summary/core/celery_worker.py | 40 ++++++++++++++++-------
 1 file changed, 28 insertions(+), 12 deletions(-)

diff --git a/src/summary/summary/core/celery_worker.py b/src/summary/summary/core/celery_worker.py
index baced13e..f936a44f 100644
--- a/src/summary/summary/core/celery_worker.py
+++ b/src/summary/summary/core/celery_worker.py
@@ -233,11 +233,16 @@ def process_audio_transcribe_summarize_v2(
     3. Sends the results via webhook
 
     """
-    logger.info("Notification received")
-    logger.debug("filename: %s", filename)
+    logger.info(
+        "Notification received | Owner: %s | Room: %s",
+        owner_id,
+        room,
+    )
 
     task_id = self.request.id
 
+    logger.info("Download recording | Filename: %s", filename)
+
     minio_client = Minio(
         settings.aws_s3_endpoint_url,
         access_key=settings.aws_s3_access_key_id,
@@ -278,7 +283,9 @@ def process_audio_transcribe_summarize_v2(
     )
 
     try:
-        logger.info("Querying transcription …")
+        logger.info(
+            "Querying transcription for %s seconds of audio …", audio_file.info.length
+        )
         transcription_start_time = time.time()
         with open(temp_file_path, "rb") as audio_file:
             transcription = whisperx_client.audio.transcriptions.create(
@@ -286,15 +293,13 @@ def process_audio_transcribe_summarize_v2(
                 file=audio_file,
                 language=settings.whisperx_default_language,
             )
+
+            transcription_time = round(time.time() - transcription_start_time, 2)
             metadata_manager.track(
                 task_id,
-                {
-                    "transcription_time": round(
-                        time.time() - transcription_start_time, 2
-                    )
-                },
+                {"transcription_time": transcription_time},
             )
-            logger.info("Transcription received.")
+            logger.info("Transcription received in %s seconds.", transcription_time)
             logger.debug("Transcription: \n %s", transcription)
     finally:
         if os.path.exists(temp_file_path):
@@ -329,8 +334,19 @@ def process_audio_transcribe_summarize_v2(
 
     response = post_with_retries(settings.webhook_url, data)
 
-    logger.info("Webhook submitted successfully. Status: %s", response.status_code)
-    logger.debug("Response body: %s", response.text)
+    try:
+        response_data = response.json()
+        document_id = response_data.get("id", "N/A")
+    except (json.JSONDecodeError, AttributeError):
+        document_id = "Unable to parse response"
+        response_data = response.text
+
+    logger.info(
+        "Webhook success | Document %s submitted (HTTP %s)",
+        document_id,
+        response.status_code,
+    )
+    logger.debug("Full response: %s", response_data)
 
     metadata_manager.capture(task_id, settings.posthog_event_success)
 
@@ -344,7 +360,7 @@ def process_audio_transcribe_summarize_v2(
             queue=settings.summarize_queue,
         )
     else:
-        logger.info("Summary generation not enabled for this user.")
+        logger.info("Summary generation not enabled for this user. Skipping.")
 
 
 @signals.task_prerun.connect(sender=process_audio_transcribe_summarize_v2)