diff --git a/src/summary/summary/core/celery_worker.py b/src/summary/summary/core/celery_worker.py index f98073b4..1f120bd2 100644 --- a/src/summary/summary/core/celery_worker.py +++ b/src/summary/summary/core/celery_worker.py @@ -31,6 +31,7 @@ from summary.core.prompt import ( PROMPT_SYSTEM_TLDR, PROMPT_USER_PART, ) +from summary.core.transcript_formatter import TranscriptFormatter settings = get_settings() analytics = get_analytics() @@ -56,28 +57,6 @@ if settings.sentry_dsn and settings.sentry_is_enabled: sentry_sdk.init(dsn=settings.sentry_dsn, enable_tracing=True) -DEFAULT_EMPTY_TRANSCRIPTION = """ -**Aucun contenu audio n’a été détecté dans votre transcription.** - - -*Si vous pensez qu’il s’agit d’une erreur, n’hésitez pas à contacter -notre support technique : visio@numerique.gouv.fr* - -. - -. - -. - -Quelques points que nous vous conseillons de vérifier : -- Un micro était-il activé ? -- Étiez-vous suffisamment proche ? -- Le micro est-il de bonne qualité ? -- L’enregistrement dure-t-il plus de 30 secondes ? - -""" - - class AudioValidationError(Exception): """Custom exception for audio validation errors.""" @@ -166,36 +145,6 @@ def format_actions(llm_output: dict) -> str: return "" -def format_segments(transcription_data): - """Format transcription segments from WhisperX into a readable conversation format. - - Processes transcription data with segments containing speaker information and text, - combining consecutive segments from the same speaker and formatting them as a - conversation with speaker labels. - """ - formatted_output = "" - if not transcription_data or not hasattr(transcription_data, "segments"): - if isinstance(transcription_data, dict) and "segments" in transcription_data: - segments = transcription_data["segments"] - else: - return "Error: Invalid transcription data format" - else: - segments = transcription_data.segments - - previous_speaker = None - - for segment in segments: - speaker = segment.get("speaker", "UNKNOWN_SPEAKER") - text = segment.get("text", "") - if text: - if speaker != previous_speaker: - formatted_output += f"\n\n **{speaker}**: {text}" - else: - formatted_output += f" {text}" - previous_speaker = speaker - return formatted_output - - def post_with_retries(url, data): """Send POST request with automatic retries.""" session = create_retry_session() @@ -306,25 +255,20 @@ def process_audio_transcribe_summarize_v2( os.remove(temp_file_path) logger.debug("Temporary file removed: %s", temp_file_path) - formatted_transcription = ( - DEFAULT_EMPTY_TRANSCRIPTION - if not transcription.segments - else format_segments(transcription) - ) - metadata_manager.track_transcription_metadata(task_id, transcription) - if not room or not recording_date or not recording_time: - title = settings.document_default_title - else: - title = settings.document_title_template.format( - room=room, - room_recording_date=recording_date, - room_recording_time=recording_time, - ) + formatter = TranscriptFormatter() + + content, title = formatter.format( + transcription, + room=room, + recording_date=recording_date, + recording_time=recording_time, + ) + data = { "title": title, - "content": formatted_transcription, + "content": content, "email": email, "sub": sub, } @@ -356,7 +300,7 @@ def process_audio_transcribe_summarize_v2( ): logger.info("Queuing summary generation task.") summarize_transcription.apply_async( - args=[formatted_transcription, email, sub, title], + args=[content, email, sub, title], queue=settings.summarize_queue, ) else: diff --git a/src/summary/summary/core/config.py b/src/summary/summary/core/config.py index b4e39520..8ffe7d6e 100644 --- a/src/summary/summary/core/config.py +++ b/src/summary/summary/core/config.py @@ -45,6 +45,10 @@ class Settings(BaseSettings): llm_api_key: str llm_model: str + # Transcription processing + hallucination_patterns: List[str] = ["Vap'n'Roll Thierry"] + hallucination_replacement_text: str = "[Texte impossible à transcrire]" + # Webhook-related settings webhook_max_retries: int = 2 webhook_status_forcelist: List[int] = [502, 503, 504] diff --git a/src/summary/summary/core/transcript_formatter.py b/src/summary/summary/core/transcript_formatter.py new file mode 100644 index 00000000..db123533 --- /dev/null +++ b/src/summary/summary/core/transcript_formatter.py @@ -0,0 +1,121 @@ +"""Transcript formatting into readable conversation format with speaker labels.""" + +import logging +from typing import Optional, Tuple + +from summary.core.config import get_settings + +settings = get_settings() + +logger = logging.getLogger(__name__) + + +DEFAULT_EMPTY_TRANSCRIPTION = """ +**Aucun contenu audio n’a été détecté dans votre transcription.** + + +*Si vous pensez qu’il s’agit d’une erreur, n’hésitez pas à contacter +notre support technique : visio@numerique.gouv.fr* + +. + +. + +. + +Quelques points que nous vous conseillons de vérifier : +- Un micro était-il activé ? +- Étiez-vous suffisamment proche ? +- Le micro est-il de bonne qualité ? +- L’enregistrement dure-t-il plus de 30 secondes ? + +""" + + +class TranscriptFormatter: + """Formats WhisperX transcription output into readable conversation format. + + Handles: + - Extracting segments from transcription objects or dictionaries + - Combining consecutive segments from the same speaker + - Removing hallucination patterns from content + - Generating descriptive titles from context + """ + + def __init__(self): + """Initialize formatter with settings.""" + self.hallucination_patterns = settings.hallucination_patterns + self.hallucination_replacement_text = settings.hallucination_replacement_text + self.default_title = settings.document_default_title + self.default_empty_transcription = DEFAULT_EMPTY_TRANSCRIPTION + + def _get_segments(self, transcription): + """Extract segments from transcription object or dictionary.""" + if hasattr(transcription, "segments"): + return transcription.segments + + if isinstance(transcription, dict): + return transcription.get("segments", None) + + return None + + def format( + self, + transcription, + room: Optional[str] = None, + recording_date: Optional[str] = None, + recording_time: Optional[str] = None, + ) -> Tuple[str, str]: + """Format transcription into the final document and its title.""" + segments = self._get_segments(transcription) + + if not segments: + content = self.default_empty_transcription + else: + content = self._format_speaker(segments) + content = self._remove_hallucinations(content) + + title = self._generate_title(room, recording_date, recording_time) + + return content, title + + def _remove_hallucinations(self, content: str) -> str: + """Remove hallucination patterns from content.""" + replacement = self.hallucination_replacement_text or "" + + for pattern in self.hallucination_patterns: + content = content.replace(pattern, replacement) + return content + + def _format_speaker(self, segments) -> str: + """Format segments with speaker labels, combining consecutive speakers.""" + formatted_output = "" + previous_speaker = None + + for segment in segments: + speaker = segment.get("speaker", "UNKNOWN_SPEAKER") + text = segment.get("text", "") + if text: + if speaker != previous_speaker: + formatted_output += f"\n\n **{speaker}**: {text}" + else: + formatted_output += f" {text}" + previous_speaker = speaker + + return formatted_output + + def _generate_title( + self, + room: Optional[str] = None, + recording_date: Optional[str] = None, + recording_time: Optional[str] = None, + ) -> str: + """Generate title from context or return default.""" + if not room or not recording_date or not recording_time: + return self.default_title + + return settings.document_title_template.format( + room=room, + room_recording_date=recording_date, + room_recording_time=recording_time, + )