diff --git a/CHANGELOG.md b/CHANGELOG.md index ef302839..f14b8291 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -11,6 +11,7 @@ and this project adheres to ### Added - 👷(docker) add arm64 platform support for image builds +- ✨(summary) add localization support for transcription context text ### Changed diff --git a/gitlint/gitlint_emoji.py b/gitlint/gitlint_emoji.py index 59c86eaf..fe3243da 100644 --- a/gitlint/gitlint_emoji.py +++ b/gitlint/gitlint_emoji.py @@ -2,6 +2,7 @@ Gitlint extra rule to validate that the message title is of the form "() " """ + from __future__ import unicode_literals import re diff --git a/src/backend/core/recording/event/notification.py b/src/backend/core/recording/event/notification.py index 761a7d79..916c51d0 100644 --- a/src/backend/core/recording/event/notification.py +++ b/src/backend/core/recording/event/notification.py @@ -167,6 +167,7 @@ class NotificationService: owner_access.user.timezone ).strftime("%H:%M"), "download_link": f"{get_recording_download_base_url()}/{recording.id}", + "context_language": owner_access.user.language, } headers = { diff --git a/src/summary/summary/api/route/tasks.py b/src/summary/summary/api/route/tasks.py index 9d6f73d5..7117f6db 100644 --- a/src/summary/summary/api/route/tasks.py +++ b/src/summary/summary/api/route/tasks.py @@ -15,8 +15,8 @@ from summary.core.config import get_settings settings = get_settings() -class TaskCreation(BaseModel): - """Task data.""" +class TranscribeSummarizeTaskCreation(BaseModel): + """Transcription and summarization parameters.""" owner_id: str filename: str @@ -28,6 +28,7 @@ class TaskCreation(BaseModel): recording_time: Optional[str] language: Optional[str] download_link: Optional[str] + context_language: Optional[str] = None @field_validator("language") @classmethod @@ -45,8 +46,8 @@ router = APIRouter(prefix="/tasks") @router.post("/") -async def create_task(request: TaskCreation): - """Create a task.""" +async def create_transcribe_summarize_task(request: TranscribeSummarizeTaskCreation): + """Create a transcription and summarization task.""" task = process_audio_transcribe_summarize_v2.apply_async( args=[ request.owner_id, @@ -59,6 +60,7 @@ async def create_task(request: TaskCreation): request.recording_time, request.language, request.download_link, + request.context_language, ], queue=settings.transcribe_queue, ) diff --git a/src/summary/summary/core/celery_worker.py b/src/summary/summary/core/celery_worker.py index 91da516c..0f1131a9 100644 --- a/src/summary/summary/core/celery_worker.py +++ b/src/summary/summary/core/celery_worker.py @@ -18,6 +18,7 @@ from summary.core.analytics import MetadataManager, get_analytics from summary.core.config import get_settings from summary.core.file_service import FileService, FileServiceException from summary.core.llm_service import LLMException, LLMObservability, LLMService +from summary.core.locales import get_locale from summary.core.prompt import ( FORMAT_NEXT_STEPS, FORMAT_PLAN, @@ -121,6 +122,7 @@ def process_audio_transcribe_summarize_v2( recording_time: Optional[str], language: Optional[str], download_link: Optional[str], + context_language: Optional[str] = None, ): """Process an audio file by transcribing it and generating a summary. @@ -129,6 +131,19 @@ def process_audio_transcribe_summarize_v2( 2. Transcribes the audio using WhisperX model 3. Sends the results via webhook + Args: + self: Celery task instance (passed on with bind=True) + owner_id: Unique identifier of the recording owner. + filename: Name of the audio file in MinIO storage. + email: Email address of the recording owner. + sub: OIDC subject identifier of the recording owner. + received_at: Unix timestamp when the recording was received. + room: room name where the recording took place. + recording_date: Date of the recording (localized display string). + recording_time: Time of the recording (localized display string). + language: ISO 639-1 language code for transcription. + download_link: URL to download the original recording. + context_language: ISO 639-1 language code of the meeting summary context text. """ logger.info( "Notification received | Owner: %s | Room: %s", @@ -145,6 +160,7 @@ def process_audio_transcribe_summarize_v2( max_retries=settings.whisperx_max_retries, ) + # Transcription try: with ( file_service.prepare_audio_file(filename) as (audio_file, metadata), @@ -183,7 +199,10 @@ def process_audio_transcribe_summarize_v2( metadata_manager.track_transcription_metadata(task_id, transcription) - formatter = TranscriptFormatter() + # For locale of context, use in decreasing priority context_language, + # language (of meeting), default context language + locale = get_locale(context_language, language) + formatter = TranscriptFormatter(locale) content, title = formatter.format( transcription, @@ -221,6 +240,7 @@ def process_audio_transcribe_summarize_v2( metadata_manager.capture(task_id, settings.posthog_event_success) + # LLM Summarization if ( analytics.is_feature_enabled("summary-enabled", distinct_id=owner_id) and settings.is_summary_enabled @@ -336,9 +356,7 @@ def summarize_transcription( summary = tldr + "\n\n" + cleaned_summary + "\n\n" + next_steps data = { - "title": settings.summary_title_template.format( - title=title, - ), + "title": settings.summary_title_template.format(title=title), "content": summary, "email": email, "sub": sub, diff --git a/src/summary/summary/core/config.py b/src/summary/summary/core/config.py index e826d90d..1af91ec5 100644 --- a/src/summary/summary/core/config.py +++ b/src/summary/summary/core/config.py @@ -1,7 +1,7 @@ """Application configuration and settings.""" from functools import lru_cache -from typing import Annotated, List, Optional, Set +from typing import Annotated, List, Literal, Optional, Set from fastapi import Depends from pydantic import SecretStr @@ -51,7 +51,6 @@ class Settings(BaseSettings): # Transcription processing hallucination_patterns: List[str] = ["Vap'n'Roll Thierry"] - hallucination_replacement_text: str = "[Texte impossible à transcrire]" # Webhook-related settings webhook_max_retries: int = 2 @@ -60,11 +59,10 @@ class Settings(BaseSettings): webhook_api_token: SecretStr webhook_url: str + # Locale + default_context_language: Literal["de", "en", "fr", "nl"] = "fr" + # Output related settings - document_default_title: Optional[str] = "Transcription" - document_title_template: Optional[str] = ( - 'Réunion "{room}" du {room_recording_date} à {room_recording_time}' - ) summary_title_template: Optional[str] = "Résumé de {title}" # Summary related settings diff --git a/src/summary/summary/core/locales/__init__.py b/src/summary/summary/core/locales/__init__.py new file mode 100644 index 00000000..89dfea46 --- /dev/null +++ b/src/summary/summary/core/locales/__init__.py @@ -0,0 +1,30 @@ +"""Locale support for the summary service.""" + +from typing import Optional + +from summary.core.config import get_settings +from summary.core.locales import de, en, fr, nl +from summary.core.locales.strings import LocaleStrings + +_LOCALES = {"fr": fr, "en": en, "de": de, "nl": nl} + + +def get_locale(*languages: Optional[str]) -> LocaleStrings: + """Return locale strings for the first matching language candidate. + + Accept language codes in decreasing priority order and return the + locale for the first one that matches a known locale. + Fall back to the configured default_context_language. + """ + for lang in languages: + if not lang: + continue + if lang in _LOCALES: + return _LOCALES[lang].STRINGS + + # Provide fallback for longer formats of ISO 639-1 (e.g. "en-au" -> "en") + base_lang = lang.split("-")[0] + if base_lang in _LOCALES: + return _LOCALES[base_lang].STRINGS + + return _LOCALES[get_settings().default_context_language].STRINGS diff --git a/src/summary/summary/core/locales/de.py b/src/summary/summary/core/locales/de.py new file mode 100644 index 00000000..ae792b6a --- /dev/null +++ b/src/summary/summary/core/locales/de.py @@ -0,0 +1,34 @@ +"""German locale strings.""" + +from summary.core.locales.strings import LocaleStrings + +STRINGS = LocaleStrings( + empty_transcription=""" +**In Ihrer Transkription wurde kein Audioinhalt erkannt.** + +*Wenn Sie glauben, dass es sich um einen Fehler handelt, zögern Sie nicht, +unseren technischen Support zu kontaktieren: visio@numerique.gouv.fr* + +. + +. + +. + +Einige Punkte, die wir Ihnen empfehlen zu überprüfen: +- War ein Mikrofon aktiviert? +- Waren Sie nah genug am Mikrofon? +- Ist das Mikrofon von guter Qualität? +- Dauert die Aufnahme länger als 30 Sekunden? + +""", + download_header_template=( + "\n*Laden Sie Ihre Aufnahme herunter, " + "indem Sie [diesem Link folgen]({download_link})*\n" + ), + hallucination_replacement_text="[Text konnte nicht transkribiert werden]", + document_default_title="Transkription", + document_title_template=( + 'Besprechung "{room}" am {room_recording_date} um {room_recording_time}' + ), +) diff --git a/src/summary/summary/core/locales/en.py b/src/summary/summary/core/locales/en.py new file mode 100644 index 00000000..6534e924 --- /dev/null +++ b/src/summary/summary/core/locales/en.py @@ -0,0 +1,33 @@ +"""English locale strings.""" + +from summary.core.locales.strings import LocaleStrings + +STRINGS = LocaleStrings( + empty_transcription=""" +**No audio content was detected in your transcription.** + +*If you believe this is an error, please do not hesitate to contact +our technical support: visio@numerique.gouv.fr* + +. + +. + +. + +A few things we recommend you check: +- Was a microphone enabled? +- Were you close enough to the microphone? +- Is the microphone of good quality? +- Is the recording longer than 30 seconds? + +""", + download_header_template=( + "\n*Download your recording by [following this link]({download_link})*\n" + ), + hallucination_replacement_text="[Unable to transcribe text]", + document_default_title="Transcription", + document_title_template=( + 'Meeting "{room}" on {room_recording_date} at {room_recording_time}' + ), +) diff --git a/src/summary/summary/core/locales/fr.py b/src/summary/summary/core/locales/fr.py new file mode 100644 index 00000000..48a1f00e --- /dev/null +++ b/src/summary/summary/core/locales/fr.py @@ -0,0 +1,33 @@ +"""French locale strings (default).""" + +from summary.core.locales.strings import LocaleStrings + +STRINGS = LocaleStrings( + empty_transcription=""" +**Aucun contenu audio n'a été détecté dans votre transcription.** + +*Si vous pensez qu'il s'agit d'une erreur, n'hésitez pas à contacter +notre support technique : visio@numerique.gouv.fr* + +. + +. + +. + +Quelques points que nous vous conseillons de vérifier : +- Un micro était-il activé ? +- Étiez-vous suffisamment proche ? +- Le micro est-il de bonne qualité ? +- L'enregistrement dure-t-il plus de 30 secondes ? + +""", + download_header_template=( + "\n*Télécharger votre enregistrement en [suivant ce lien]({download_link})*\n" + ), + hallucination_replacement_text="[Texte impossible à transcrire]", + document_default_title="Transcription", + document_title_template=( + 'Réunion "{room}" du {room_recording_date} à {room_recording_time}' + ), +) diff --git a/src/summary/summary/core/locales/nl.py b/src/summary/summary/core/locales/nl.py new file mode 100644 index 00000000..0cb00213 --- /dev/null +++ b/src/summary/summary/core/locales/nl.py @@ -0,0 +1,33 @@ +"""Dutch locale strings.""" + +from summary.core.locales.strings import LocaleStrings + +STRINGS = LocaleStrings( + empty_transcription=""" +**Er is geen audio-inhoud gedetecteerd in uw transcriptie.** + +*Als u denkt dat dit een fout is, aarzel dan niet om contact op te nemen +met onze technische ondersteuning: visio@numerique.gouv.fr* + +. + +. + +. + +Een paar punten die wij u aanraden te controleren: +- Was er een microfoon ingeschakeld? +- Was u dicht genoeg bij de microfoon? +- Is de microfoon van goede kwaliteit? +- Duurt de opname langer dan 30 seconden? + +""", + download_header_template=( + "\n*Download uw opname door [deze link te volgen]({download_link})*\n" + ), + hallucination_replacement_text="[Tekst kon niet worden getranscribeerd]", + document_default_title="Transcriptie", + document_title_template=( + 'Vergadering "{room}" op {room_recording_date} om {room_recording_time}' + ), +) diff --git a/src/summary/summary/core/locales/strings.py b/src/summary/summary/core/locales/strings.py new file mode 100644 index 00000000..c182cfdf --- /dev/null +++ b/src/summary/summary/core/locales/strings.py @@ -0,0 +1,15 @@ +"""Locale types for the summary service.""" + +from dataclasses import dataclass + + +@dataclass(frozen=True) +class LocaleStrings: + """All translatable output strings for the summary pipeline.""" + + # transcript_formatter.py + empty_transcription: str + download_header_template: str + hallucination_replacement_text: str + document_default_title: str + document_title_template: str diff --git a/src/summary/summary/core/transcript_formatter.py b/src/summary/summary/core/transcript_formatter.py index 6c62acf0..611806d2 100644 --- a/src/summary/summary/core/transcript_formatter.py +++ b/src/summary/summary/core/transcript_formatter.py @@ -4,34 +4,13 @@ import logging from typing import Optional, Tuple from summary.core.config import get_settings +from summary.core.locales import LocaleStrings settings = get_settings() logger = logging.getLogger(__name__) -DEFAULT_EMPTY_TRANSCRIPTION = """ -**Aucun contenu audio n’a été détecté dans votre transcription.** - - -*Si vous pensez qu’il s’agit d’une erreur, n’hésitez pas à contacter -notre support technique : visio@numerique.gouv.fr* - -. - -. - -. - -Quelques points que nous vous conseillons de vérifier : -- Un micro était-il activé ? -- Étiez-vous suffisamment proche ? -- Le micro est-il de bonne qualité ? -- L’enregistrement dure-t-il plus de 30 secondes ? - -""" - - class TranscriptFormatter: """Formats WhisperX transcription output into readable conversation format. @@ -42,12 +21,10 @@ class TranscriptFormatter: - Generating descriptive titles from context """ - def __init__(self): - """Initialize formatter with settings.""" + def __init__(self, locale: LocaleStrings): + """Initialize formatter with settings and locale.""" self.hallucination_patterns = settings.hallucination_patterns - self.hallucination_replacement_text = settings.hallucination_replacement_text - self.default_title = settings.document_default_title - self.default_empty_transcription = DEFAULT_EMPTY_TRANSCRIPTION + self._locale = locale def _get_segments(self, transcription): """Extract segments from transcription object or dictionary.""" @@ -71,7 +48,7 @@ class TranscriptFormatter: segments = self._get_segments(transcription) if not segments: - content = self.default_empty_transcription + content = self._locale.empty_transcription else: content = self._format_speaker(segments) content = self._remove_hallucinations(content) @@ -83,7 +60,7 @@ class TranscriptFormatter: def _remove_hallucinations(self, content: str) -> str: """Remove hallucination patterns from content.""" - replacement = self.hallucination_replacement_text or "" + replacement = self._locale.hallucination_replacement_text or "" for pattern in self.hallucination_patterns: content = content.replace(pattern, replacement) @@ -111,9 +88,8 @@ class TranscriptFormatter: if not download_link: return content - header = ( - f"\n*Télécharger votre enregistrement " - f"en [suivant ce lien]({download_link})*\n" + header = self._locale.download_header_template.format( + download_link=download_link ) content = header + content @@ -127,9 +103,9 @@ class TranscriptFormatter: ) -> str: """Generate title from context or return default.""" if not room or not recording_date or not recording_time: - return self.default_title + return self._locale.document_default_title - return settings.document_title_template.format( + return self._locale.document_title_template.format( room=room, room_recording_date=recording_date, room_recording_time=recording_time,