(backend) Import of documents

We can now import documents in formats .docx and .md.
To do so we added a new container "docspec", which
uses the docspec service to convert
these formats to Blocknote format.

More here: #1567 #1569.
This commit is contained in:
Stephan Meijer
2025-11-15 16:29:43 +01:00
committed by Anthony LC
parent 61dbda0bf6
commit b547657efd
12 changed files with 305 additions and 109 deletions

View File

@@ -11,6 +11,7 @@ from django.utils.functional import lazy
from django.utils.text import slugify
from django.utils.translation import gettext_lazy as _
from core.services import mime_types
import magic
from rest_framework import serializers
@@ -18,7 +19,7 @@ from core import choices, enums, models, utils, validators
from core.services.ai_services import AI_ACTIONS
from core.services.converter_services import (
ConversionError,
YdocConverter,
Converter,
)
@@ -164,6 +165,7 @@ class DocumentSerializer(ListDocumentSerializer):
content = serializers.CharField(required=False)
websocket = serializers.BooleanField(required=False, write_only=True)
file = serializers.FileField(required=False, write_only=True, allow_null=True)
class Meta:
model = models.Document
@@ -180,6 +182,7 @@ class DocumentSerializer(ListDocumentSerializer):
"deleted_at",
"depth",
"excerpt",
"file",
"is_favorite",
"link_role",
"link_reach",
@@ -437,7 +440,11 @@ class ServerCreateDocumentSerializer(serializers.Serializer):
language = user.language or language
try:
document_content = YdocConverter().convert(validated_data["content"])
document_content = Converter().convert(
validated_data["content"],
mime_types.MARKDOWN,
mime_types.YJS
)
except ConversionError as err:
raise serializers.ValidationError(
{"content": ["Could not convert content"]}

View File

@@ -46,14 +46,12 @@ from core.api.filters import remove_accents
from core.services.ai_services import AIService
from core.services.collaboration_services import CollaborationService
from core.services.converter_services import (
ConversionError,
ServiceUnavailableError as YProviderServiceUnavailableError,
)
from core.services.converter_services import (
ValidationError as YProviderValidationError,
Converter,
)
from core.services.converter_services import (
YdocConverter,
)
from core.services import mime_types
from core.services.search_indexers import (
get_document_indexer,
get_visited_document_ids_of,
@@ -527,6 +525,28 @@ class DocumentViewSet(
"IN SHARE ROW EXCLUSIVE MODE;"
)
# Remove file from validated_data as it's not a model field
# Process it if present
uploaded_file = serializer.validated_data.pop("file", None)
# If a file is uploaded, convert it to Yjs format and set as content
if uploaded_file:
try:
file_content = uploaded_file.read()
converter = Converter()
converted_content = converter.convert(
file_content,
content_type=uploaded_file.content_type,
accept=mime_types.YJS
)
serializer.validated_data["content"] = converted_content
serializer.validated_data["title"] = uploaded_file.name
except ConversionError as err:
raise drf.exceptions.ValidationError(
{"file": ["Could not convert file content"]}
) from err
obj = models.Document.add_root(
creator=self.request.user,
**serializer.validated_data,
@@ -1881,14 +1901,14 @@ class DocumentViewSet(
if base64_content is not None:
# Convert using the y-provider service
try:
yprovider = YdocConverter()
yprovider = Converter()
result = yprovider.convert(
base64.b64decode(base64_content),
"application/vnd.yjs.doc",
mime_types.YJS,
{
"markdown": "text/markdown",
"html": "text/html",
"json": "application/json",
"markdown": mime_types.MARKDOWN,
"html": mime_types.HTML,
"json": mime_types.JSON,
}[content_format],
)
content = result

View File

@@ -5,7 +5,9 @@ from base64 import b64encode
from django.conf import settings
import requests
import typing
from core.services import mime_types
class ConversionError(Exception):
"""Base exception for conversion-related errors."""
@@ -19,8 +21,65 @@ class ServiceUnavailableError(ConversionError):
"""Raised when the conversion service is unavailable."""
class ConverterProtocol(typing.Protocol):
def convert(self, text, content_type, accept): ...
class Converter:
docspec: ConverterProtocol
ydoc: ConverterProtocol
def __init__(self):
self.docspec = DocSpecConverter()
self.ydoc = YdocConverter()
def convert(self, input, content_type, accept):
"""Convert input into other formats using external microservices."""
if content_type == mime_types.DOCX and accept == mime_types.YJS:
return self.convert(
self.docspec.convert(input, mime_types.DOCX, mime_types.BLOCKNOTE),
mime_types.BLOCKNOTE,
mime_types.YJS
)
return self.ydoc.convert(input, content_type, accept)
class DocSpecConverter:
"""Service class for DocSpec conversion-related operations."""
def _request(self, url, data, content_type):
"""Make a request to the DocSpec API."""
response = requests.post(
url,
headers={"Accept": mime_types.BLOCKNOTE},
files={"file": ("document.docx", data, content_type)},
timeout=settings.CONVERSION_API_TIMEOUT,
verify=settings.CONVERSION_API_SECURE,
)
response.raise_for_status()
return response
def convert(self, data, content_type, accept):
"""Convert a Document to BlockNote."""
if not data:
raise ValidationError("Input data cannot be empty")
if content_type != mime_types.DOCX or accept != mime_types.BLOCKNOTE:
raise ValidationError(f"Conversion from {content_type} to {accept} is not supported.")
try:
return self._request(settings.DOCSPEC_API_URL, data, content_type).content
except requests.RequestException as err:
raise ServiceUnavailableError(
"Failed to connect to DocSpec conversion service",
) from err
class YdocConverter:
"""Service class for conversion-related operations."""
"""Service class for YDoc conversion-related operations."""
@property
def auth_header(self):
@@ -45,7 +104,7 @@ class YdocConverter:
return response
def convert(
self, text, content_type="text/markdown", accept="application/vnd.yjs.doc"
self, text, content_type=mime_types.MARKDOWN, accept=mime_types.YJS
):
"""Convert a Markdown text into our internal format using an external microservice."""
@@ -59,14 +118,14 @@ class YdocConverter:
content_type,
accept,
)
if accept == "application/vnd.yjs.doc":
if accept == mime_types.YJS:
return b64encode(response.content).decode("utf-8")
if accept in {"text/markdown", "text/html"}:
if accept in {mime_types.MARKDOWN, "text/html"}:
return response.text
if accept == "application/json":
if accept == mime_types.JSON:
return response.json()
raise ValidationError("Unsupported format")
except requests.RequestException as err:
raise ServiceUnavailableError(
"Failed to connect to conversion service",
f"Failed to connect to YDoc conversion service {content_type}, {accept}",
) from err

View File

@@ -0,0 +1,6 @@
BLOCKNOTE = "application/vnd.blocknote+json"
YJS = "application/vnd.yjs.doc"
MARKDOWN = "text/markdown"
JSON = "application/json"
DOCX = "application/vnd.openxmlformats-officedocument.wordprocessingml.document"
HTML = "text/html"

View File

@@ -709,6 +709,12 @@ class Base(Configuration):
environ_prefix=None,
)
# DocSpec API microservice
DOCSPEC_API_URL = values.Value(
environ_name="DOCSPEC_API_URL",
environ_prefix=None
)
# Conversion endpoint
CONVERSION_API_ENDPOINT = values.Value(
default="convert",