(backend) Import of documents

We can now import documents in formats .docx and .md.
To do so we added a new container "docspec", which
uses the docspec service to convert
these formats to Blocknote format.

More here: #1567 #1569.
This commit is contained in:
Stephan Meijer
2025-11-15 16:29:43 +01:00
committed by Anthony LC
parent 61dbda0bf6
commit b547657efd
12 changed files with 305 additions and 109 deletions

View File

@@ -11,6 +11,7 @@ from django.utils.functional import lazy
from django.utils.text import slugify
from django.utils.translation import gettext_lazy as _
from core.services import mime_types
import magic
from rest_framework import serializers
@@ -18,7 +19,7 @@ from core import choices, enums, models, utils, validators
from core.services.ai_services import AI_ACTIONS
from core.services.converter_services import (
ConversionError,
YdocConverter,
Converter,
)
@@ -164,6 +165,7 @@ class DocumentSerializer(ListDocumentSerializer):
content = serializers.CharField(required=False)
websocket = serializers.BooleanField(required=False, write_only=True)
file = serializers.FileField(required=False, write_only=True, allow_null=True)
class Meta:
model = models.Document
@@ -180,6 +182,7 @@ class DocumentSerializer(ListDocumentSerializer):
"deleted_at",
"depth",
"excerpt",
"file",
"is_favorite",
"link_role",
"link_reach",
@@ -437,7 +440,11 @@ class ServerCreateDocumentSerializer(serializers.Serializer):
language = user.language or language
try:
document_content = YdocConverter().convert(validated_data["content"])
document_content = Converter().convert(
validated_data["content"],
mime_types.MARKDOWN,
mime_types.YJS
)
except ConversionError as err:
raise serializers.ValidationError(
{"content": ["Could not convert content"]}

View File

@@ -46,14 +46,12 @@ from core.api.filters import remove_accents
from core.services.ai_services import AIService
from core.services.collaboration_services import CollaborationService
from core.services.converter_services import (
ConversionError,
ServiceUnavailableError as YProviderServiceUnavailableError,
)
from core.services.converter_services import (
ValidationError as YProviderValidationError,
Converter,
)
from core.services.converter_services import (
YdocConverter,
)
from core.services import mime_types
from core.services.search_indexers import (
get_document_indexer,
get_visited_document_ids_of,
@@ -527,6 +525,28 @@ class DocumentViewSet(
"IN SHARE ROW EXCLUSIVE MODE;"
)
# Remove file from validated_data as it's not a model field
# Process it if present
uploaded_file = serializer.validated_data.pop("file", None)
# If a file is uploaded, convert it to Yjs format and set as content
if uploaded_file:
try:
file_content = uploaded_file.read()
converter = Converter()
converted_content = converter.convert(
file_content,
content_type=uploaded_file.content_type,
accept=mime_types.YJS
)
serializer.validated_data["content"] = converted_content
serializer.validated_data["title"] = uploaded_file.name
except ConversionError as err:
raise drf.exceptions.ValidationError(
{"file": ["Could not convert file content"]}
) from err
obj = models.Document.add_root(
creator=self.request.user,
**serializer.validated_data,
@@ -1881,14 +1901,14 @@ class DocumentViewSet(
if base64_content is not None:
# Convert using the y-provider service
try:
yprovider = YdocConverter()
yprovider = Converter()
result = yprovider.convert(
base64.b64decode(base64_content),
"application/vnd.yjs.doc",
mime_types.YJS,
{
"markdown": "text/markdown",
"html": "text/html",
"json": "application/json",
"markdown": mime_types.MARKDOWN,
"html": mime_types.HTML,
"json": mime_types.JSON,
}[content_format],
)
content = result