(backend) Import of documents

We can now import documents in formats .docx and .md.
To do so we added a new container "docspec", which
uses the docspec service to convert
these formats to Blocknote format.

More here: #1567 #1569.
This commit is contained in:
Stephan Meijer
2025-11-15 16:29:43 +01:00
committed by Anthony LC
parent 61dbda0bf6
commit b547657efd
12 changed files with 305 additions and 109 deletions

View File

@@ -213,6 +213,7 @@ logs: ## display app-dev logs (follow mode)
.PHONY: logs
run-backend: ## Start only the backend application and all needed services
@$(COMPOSE) up --force-recreate -d docspec
@$(COMPOSE) up --force-recreate -d celery-dev
@$(COMPOSE) up --force-recreate -d y-provider-development
@$(COMPOSE) up --force-recreate -d nginx

View File

@@ -231,6 +231,11 @@ services:
condition: service_healthy
restart: true
docspec:
image: ghcr.io/docspecio/api:2.0.0
ports:
- "4000:4000"
networks:
lasuite:
name: lasuite-network

View File

@@ -58,6 +58,7 @@ These are the environment variables you can set for the `impress-backend` contai
| DJANGO_EMAIL_USE_TLS | Use tls for email host connection | false |
| DJANGO_SECRET_KEY | Secret key | |
| DJANGO_SERVER_TO_SERVER_API_TOKENS | | [] |
| DOCSPEC_API_URL | URL to endpoint of DocSpec conversion API | |
| DOCUMENT_IMAGE_MAX_SIZE | Maximum size of document in bytes | 10485760 |
| FRONTEND_CSS_URL | To add a external css file to the app | |
| FRONTEND_JS_URL | To add a external js file to the app | |

View File

@@ -76,6 +76,8 @@ DJANGO_SERVER_TO_SERVER_API_TOKENS=server-api-token
Y_PROVIDER_API_BASE_URL=http://y-provider-development:4444/api/
Y_PROVIDER_API_KEY=yprovider-api-key
DOCSPEC_API_URL=http://docspec:4000/conversion
# Theme customization
THEME_CUSTOMIZATION_CACHE_TIMEOUT=15

View File

@@ -11,6 +11,7 @@ from django.utils.functional import lazy
from django.utils.text import slugify
from django.utils.translation import gettext_lazy as _
from core.services import mime_types
import magic
from rest_framework import serializers
@@ -18,7 +19,7 @@ from core import choices, enums, models, utils, validators
from core.services.ai_services import AI_ACTIONS
from core.services.converter_services import (
ConversionError,
YdocConverter,
Converter,
)
@@ -164,6 +165,7 @@ class DocumentSerializer(ListDocumentSerializer):
content = serializers.CharField(required=False)
websocket = serializers.BooleanField(required=False, write_only=True)
file = serializers.FileField(required=False, write_only=True, allow_null=True)
class Meta:
model = models.Document
@@ -180,6 +182,7 @@ class DocumentSerializer(ListDocumentSerializer):
"deleted_at",
"depth",
"excerpt",
"file",
"is_favorite",
"link_role",
"link_reach",
@@ -437,7 +440,11 @@ class ServerCreateDocumentSerializer(serializers.Serializer):
language = user.language or language
try:
document_content = YdocConverter().convert(validated_data["content"])
document_content = Converter().convert(
validated_data["content"],
mime_types.MARKDOWN,
mime_types.YJS
)
except ConversionError as err:
raise serializers.ValidationError(
{"content": ["Could not convert content"]}

View File

@@ -46,14 +46,12 @@ from core.api.filters import remove_accents
from core.services.ai_services import AIService
from core.services.collaboration_services import CollaborationService
from core.services.converter_services import (
ConversionError,
ServiceUnavailableError as YProviderServiceUnavailableError,
)
from core.services.converter_services import (
ValidationError as YProviderValidationError,
Converter,
)
from core.services.converter_services import (
YdocConverter,
)
from core.services import mime_types
from core.services.search_indexers import (
get_document_indexer,
get_visited_document_ids_of,
@@ -527,6 +525,28 @@ class DocumentViewSet(
"IN SHARE ROW EXCLUSIVE MODE;"
)
# Remove file from validated_data as it's not a model field
# Process it if present
uploaded_file = serializer.validated_data.pop("file", None)
# If a file is uploaded, convert it to Yjs format and set as content
if uploaded_file:
try:
file_content = uploaded_file.read()
converter = Converter()
converted_content = converter.convert(
file_content,
content_type=uploaded_file.content_type,
accept=mime_types.YJS
)
serializer.validated_data["content"] = converted_content
serializer.validated_data["title"] = uploaded_file.name
except ConversionError as err:
raise drf.exceptions.ValidationError(
{"file": ["Could not convert file content"]}
) from err
obj = models.Document.add_root(
creator=self.request.user,
**serializer.validated_data,
@@ -1881,14 +1901,14 @@ class DocumentViewSet(
if base64_content is not None:
# Convert using the y-provider service
try:
yprovider = YdocConverter()
yprovider = Converter()
result = yprovider.convert(
base64.b64decode(base64_content),
"application/vnd.yjs.doc",
mime_types.YJS,
{
"markdown": "text/markdown",
"html": "text/html",
"json": "application/json",
"markdown": mime_types.MARKDOWN,
"html": mime_types.HTML,
"json": mime_types.JSON,
}[content_format],
)
content = result

View File

@@ -5,7 +5,9 @@ from base64 import b64encode
from django.conf import settings
import requests
import typing
from core.services import mime_types
class ConversionError(Exception):
"""Base exception for conversion-related errors."""
@@ -19,8 +21,65 @@ class ServiceUnavailableError(ConversionError):
"""Raised when the conversion service is unavailable."""
class ConverterProtocol(typing.Protocol):
def convert(self, text, content_type, accept): ...
class Converter:
docspec: ConverterProtocol
ydoc: ConverterProtocol
def __init__(self):
self.docspec = DocSpecConverter()
self.ydoc = YdocConverter()
def convert(self, input, content_type, accept):
"""Convert input into other formats using external microservices."""
if content_type == mime_types.DOCX and accept == mime_types.YJS:
return self.convert(
self.docspec.convert(input, mime_types.DOCX, mime_types.BLOCKNOTE),
mime_types.BLOCKNOTE,
mime_types.YJS
)
return self.ydoc.convert(input, content_type, accept)
class DocSpecConverter:
"""Service class for DocSpec conversion-related operations."""
def _request(self, url, data, content_type):
"""Make a request to the DocSpec API."""
response = requests.post(
url,
headers={"Accept": mime_types.BLOCKNOTE},
files={"file": ("document.docx", data, content_type)},
timeout=settings.CONVERSION_API_TIMEOUT,
verify=settings.CONVERSION_API_SECURE,
)
response.raise_for_status()
return response
def convert(self, data, content_type, accept):
"""Convert a Document to BlockNote."""
if not data:
raise ValidationError("Input data cannot be empty")
if content_type != mime_types.DOCX or accept != mime_types.BLOCKNOTE:
raise ValidationError(f"Conversion from {content_type} to {accept} is not supported.")
try:
return self._request(settings.DOCSPEC_API_URL, data, content_type).content
except requests.RequestException as err:
raise ServiceUnavailableError(
"Failed to connect to DocSpec conversion service",
) from err
class YdocConverter:
"""Service class for conversion-related operations."""
"""Service class for YDoc conversion-related operations."""
@property
def auth_header(self):
@@ -45,7 +104,7 @@ class YdocConverter:
return response
def convert(
self, text, content_type="text/markdown", accept="application/vnd.yjs.doc"
self, text, content_type=mime_types.MARKDOWN, accept=mime_types.YJS
):
"""Convert a Markdown text into our internal format using an external microservice."""
@@ -59,14 +118,14 @@ class YdocConverter:
content_type,
accept,
)
if accept == "application/vnd.yjs.doc":
if accept == mime_types.YJS:
return b64encode(response.content).decode("utf-8")
if accept in {"text/markdown", "text/html"}:
if accept in {mime_types.MARKDOWN, "text/html"}:
return response.text
if accept == "application/json":
if accept == mime_types.JSON:
return response.json()
raise ValidationError("Unsupported format")
except requests.RequestException as err:
raise ServiceUnavailableError(
"Failed to connect to conversion service",
f"Failed to connect to YDoc conversion service {content_type}, {accept}",
) from err

View File

@@ -0,0 +1,6 @@
BLOCKNOTE = "application/vnd.blocknote+json"
YJS = "application/vnd.yjs.doc"
MARKDOWN = "text/markdown"
JSON = "application/json"
DOCX = "application/vnd.openxmlformats-officedocument.wordprocessingml.document"
HTML = "text/html"

View File

@@ -709,6 +709,12 @@ class Base(Configuration):
environ_prefix=None,
)
# DocSpec API microservice
DOCSPEC_API_URL = values.Value(
environ_name="DOCSPEC_API_URL",
environ_prefix=None
)
# Conversion endpoint
CONVERSION_API_ENDPOINT = values.Value(
default="convert",

View File

@@ -69,7 +69,7 @@ describe('Server Tests', () => {
const response = await request(app)
.post('/api/convert')
.set('origin', origin)
.set('authorization', 'wrong-api-key')
.set('authorization', `Bearer wrong-api-key`)
.set('content-type', 'application/json');
expect(response.status).toBe(401);
@@ -99,7 +99,7 @@ describe('Server Tests', () => {
const response = await request(app)
.post('/api/convert')
.set('origin', origin)
.set('authorization', apiKey)
.set('authorization', `Bearer ${apiKey}`)
.set('content-type', 'application/json');
expect(response.status).toBe(400);
@@ -114,7 +114,7 @@ describe('Server Tests', () => {
const response = await request(app)
.post('/api/convert')
.set('origin', origin)
.set('authorization', apiKey)
.set('authorization', `Bearer ${apiKey}`)
.set('content-type', 'application/json')
.send('');
@@ -129,9 +129,10 @@ describe('Server Tests', () => {
const response = await request(app)
.post('/api/convert')
.set('origin', origin)
.set('authorization', apiKey)
.set('authorization', `Bearer ${apiKey}`)
.set('content-type', 'image/png')
.send('randomdata');
expect(response.status).toBe(415);
expect(response.body).toStrictEqual({ error: 'Unsupported Content-Type' });
});
@@ -141,38 +142,73 @@ describe('Server Tests', () => {
const response = await request(app)
.post('/api/convert')
.set('origin', origin)
.set('authorization', apiKey)
.set('authorization', `Bearer ${apiKey}`)
.set('content-type', 'text/markdown')
.set('accept', 'image/png')
.send('# Header');
expect(response.status).toBe(406);
expect(response.body).toStrictEqual({ error: 'Unsupported format' });
});
test.each([[apiKey], [`Bearer ${apiKey}`]])(
'POST /api/convert with correct content with Authorization: %s',
async (authHeader) => {
test('POST /api/convert BlockNote to Markdown', async () => {
const app = initApp();
const response = await request(app)
.post('/api/convert')
.set('Origin', origin)
.set('Authorization', authHeader)
.set('content-type', 'text/markdown')
.set('accept', 'application/vnd.yjs.doc')
.send(expectedMarkdown);
.set('origin', origin)
.set('authorization', `Bearer ${apiKey}`)
.set('content-type', 'application/vnd.blocknote+json')
.set('accept', 'text/markdown')
.send(expectedBlocks);
expect(response.status).toBe(200);
expect(response.body).toBeInstanceOf(Buffer);
const editor = ServerBlockNoteEditor.create();
const doc = new Y.Doc();
Y.applyUpdate(doc, response.body);
const blocks = editor.yDocToBlocks(doc, 'document-store');
expect(blocks).toStrictEqual(expectedBlocks);
},
expect(response.header['content-type']).toBe(
'text/markdown; charset=utf-8',
);
expect(typeof response.text).toBe('string');
expect(response.text.trim()).toBe(expectedMarkdown);
});
test('POST /api/convert BlockNote to Yjs', async () => {
const app = initApp();
const editor = ServerBlockNoteEditor.create();
const blocks = await editor.tryParseMarkdownToBlocks(expectedMarkdown);
const response = await request(app)
.post('/api/convert')
.set('origin', origin)
.set('authorization', `Bearer ${apiKey}`)
.set('content-type', 'application/vnd.blocknote+json')
.set('accept', 'application/vnd.yjs.doc')
.send(blocks)
.responseType('blob');
expect(response.status).toBe(200);
expect(response.header['content-type']).toBe('application/vnd.yjs.doc');
// Decode the Yjs response and verify it contains the correct blocks
const responseBuffer = Buffer.from(response.body as Buffer);
const ydoc = new Y.Doc();
Y.applyUpdate(ydoc, responseBuffer);
const decodedBlocks = editor.yDocToBlocks(ydoc, 'document-store');
expect(decodedBlocks).toStrictEqual(expectedBlocks);
});
test('POST /api/convert BlockNote to HTML', async () => {
const app = initApp();
const response = await request(app)
.post('/api/convert')
.set('origin', origin)
.set('authorization', `Bearer ${apiKey}`)
.set('content-type', 'application/vnd.blocknote+json')
.set('accept', 'text/html')
.send(expectedBlocks);
expect(response.status).toBe(200);
expect(response.header['content-type']).toBe('text/html; charset=utf-8');
expect(typeof response.text).toBe('string');
expect(response.text).toBe(expectedHTML);
});
test('POST /api/convert Yjs to HTML', async () => {
const app = initApp();
@@ -183,10 +219,11 @@ describe('Server Tests', () => {
const response = await request(app)
.post('/api/convert')
.set('origin', origin)
.set('authorization', apiKey)
.set('authorization', `Bearer ${apiKey}`)
.set('content-type', 'application/vnd.yjs.doc')
.set('accept', 'text/html')
.send(Buffer.from(yjsUpdate));
expect(response.status).toBe(200);
expect(response.header['content-type']).toBe('text/html; charset=utf-8');
expect(typeof response.text).toBe('string');
@@ -202,10 +239,11 @@ describe('Server Tests', () => {
const response = await request(app)
.post('/api/convert')
.set('origin', origin)
.set('authorization', apiKey)
.set('authorization', `Bearer ${apiKey}`)
.set('content-type', 'application/vnd.yjs.doc')
.set('accept', 'text/markdown')
.send(Buffer.from(yjsUpdate));
expect(response.status).toBe(200);
expect(response.header['content-type']).toBe(
'text/markdown; charset=utf-8',
@@ -223,15 +261,16 @@ describe('Server Tests', () => {
const response = await request(app)
.post('/api/convert')
.set('origin', origin)
.set('authorization', apiKey)
.set('authorization', `Bearer ${apiKey}`)
.set('content-type', 'application/vnd.yjs.doc')
.set('accept', 'application/json')
.send(Buffer.from(yjsUpdate));
expect(response.status).toBe(200);
expect(response.header['content-type']).toBe(
'application/json; charset=utf-8',
);
expect(Array.isArray(response.body)).toBe(true);
expect(response.body).toBeInstanceOf(Array);
expect(response.body).toStrictEqual(expectedBlocks);
});
@@ -240,15 +279,16 @@ describe('Server Tests', () => {
const response = await request(app)
.post('/api/convert')
.set('origin', origin)
.set('authorization', apiKey)
.set('authorization', `Bearer ${apiKey}`)
.set('content-type', 'text/markdown')
.set('accept', 'application/json')
.send(expectedMarkdown);
expect(response.status).toBe(200);
expect(response.header['content-type']).toBe(
'application/json; charset=utf-8',
);
expect(Array.isArray(response.body)).toBe(true);
expect(response.body).toBeInstanceOf(Array);
expect(response.body).toStrictEqual(expectedBlocks);
});
@@ -257,11 +297,12 @@ describe('Server Tests', () => {
const response = await request(app)
.post('/api/convert')
.set('origin', origin)
.set('authorization', apiKey)
.set('authorization', `Bearer ${apiKey}`)
.set('content-type', 'application/vnd.yjs.doc')
.set('accept', 'application/json')
.send(Buffer.from('notvalidyjs'));
expect(response.status).toBe(400);
expect(response.body).toStrictEqual({ error: 'Invalid Yjs content' });
expect(response.body).toStrictEqual({ error: 'Invalid content' });
});
});

View File

@@ -14,27 +14,115 @@ interface ErrorResponse {
error: string;
}
type ConversionResponseBody = Uint8Array | string | object | ErrorResponse;
interface InputReader {
supportedContentTypes: string[];
read(data: Buffer): Promise<PartialBlock[]>;
}
interface OutputWriter {
supportedContentTypes: string[];
write(blocks: PartialBlock[]): Promise<ConversionResponseBody>;
}
const editor = ServerBlockNoteEditor.create<
DefaultBlockSchema,
DefaultInlineContentSchema,
DefaultStyleSchema
>();
const ContentTypes = {
XMarkdown: 'text/x-markdown',
Markdown: 'text/markdown',
YJS: 'application/vnd.yjs.doc',
FormUrlEncoded: 'application/x-www-form-urlencoded',
OctetStream: 'application/octet-stream',
HTML: 'text/html',
BlockNote: 'application/vnd.blocknote+json',
JSON: 'application/json',
} as const;
const createYDocument = (blocks: PartialBlock[]) =>
editor.blocksToYDoc(blocks, 'document-store');
const readers: InputReader[] = [
{
// application/x-www-form-urlencoded is interpreted as Markdown for backward compatibility
supportedContentTypes: [
ContentTypes.Markdown,
ContentTypes.XMarkdown,
ContentTypes.FormUrlEncoded,
],
read: (data) => editor.tryParseMarkdownToBlocks(data.toString()),
},
{
supportedContentTypes: [ContentTypes.YJS, ContentTypes.OctetStream],
read: async (data) => {
const ydoc = new Y.Doc();
Y.applyUpdate(ydoc, data);
return editor.yDocToBlocks(ydoc, 'document-store') as PartialBlock[];
},
},
{
supportedContentTypes: [ContentTypes.BlockNote],
read: async (data) => JSON.parse(data.toString()),
},
];
const writers: OutputWriter[] = [
{
supportedContentTypes: [ContentTypes.BlockNote, ContentTypes.JSON],
write: async (blocks) => blocks,
},
{
supportedContentTypes: [ContentTypes.YJS, ContentTypes.OctetStream],
write: async (blocks) => Y.encodeStateAsUpdate(createYDocument(blocks)),
},
{
supportedContentTypes: [ContentTypes.Markdown, ContentTypes.XMarkdown],
write: (blocks) => editor.blocksToMarkdownLossy(blocks),
},
{
supportedContentTypes: [ContentTypes.HTML],
write: (blocks) => editor.blocksToHTMLLossy(blocks),
},
];
const normalizeContentType = (value: string) => value.split(';')[0];
export const convertHandler = async (
req: Request<object, Uint8Array | ErrorResponse, Buffer, object>,
res: Response<Uint8Array | string | object | ErrorResponse>,
res: Response<ConversionResponseBody>,
) => {
if (!req.body || req.body.length === 0) {
res.status(400).json({ error: 'Invalid request: missing content' });
return;
}
const contentType = (req.header('content-type') || 'text/markdown').split(
';',
)[0];
const accept = (req.header('accept') || 'application/vnd.yjs.doc').split(
';',
)[0];
const contentType = normalizeContentType(
req.header('content-type') || ContentTypes.Markdown,
);
const reader = readers.find((reader) =>
reader.supportedContentTypes.includes(contentType),
);
if (!reader) {
res.status(415).json({ error: 'Unsupported Content-Type' });
return;
}
const accept = normalizeContentType(req.header('accept') || ContentTypes.YJS);
const writer = writers.find((writer) =>
writer.supportedContentTypes.includes(accept),
);
if (!writer) {
res.status(406).json({ error: 'Unsupported format' });
return;
}
let blocks:
| PartialBlock<
@@ -44,63 +132,23 @@ export const convertHandler = async (
>[]
| null = null;
try {
// First, convert from the input format to blocks
// application/x-www-form-urlencoded is interpreted as Markdown for backward compatibility
if (
contentType === 'text/markdown' ||
contentType === 'application/x-www-form-urlencoded'
) {
blocks = await editor.tryParseMarkdownToBlocks(req.body.toString());
} else if (
contentType === 'application/vnd.yjs.doc' ||
contentType === 'application/octet-stream'
) {
try {
const ydoc = new Y.Doc();
Y.applyUpdate(ydoc, req.body);
blocks = editor.yDocToBlocks(ydoc, 'document-store') as PartialBlock[];
blocks = await reader.read(req.body);
} catch (e) {
logger('Invalid Yjs content:', e);
res.status(400).json({ error: 'Invalid Yjs content' });
return;
}
} else {
res.status(415).json({ error: 'Unsupported Content-Type' });
logger('Invalid content:', e);
res.status(400).json({ error: 'Invalid content' });
return;
}
if (!blocks || blocks.length === 0) {
res.status(500).json({ error: 'No valid blocks were generated' });
return;
}
// Then, convert from blocks to the output format
if (accept === 'application/json') {
res.status(200).json(blocks);
} else {
const yDocument = editor.blocksToYDoc(blocks, 'document-store');
if (
accept === 'application/vnd.yjs.doc' ||
accept === 'application/octet-stream'
) {
res
.status(200)
.setHeader('content-type', 'application/octet-stream')
.send(Y.encodeStateAsUpdate(yDocument));
} else if (accept === 'text/markdown') {
res
.status(200)
.setHeader('content-type', 'text/markdown')
.send(await editor.blocksToMarkdownLossy(blocks));
} else if (accept === 'text/html') {
res
.status(200)
.setHeader('content-type', 'text/html')
.send(await editor.blocksToHTMLLossy(blocks));
} else {
res.status(406).json({ error: 'Unsupported format' });
}
}
.setHeader('content-type', accept)
.send(await writer.write(blocks));
} catch (e) {
logger('conversion failed:', e);
res.status(500).json({ error: 'An error occurred' });