✨(backend) add util to extract text from Ydoc content
Documents content is stored in the Ydoc format. We need a util to extract it as xml/text.
This commit is contained in:
committed by
Manuel Raynaud
parent
747ca70186
commit
710bbf512c
@@ -132,6 +132,7 @@ and this project adheres to
|
||||
|
||||
## Added
|
||||
|
||||
- ⚗️(backend) add util to extract text from base64 yjs document
|
||||
- ✨(backend) add soft delete and restore API endpoints to documents #516
|
||||
- ✨(backend) allow organizing documents in a tree structure #516
|
||||
- ✨(backend) add "excerpt" field to document list serializer #516
|
||||
|
||||
@@ -15,6 +15,13 @@ FROM base AS back-builder
|
||||
|
||||
WORKDIR /builder
|
||||
|
||||
# Install Rust and Cargo using Alpine's package manager
|
||||
RUN apk add --no-cache \
|
||||
build-base \
|
||||
libffi-dev \
|
||||
rust \
|
||||
cargo
|
||||
|
||||
# Copy required python dependencies
|
||||
COPY ./src/backend /builder
|
||||
|
||||
|
||||
@@ -13,6 +13,22 @@ from core import models
|
||||
|
||||
fake = Faker()
|
||||
|
||||
YDOC_HELLO_WORLD_BASE64 = (
|
||||
"AR717vLVDgAHAQ5kb2N1bWVudC1zdG9yZQMKYmxvY2tHcm91cAcA9e7y1Q4AAw5ibG9ja0NvbnRh"
|
||||
"aW5lcgcA9e7y1Q4BAwdoZWFkaW5nBwD17vLVDgIGBgD17vLVDgMGaXRhbGljAnt9hPXu8tUOBAVI"
|
||||
"ZWxsb4b17vLVDgkGaXRhbGljBG51bGwoAPXu8tUOAg10ZXh0QWxpZ25tZW50AXcEbGVmdCgA9e7y"
|
||||
"1Q4CBWxldmVsAX0BKAD17vLVDgECaWQBdyQwNGQ2MjM0MS04MzI2LTQyMzYtYTA4My00ODdlMjZm"
|
||||
"YWQyMzAoAPXu8tUOAQl0ZXh0Q29sb3IBdwdkZWZhdWx0KAD17vLVDgEPYmFja2dyb3VuZENvbG9y"
|
||||
"AXcHZGVmYXVsdIf17vLVDgEDDmJsb2NrQ29udGFpbmVyBwD17vLVDhADDmJ1bGxldExpc3RJdGVt"
|
||||
"BwD17vLVDhEGBAD17vLVDhIBd4b17vLVDhMEYm9sZAJ7fYT17vLVDhQCb3KG9e7y1Q4WBGJvbGQE"
|
||||
"bnVsbIT17vLVDhcCbGQoAPXu8tUOEQ10ZXh0QWxpZ25tZW50AXcEbGVmdCgA9e7y1Q4QAmlkAXck"
|
||||
"ZDM1MWUwNjgtM2U1NS00MjI2LThlYTUtYWJiMjYzMTk4ZTJhKAD17vLVDhAJdGV4dENvbG9yAXcH"
|
||||
"ZGVmYXVsdCgA9e7y1Q4QD2JhY2tncm91bmRDb2xvcgF3B2RlZmF1bHSH9e7y1Q4QAw5ibG9ja0Nv"
|
||||
"bnRhaW5lcgcA9e7y1Q4eAwlwYXJhZ3JhcGgoAPXu8tUOHw10ZXh0QWxpZ25tZW50AXcEbGVmdCgA"
|
||||
"9e7y1Q4eAmlkAXckODk3MDBjMDctZTBlMS00ZmUwLWFjYTItODQ5MzIwOWE3ZTQyKAD17vLVDh4J"
|
||||
"dGV4dENvbG9yAXcHZGVmYXVsdCgA9e7y1Q4eD2JhY2tncm91bmRDb2xvcgF3B2RlZmF1bHQA"
|
||||
)
|
||||
|
||||
|
||||
class UserFactory(factory.django.DjangoModelFactory):
|
||||
"""A factory to random users for testing purposes."""
|
||||
@@ -75,7 +91,7 @@ class DocumentFactory(factory.django.DjangoModelFactory):
|
||||
|
||||
title = factory.Sequence(lambda n: f"document{n}")
|
||||
excerpt = factory.Sequence(lambda n: f"excerpt{n}")
|
||||
content = factory.Sequence(lambda n: f"content{n}")
|
||||
content = YDOC_HELLO_WORLD_BASE64
|
||||
creator = factory.SubFactory(UserFactory)
|
||||
deleted_at = None
|
||||
link_reach = factory.fuzzy.FuzzyChoice(
|
||||
|
||||
37
src/backend/core/tests/test_utils.py
Normal file
37
src/backend/core/tests/test_utils.py
Normal file
@@ -0,0 +1,37 @@
|
||||
"""Test util base64_yjs_to_text."""
|
||||
|
||||
from core import utils
|
||||
|
||||
# This base64 string is an example of what is saved in the database.
|
||||
# This base64 is generated from the blocknote editor, it contains
|
||||
# the text \n# *Hello* \n- w**or**ld
|
||||
TEST_BASE64_STRING = (
|
||||
"AR717vLVDgAHAQ5kb2N1bWVudC1zdG9yZQMKYmxvY2tHcm91cAcA9e7y1Q4AAw5ibG9ja0NvbnRh"
|
||||
"aW5lcgcA9e7y1Q4BAwdoZWFkaW5nBwD17vLVDgIGBgD17vLVDgMGaXRhbGljAnt9hPXu8tUOBAVI"
|
||||
"ZWxsb4b17vLVDgkGaXRhbGljBG51bGwoAPXu8tUOAg10ZXh0QWxpZ25tZW50AXcEbGVmdCgA9e7y"
|
||||
"1Q4CBWxldmVsAX0BKAD17vLVDgECaWQBdyQwNGQ2MjM0MS04MzI2LTQyMzYtYTA4My00ODdlMjZm"
|
||||
"YWQyMzAoAPXu8tUOAQl0ZXh0Q29sb3IBdwdkZWZhdWx0KAD17vLVDgEPYmFja2dyb3VuZENvbG9y"
|
||||
"AXcHZGVmYXVsdIf17vLVDgEDDmJsb2NrQ29udGFpbmVyBwD17vLVDhADDmJ1bGxldExpc3RJdGVt"
|
||||
"BwD17vLVDhEGBAD17vLVDhIBd4b17vLVDhMEYm9sZAJ7fYT17vLVDhQCb3KG9e7y1Q4WBGJvbGQE"
|
||||
"bnVsbIT17vLVDhcCbGQoAPXu8tUOEQ10ZXh0QWxpZ25tZW50AXcEbGVmdCgA9e7y1Q4QAmlkAXck"
|
||||
"ZDM1MWUwNjgtM2U1NS00MjI2LThlYTUtYWJiMjYzMTk4ZTJhKAD17vLVDhAJdGV4dENvbG9yAXcH"
|
||||
"ZGVmYXVsdCgA9e7y1Q4QD2JhY2tncm91bmRDb2xvcgF3B2RlZmF1bHSH9e7y1Q4QAw5ibG9ja0Nv"
|
||||
"bnRhaW5lcgcA9e7y1Q4eAwlwYXJhZ3JhcGgoAPXu8tUOHw10ZXh0QWxpZ25tZW50AXcEbGVmdCgA"
|
||||
"9e7y1Q4eAmlkAXckODk3MDBjMDctZTBlMS00ZmUwLWFjYTItODQ5MzIwOWE3ZTQyKAD17vLVDh4J"
|
||||
"dGV4dENvbG9yAXcHZGVmYXVsdCgA9e7y1Q4eD2JhY2tncm91bmRDb2xvcgF3B2RlZmF1bHQA"
|
||||
)
|
||||
|
||||
|
||||
def test_utils_base64_yjs_to_text():
|
||||
"""Test extract text from saved yjs document"""
|
||||
assert utils.base64_yjs_to_text(TEST_BASE64_STRING) == "Hello world"
|
||||
|
||||
|
||||
def test_utils_base64_yjs_to_xml():
|
||||
"""Test extract xml from saved yjs document"""
|
||||
content = utils.base64_yjs_to_xml(TEST_BASE64_STRING)
|
||||
assert (
|
||||
'<heading "level"="1" "textAlignment"="left">Hello</heading>' in content
|
||||
or '<heading "textAlignment"="left" "level"="1">Hello</heading>' in content
|
||||
)
|
||||
assert '<bulletListItem "textAlignment"="left">world</bulletListItem>' in content
|
||||
29
src/backend/core/tests/test_utils_base64_yjs_to_text.py
Normal file
29
src/backend/core/tests/test_utils_base64_yjs_to_text.py
Normal file
@@ -0,0 +1,29 @@
|
||||
"""Test util base64_yjs_to_text."""
|
||||
|
||||
from core.utils import base64_yjs_to_text
|
||||
|
||||
|
||||
def test_base64_yjs_to_text():
|
||||
"""
|
||||
Test extract_text_from_saved_yjs_document
|
||||
This base64 string is an example of what is saved in the database.
|
||||
This base64 is generated from the blocknote editor, it contains
|
||||
the text \n# *Hello* \n- w**or**ld
|
||||
"""
|
||||
base64_string = (
|
||||
"AR717vLVDgAHAQ5kb2N1bWVudC1zdG9yZQMKYmxvY2tHcm91cAcA9e7y1Q4AAw5ibG9ja0NvbnRh"
|
||||
"aW5lcgcA9e7y1Q4BAwdoZWFkaW5nBwD17vLVDgIGBgD17vLVDgMGaXRhbGljAnt9hPXu8tUOBAVI"
|
||||
"ZWxsb4b17vLVDgkGaXRhbGljBG51bGwoAPXu8tUOAg10ZXh0QWxpZ25tZW50AXcEbGVmdCgA9e7y"
|
||||
"1Q4CBWxldmVsAX0BKAD17vLVDgECaWQBdyQwNGQ2MjM0MS04MzI2LTQyMzYtYTA4My00ODdlMjZm"
|
||||
"YWQyMzAoAPXu8tUOAQl0ZXh0Q29sb3IBdwdkZWZhdWx0KAD17vLVDgEPYmFja2dyb3VuZENvbG9y"
|
||||
"AXcHZGVmYXVsdIf17vLVDgEDDmJsb2NrQ29udGFpbmVyBwD17vLVDhADDmJ1bGxldExpc3RJdGVt"
|
||||
"BwD17vLVDhEGBAD17vLVDhIBd4b17vLVDhMEYm9sZAJ7fYT17vLVDhQCb3KG9e7y1Q4WBGJvbGQE"
|
||||
"bnVsbIT17vLVDhcCbGQoAPXu8tUOEQ10ZXh0QWxpZ25tZW50AXcEbGVmdCgA9e7y1Q4QAmlkAXck"
|
||||
"ZDM1MWUwNjgtM2U1NS00MjI2LThlYTUtYWJiMjYzMTk4ZTJhKAD17vLVDhAJdGV4dENvbG9yAXcH"
|
||||
"ZGVmYXVsdCgA9e7y1Q4QD2JhY2tncm91bmRDb2xvcgF3B2RlZmF1bHSH9e7y1Q4QAw5ibG9ja0Nv"
|
||||
"bnRhaW5lcgcA9e7y1Q4eAwlwYXJhZ3JhcGgoAPXu8tUOHw10ZXh0QWxpZ25tZW50AXcEbGVmdCgA"
|
||||
"9e7y1Q4eAmlkAXckODk3MDBjMDctZTBlMS00ZmUwLWFjYTItODQ5MzIwOWE3ZTQyKAD17vLVDh4J"
|
||||
"dGV4dENvbG9yAXcHZGVmYXVsdCgA9e7y1Q4eD2JhY2tncm91bmRDb2xvcgF3B2RlZmF1bHQA"
|
||||
)
|
||||
|
||||
assert base64_yjs_to_text(base64_string) == "Hello world"
|
||||
25
src/backend/core/utils.py
Normal file
25
src/backend/core/utils.py
Normal file
@@ -0,0 +1,25 @@
|
||||
"""Utils for the core app."""
|
||||
|
||||
import base64
|
||||
|
||||
import y_py as Y
|
||||
from bs4 import BeautifulSoup
|
||||
|
||||
|
||||
def base64_yjs_to_xml(base64_string):
|
||||
"""Extract xml from base64 yjs document."""
|
||||
|
||||
decoded_bytes = base64.b64decode(base64_string)
|
||||
uint8_array = bytearray(decoded_bytes)
|
||||
|
||||
doc = Y.YDoc() # pylint: disable=E1101
|
||||
Y.apply_update(doc, uint8_array) # pylint: disable=E1101
|
||||
return str(doc.get_xml_element("document-store"))
|
||||
|
||||
|
||||
def base64_yjs_to_text(base64_string):
|
||||
"""Extract text from base64 yjs document."""
|
||||
|
||||
blocknote_structure = base64_yjs_to_xml(base64_string)
|
||||
soup = BeautifulSoup(blocknote_structure, "html.parser")
|
||||
return soup.get_text(separator=" ").strip()
|
||||
@@ -25,6 +25,7 @@ license = { file = "LICENSE" }
|
||||
readme = "README.md"
|
||||
requires-python = ">=3.12"
|
||||
dependencies = [
|
||||
"beautifulsoup4==4.12.3",
|
||||
"boto3==1.37.18",
|
||||
"Brotli==1.1.0",
|
||||
"celery[redis]==5.4.0",
|
||||
@@ -47,6 +48,7 @@ dependencies = [
|
||||
"gunicorn==23.0.0",
|
||||
"jsonschema==4.23.0",
|
||||
"markdown==3.7",
|
||||
"mozilla-django-oidc==4.0.1",
|
||||
"nested-multipart-parser==1.5.0",
|
||||
"openai==1.68.2",
|
||||
"psycopg[binary]==3.2.6",
|
||||
@@ -55,8 +57,13 @@ dependencies = [
|
||||
"requests==2.32.3",
|
||||
"sentry-sdk==2.24.0",
|
||||
"url-normalize==1.4.3",
|
||||
<<<<<<< HEAD
|
||||
"whitenoise==6.9.0",
|
||||
"mozilla-django-oidc==4.0.1",
|
||||
=======
|
||||
"whitenoise==6.8.2",
|
||||
"y-py==0.6.2",
|
||||
>>>>>>> f087cd70 (✨(backend) add util to extract text from Ydoc content)
|
||||
]
|
||||
|
||||
[project.urls]
|
||||
|
||||
Reference in New Issue
Block a user