✨(backend) add util to extract text from Ydoc content
Documents content is stored in the Ydoc format. We need a util to extract it as xml/text.
This commit is contained in:
committed by
Manuel Raynaud
parent
747ca70186
commit
710bbf512c
@@ -132,6 +132,7 @@ and this project adheres to
|
|||||||
|
|
||||||
## Added
|
## Added
|
||||||
|
|
||||||
|
- ⚗️(backend) add util to extract text from base64 yjs document
|
||||||
- ✨(backend) add soft delete and restore API endpoints to documents #516
|
- ✨(backend) add soft delete and restore API endpoints to documents #516
|
||||||
- ✨(backend) allow organizing documents in a tree structure #516
|
- ✨(backend) allow organizing documents in a tree structure #516
|
||||||
- ✨(backend) add "excerpt" field to document list serializer #516
|
- ✨(backend) add "excerpt" field to document list serializer #516
|
||||||
|
|||||||
@@ -15,6 +15,13 @@ FROM base AS back-builder
|
|||||||
|
|
||||||
WORKDIR /builder
|
WORKDIR /builder
|
||||||
|
|
||||||
|
# Install Rust and Cargo using Alpine's package manager
|
||||||
|
RUN apk add --no-cache \
|
||||||
|
build-base \
|
||||||
|
libffi-dev \
|
||||||
|
rust \
|
||||||
|
cargo
|
||||||
|
|
||||||
# Copy required python dependencies
|
# Copy required python dependencies
|
||||||
COPY ./src/backend /builder
|
COPY ./src/backend /builder
|
||||||
|
|
||||||
|
|||||||
@@ -13,6 +13,22 @@ from core import models
|
|||||||
|
|
||||||
fake = Faker()
|
fake = Faker()
|
||||||
|
|
||||||
|
YDOC_HELLO_WORLD_BASE64 = (
|
||||||
|
"AR717vLVDgAHAQ5kb2N1bWVudC1zdG9yZQMKYmxvY2tHcm91cAcA9e7y1Q4AAw5ibG9ja0NvbnRh"
|
||||||
|
"aW5lcgcA9e7y1Q4BAwdoZWFkaW5nBwD17vLVDgIGBgD17vLVDgMGaXRhbGljAnt9hPXu8tUOBAVI"
|
||||||
|
"ZWxsb4b17vLVDgkGaXRhbGljBG51bGwoAPXu8tUOAg10ZXh0QWxpZ25tZW50AXcEbGVmdCgA9e7y"
|
||||||
|
"1Q4CBWxldmVsAX0BKAD17vLVDgECaWQBdyQwNGQ2MjM0MS04MzI2LTQyMzYtYTA4My00ODdlMjZm"
|
||||||
|
"YWQyMzAoAPXu8tUOAQl0ZXh0Q29sb3IBdwdkZWZhdWx0KAD17vLVDgEPYmFja2dyb3VuZENvbG9y"
|
||||||
|
"AXcHZGVmYXVsdIf17vLVDgEDDmJsb2NrQ29udGFpbmVyBwD17vLVDhADDmJ1bGxldExpc3RJdGVt"
|
||||||
|
"BwD17vLVDhEGBAD17vLVDhIBd4b17vLVDhMEYm9sZAJ7fYT17vLVDhQCb3KG9e7y1Q4WBGJvbGQE"
|
||||||
|
"bnVsbIT17vLVDhcCbGQoAPXu8tUOEQ10ZXh0QWxpZ25tZW50AXcEbGVmdCgA9e7y1Q4QAmlkAXck"
|
||||||
|
"ZDM1MWUwNjgtM2U1NS00MjI2LThlYTUtYWJiMjYzMTk4ZTJhKAD17vLVDhAJdGV4dENvbG9yAXcH"
|
||||||
|
"ZGVmYXVsdCgA9e7y1Q4QD2JhY2tncm91bmRDb2xvcgF3B2RlZmF1bHSH9e7y1Q4QAw5ibG9ja0Nv"
|
||||||
|
"bnRhaW5lcgcA9e7y1Q4eAwlwYXJhZ3JhcGgoAPXu8tUOHw10ZXh0QWxpZ25tZW50AXcEbGVmdCgA"
|
||||||
|
"9e7y1Q4eAmlkAXckODk3MDBjMDctZTBlMS00ZmUwLWFjYTItODQ5MzIwOWE3ZTQyKAD17vLVDh4J"
|
||||||
|
"dGV4dENvbG9yAXcHZGVmYXVsdCgA9e7y1Q4eD2JhY2tncm91bmRDb2xvcgF3B2RlZmF1bHQA"
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
class UserFactory(factory.django.DjangoModelFactory):
|
class UserFactory(factory.django.DjangoModelFactory):
|
||||||
"""A factory to random users for testing purposes."""
|
"""A factory to random users for testing purposes."""
|
||||||
@@ -75,7 +91,7 @@ class DocumentFactory(factory.django.DjangoModelFactory):
|
|||||||
|
|
||||||
title = factory.Sequence(lambda n: f"document{n}")
|
title = factory.Sequence(lambda n: f"document{n}")
|
||||||
excerpt = factory.Sequence(lambda n: f"excerpt{n}")
|
excerpt = factory.Sequence(lambda n: f"excerpt{n}")
|
||||||
content = factory.Sequence(lambda n: f"content{n}")
|
content = YDOC_HELLO_WORLD_BASE64
|
||||||
creator = factory.SubFactory(UserFactory)
|
creator = factory.SubFactory(UserFactory)
|
||||||
deleted_at = None
|
deleted_at = None
|
||||||
link_reach = factory.fuzzy.FuzzyChoice(
|
link_reach = factory.fuzzy.FuzzyChoice(
|
||||||
|
|||||||
37
src/backend/core/tests/test_utils.py
Normal file
37
src/backend/core/tests/test_utils.py
Normal file
@@ -0,0 +1,37 @@
|
|||||||
|
"""Test util base64_yjs_to_text."""
|
||||||
|
|
||||||
|
from core import utils
|
||||||
|
|
||||||
|
# This base64 string is an example of what is saved in the database.
|
||||||
|
# This base64 is generated from the blocknote editor, it contains
|
||||||
|
# the text \n# *Hello* \n- w**or**ld
|
||||||
|
TEST_BASE64_STRING = (
|
||||||
|
"AR717vLVDgAHAQ5kb2N1bWVudC1zdG9yZQMKYmxvY2tHcm91cAcA9e7y1Q4AAw5ibG9ja0NvbnRh"
|
||||||
|
"aW5lcgcA9e7y1Q4BAwdoZWFkaW5nBwD17vLVDgIGBgD17vLVDgMGaXRhbGljAnt9hPXu8tUOBAVI"
|
||||||
|
"ZWxsb4b17vLVDgkGaXRhbGljBG51bGwoAPXu8tUOAg10ZXh0QWxpZ25tZW50AXcEbGVmdCgA9e7y"
|
||||||
|
"1Q4CBWxldmVsAX0BKAD17vLVDgECaWQBdyQwNGQ2MjM0MS04MzI2LTQyMzYtYTA4My00ODdlMjZm"
|
||||||
|
"YWQyMzAoAPXu8tUOAQl0ZXh0Q29sb3IBdwdkZWZhdWx0KAD17vLVDgEPYmFja2dyb3VuZENvbG9y"
|
||||||
|
"AXcHZGVmYXVsdIf17vLVDgEDDmJsb2NrQ29udGFpbmVyBwD17vLVDhADDmJ1bGxldExpc3RJdGVt"
|
||||||
|
"BwD17vLVDhEGBAD17vLVDhIBd4b17vLVDhMEYm9sZAJ7fYT17vLVDhQCb3KG9e7y1Q4WBGJvbGQE"
|
||||||
|
"bnVsbIT17vLVDhcCbGQoAPXu8tUOEQ10ZXh0QWxpZ25tZW50AXcEbGVmdCgA9e7y1Q4QAmlkAXck"
|
||||||
|
"ZDM1MWUwNjgtM2U1NS00MjI2LThlYTUtYWJiMjYzMTk4ZTJhKAD17vLVDhAJdGV4dENvbG9yAXcH"
|
||||||
|
"ZGVmYXVsdCgA9e7y1Q4QD2JhY2tncm91bmRDb2xvcgF3B2RlZmF1bHSH9e7y1Q4QAw5ibG9ja0Nv"
|
||||||
|
"bnRhaW5lcgcA9e7y1Q4eAwlwYXJhZ3JhcGgoAPXu8tUOHw10ZXh0QWxpZ25tZW50AXcEbGVmdCgA"
|
||||||
|
"9e7y1Q4eAmlkAXckODk3MDBjMDctZTBlMS00ZmUwLWFjYTItODQ5MzIwOWE3ZTQyKAD17vLVDh4J"
|
||||||
|
"dGV4dENvbG9yAXcHZGVmYXVsdCgA9e7y1Q4eD2JhY2tncm91bmRDb2xvcgF3B2RlZmF1bHQA"
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
def test_utils_base64_yjs_to_text():
|
||||||
|
"""Test extract text from saved yjs document"""
|
||||||
|
assert utils.base64_yjs_to_text(TEST_BASE64_STRING) == "Hello world"
|
||||||
|
|
||||||
|
|
||||||
|
def test_utils_base64_yjs_to_xml():
|
||||||
|
"""Test extract xml from saved yjs document"""
|
||||||
|
content = utils.base64_yjs_to_xml(TEST_BASE64_STRING)
|
||||||
|
assert (
|
||||||
|
'<heading "level"="1" "textAlignment"="left">Hello</heading>' in content
|
||||||
|
or '<heading "textAlignment"="left" "level"="1">Hello</heading>' in content
|
||||||
|
)
|
||||||
|
assert '<bulletListItem "textAlignment"="left">world</bulletListItem>' in content
|
||||||
29
src/backend/core/tests/test_utils_base64_yjs_to_text.py
Normal file
29
src/backend/core/tests/test_utils_base64_yjs_to_text.py
Normal file
@@ -0,0 +1,29 @@
|
|||||||
|
"""Test util base64_yjs_to_text."""
|
||||||
|
|
||||||
|
from core.utils import base64_yjs_to_text
|
||||||
|
|
||||||
|
|
||||||
|
def test_base64_yjs_to_text():
|
||||||
|
"""
|
||||||
|
Test extract_text_from_saved_yjs_document
|
||||||
|
This base64 string is an example of what is saved in the database.
|
||||||
|
This base64 is generated from the blocknote editor, it contains
|
||||||
|
the text \n# *Hello* \n- w**or**ld
|
||||||
|
"""
|
||||||
|
base64_string = (
|
||||||
|
"AR717vLVDgAHAQ5kb2N1bWVudC1zdG9yZQMKYmxvY2tHcm91cAcA9e7y1Q4AAw5ibG9ja0NvbnRh"
|
||||||
|
"aW5lcgcA9e7y1Q4BAwdoZWFkaW5nBwD17vLVDgIGBgD17vLVDgMGaXRhbGljAnt9hPXu8tUOBAVI"
|
||||||
|
"ZWxsb4b17vLVDgkGaXRhbGljBG51bGwoAPXu8tUOAg10ZXh0QWxpZ25tZW50AXcEbGVmdCgA9e7y"
|
||||||
|
"1Q4CBWxldmVsAX0BKAD17vLVDgECaWQBdyQwNGQ2MjM0MS04MzI2LTQyMzYtYTA4My00ODdlMjZm"
|
||||||
|
"YWQyMzAoAPXu8tUOAQl0ZXh0Q29sb3IBdwdkZWZhdWx0KAD17vLVDgEPYmFja2dyb3VuZENvbG9y"
|
||||||
|
"AXcHZGVmYXVsdIf17vLVDgEDDmJsb2NrQ29udGFpbmVyBwD17vLVDhADDmJ1bGxldExpc3RJdGVt"
|
||||||
|
"BwD17vLVDhEGBAD17vLVDhIBd4b17vLVDhMEYm9sZAJ7fYT17vLVDhQCb3KG9e7y1Q4WBGJvbGQE"
|
||||||
|
"bnVsbIT17vLVDhcCbGQoAPXu8tUOEQ10ZXh0QWxpZ25tZW50AXcEbGVmdCgA9e7y1Q4QAmlkAXck"
|
||||||
|
"ZDM1MWUwNjgtM2U1NS00MjI2LThlYTUtYWJiMjYzMTk4ZTJhKAD17vLVDhAJdGV4dENvbG9yAXcH"
|
||||||
|
"ZGVmYXVsdCgA9e7y1Q4QD2JhY2tncm91bmRDb2xvcgF3B2RlZmF1bHSH9e7y1Q4QAw5ibG9ja0Nv"
|
||||||
|
"bnRhaW5lcgcA9e7y1Q4eAwlwYXJhZ3JhcGgoAPXu8tUOHw10ZXh0QWxpZ25tZW50AXcEbGVmdCgA"
|
||||||
|
"9e7y1Q4eAmlkAXckODk3MDBjMDctZTBlMS00ZmUwLWFjYTItODQ5MzIwOWE3ZTQyKAD17vLVDh4J"
|
||||||
|
"dGV4dENvbG9yAXcHZGVmYXVsdCgA9e7y1Q4eD2JhY2tncm91bmRDb2xvcgF3B2RlZmF1bHQA"
|
||||||
|
)
|
||||||
|
|
||||||
|
assert base64_yjs_to_text(base64_string) == "Hello world"
|
||||||
25
src/backend/core/utils.py
Normal file
25
src/backend/core/utils.py
Normal file
@@ -0,0 +1,25 @@
|
|||||||
|
"""Utils for the core app."""
|
||||||
|
|
||||||
|
import base64
|
||||||
|
|
||||||
|
import y_py as Y
|
||||||
|
from bs4 import BeautifulSoup
|
||||||
|
|
||||||
|
|
||||||
|
def base64_yjs_to_xml(base64_string):
|
||||||
|
"""Extract xml from base64 yjs document."""
|
||||||
|
|
||||||
|
decoded_bytes = base64.b64decode(base64_string)
|
||||||
|
uint8_array = bytearray(decoded_bytes)
|
||||||
|
|
||||||
|
doc = Y.YDoc() # pylint: disable=E1101
|
||||||
|
Y.apply_update(doc, uint8_array) # pylint: disable=E1101
|
||||||
|
return str(doc.get_xml_element("document-store"))
|
||||||
|
|
||||||
|
|
||||||
|
def base64_yjs_to_text(base64_string):
|
||||||
|
"""Extract text from base64 yjs document."""
|
||||||
|
|
||||||
|
blocknote_structure = base64_yjs_to_xml(base64_string)
|
||||||
|
soup = BeautifulSoup(blocknote_structure, "html.parser")
|
||||||
|
return soup.get_text(separator=" ").strip()
|
||||||
@@ -25,6 +25,7 @@ license = { file = "LICENSE" }
|
|||||||
readme = "README.md"
|
readme = "README.md"
|
||||||
requires-python = ">=3.12"
|
requires-python = ">=3.12"
|
||||||
dependencies = [
|
dependencies = [
|
||||||
|
"beautifulsoup4==4.12.3",
|
||||||
"boto3==1.37.18",
|
"boto3==1.37.18",
|
||||||
"Brotli==1.1.0",
|
"Brotli==1.1.0",
|
||||||
"celery[redis]==5.4.0",
|
"celery[redis]==5.4.0",
|
||||||
@@ -47,6 +48,7 @@ dependencies = [
|
|||||||
"gunicorn==23.0.0",
|
"gunicorn==23.0.0",
|
||||||
"jsonschema==4.23.0",
|
"jsonschema==4.23.0",
|
||||||
"markdown==3.7",
|
"markdown==3.7",
|
||||||
|
"mozilla-django-oidc==4.0.1",
|
||||||
"nested-multipart-parser==1.5.0",
|
"nested-multipart-parser==1.5.0",
|
||||||
"openai==1.68.2",
|
"openai==1.68.2",
|
||||||
"psycopg[binary]==3.2.6",
|
"psycopg[binary]==3.2.6",
|
||||||
@@ -55,8 +57,13 @@ dependencies = [
|
|||||||
"requests==2.32.3",
|
"requests==2.32.3",
|
||||||
"sentry-sdk==2.24.0",
|
"sentry-sdk==2.24.0",
|
||||||
"url-normalize==1.4.3",
|
"url-normalize==1.4.3",
|
||||||
|
<<<<<<< HEAD
|
||||||
"whitenoise==6.9.0",
|
"whitenoise==6.9.0",
|
||||||
"mozilla-django-oidc==4.0.1",
|
"mozilla-django-oidc==4.0.1",
|
||||||
|
=======
|
||||||
|
"whitenoise==6.8.2",
|
||||||
|
"y-py==0.6.2",
|
||||||
|
>>>>>>> f087cd70 (✨(backend) add util to extract text from Ydoc content)
|
||||||
]
|
]
|
||||||
|
|
||||||
[project.urls]
|
[project.urls]
|
||||||
|
|||||||
Reference in New Issue
Block a user