diff --git a/CHANGELOG.md b/CHANGELOG.md
index f2e66522..d4f738fd 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -132,6 +132,7 @@ and this project adheres to
## Added
+- ⚗️(backend) add util to extract text from base64 yjs document
- ✨(backend) add soft delete and restore API endpoints to documents #516
- ✨(backend) allow organizing documents in a tree structure #516
- ✨(backend) add "excerpt" field to document list serializer #516
diff --git a/Dockerfile b/Dockerfile
index 60f464ee..23f26b70 100644
--- a/Dockerfile
+++ b/Dockerfile
@@ -15,6 +15,13 @@ FROM base AS back-builder
WORKDIR /builder
+# Install Rust and Cargo using Alpine's package manager
+RUN apk add --no-cache \
+ build-base \
+ libffi-dev \
+ rust \
+ cargo
+
# Copy required python dependencies
COPY ./src/backend /builder
diff --git a/src/backend/core/factories.py b/src/backend/core/factories.py
index 3f2c085f..d0a641d8 100644
--- a/src/backend/core/factories.py
+++ b/src/backend/core/factories.py
@@ -13,6 +13,22 @@ from core import models
fake = Faker()
+YDOC_HELLO_WORLD_BASE64 = (
+ "AR717vLVDgAHAQ5kb2N1bWVudC1zdG9yZQMKYmxvY2tHcm91cAcA9e7y1Q4AAw5ibG9ja0NvbnRh"
+ "aW5lcgcA9e7y1Q4BAwdoZWFkaW5nBwD17vLVDgIGBgD17vLVDgMGaXRhbGljAnt9hPXu8tUOBAVI"
+ "ZWxsb4b17vLVDgkGaXRhbGljBG51bGwoAPXu8tUOAg10ZXh0QWxpZ25tZW50AXcEbGVmdCgA9e7y"
+ "1Q4CBWxldmVsAX0BKAD17vLVDgECaWQBdyQwNGQ2MjM0MS04MzI2LTQyMzYtYTA4My00ODdlMjZm"
+ "YWQyMzAoAPXu8tUOAQl0ZXh0Q29sb3IBdwdkZWZhdWx0KAD17vLVDgEPYmFja2dyb3VuZENvbG9y"
+ "AXcHZGVmYXVsdIf17vLVDgEDDmJsb2NrQ29udGFpbmVyBwD17vLVDhADDmJ1bGxldExpc3RJdGVt"
+ "BwD17vLVDhEGBAD17vLVDhIBd4b17vLVDhMEYm9sZAJ7fYT17vLVDhQCb3KG9e7y1Q4WBGJvbGQE"
+ "bnVsbIT17vLVDhcCbGQoAPXu8tUOEQ10ZXh0QWxpZ25tZW50AXcEbGVmdCgA9e7y1Q4QAmlkAXck"
+ "ZDM1MWUwNjgtM2U1NS00MjI2LThlYTUtYWJiMjYzMTk4ZTJhKAD17vLVDhAJdGV4dENvbG9yAXcH"
+ "ZGVmYXVsdCgA9e7y1Q4QD2JhY2tncm91bmRDb2xvcgF3B2RlZmF1bHSH9e7y1Q4QAw5ibG9ja0Nv"
+ "bnRhaW5lcgcA9e7y1Q4eAwlwYXJhZ3JhcGgoAPXu8tUOHw10ZXh0QWxpZ25tZW50AXcEbGVmdCgA"
+ "9e7y1Q4eAmlkAXckODk3MDBjMDctZTBlMS00ZmUwLWFjYTItODQ5MzIwOWE3ZTQyKAD17vLVDh4J"
+ "dGV4dENvbG9yAXcHZGVmYXVsdCgA9e7y1Q4eD2JhY2tncm91bmRDb2xvcgF3B2RlZmF1bHQA"
+)
+
class UserFactory(factory.django.DjangoModelFactory):
"""A factory to random users for testing purposes."""
@@ -75,7 +91,7 @@ class DocumentFactory(factory.django.DjangoModelFactory):
title = factory.Sequence(lambda n: f"document{n}")
excerpt = factory.Sequence(lambda n: f"excerpt{n}")
- content = factory.Sequence(lambda n: f"content{n}")
+ content = YDOC_HELLO_WORLD_BASE64
creator = factory.SubFactory(UserFactory)
deleted_at = None
link_reach = factory.fuzzy.FuzzyChoice(
diff --git a/src/backend/core/tests/test_utils.py b/src/backend/core/tests/test_utils.py
new file mode 100644
index 00000000..4fa33e1e
--- /dev/null
+++ b/src/backend/core/tests/test_utils.py
@@ -0,0 +1,37 @@
+"""Test util base64_yjs_to_text."""
+
+from core import utils
+
+# This base64 string is an example of what is saved in the database.
+# This base64 is generated from the blocknote editor, it contains
+# the text \n# *Hello* \n- w**or**ld
+TEST_BASE64_STRING = (
+ "AR717vLVDgAHAQ5kb2N1bWVudC1zdG9yZQMKYmxvY2tHcm91cAcA9e7y1Q4AAw5ibG9ja0NvbnRh"
+ "aW5lcgcA9e7y1Q4BAwdoZWFkaW5nBwD17vLVDgIGBgD17vLVDgMGaXRhbGljAnt9hPXu8tUOBAVI"
+ "ZWxsb4b17vLVDgkGaXRhbGljBG51bGwoAPXu8tUOAg10ZXh0QWxpZ25tZW50AXcEbGVmdCgA9e7y"
+ "1Q4CBWxldmVsAX0BKAD17vLVDgECaWQBdyQwNGQ2MjM0MS04MzI2LTQyMzYtYTA4My00ODdlMjZm"
+ "YWQyMzAoAPXu8tUOAQl0ZXh0Q29sb3IBdwdkZWZhdWx0KAD17vLVDgEPYmFja2dyb3VuZENvbG9y"
+ "AXcHZGVmYXVsdIf17vLVDgEDDmJsb2NrQ29udGFpbmVyBwD17vLVDhADDmJ1bGxldExpc3RJdGVt"
+ "BwD17vLVDhEGBAD17vLVDhIBd4b17vLVDhMEYm9sZAJ7fYT17vLVDhQCb3KG9e7y1Q4WBGJvbGQE"
+ "bnVsbIT17vLVDhcCbGQoAPXu8tUOEQ10ZXh0QWxpZ25tZW50AXcEbGVmdCgA9e7y1Q4QAmlkAXck"
+ "ZDM1MWUwNjgtM2U1NS00MjI2LThlYTUtYWJiMjYzMTk4ZTJhKAD17vLVDhAJdGV4dENvbG9yAXcH"
+ "ZGVmYXVsdCgA9e7y1Q4QD2JhY2tncm91bmRDb2xvcgF3B2RlZmF1bHSH9e7y1Q4QAw5ibG9ja0Nv"
+ "bnRhaW5lcgcA9e7y1Q4eAwlwYXJhZ3JhcGgoAPXu8tUOHw10ZXh0QWxpZ25tZW50AXcEbGVmdCgA"
+ "9e7y1Q4eAmlkAXckODk3MDBjMDctZTBlMS00ZmUwLWFjYTItODQ5MzIwOWE3ZTQyKAD17vLVDh4J"
+ "dGV4dENvbG9yAXcHZGVmYXVsdCgA9e7y1Q4eD2JhY2tncm91bmRDb2xvcgF3B2RlZmF1bHQA"
+)
+
+
+def test_utils_base64_yjs_to_text():
+ """Test extract text from saved yjs document"""
+ assert utils.base64_yjs_to_text(TEST_BASE64_STRING) == "Hello world"
+
+
+def test_utils_base64_yjs_to_xml():
+ """Test extract xml from saved yjs document"""
+ content = utils.base64_yjs_to_xml(TEST_BASE64_STRING)
+ assert (
+ 'Hello' in content
+ or 'Hello' in content
+ )
+ assert 'world' in content
diff --git a/src/backend/core/tests/test_utils_base64_yjs_to_text.py b/src/backend/core/tests/test_utils_base64_yjs_to_text.py
new file mode 100644
index 00000000..376bb85d
--- /dev/null
+++ b/src/backend/core/tests/test_utils_base64_yjs_to_text.py
@@ -0,0 +1,29 @@
+"""Test util base64_yjs_to_text."""
+
+from core.utils import base64_yjs_to_text
+
+
+def test_base64_yjs_to_text():
+ """
+ Test extract_text_from_saved_yjs_document
+ This base64 string is an example of what is saved in the database.
+ This base64 is generated from the blocknote editor, it contains
+ the text \n# *Hello* \n- w**or**ld
+ """
+ base64_string = (
+ "AR717vLVDgAHAQ5kb2N1bWVudC1zdG9yZQMKYmxvY2tHcm91cAcA9e7y1Q4AAw5ibG9ja0NvbnRh"
+ "aW5lcgcA9e7y1Q4BAwdoZWFkaW5nBwD17vLVDgIGBgD17vLVDgMGaXRhbGljAnt9hPXu8tUOBAVI"
+ "ZWxsb4b17vLVDgkGaXRhbGljBG51bGwoAPXu8tUOAg10ZXh0QWxpZ25tZW50AXcEbGVmdCgA9e7y"
+ "1Q4CBWxldmVsAX0BKAD17vLVDgECaWQBdyQwNGQ2MjM0MS04MzI2LTQyMzYtYTA4My00ODdlMjZm"
+ "YWQyMzAoAPXu8tUOAQl0ZXh0Q29sb3IBdwdkZWZhdWx0KAD17vLVDgEPYmFja2dyb3VuZENvbG9y"
+ "AXcHZGVmYXVsdIf17vLVDgEDDmJsb2NrQ29udGFpbmVyBwD17vLVDhADDmJ1bGxldExpc3RJdGVt"
+ "BwD17vLVDhEGBAD17vLVDhIBd4b17vLVDhMEYm9sZAJ7fYT17vLVDhQCb3KG9e7y1Q4WBGJvbGQE"
+ "bnVsbIT17vLVDhcCbGQoAPXu8tUOEQ10ZXh0QWxpZ25tZW50AXcEbGVmdCgA9e7y1Q4QAmlkAXck"
+ "ZDM1MWUwNjgtM2U1NS00MjI2LThlYTUtYWJiMjYzMTk4ZTJhKAD17vLVDhAJdGV4dENvbG9yAXcH"
+ "ZGVmYXVsdCgA9e7y1Q4QD2JhY2tncm91bmRDb2xvcgF3B2RlZmF1bHSH9e7y1Q4QAw5ibG9ja0Nv"
+ "bnRhaW5lcgcA9e7y1Q4eAwlwYXJhZ3JhcGgoAPXu8tUOHw10ZXh0QWxpZ25tZW50AXcEbGVmdCgA"
+ "9e7y1Q4eAmlkAXckODk3MDBjMDctZTBlMS00ZmUwLWFjYTItODQ5MzIwOWE3ZTQyKAD17vLVDh4J"
+ "dGV4dENvbG9yAXcHZGVmYXVsdCgA9e7y1Q4eD2JhY2tncm91bmRDb2xvcgF3B2RlZmF1bHQA"
+ )
+
+ assert base64_yjs_to_text(base64_string) == "Hello world"
diff --git a/src/backend/core/utils.py b/src/backend/core/utils.py
new file mode 100644
index 00000000..bd2e0170
--- /dev/null
+++ b/src/backend/core/utils.py
@@ -0,0 +1,25 @@
+"""Utils for the core app."""
+
+import base64
+
+import y_py as Y
+from bs4 import BeautifulSoup
+
+
+def base64_yjs_to_xml(base64_string):
+ """Extract xml from base64 yjs document."""
+
+ decoded_bytes = base64.b64decode(base64_string)
+ uint8_array = bytearray(decoded_bytes)
+
+ doc = Y.YDoc() # pylint: disable=E1101
+ Y.apply_update(doc, uint8_array) # pylint: disable=E1101
+ return str(doc.get_xml_element("document-store"))
+
+
+def base64_yjs_to_text(base64_string):
+ """Extract text from base64 yjs document."""
+
+ blocknote_structure = base64_yjs_to_xml(base64_string)
+ soup = BeautifulSoup(blocknote_structure, "html.parser")
+ return soup.get_text(separator=" ").strip()
diff --git a/src/backend/pyproject.toml b/src/backend/pyproject.toml
index c742af5f..17cb0049 100644
--- a/src/backend/pyproject.toml
+++ b/src/backend/pyproject.toml
@@ -25,6 +25,7 @@ license = { file = "LICENSE" }
readme = "README.md"
requires-python = ">=3.12"
dependencies = [
+ "beautifulsoup4==4.12.3",
"boto3==1.37.18",
"Brotli==1.1.0",
"celery[redis]==5.4.0",
@@ -47,6 +48,7 @@ dependencies = [
"gunicorn==23.0.0",
"jsonschema==4.23.0",
"markdown==3.7",
+ "mozilla-django-oidc==4.0.1",
"nested-multipart-parser==1.5.0",
"openai==1.68.2",
"psycopg[binary]==3.2.6",
@@ -55,8 +57,13 @@ dependencies = [
"requests==2.32.3",
"sentry-sdk==2.24.0",
"url-normalize==1.4.3",
+<<<<<<< HEAD
"whitenoise==6.9.0",
"mozilla-django-oidc==4.0.1",
+=======
+ "whitenoise==6.8.2",
+ "y-py==0.6.2",
+>>>>>>> f087cd70 (✨(backend) add util to extract text from Ydoc content)
]
[project.urls]