diff --git a/src/backend/core/tests/documents/test_api_documents_duplicate.py b/src/backend/core/tests/documents/test_api_documents_duplicate.py
index ec13d878..82acfa98 100644
--- a/src/backend/core/tests/documents/test_api_documents_duplicate.py
+++ b/src/backend/core/tests/documents/test_api_documents_duplicate.py
@@ -11,9 +11,9 @@ from django.conf import settings
from django.core.files.storage import default_storage
from django.utils import timezone
+import pycrdt
import pytest
import requests
-import y_py
from rest_framework.test import APIClient
from core import factories, models
@@ -84,13 +84,14 @@ def test_api_documents_duplicate_success(index):
image_refs = [get_image_refs(doc_id) for doc_id in document_ids]
# Create document content with the first image only
- ydoc = y_py.YDoc() # pylint: disable=no-member
- with ydoc.begin_transaction() as txn:
- xml_fragment = ydoc.get_xml_element("document-store")
- xml_fragment.push_xml_element(txn, "image").set_attribute(
- txn, "src", image_refs[0][1]
- )
- update = y_py.encode_state_as_update(ydoc) # pylint: disable=no-member
+ ydoc = pycrdt.Doc()
+ fragment = pycrdt.XmlFragment(
+ [
+ pycrdt.XmlElement("img", {"src": image_refs[0][1]}),
+ ]
+ )
+ ydoc["document-store"] = fragment
+ update = ydoc.get_update()
base64_content = base64.b64encode(update).decode("utf-8")
# Create documents
diff --git a/src/backend/core/tests/documents/test_api_documents_update_extract_attachments.py b/src/backend/core/tests/documents/test_api_documents_update_extract_attachments.py
index f18c73db..b52f83e3 100644
--- a/src/backend/core/tests/documents/test_api_documents_update_extract_attachments.py
+++ b/src/backend/core/tests/documents/test_api_documents_update_extract_attachments.py
@@ -5,8 +5,8 @@ Test extract-attachments on document update in docs core app.
import base64
from uuid import uuid4
+import pycrdt
import pytest
-import y_py
from rest_framework.test import APIClient
from core import factories
@@ -16,14 +16,15 @@ pytestmark = pytest.mark.django_db
def get_ydoc_with_mages(image_keys):
"""Return a ydoc from text for testing purposes."""
- ydoc = y_py.YDoc() # pylint: disable=no-member
- with ydoc.begin_transaction() as txn:
- xml_fragment = ydoc.get_xml_element("document-store")
- for key in image_keys:
- xml_image = xml_fragment.push_xml_element(txn, "image")
- xml_image.set_attribute(txn, "src", f"http://localhost/media/{key:s}")
-
- update = y_py.encode_state_as_update(ydoc) # pylint: disable=no-member
+ ydoc = pycrdt.Doc()
+ fragment = pycrdt.XmlFragment(
+ [
+ pycrdt.XmlElement("img", {"src": f"http://localhost/media/{key:s}"})
+ for key in image_keys
+ ]
+ )
+ ydoc["document-store"] = fragment
+ update = ydoc.get_update()
return base64.b64encode(update).decode("utf-8")
diff --git a/src/backend/core/tests/migrations/test_migrations_0020_remove_is_public_add_field_attachments_and_duplicated_from.py b/src/backend/core/tests/migrations/test_migrations_0020_remove_is_public_add_field_attachments_and_duplicated_from.py
index 94ce5375..f94e2a1e 100644
--- a/src/backend/core/tests/migrations/test_migrations_0020_remove_is_public_add_field_attachments_and_duplicated_from.py
+++ b/src/backend/core/tests/migrations/test_migrations_0020_remove_is_public_add_field_attachments_and_duplicated_from.py
@@ -4,8 +4,8 @@ import uuid
from django.core.files.base import ContentFile
from django.core.files.storage import default_storage
+import pycrdt
import pytest
-import y_py
from core import models
@@ -27,14 +27,13 @@ def test_populate_attachments_on_all_documents(migrator):
# Create document content with an image
file_key = f"{old_doc_with_attachments.id!s}/file"
- ydoc = y_py.YDoc() # pylint: disable=no-member
image_key = f"{old_doc_with_attachments.id!s}/attachments/{uuid.uuid4()!s}.png"
- with ydoc.begin_transaction() as txn:
- xml_fragment = ydoc.get_xml_element("document-store")
- xml_fragment.push_xml_element(txn, "image").set_attribute(
- txn, "src", f"http://localhost/media/{image_key:s}"
- )
- update = y_py.encode_state_as_update(ydoc) # pylint: disable=no-member
+ ydoc = pycrdt.Doc()
+ fragment = pycrdt.XmlFragment(
+ [pycrdt.XmlElement("img", {"src": f"http://localhost/media/{image_key:s}"})]
+ )
+ ydoc["document-store"] = fragment
+ update = ydoc.get_update()
base64_content = base64.b64encode(update).decode("utf-8")
bytes_content = base64_content.encode("utf-8")
content_file = ContentFile(bytes_content)
diff --git a/src/backend/core/tests/test_utils.py b/src/backend/core/tests/test_utils.py
index 3fea93ed..37b2e32d 100644
--- a/src/backend/core/tests/test_utils.py
+++ b/src/backend/core/tests/test_utils.py
@@ -3,7 +3,7 @@
import base64
import uuid
-import y_py
+import pycrdt
from core import utils
@@ -29,17 +29,22 @@ TEST_BASE64_STRING = (
def test_utils_base64_yjs_to_text():
"""Test extract text from saved yjs document"""
- assert utils.base64_yjs_to_text(TEST_BASE64_STRING) == "Hello world"
+ assert utils.base64_yjs_to_text(TEST_BASE64_STRING) == "Hello w or ld"
def test_utils_base64_yjs_to_xml():
"""Test extract xml from saved yjs document"""
content = utils.base64_yjs_to_xml(TEST_BASE64_STRING)
assert (
- 'Hello' in content
- or 'Hello' in content
+ 'Hello'
+ in content
+ or 'Hello'
+ in content
+ )
+ assert (
+ 'world'
+ in content
)
- assert 'world' in content
def test_utils_extract_attachments():
@@ -56,22 +61,17 @@ def test_utils_extract_attachments():
image_key3 = f"{uuid.uuid4()!s}/attachments/{uuid.uuid4()!s}.png"
image_url3 = f"http://localhost/media/{image_key3:s}"
- ydoc = y_py.YDoc() # pylint: disable=no-member
- with ydoc.begin_transaction() as txn:
- xml_fragment = ydoc.get_xml_element("document-store")
+ ydoc = pycrdt.Doc()
+ frag = pycrdt.XmlFragment(
+ [
+ pycrdt.XmlElement("img", {"src": image_url1}),
+ pycrdt.XmlElement("img", {"src": image_url2}),
+ pycrdt.XmlElement("p", {}, [pycrdt.XmlText(image_url3)]),
+ ]
+ )
+ ydoc["document-store"] = frag
- xml_image = xml_fragment.push_xml_element(txn, "image")
- xml_image.set_attribute(txn, "src", image_url1)
-
- xml_image = xml_fragment.push_xml_element(txn, "image")
- xml_image.set_attribute(txn, "src", image_url2)
-
- xml_paragraph = xml_fragment.push_xml_element(txn, "paragraph")
- xml_text = xml_paragraph.push_xml_text(txn)
- xml_text.push(txn, image_url3)
-
- update = y_py.encode_state_as_update(ydoc) # pylint: disable=no-member
+ update = ydoc.get_update()
base64_string = base64.b64encode(update).decode("utf-8")
-
# image_key2 is missing the "/media/" part and shouldn't get extracted
assert utils.extract_attachments(base64_string) == [image_key1, image_key3]
diff --git a/src/backend/core/tests/test_utils_base64_yjs_to_text.py b/src/backend/core/tests/test_utils_base64_yjs_to_text.py
deleted file mode 100644
index 2ffa168d..00000000
--- a/src/backend/core/tests/test_utils_base64_yjs_to_text.py
+++ /dev/null
@@ -1,70 +0,0 @@
-"""Test util base64_yjs_to_text."""
-
-import base64
-import uuid
-
-import y_py
-
-from core import utils
-from core.utils import base64_yjs_to_text
-
-
-def test_utils_base64_yjs_to_text():
- """
- Test extract_text_from_saved_yjs_document
- This base64 string is an example of what is saved in the database.
- This base64 is generated from the blocknote editor, it contains
- the text \n# *Hello* \n- w**or**ld
- """
- base64_string = (
- "AR717vLVDgAHAQ5kb2N1bWVudC1zdG9yZQMKYmxvY2tHcm91cAcA9e7y1Q4AAw5ibG9ja0NvbnRh"
- "aW5lcgcA9e7y1Q4BAwdoZWFkaW5nBwD17vLVDgIGBgD17vLVDgMGaXRhbGljAnt9hPXu8tUOBAVI"
- "ZWxsb4b17vLVDgkGaXRhbGljBG51bGwoAPXu8tUOAg10ZXh0QWxpZ25tZW50AXcEbGVmdCgA9e7y"
- "1Q4CBWxldmVsAX0BKAD17vLVDgECaWQBdyQwNGQ2MjM0MS04MzI2LTQyMzYtYTA4My00ODdlMjZm"
- "YWQyMzAoAPXu8tUOAQl0ZXh0Q29sb3IBdwdkZWZhdWx0KAD17vLVDgEPYmFja2dyb3VuZENvbG9y"
- "AXcHZGVmYXVsdIf17vLVDgEDDmJsb2NrQ29udGFpbmVyBwD17vLVDhADDmJ1bGxldExpc3RJdGVt"
- "BwD17vLVDhEGBAD17vLVDhIBd4b17vLVDhMEYm9sZAJ7fYT17vLVDhQCb3KG9e7y1Q4WBGJvbGQE"
- "bnVsbIT17vLVDhcCbGQoAPXu8tUOEQ10ZXh0QWxpZ25tZW50AXcEbGVmdCgA9e7y1Q4QAmlkAXck"
- "ZDM1MWUwNjgtM2U1NS00MjI2LThlYTUtYWJiMjYzMTk4ZTJhKAD17vLVDhAJdGV4dENvbG9yAXcH"
- "ZGVmYXVsdCgA9e7y1Q4QD2JhY2tncm91bmRDb2xvcgF3B2RlZmF1bHSH9e7y1Q4QAw5ibG9ja0Nv"
- "bnRhaW5lcgcA9e7y1Q4eAwlwYXJhZ3JhcGgoAPXu8tUOHw10ZXh0QWxpZ25tZW50AXcEbGVmdCgA"
- "9e7y1Q4eAmlkAXckODk3MDBjMDctZTBlMS00ZmUwLWFjYTItODQ5MzIwOWE3ZTQyKAD17vLVDh4J"
- "dGV4dENvbG9yAXcHZGVmYXVsdCgA9e7y1Q4eD2JhY2tncm91bmRDb2xvcgF3B2RlZmF1bHQA"
- )
-
- assert base64_yjs_to_text(base64_string) == "Hello world"
-
-
-def test_utils_extract_attachments():
- """
- All attachment keys in the document content should be extracted.
- """
- document_id = uuid.uuid4()
- image_key1 = f"{document_id!s}/attachments/{uuid.uuid4()!s}.png"
- image_url1 = f"http://localhost/media/{image_key1:s}"
-
- image_key2 = f"{uuid.uuid4()!s}/attachments/{uuid.uuid4()!s}.png"
- image_url2 = f"http://localhost/{image_key2:s}"
-
- image_key3 = f"{uuid.uuid4()!s}/attachments/{uuid.uuid4()!s}.png"
- image_url3 = f"http://localhost/media/{image_key3:s}"
-
- ydoc = y_py.YDoc() # pylint: disable=no-member
- with ydoc.begin_transaction() as txn:
- xml_fragment = ydoc.get_xml_element("document-store")
-
- xml_image = xml_fragment.push_xml_element(txn, "image")
- xml_image.set_attribute(txn, "src", image_url1)
-
- xml_image = xml_fragment.push_xml_element(txn, "image")
- xml_image.set_attribute(txn, "src", image_url2)
-
- xml_paragraph = xml_fragment.push_xml_element(txn, "paragraph")
- xml_text = xml_paragraph.push_xml_text(txn)
- xml_text.push(txn, image_url3)
-
- update = y_py.encode_state_as_update(ydoc) # pylint: disable=no-member
- base64_string = base64.b64encode(update).decode("utf-8")
-
- # image_url3 is missing the "/media/" part and shouldn't get extracted
- assert utils.extract_attachments(base64_string) == [image_key1, image_key3]
diff --git a/src/backend/core/utils.py b/src/backend/core/utils.py
index 0378c323..780431f4 100644
--- a/src/backend/core/utils.py
+++ b/src/backend/core/utils.py
@@ -3,7 +3,7 @@
import base64
import re
-import y_py as Y
+import pycrdt
from bs4 import BeautifulSoup
from core import enums
@@ -52,19 +52,19 @@ def base64_yjs_to_xml(base64_string):
"""Extract xml from base64 yjs document."""
decoded_bytes = base64.b64decode(base64_string)
- uint8_array = bytearray(decoded_bytes)
+ # uint8_array = bytearray(decoded_bytes)
- doc = Y.YDoc() # pylint: disable=E1101
- Y.apply_update(doc, uint8_array) # pylint: disable=E1101
- return str(doc.get_xml_element("document-store"))
+ doc = pycrdt.Doc()
+ doc.apply_update(decoded_bytes)
+ return str(doc.get("document-store", type=pycrdt.XmlFragment))
def base64_yjs_to_text(base64_string):
"""Extract text from base64 yjs document."""
blocknote_structure = base64_yjs_to_xml(base64_string)
- soup = BeautifulSoup(blocknote_structure, "html.parser")
- return soup.get_text(separator=" ").strip()
+ soup = BeautifulSoup(blocknote_structure, "lxml-xml")
+ return soup.get_text(separator=" ", strip=True)
def extract_attachments(content):
diff --git a/src/backend/pyproject.toml b/src/backend/pyproject.toml
index e25f433a..20fbfc15 100644
--- a/src/backend/pyproject.toml
+++ b/src/backend/pyproject.toml
@@ -47,18 +47,19 @@ dependencies = [
"factory_boy==3.3.3",
"gunicorn==23.0.0",
"jsonschema==4.23.0",
+ "lxml==5.3.1",
"markdown==3.7",
"mozilla-django-oidc==4.0.1",
"nested-multipart-parser==1.5.0",
"openai==1.68.2",
"psycopg[binary]==3.2.6",
+ "pycrdt==0.12.10",
"PyJWT==2.10.1",
"python-magic==0.4.27",
"requests==2.32.3",
"sentry-sdk==2.24.0",
"url-normalize==1.4.3",
"whitenoise==6.9.0",
- "y-py==0.6.2",
]
[project.urls]