♻️(back) replace Ypy by pycrdt

Ypy is deprecated and unmaintained. We have problem with parsing existing documents. We replace it by pycrdt, library actively maintained and without the issues we have with Ypy.
2025-03-26 23:23:59 +01:00
parent c0dfb4b6b3
commit a5b9169eb6
7 changed files with 55 additions and 123 deletions
--- a/src/backend/core/tests/documents/test_api_documents_duplicate.py
+++ b/src/backend/core/tests/documents/test_api_documents_duplicate.py
@@ -11,9 +11,9 @@ from django.conf import settings
 from django.core.files.storage import default_storage
 from django.utils import timezone
 import pycrdt
 import pytest
 import requests
 import y_py
 from rest_framework.test import APIClient
 from core import factories, models
@@ -84,13 +84,14 @@ def test_api_documents_duplicate_success(index):
    image_refs = [get_image_refs(doc_id) for doc_id in document_ids]
    # Create document content with the first image only
-    ydoc = y_py.YDoc()  # pylint: disable=no-member
+    ydoc = pycrdt.Doc()
-    with ydoc.begin_transaction() as txn:
+    fragment = pycrdt.XmlFragment(
-        xml_fragment = ydoc.get_xml_element("document-store")
+        [
-        xml_fragment.push_xml_element(txn, "image").set_attribute(
+            pycrdt.XmlElement("img", {"src": image_refs[0][1]}),
-            txn, "src", image_refs[0][1]
+        ]
-        )
+    )
-    update = y_py.encode_state_as_update(ydoc)  # pylint: disable=no-member
+    ydoc["document-store"] = fragment
    update = ydoc.get_update()
    base64_content = base64.b64encode(update).decode("utf-8")
    # Create documents
--- a/src/backend/core/tests/documents/test_api_documents_update_extract_attachments.py
+++ b/src/backend/core/tests/documents/test_api_documents_update_extract_attachments.py
@@ -5,8 +5,8 @@ Test extract-attachments on document update in docs core app.
 import base64
 from uuid import uuid4
 import pycrdt
 import pytest
 import y_py
 from rest_framework.test import APIClient
 from core import factories
@@ -16,14 +16,15 @@ pytestmark = pytest.mark.django_db
 def get_ydoc_with_mages(image_keys):
    """Return a ydoc from text for testing purposes."""
-    ydoc = y_py.YDoc()  # pylint: disable=no-member
+    ydoc = pycrdt.Doc()
-    with ydoc.begin_transaction() as txn:
+    fragment = pycrdt.XmlFragment(
-        xml_fragment = ydoc.get_xml_element("document-store")
+        [
-        for key in image_keys:
+            pycrdt.XmlElement("img", {"src": f"http://localhost/media/{key:s}"})
-            xml_image = xml_fragment.push_xml_element(txn, "image")
+            for key in image_keys
-            xml_image.set_attribute(txn, "src", f"http://localhost/media/{key:s}")
+        ]
-
+    )
-    update = y_py.encode_state_as_update(ydoc)  # pylint: disable=no-member
+    ydoc["document-store"] = fragment
    update = ydoc.get_update()
    return base64.b64encode(update).decode("utf-8")
--- a/src/backend/core/tests/migrations/test_migrations_0020_remove_is_public_add_field_attachments_and_duplicated_from.py
+++ b/src/backend/core/tests/migrations/test_migrations_0020_remove_is_public_add_field_attachments_and_duplicated_from.py
@@ -4,8 +4,8 @@ import uuid
 from django.core.files.base import ContentFile
 from django.core.files.storage import default_storage
 import pycrdt
 import pytest
 import y_py
 from core import models
@@ -27,14 +27,13 @@ def test_populate_attachments_on_all_documents(migrator):
    # Create document content with an image
    file_key = f"{old_doc_with_attachments.id!s}/file"
    ydoc = y_py.YDoc()  # pylint: disable=no-member
    image_key = f"{old_doc_with_attachments.id!s}/attachments/{uuid.uuid4()!s}.png"
-    with ydoc.begin_transaction() as txn:
+    ydoc = pycrdt.Doc()
-        xml_fragment = ydoc.get_xml_element("document-store")
+    fragment = pycrdt.XmlFragment(
-        xml_fragment.push_xml_element(txn, "image").set_attribute(
+        [pycrdt.XmlElement("img", {"src": f"http://localhost/media/{image_key:s}"})]
-            txn, "src", f"http://localhost/media/{image_key:s}"
+    )
-        )
+    ydoc["document-store"] = fragment
-    update = y_py.encode_state_as_update(ydoc)  # pylint: disable=no-member
+    update = ydoc.get_update()
    base64_content = base64.b64encode(update).decode("utf-8")
    bytes_content = base64_content.encode("utf-8")
    content_file = ContentFile(bytes_content)
--- a/src/backend/core/tests/test_utils.py
+++ b/src/backend/core/tests/test_utils.py
@@ -3,7 +3,7 @@
 import base64
 import uuid
-import y_py
+import pycrdt
 from core import utils
@@ -29,17 +29,22 @@ TEST_BASE64_STRING = (
 def test_utils_base64_yjs_to_text():
    """Test extract text from saved yjs document"""
-    assert utils.base64_yjs_to_text(TEST_BASE64_STRING) == "Hello world"
+    assert utils.base64_yjs_to_text(TEST_BASE64_STRING) == "Hello w or ld"
 def test_utils_base64_yjs_to_xml():
    """Test extract xml from saved yjs document"""
    content = utils.base64_yjs_to_xml(TEST_BASE64_STRING)
    assert (
-        '<heading "level"="1" "textAlignment"="left">Hello</heading>' in content
+        '<heading textAlignment="left" level="1"><italic>Hello</italic></heading>'
-        or '<heading "textAlignment"="left" "level"="1">Hello</heading>' in content
+        in content
        or '<heading level="1" textAlignment="left"><italic>Hello</italic></heading>'
        in content
    )
    assert (
        '<bulletListItem textAlignment="left">w<bold>or</bold>ld</bulletListItem>'
        in content
    )
    assert '<bulletListItem "textAlignment"="left">world</bulletListItem>' in content
 def test_utils_extract_attachments():
@@ -56,22 +61,17 @@ def test_utils_extract_attachments():
    image_key3 = f"{uuid.uuid4()!s}/attachments/{uuid.uuid4()!s}.png"
    image_url3 = f"http://localhost/media/{image_key3:s}"
-    ydoc = y_py.YDoc()  # pylint: disable=no-member
+    ydoc = pycrdt.Doc()
-    with ydoc.begin_transaction() as txn:
+    frag = pycrdt.XmlFragment(
-        xml_fragment = ydoc.get_xml_element("document-store")
+        [
            pycrdt.XmlElement("img", {"src": image_url1}),
            pycrdt.XmlElement("img", {"src": image_url2}),
            pycrdt.XmlElement("p", {}, [pycrdt.XmlText(image_url3)]),
        ]
    )
    ydoc["document-store"] = frag
-        xml_image = xml_fragment.push_xml_element(txn, "image")
+    update = ydoc.get_update()
        xml_image.set_attribute(txn, "src", image_url1)
        xml_image = xml_fragment.push_xml_element(txn, "image")
        xml_image.set_attribute(txn, "src", image_url2)
        xml_paragraph = xml_fragment.push_xml_element(txn, "paragraph")
        xml_text = xml_paragraph.push_xml_text(txn)
        xml_text.push(txn, image_url3)
    update = y_py.encode_state_as_update(ydoc)  # pylint: disable=no-member
    base64_string = base64.b64encode(update).decode("utf-8")
    # image_key2 is missing the "/media/" part and shouldn't get extracted
    assert utils.extract_attachments(base64_string) == [image_key1, image_key3]
--- a/src/backend/core/tests/test_utils_base64_yjs_to_text.py
+++ b/src/backend/core/tests/test_utils_base64_yjs_to_text.py
@@ -1,70 +0,0 @@
 """Test util base64_yjs_to_text."""
 import base64
 import uuid
 import y_py
 from core import utils
 from core.utils import base64_yjs_to_text
 def test_utils_base64_yjs_to_text():
    """
    Test extract_text_from_saved_yjs_document
    This base64 string is an example of what is saved in the database.
    This base64 is generated from the blocknote editor, it contains
    the text \n# *Hello* \n- w**or**ld
    """
    base64_string = (
        "AR717vLVDgAHAQ5kb2N1bWVudC1zdG9yZQMKYmxvY2tHcm91cAcA9e7y1Q4AAw5ibG9ja0NvbnRh"
        "aW5lcgcA9e7y1Q4BAwdoZWFkaW5nBwD17vLVDgIGBgD17vLVDgMGaXRhbGljAnt9hPXu8tUOBAVI"
        "ZWxsb4b17vLVDgkGaXRhbGljBG51bGwoAPXu8tUOAg10ZXh0QWxpZ25tZW50AXcEbGVmdCgA9e7y"
        "1Q4CBWxldmVsAX0BKAD17vLVDgECaWQBdyQwNGQ2MjM0MS04MzI2LTQyMzYtYTA4My00ODdlMjZm"
        "YWQyMzAoAPXu8tUOAQl0ZXh0Q29sb3IBdwdkZWZhdWx0KAD17vLVDgEPYmFja2dyb3VuZENvbG9y"
        "AXcHZGVmYXVsdIf17vLVDgEDDmJsb2NrQ29udGFpbmVyBwD17vLVDhADDmJ1bGxldExpc3RJdGVt"
        "BwD17vLVDhEGBAD17vLVDhIBd4b17vLVDhMEYm9sZAJ7fYT17vLVDhQCb3KG9e7y1Q4WBGJvbGQE"
        "bnVsbIT17vLVDhcCbGQoAPXu8tUOEQ10ZXh0QWxpZ25tZW50AXcEbGVmdCgA9e7y1Q4QAmlkAXck"
        "ZDM1MWUwNjgtM2U1NS00MjI2LThlYTUtYWJiMjYzMTk4ZTJhKAD17vLVDhAJdGV4dENvbG9yAXcH"
        "ZGVmYXVsdCgA9e7y1Q4QD2JhY2tncm91bmRDb2xvcgF3B2RlZmF1bHSH9e7y1Q4QAw5ibG9ja0Nv"
        "bnRhaW5lcgcA9e7y1Q4eAwlwYXJhZ3JhcGgoAPXu8tUOHw10ZXh0QWxpZ25tZW50AXcEbGVmdCgA"
        "9e7y1Q4eAmlkAXckODk3MDBjMDctZTBlMS00ZmUwLWFjYTItODQ5MzIwOWE3ZTQyKAD17vLVDh4J"
        "dGV4dENvbG9yAXcHZGVmYXVsdCgA9e7y1Q4eD2JhY2tncm91bmRDb2xvcgF3B2RlZmF1bHQA"
    )
    assert base64_yjs_to_text(base64_string) == "Hello world"
 def test_utils_extract_attachments():
    """
    All attachment keys in the document content should be extracted.
    """
    document_id = uuid.uuid4()
    image_key1 = f"{document_id!s}/attachments/{uuid.uuid4()!s}.png"
    image_url1 = f"http://localhost/media/{image_key1:s}"
    image_key2 = f"{uuid.uuid4()!s}/attachments/{uuid.uuid4()!s}.png"
    image_url2 = f"http://localhost/{image_key2:s}"
    image_key3 = f"{uuid.uuid4()!s}/attachments/{uuid.uuid4()!s}.png"
    image_url3 = f"http://localhost/media/{image_key3:s}"
    ydoc = y_py.YDoc()  # pylint: disable=no-member
    with ydoc.begin_transaction() as txn:
        xml_fragment = ydoc.get_xml_element("document-store")
        xml_image = xml_fragment.push_xml_element(txn, "image")
        xml_image.set_attribute(txn, "src", image_url1)
        xml_image = xml_fragment.push_xml_element(txn, "image")
        xml_image.set_attribute(txn, "src", image_url2)
        xml_paragraph = xml_fragment.push_xml_element(txn, "paragraph")
        xml_text = xml_paragraph.push_xml_text(txn)
        xml_text.push(txn, image_url3)
    update = y_py.encode_state_as_update(ydoc)  # pylint: disable=no-member
    base64_string = base64.b64encode(update).decode("utf-8")
    # image_url3 is missing the "/media/" part and shouldn't get extracted
    assert utils.extract_attachments(base64_string) == [image_key1, image_key3]
--- a/src/backend/core/utils.py
+++ b/src/backend/core/utils.py
@@ -3,7 +3,7 @@
 import base64
 import re
-import y_py as Y
+import pycrdt
 from bs4 import BeautifulSoup
 from core import enums
@@ -52,19 +52,19 @@ def base64_yjs_to_xml(base64_string):
    """Extract xml from base64 yjs document."""
    decoded_bytes = base64.b64decode(base64_string)
-    uint8_array = bytearray(decoded_bytes)
+    # uint8_array = bytearray(decoded_bytes)
-    doc = Y.YDoc()  # pylint: disable=E1101
+    doc = pycrdt.Doc()
-    Y.apply_update(doc, uint8_array)  # pylint: disable=E1101
+    doc.apply_update(decoded_bytes)
-    return str(doc.get_xml_element("document-store"))
+    return str(doc.get("document-store", type=pycrdt.XmlFragment))
 def base64_yjs_to_text(base64_string):
    """Extract text from base64 yjs document."""
    blocknote_structure = base64_yjs_to_xml(base64_string)
-    soup = BeautifulSoup(blocknote_structure, "html.parser")
+    soup = BeautifulSoup(blocknote_structure, "lxml-xml")
-    return soup.get_text(separator=" ").strip()
+    return soup.get_text(separator=" ", strip=True)
 def extract_attachments(content):
--- a/src/backend/pyproject.toml
+++ b/src/backend/pyproject.toml
@@ -47,18 +47,19 @@ dependencies = [
    "factory_boy==3.3.3",
    "gunicorn==23.0.0",
    "jsonschema==4.23.0",
    "lxml==5.3.1",
    "markdown==3.7",
    "mozilla-django-oidc==4.0.1",
    "nested-multipart-parser==1.5.0",
    "openai==1.68.2",
    "psycopg[binary]==3.2.6",
    "pycrdt==0.12.10", 
    "PyJWT==2.10.1",
    "python-magic==0.4.27",
    "requests==2.32.3",
    "sentry-sdk==2.24.0",
    "url-normalize==1.4.3",
    "whitenoise==6.9.0",
    "y-py==0.6.2",
 ]
 [project.urls]