♻️(back) replace Ypy by pycrdt

Ypy is deprecated and unmaintained. We have problem with parsing
existing documents. We replace it by pycrdt, library actively maintained
and without the issues we have with Ypy.
This commit is contained in:
Manuel Raynaud
2025-03-26 23:23:59 +01:00
parent c0dfb4b6b3
commit a5b9169eb6
7 changed files with 55 additions and 123 deletions

View File

@@ -11,9 +11,9 @@ from django.conf import settings
from django.core.files.storage import default_storage from django.core.files.storage import default_storage
from django.utils import timezone from django.utils import timezone
import pycrdt
import pytest import pytest
import requests import requests
import y_py
from rest_framework.test import APIClient from rest_framework.test import APIClient
from core import factories, models from core import factories, models
@@ -84,13 +84,14 @@ def test_api_documents_duplicate_success(index):
image_refs = [get_image_refs(doc_id) for doc_id in document_ids] image_refs = [get_image_refs(doc_id) for doc_id in document_ids]
# Create document content with the first image only # Create document content with the first image only
ydoc = y_py.YDoc() # pylint: disable=no-member ydoc = pycrdt.Doc()
with ydoc.begin_transaction() as txn: fragment = pycrdt.XmlFragment(
xml_fragment = ydoc.get_xml_element("document-store") [
xml_fragment.push_xml_element(txn, "image").set_attribute( pycrdt.XmlElement("img", {"src": image_refs[0][1]}),
txn, "src", image_refs[0][1] ]
) )
update = y_py.encode_state_as_update(ydoc) # pylint: disable=no-member ydoc["document-store"] = fragment
update = ydoc.get_update()
base64_content = base64.b64encode(update).decode("utf-8") base64_content = base64.b64encode(update).decode("utf-8")
# Create documents # Create documents

View File

@@ -5,8 +5,8 @@ Test extract-attachments on document update in docs core app.
import base64 import base64
from uuid import uuid4 from uuid import uuid4
import pycrdt
import pytest import pytest
import y_py
from rest_framework.test import APIClient from rest_framework.test import APIClient
from core import factories from core import factories
@@ -16,14 +16,15 @@ pytestmark = pytest.mark.django_db
def get_ydoc_with_mages(image_keys): def get_ydoc_with_mages(image_keys):
"""Return a ydoc from text for testing purposes.""" """Return a ydoc from text for testing purposes."""
ydoc = y_py.YDoc() # pylint: disable=no-member ydoc = pycrdt.Doc()
with ydoc.begin_transaction() as txn: fragment = pycrdt.XmlFragment(
xml_fragment = ydoc.get_xml_element("document-store") [
for key in image_keys: pycrdt.XmlElement("img", {"src": f"http://localhost/media/{key:s}"})
xml_image = xml_fragment.push_xml_element(txn, "image") for key in image_keys
xml_image.set_attribute(txn, "src", f"http://localhost/media/{key:s}") ]
)
update = y_py.encode_state_as_update(ydoc) # pylint: disable=no-member ydoc["document-store"] = fragment
update = ydoc.get_update()
return base64.b64encode(update).decode("utf-8") return base64.b64encode(update).decode("utf-8")

View File

@@ -4,8 +4,8 @@ import uuid
from django.core.files.base import ContentFile from django.core.files.base import ContentFile
from django.core.files.storage import default_storage from django.core.files.storage import default_storage
import pycrdt
import pytest import pytest
import y_py
from core import models from core import models
@@ -27,14 +27,13 @@ def test_populate_attachments_on_all_documents(migrator):
# Create document content with an image # Create document content with an image
file_key = f"{old_doc_with_attachments.id!s}/file" file_key = f"{old_doc_with_attachments.id!s}/file"
ydoc = y_py.YDoc() # pylint: disable=no-member
image_key = f"{old_doc_with_attachments.id!s}/attachments/{uuid.uuid4()!s}.png" image_key = f"{old_doc_with_attachments.id!s}/attachments/{uuid.uuid4()!s}.png"
with ydoc.begin_transaction() as txn: ydoc = pycrdt.Doc()
xml_fragment = ydoc.get_xml_element("document-store") fragment = pycrdt.XmlFragment(
xml_fragment.push_xml_element(txn, "image").set_attribute( [pycrdt.XmlElement("img", {"src": f"http://localhost/media/{image_key:s}"})]
txn, "src", f"http://localhost/media/{image_key:s}" )
) ydoc["document-store"] = fragment
update = y_py.encode_state_as_update(ydoc) # pylint: disable=no-member update = ydoc.get_update()
base64_content = base64.b64encode(update).decode("utf-8") base64_content = base64.b64encode(update).decode("utf-8")
bytes_content = base64_content.encode("utf-8") bytes_content = base64_content.encode("utf-8")
content_file = ContentFile(bytes_content) content_file = ContentFile(bytes_content)

View File

@@ -3,7 +3,7 @@
import base64 import base64
import uuid import uuid
import y_py import pycrdt
from core import utils from core import utils
@@ -29,17 +29,22 @@ TEST_BASE64_STRING = (
def test_utils_base64_yjs_to_text(): def test_utils_base64_yjs_to_text():
"""Test extract text from saved yjs document""" """Test extract text from saved yjs document"""
assert utils.base64_yjs_to_text(TEST_BASE64_STRING) == "Hello world" assert utils.base64_yjs_to_text(TEST_BASE64_STRING) == "Hello w or ld"
def test_utils_base64_yjs_to_xml(): def test_utils_base64_yjs_to_xml():
"""Test extract xml from saved yjs document""" """Test extract xml from saved yjs document"""
content = utils.base64_yjs_to_xml(TEST_BASE64_STRING) content = utils.base64_yjs_to_xml(TEST_BASE64_STRING)
assert ( assert (
'<heading "level"="1" "textAlignment"="left">Hello</heading>' in content '<heading textAlignment="left" level="1"><italic>Hello</italic></heading>'
or '<heading "textAlignment"="left" "level"="1">Hello</heading>' in content in content
or '<heading level="1" textAlignment="left"><italic>Hello</italic></heading>'
in content
)
assert (
'<bulletListItem textAlignment="left">w<bold>or</bold>ld</bulletListItem>'
in content
) )
assert '<bulletListItem "textAlignment"="left">world</bulletListItem>' in content
def test_utils_extract_attachments(): def test_utils_extract_attachments():
@@ -56,22 +61,17 @@ def test_utils_extract_attachments():
image_key3 = f"{uuid.uuid4()!s}/attachments/{uuid.uuid4()!s}.png" image_key3 = f"{uuid.uuid4()!s}/attachments/{uuid.uuid4()!s}.png"
image_url3 = f"http://localhost/media/{image_key3:s}" image_url3 = f"http://localhost/media/{image_key3:s}"
ydoc = y_py.YDoc() # pylint: disable=no-member ydoc = pycrdt.Doc()
with ydoc.begin_transaction() as txn: frag = pycrdt.XmlFragment(
xml_fragment = ydoc.get_xml_element("document-store") [
pycrdt.XmlElement("img", {"src": image_url1}),
pycrdt.XmlElement("img", {"src": image_url2}),
pycrdt.XmlElement("p", {}, [pycrdt.XmlText(image_url3)]),
]
)
ydoc["document-store"] = frag
xml_image = xml_fragment.push_xml_element(txn, "image") update = ydoc.get_update()
xml_image.set_attribute(txn, "src", image_url1)
xml_image = xml_fragment.push_xml_element(txn, "image")
xml_image.set_attribute(txn, "src", image_url2)
xml_paragraph = xml_fragment.push_xml_element(txn, "paragraph")
xml_text = xml_paragraph.push_xml_text(txn)
xml_text.push(txn, image_url3)
update = y_py.encode_state_as_update(ydoc) # pylint: disable=no-member
base64_string = base64.b64encode(update).decode("utf-8") base64_string = base64.b64encode(update).decode("utf-8")
# image_key2 is missing the "/media/" part and shouldn't get extracted # image_key2 is missing the "/media/" part and shouldn't get extracted
assert utils.extract_attachments(base64_string) == [image_key1, image_key3] assert utils.extract_attachments(base64_string) == [image_key1, image_key3]

View File

@@ -1,70 +0,0 @@
"""Test util base64_yjs_to_text."""
import base64
import uuid
import y_py
from core import utils
from core.utils import base64_yjs_to_text
def test_utils_base64_yjs_to_text():
"""
Test extract_text_from_saved_yjs_document
This base64 string is an example of what is saved in the database.
This base64 is generated from the blocknote editor, it contains
the text \n# *Hello* \n- w**or**ld
"""
base64_string = (
"AR717vLVDgAHAQ5kb2N1bWVudC1zdG9yZQMKYmxvY2tHcm91cAcA9e7y1Q4AAw5ibG9ja0NvbnRh"
"aW5lcgcA9e7y1Q4BAwdoZWFkaW5nBwD17vLVDgIGBgD17vLVDgMGaXRhbGljAnt9hPXu8tUOBAVI"
"ZWxsb4b17vLVDgkGaXRhbGljBG51bGwoAPXu8tUOAg10ZXh0QWxpZ25tZW50AXcEbGVmdCgA9e7y"
"1Q4CBWxldmVsAX0BKAD17vLVDgECaWQBdyQwNGQ2MjM0MS04MzI2LTQyMzYtYTA4My00ODdlMjZm"
"YWQyMzAoAPXu8tUOAQl0ZXh0Q29sb3IBdwdkZWZhdWx0KAD17vLVDgEPYmFja2dyb3VuZENvbG9y"
"AXcHZGVmYXVsdIf17vLVDgEDDmJsb2NrQ29udGFpbmVyBwD17vLVDhADDmJ1bGxldExpc3RJdGVt"
"BwD17vLVDhEGBAD17vLVDhIBd4b17vLVDhMEYm9sZAJ7fYT17vLVDhQCb3KG9e7y1Q4WBGJvbGQE"
"bnVsbIT17vLVDhcCbGQoAPXu8tUOEQ10ZXh0QWxpZ25tZW50AXcEbGVmdCgA9e7y1Q4QAmlkAXck"
"ZDM1MWUwNjgtM2U1NS00MjI2LThlYTUtYWJiMjYzMTk4ZTJhKAD17vLVDhAJdGV4dENvbG9yAXcH"
"ZGVmYXVsdCgA9e7y1Q4QD2JhY2tncm91bmRDb2xvcgF3B2RlZmF1bHSH9e7y1Q4QAw5ibG9ja0Nv"
"bnRhaW5lcgcA9e7y1Q4eAwlwYXJhZ3JhcGgoAPXu8tUOHw10ZXh0QWxpZ25tZW50AXcEbGVmdCgA"
"9e7y1Q4eAmlkAXckODk3MDBjMDctZTBlMS00ZmUwLWFjYTItODQ5MzIwOWE3ZTQyKAD17vLVDh4J"
"dGV4dENvbG9yAXcHZGVmYXVsdCgA9e7y1Q4eD2JhY2tncm91bmRDb2xvcgF3B2RlZmF1bHQA"
)
assert base64_yjs_to_text(base64_string) == "Hello world"
def test_utils_extract_attachments():
"""
All attachment keys in the document content should be extracted.
"""
document_id = uuid.uuid4()
image_key1 = f"{document_id!s}/attachments/{uuid.uuid4()!s}.png"
image_url1 = f"http://localhost/media/{image_key1:s}"
image_key2 = f"{uuid.uuid4()!s}/attachments/{uuid.uuid4()!s}.png"
image_url2 = f"http://localhost/{image_key2:s}"
image_key3 = f"{uuid.uuid4()!s}/attachments/{uuid.uuid4()!s}.png"
image_url3 = f"http://localhost/media/{image_key3:s}"
ydoc = y_py.YDoc() # pylint: disable=no-member
with ydoc.begin_transaction() as txn:
xml_fragment = ydoc.get_xml_element("document-store")
xml_image = xml_fragment.push_xml_element(txn, "image")
xml_image.set_attribute(txn, "src", image_url1)
xml_image = xml_fragment.push_xml_element(txn, "image")
xml_image.set_attribute(txn, "src", image_url2)
xml_paragraph = xml_fragment.push_xml_element(txn, "paragraph")
xml_text = xml_paragraph.push_xml_text(txn)
xml_text.push(txn, image_url3)
update = y_py.encode_state_as_update(ydoc) # pylint: disable=no-member
base64_string = base64.b64encode(update).decode("utf-8")
# image_url3 is missing the "/media/" part and shouldn't get extracted
assert utils.extract_attachments(base64_string) == [image_key1, image_key3]

View File

@@ -3,7 +3,7 @@
import base64 import base64
import re import re
import y_py as Y import pycrdt
from bs4 import BeautifulSoup from bs4 import BeautifulSoup
from core import enums from core import enums
@@ -52,19 +52,19 @@ def base64_yjs_to_xml(base64_string):
"""Extract xml from base64 yjs document.""" """Extract xml from base64 yjs document."""
decoded_bytes = base64.b64decode(base64_string) decoded_bytes = base64.b64decode(base64_string)
uint8_array = bytearray(decoded_bytes) # uint8_array = bytearray(decoded_bytes)
doc = Y.YDoc() # pylint: disable=E1101 doc = pycrdt.Doc()
Y.apply_update(doc, uint8_array) # pylint: disable=E1101 doc.apply_update(decoded_bytes)
return str(doc.get_xml_element("document-store")) return str(doc.get("document-store", type=pycrdt.XmlFragment))
def base64_yjs_to_text(base64_string): def base64_yjs_to_text(base64_string):
"""Extract text from base64 yjs document.""" """Extract text from base64 yjs document."""
blocknote_structure = base64_yjs_to_xml(base64_string) blocknote_structure = base64_yjs_to_xml(base64_string)
soup = BeautifulSoup(blocknote_structure, "html.parser") soup = BeautifulSoup(blocknote_structure, "lxml-xml")
return soup.get_text(separator=" ").strip() return soup.get_text(separator=" ", strip=True)
def extract_attachments(content): def extract_attachments(content):

View File

@@ -47,18 +47,19 @@ dependencies = [
"factory_boy==3.3.3", "factory_boy==3.3.3",
"gunicorn==23.0.0", "gunicorn==23.0.0",
"jsonschema==4.23.0", "jsonschema==4.23.0",
"lxml==5.3.1",
"markdown==3.7", "markdown==3.7",
"mozilla-django-oidc==4.0.1", "mozilla-django-oidc==4.0.1",
"nested-multipart-parser==1.5.0", "nested-multipart-parser==1.5.0",
"openai==1.68.2", "openai==1.68.2",
"psycopg[binary]==3.2.6", "psycopg[binary]==3.2.6",
"pycrdt==0.12.10",
"PyJWT==2.10.1", "PyJWT==2.10.1",
"python-magic==0.4.27", "python-magic==0.4.27",
"requests==2.32.3", "requests==2.32.3",
"sentry-sdk==2.24.0", "sentry-sdk==2.24.0",
"url-normalize==1.4.3", "url-normalize==1.4.3",
"whitenoise==6.9.0", "whitenoise==6.9.0",
"y-py==0.6.2",
] ]
[project.urls] [project.urls]