♻️(back) replace Ypy by pycrdt

Ypy is deprecated and unmaintained. We have problem with parsing
existing documents. We replace it by pycrdt, library actively maintained
and without the issues we have with Ypy.
This commit is contained in:
Manuel Raynaud
2025-03-26 23:23:59 +01:00
parent c0dfb4b6b3
commit a5b9169eb6
7 changed files with 55 additions and 123 deletions

View File

@@ -11,9 +11,9 @@ from django.conf import settings
from django.core.files.storage import default_storage
from django.utils import timezone
import pycrdt
import pytest
import requests
import y_py
from rest_framework.test import APIClient
from core import factories, models
@@ -84,13 +84,14 @@ def test_api_documents_duplicate_success(index):
image_refs = [get_image_refs(doc_id) for doc_id in document_ids]
# Create document content with the first image only
ydoc = y_py.YDoc() # pylint: disable=no-member
with ydoc.begin_transaction() as txn:
xml_fragment = ydoc.get_xml_element("document-store")
xml_fragment.push_xml_element(txn, "image").set_attribute(
txn, "src", image_refs[0][1]
)
update = y_py.encode_state_as_update(ydoc) # pylint: disable=no-member
ydoc = pycrdt.Doc()
fragment = pycrdt.XmlFragment(
[
pycrdt.XmlElement("img", {"src": image_refs[0][1]}),
]
)
ydoc["document-store"] = fragment
update = ydoc.get_update()
base64_content = base64.b64encode(update).decode("utf-8")
# Create documents

View File

@@ -5,8 +5,8 @@ Test extract-attachments on document update in docs core app.
import base64
from uuid import uuid4
import pycrdt
import pytest
import y_py
from rest_framework.test import APIClient
from core import factories
@@ -16,14 +16,15 @@ pytestmark = pytest.mark.django_db
def get_ydoc_with_mages(image_keys):
"""Return a ydoc from text for testing purposes."""
ydoc = y_py.YDoc() # pylint: disable=no-member
with ydoc.begin_transaction() as txn:
xml_fragment = ydoc.get_xml_element("document-store")
for key in image_keys:
xml_image = xml_fragment.push_xml_element(txn, "image")
xml_image.set_attribute(txn, "src", f"http://localhost/media/{key:s}")
update = y_py.encode_state_as_update(ydoc) # pylint: disable=no-member
ydoc = pycrdt.Doc()
fragment = pycrdt.XmlFragment(
[
pycrdt.XmlElement("img", {"src": f"http://localhost/media/{key:s}"})
for key in image_keys
]
)
ydoc["document-store"] = fragment
update = ydoc.get_update()
return base64.b64encode(update).decode("utf-8")

View File

@@ -4,8 +4,8 @@ import uuid
from django.core.files.base import ContentFile
from django.core.files.storage import default_storage
import pycrdt
import pytest
import y_py
from core import models
@@ -27,14 +27,13 @@ def test_populate_attachments_on_all_documents(migrator):
# Create document content with an image
file_key = f"{old_doc_with_attachments.id!s}/file"
ydoc = y_py.YDoc() # pylint: disable=no-member
image_key = f"{old_doc_with_attachments.id!s}/attachments/{uuid.uuid4()!s}.png"
with ydoc.begin_transaction() as txn:
xml_fragment = ydoc.get_xml_element("document-store")
xml_fragment.push_xml_element(txn, "image").set_attribute(
txn, "src", f"http://localhost/media/{image_key:s}"
)
update = y_py.encode_state_as_update(ydoc) # pylint: disable=no-member
ydoc = pycrdt.Doc()
fragment = pycrdt.XmlFragment(
[pycrdt.XmlElement("img", {"src": f"http://localhost/media/{image_key:s}"})]
)
ydoc["document-store"] = fragment
update = ydoc.get_update()
base64_content = base64.b64encode(update).decode("utf-8")
bytes_content = base64_content.encode("utf-8")
content_file = ContentFile(bytes_content)

View File

@@ -3,7 +3,7 @@
import base64
import uuid
import y_py
import pycrdt
from core import utils
@@ -29,17 +29,22 @@ TEST_BASE64_STRING = (
def test_utils_base64_yjs_to_text():
"""Test extract text from saved yjs document"""
assert utils.base64_yjs_to_text(TEST_BASE64_STRING) == "Hello world"
assert utils.base64_yjs_to_text(TEST_BASE64_STRING) == "Hello w or ld"
def test_utils_base64_yjs_to_xml():
"""Test extract xml from saved yjs document"""
content = utils.base64_yjs_to_xml(TEST_BASE64_STRING)
assert (
'<heading "level"="1" "textAlignment"="left">Hello</heading>' in content
or '<heading "textAlignment"="left" "level"="1">Hello</heading>' in content
'<heading textAlignment="left" level="1"><italic>Hello</italic></heading>'
in content
or '<heading level="1" textAlignment="left"><italic>Hello</italic></heading>'
in content
)
assert (
'<bulletListItem textAlignment="left">w<bold>or</bold>ld</bulletListItem>'
in content
)
assert '<bulletListItem "textAlignment"="left">world</bulletListItem>' in content
def test_utils_extract_attachments():
@@ -56,22 +61,17 @@ def test_utils_extract_attachments():
image_key3 = f"{uuid.uuid4()!s}/attachments/{uuid.uuid4()!s}.png"
image_url3 = f"http://localhost/media/{image_key3:s}"
ydoc = y_py.YDoc() # pylint: disable=no-member
with ydoc.begin_transaction() as txn:
xml_fragment = ydoc.get_xml_element("document-store")
ydoc = pycrdt.Doc()
frag = pycrdt.XmlFragment(
[
pycrdt.XmlElement("img", {"src": image_url1}),
pycrdt.XmlElement("img", {"src": image_url2}),
pycrdt.XmlElement("p", {}, [pycrdt.XmlText(image_url3)]),
]
)
ydoc["document-store"] = frag
xml_image = xml_fragment.push_xml_element(txn, "image")
xml_image.set_attribute(txn, "src", image_url1)
xml_image = xml_fragment.push_xml_element(txn, "image")
xml_image.set_attribute(txn, "src", image_url2)
xml_paragraph = xml_fragment.push_xml_element(txn, "paragraph")
xml_text = xml_paragraph.push_xml_text(txn)
xml_text.push(txn, image_url3)
update = y_py.encode_state_as_update(ydoc) # pylint: disable=no-member
update = ydoc.get_update()
base64_string = base64.b64encode(update).decode("utf-8")
# image_key2 is missing the "/media/" part and shouldn't get extracted
assert utils.extract_attachments(base64_string) == [image_key1, image_key3]

View File

@@ -1,70 +0,0 @@
"""Test util base64_yjs_to_text."""
import base64
import uuid
import y_py
from core import utils
from core.utils import base64_yjs_to_text
def test_utils_base64_yjs_to_text():
"""
Test extract_text_from_saved_yjs_document
This base64 string is an example of what is saved in the database.
This base64 is generated from the blocknote editor, it contains
the text \n# *Hello* \n- w**or**ld
"""
base64_string = (
"AR717vLVDgAHAQ5kb2N1bWVudC1zdG9yZQMKYmxvY2tHcm91cAcA9e7y1Q4AAw5ibG9ja0NvbnRh"
"aW5lcgcA9e7y1Q4BAwdoZWFkaW5nBwD17vLVDgIGBgD17vLVDgMGaXRhbGljAnt9hPXu8tUOBAVI"
"ZWxsb4b17vLVDgkGaXRhbGljBG51bGwoAPXu8tUOAg10ZXh0QWxpZ25tZW50AXcEbGVmdCgA9e7y"
"1Q4CBWxldmVsAX0BKAD17vLVDgECaWQBdyQwNGQ2MjM0MS04MzI2LTQyMzYtYTA4My00ODdlMjZm"
"YWQyMzAoAPXu8tUOAQl0ZXh0Q29sb3IBdwdkZWZhdWx0KAD17vLVDgEPYmFja2dyb3VuZENvbG9y"
"AXcHZGVmYXVsdIf17vLVDgEDDmJsb2NrQ29udGFpbmVyBwD17vLVDhADDmJ1bGxldExpc3RJdGVt"
"BwD17vLVDhEGBAD17vLVDhIBd4b17vLVDhMEYm9sZAJ7fYT17vLVDhQCb3KG9e7y1Q4WBGJvbGQE"
"bnVsbIT17vLVDhcCbGQoAPXu8tUOEQ10ZXh0QWxpZ25tZW50AXcEbGVmdCgA9e7y1Q4QAmlkAXck"
"ZDM1MWUwNjgtM2U1NS00MjI2LThlYTUtYWJiMjYzMTk4ZTJhKAD17vLVDhAJdGV4dENvbG9yAXcH"
"ZGVmYXVsdCgA9e7y1Q4QD2JhY2tncm91bmRDb2xvcgF3B2RlZmF1bHSH9e7y1Q4QAw5ibG9ja0Nv"
"bnRhaW5lcgcA9e7y1Q4eAwlwYXJhZ3JhcGgoAPXu8tUOHw10ZXh0QWxpZ25tZW50AXcEbGVmdCgA"
"9e7y1Q4eAmlkAXckODk3MDBjMDctZTBlMS00ZmUwLWFjYTItODQ5MzIwOWE3ZTQyKAD17vLVDh4J"
"dGV4dENvbG9yAXcHZGVmYXVsdCgA9e7y1Q4eD2JhY2tncm91bmRDb2xvcgF3B2RlZmF1bHQA"
)
assert base64_yjs_to_text(base64_string) == "Hello world"
def test_utils_extract_attachments():
"""
All attachment keys in the document content should be extracted.
"""
document_id = uuid.uuid4()
image_key1 = f"{document_id!s}/attachments/{uuid.uuid4()!s}.png"
image_url1 = f"http://localhost/media/{image_key1:s}"
image_key2 = f"{uuid.uuid4()!s}/attachments/{uuid.uuid4()!s}.png"
image_url2 = f"http://localhost/{image_key2:s}"
image_key3 = f"{uuid.uuid4()!s}/attachments/{uuid.uuid4()!s}.png"
image_url3 = f"http://localhost/media/{image_key3:s}"
ydoc = y_py.YDoc() # pylint: disable=no-member
with ydoc.begin_transaction() as txn:
xml_fragment = ydoc.get_xml_element("document-store")
xml_image = xml_fragment.push_xml_element(txn, "image")
xml_image.set_attribute(txn, "src", image_url1)
xml_image = xml_fragment.push_xml_element(txn, "image")
xml_image.set_attribute(txn, "src", image_url2)
xml_paragraph = xml_fragment.push_xml_element(txn, "paragraph")
xml_text = xml_paragraph.push_xml_text(txn)
xml_text.push(txn, image_url3)
update = y_py.encode_state_as_update(ydoc) # pylint: disable=no-member
base64_string = base64.b64encode(update).decode("utf-8")
# image_url3 is missing the "/media/" part and shouldn't get extracted
assert utils.extract_attachments(base64_string) == [image_key1, image_key3]

View File

@@ -3,7 +3,7 @@
import base64
import re
import y_py as Y
import pycrdt
from bs4 import BeautifulSoup
from core import enums
@@ -52,19 +52,19 @@ def base64_yjs_to_xml(base64_string):
"""Extract xml from base64 yjs document."""
decoded_bytes = base64.b64decode(base64_string)
uint8_array = bytearray(decoded_bytes)
# uint8_array = bytearray(decoded_bytes)
doc = Y.YDoc() # pylint: disable=E1101
Y.apply_update(doc, uint8_array) # pylint: disable=E1101
return str(doc.get_xml_element("document-store"))
doc = pycrdt.Doc()
doc.apply_update(decoded_bytes)
return str(doc.get("document-store", type=pycrdt.XmlFragment))
def base64_yjs_to_text(base64_string):
"""Extract text from base64 yjs document."""
blocknote_structure = base64_yjs_to_xml(base64_string)
soup = BeautifulSoup(blocknote_structure, "html.parser")
return soup.get_text(separator=" ").strip()
soup = BeautifulSoup(blocknote_structure, "lxml-xml")
return soup.get_text(separator=" ", strip=True)
def extract_attachments(content):