diff --git a/src/backend/core/tests/documents/test_api_documents_duplicate.py b/src/backend/core/tests/documents/test_api_documents_duplicate.py index ec13d878..82acfa98 100644 --- a/src/backend/core/tests/documents/test_api_documents_duplicate.py +++ b/src/backend/core/tests/documents/test_api_documents_duplicate.py @@ -11,9 +11,9 @@ from django.conf import settings from django.core.files.storage import default_storage from django.utils import timezone +import pycrdt import pytest import requests -import y_py from rest_framework.test import APIClient from core import factories, models @@ -84,13 +84,14 @@ def test_api_documents_duplicate_success(index): image_refs = [get_image_refs(doc_id) for doc_id in document_ids] # Create document content with the first image only - ydoc = y_py.YDoc() # pylint: disable=no-member - with ydoc.begin_transaction() as txn: - xml_fragment = ydoc.get_xml_element("document-store") - xml_fragment.push_xml_element(txn, "image").set_attribute( - txn, "src", image_refs[0][1] - ) - update = y_py.encode_state_as_update(ydoc) # pylint: disable=no-member + ydoc = pycrdt.Doc() + fragment = pycrdt.XmlFragment( + [ + pycrdt.XmlElement("img", {"src": image_refs[0][1]}), + ] + ) + ydoc["document-store"] = fragment + update = ydoc.get_update() base64_content = base64.b64encode(update).decode("utf-8") # Create documents diff --git a/src/backend/core/tests/documents/test_api_documents_update_extract_attachments.py b/src/backend/core/tests/documents/test_api_documents_update_extract_attachments.py index f18c73db..b52f83e3 100644 --- a/src/backend/core/tests/documents/test_api_documents_update_extract_attachments.py +++ b/src/backend/core/tests/documents/test_api_documents_update_extract_attachments.py @@ -5,8 +5,8 @@ Test extract-attachments on document update in docs core app. import base64 from uuid import uuid4 +import pycrdt import pytest -import y_py from rest_framework.test import APIClient from core import factories @@ -16,14 +16,15 @@ pytestmark = pytest.mark.django_db def get_ydoc_with_mages(image_keys): """Return a ydoc from text for testing purposes.""" - ydoc = y_py.YDoc() # pylint: disable=no-member - with ydoc.begin_transaction() as txn: - xml_fragment = ydoc.get_xml_element("document-store") - for key in image_keys: - xml_image = xml_fragment.push_xml_element(txn, "image") - xml_image.set_attribute(txn, "src", f"http://localhost/media/{key:s}") - - update = y_py.encode_state_as_update(ydoc) # pylint: disable=no-member + ydoc = pycrdt.Doc() + fragment = pycrdt.XmlFragment( + [ + pycrdt.XmlElement("img", {"src": f"http://localhost/media/{key:s}"}) + for key in image_keys + ] + ) + ydoc["document-store"] = fragment + update = ydoc.get_update() return base64.b64encode(update).decode("utf-8") diff --git a/src/backend/core/tests/migrations/test_migrations_0020_remove_is_public_add_field_attachments_and_duplicated_from.py b/src/backend/core/tests/migrations/test_migrations_0020_remove_is_public_add_field_attachments_and_duplicated_from.py index 94ce5375..f94e2a1e 100644 --- a/src/backend/core/tests/migrations/test_migrations_0020_remove_is_public_add_field_attachments_and_duplicated_from.py +++ b/src/backend/core/tests/migrations/test_migrations_0020_remove_is_public_add_field_attachments_and_duplicated_from.py @@ -4,8 +4,8 @@ import uuid from django.core.files.base import ContentFile from django.core.files.storage import default_storage +import pycrdt import pytest -import y_py from core import models @@ -27,14 +27,13 @@ def test_populate_attachments_on_all_documents(migrator): # Create document content with an image file_key = f"{old_doc_with_attachments.id!s}/file" - ydoc = y_py.YDoc() # pylint: disable=no-member image_key = f"{old_doc_with_attachments.id!s}/attachments/{uuid.uuid4()!s}.png" - with ydoc.begin_transaction() as txn: - xml_fragment = ydoc.get_xml_element("document-store") - xml_fragment.push_xml_element(txn, "image").set_attribute( - txn, "src", f"http://localhost/media/{image_key:s}" - ) - update = y_py.encode_state_as_update(ydoc) # pylint: disable=no-member + ydoc = pycrdt.Doc() + fragment = pycrdt.XmlFragment( + [pycrdt.XmlElement("img", {"src": f"http://localhost/media/{image_key:s}"})] + ) + ydoc["document-store"] = fragment + update = ydoc.get_update() base64_content = base64.b64encode(update).decode("utf-8") bytes_content = base64_content.encode("utf-8") content_file = ContentFile(bytes_content) diff --git a/src/backend/core/tests/test_utils.py b/src/backend/core/tests/test_utils.py index 3fea93ed..37b2e32d 100644 --- a/src/backend/core/tests/test_utils.py +++ b/src/backend/core/tests/test_utils.py @@ -3,7 +3,7 @@ import base64 import uuid -import y_py +import pycrdt from core import utils @@ -29,17 +29,22 @@ TEST_BASE64_STRING = ( def test_utils_base64_yjs_to_text(): """Test extract text from saved yjs document""" - assert utils.base64_yjs_to_text(TEST_BASE64_STRING) == "Hello world" + assert utils.base64_yjs_to_text(TEST_BASE64_STRING) == "Hello w or ld" def test_utils_base64_yjs_to_xml(): """Test extract xml from saved yjs document""" content = utils.base64_yjs_to_xml(TEST_BASE64_STRING) assert ( - 'Hello' in content - or 'Hello' in content + 'Hello' + in content + or 'Hello' + in content + ) + assert ( + 'world' + in content ) - assert 'world' in content def test_utils_extract_attachments(): @@ -56,22 +61,17 @@ def test_utils_extract_attachments(): image_key3 = f"{uuid.uuid4()!s}/attachments/{uuid.uuid4()!s}.png" image_url3 = f"http://localhost/media/{image_key3:s}" - ydoc = y_py.YDoc() # pylint: disable=no-member - with ydoc.begin_transaction() as txn: - xml_fragment = ydoc.get_xml_element("document-store") + ydoc = pycrdt.Doc() + frag = pycrdt.XmlFragment( + [ + pycrdt.XmlElement("img", {"src": image_url1}), + pycrdt.XmlElement("img", {"src": image_url2}), + pycrdt.XmlElement("p", {}, [pycrdt.XmlText(image_url3)]), + ] + ) + ydoc["document-store"] = frag - xml_image = xml_fragment.push_xml_element(txn, "image") - xml_image.set_attribute(txn, "src", image_url1) - - xml_image = xml_fragment.push_xml_element(txn, "image") - xml_image.set_attribute(txn, "src", image_url2) - - xml_paragraph = xml_fragment.push_xml_element(txn, "paragraph") - xml_text = xml_paragraph.push_xml_text(txn) - xml_text.push(txn, image_url3) - - update = y_py.encode_state_as_update(ydoc) # pylint: disable=no-member + update = ydoc.get_update() base64_string = base64.b64encode(update).decode("utf-8") - # image_key2 is missing the "/media/" part and shouldn't get extracted assert utils.extract_attachments(base64_string) == [image_key1, image_key3] diff --git a/src/backend/core/tests/test_utils_base64_yjs_to_text.py b/src/backend/core/tests/test_utils_base64_yjs_to_text.py deleted file mode 100644 index 2ffa168d..00000000 --- a/src/backend/core/tests/test_utils_base64_yjs_to_text.py +++ /dev/null @@ -1,70 +0,0 @@ -"""Test util base64_yjs_to_text.""" - -import base64 -import uuid - -import y_py - -from core import utils -from core.utils import base64_yjs_to_text - - -def test_utils_base64_yjs_to_text(): - """ - Test extract_text_from_saved_yjs_document - This base64 string is an example of what is saved in the database. - This base64 is generated from the blocknote editor, it contains - the text \n# *Hello* \n- w**or**ld - """ - base64_string = ( - "AR717vLVDgAHAQ5kb2N1bWVudC1zdG9yZQMKYmxvY2tHcm91cAcA9e7y1Q4AAw5ibG9ja0NvbnRh" - "aW5lcgcA9e7y1Q4BAwdoZWFkaW5nBwD17vLVDgIGBgD17vLVDgMGaXRhbGljAnt9hPXu8tUOBAVI" - "ZWxsb4b17vLVDgkGaXRhbGljBG51bGwoAPXu8tUOAg10ZXh0QWxpZ25tZW50AXcEbGVmdCgA9e7y" - "1Q4CBWxldmVsAX0BKAD17vLVDgECaWQBdyQwNGQ2MjM0MS04MzI2LTQyMzYtYTA4My00ODdlMjZm" - "YWQyMzAoAPXu8tUOAQl0ZXh0Q29sb3IBdwdkZWZhdWx0KAD17vLVDgEPYmFja2dyb3VuZENvbG9y" - "AXcHZGVmYXVsdIf17vLVDgEDDmJsb2NrQ29udGFpbmVyBwD17vLVDhADDmJ1bGxldExpc3RJdGVt" - "BwD17vLVDhEGBAD17vLVDhIBd4b17vLVDhMEYm9sZAJ7fYT17vLVDhQCb3KG9e7y1Q4WBGJvbGQE" - "bnVsbIT17vLVDhcCbGQoAPXu8tUOEQ10ZXh0QWxpZ25tZW50AXcEbGVmdCgA9e7y1Q4QAmlkAXck" - "ZDM1MWUwNjgtM2U1NS00MjI2LThlYTUtYWJiMjYzMTk4ZTJhKAD17vLVDhAJdGV4dENvbG9yAXcH" - "ZGVmYXVsdCgA9e7y1Q4QD2JhY2tncm91bmRDb2xvcgF3B2RlZmF1bHSH9e7y1Q4QAw5ibG9ja0Nv" - "bnRhaW5lcgcA9e7y1Q4eAwlwYXJhZ3JhcGgoAPXu8tUOHw10ZXh0QWxpZ25tZW50AXcEbGVmdCgA" - "9e7y1Q4eAmlkAXckODk3MDBjMDctZTBlMS00ZmUwLWFjYTItODQ5MzIwOWE3ZTQyKAD17vLVDh4J" - "dGV4dENvbG9yAXcHZGVmYXVsdCgA9e7y1Q4eD2JhY2tncm91bmRDb2xvcgF3B2RlZmF1bHQA" - ) - - assert base64_yjs_to_text(base64_string) == "Hello world" - - -def test_utils_extract_attachments(): - """ - All attachment keys in the document content should be extracted. - """ - document_id = uuid.uuid4() - image_key1 = f"{document_id!s}/attachments/{uuid.uuid4()!s}.png" - image_url1 = f"http://localhost/media/{image_key1:s}" - - image_key2 = f"{uuid.uuid4()!s}/attachments/{uuid.uuid4()!s}.png" - image_url2 = f"http://localhost/{image_key2:s}" - - image_key3 = f"{uuid.uuid4()!s}/attachments/{uuid.uuid4()!s}.png" - image_url3 = f"http://localhost/media/{image_key3:s}" - - ydoc = y_py.YDoc() # pylint: disable=no-member - with ydoc.begin_transaction() as txn: - xml_fragment = ydoc.get_xml_element("document-store") - - xml_image = xml_fragment.push_xml_element(txn, "image") - xml_image.set_attribute(txn, "src", image_url1) - - xml_image = xml_fragment.push_xml_element(txn, "image") - xml_image.set_attribute(txn, "src", image_url2) - - xml_paragraph = xml_fragment.push_xml_element(txn, "paragraph") - xml_text = xml_paragraph.push_xml_text(txn) - xml_text.push(txn, image_url3) - - update = y_py.encode_state_as_update(ydoc) # pylint: disable=no-member - base64_string = base64.b64encode(update).decode("utf-8") - - # image_url3 is missing the "/media/" part and shouldn't get extracted - assert utils.extract_attachments(base64_string) == [image_key1, image_key3] diff --git a/src/backend/core/utils.py b/src/backend/core/utils.py index 0378c323..780431f4 100644 --- a/src/backend/core/utils.py +++ b/src/backend/core/utils.py @@ -3,7 +3,7 @@ import base64 import re -import y_py as Y +import pycrdt from bs4 import BeautifulSoup from core import enums @@ -52,19 +52,19 @@ def base64_yjs_to_xml(base64_string): """Extract xml from base64 yjs document.""" decoded_bytes = base64.b64decode(base64_string) - uint8_array = bytearray(decoded_bytes) + # uint8_array = bytearray(decoded_bytes) - doc = Y.YDoc() # pylint: disable=E1101 - Y.apply_update(doc, uint8_array) # pylint: disable=E1101 - return str(doc.get_xml_element("document-store")) + doc = pycrdt.Doc() + doc.apply_update(decoded_bytes) + return str(doc.get("document-store", type=pycrdt.XmlFragment)) def base64_yjs_to_text(base64_string): """Extract text from base64 yjs document.""" blocknote_structure = base64_yjs_to_xml(base64_string) - soup = BeautifulSoup(blocknote_structure, "html.parser") - return soup.get_text(separator=" ").strip() + soup = BeautifulSoup(blocknote_structure, "lxml-xml") + return soup.get_text(separator=" ", strip=True) def extract_attachments(content): diff --git a/src/backend/pyproject.toml b/src/backend/pyproject.toml index e25f433a..20fbfc15 100644 --- a/src/backend/pyproject.toml +++ b/src/backend/pyproject.toml @@ -47,18 +47,19 @@ dependencies = [ "factory_boy==3.3.3", "gunicorn==23.0.0", "jsonschema==4.23.0", + "lxml==5.3.1", "markdown==3.7", "mozilla-django-oidc==4.0.1", "nested-multipart-parser==1.5.0", "openai==1.68.2", "psycopg[binary]==3.2.6", + "pycrdt==0.12.10", "PyJWT==2.10.1", "python-magic==0.4.27", "requests==2.32.3", "sentry-sdk==2.24.0", "url-normalize==1.4.3", "whitenoise==6.9.0", - "y-py==0.6.2", ] [project.urls]