From 710bbf512ce588222e4f5c044fdb63f35076e77e Mon Sep 17 00:00:00 2001 From: Samuel Paccoud - DINUM Date: Fri, 27 Dec 2024 10:19:16 +0100 Subject: [PATCH] =?UTF-8?q?=E2=9C=A8(backend)=20add=20util=20to=20extract?= =?UTF-8?q?=20text=20from=20Ydoc=20content?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Documents content is stored in the Ydoc format. We need a util to extract it as xml/text. --- CHANGELOG.md | 1 + Dockerfile | 7 ++++ src/backend/core/factories.py | 18 ++++++++- src/backend/core/tests/test_utils.py | 37 +++++++++++++++++++ .../tests/test_utils_base64_yjs_to_text.py | 29 +++++++++++++++ src/backend/core/utils.py | 25 +++++++++++++ src/backend/pyproject.toml | 7 ++++ 7 files changed, 123 insertions(+), 1 deletion(-) create mode 100644 src/backend/core/tests/test_utils.py create mode 100644 src/backend/core/tests/test_utils_base64_yjs_to_text.py create mode 100644 src/backend/core/utils.py diff --git a/CHANGELOG.md b/CHANGELOG.md index f2e66522..d4f738fd 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -132,6 +132,7 @@ and this project adheres to ## Added +- ⚗️(backend) add util to extract text from base64 yjs document - ✨(backend) add soft delete and restore API endpoints to documents #516 - ✨(backend) allow organizing documents in a tree structure #516 - ✨(backend) add "excerpt" field to document list serializer #516 diff --git a/Dockerfile b/Dockerfile index 60f464ee..23f26b70 100644 --- a/Dockerfile +++ b/Dockerfile @@ -15,6 +15,13 @@ FROM base AS back-builder WORKDIR /builder +# Install Rust and Cargo using Alpine's package manager +RUN apk add --no-cache \ + build-base \ + libffi-dev \ + rust \ + cargo + # Copy required python dependencies COPY ./src/backend /builder diff --git a/src/backend/core/factories.py b/src/backend/core/factories.py index 3f2c085f..d0a641d8 100644 --- a/src/backend/core/factories.py +++ b/src/backend/core/factories.py @@ -13,6 +13,22 @@ from core import models fake = Faker() +YDOC_HELLO_WORLD_BASE64 = ( + "AR717vLVDgAHAQ5kb2N1bWVudC1zdG9yZQMKYmxvY2tHcm91cAcA9e7y1Q4AAw5ibG9ja0NvbnRh" + "aW5lcgcA9e7y1Q4BAwdoZWFkaW5nBwD17vLVDgIGBgD17vLVDgMGaXRhbGljAnt9hPXu8tUOBAVI" + "ZWxsb4b17vLVDgkGaXRhbGljBG51bGwoAPXu8tUOAg10ZXh0QWxpZ25tZW50AXcEbGVmdCgA9e7y" + "1Q4CBWxldmVsAX0BKAD17vLVDgECaWQBdyQwNGQ2MjM0MS04MzI2LTQyMzYtYTA4My00ODdlMjZm" + "YWQyMzAoAPXu8tUOAQl0ZXh0Q29sb3IBdwdkZWZhdWx0KAD17vLVDgEPYmFja2dyb3VuZENvbG9y" + "AXcHZGVmYXVsdIf17vLVDgEDDmJsb2NrQ29udGFpbmVyBwD17vLVDhADDmJ1bGxldExpc3RJdGVt" + "BwD17vLVDhEGBAD17vLVDhIBd4b17vLVDhMEYm9sZAJ7fYT17vLVDhQCb3KG9e7y1Q4WBGJvbGQE" + "bnVsbIT17vLVDhcCbGQoAPXu8tUOEQ10ZXh0QWxpZ25tZW50AXcEbGVmdCgA9e7y1Q4QAmlkAXck" + "ZDM1MWUwNjgtM2U1NS00MjI2LThlYTUtYWJiMjYzMTk4ZTJhKAD17vLVDhAJdGV4dENvbG9yAXcH" + "ZGVmYXVsdCgA9e7y1Q4QD2JhY2tncm91bmRDb2xvcgF3B2RlZmF1bHSH9e7y1Q4QAw5ibG9ja0Nv" + "bnRhaW5lcgcA9e7y1Q4eAwlwYXJhZ3JhcGgoAPXu8tUOHw10ZXh0QWxpZ25tZW50AXcEbGVmdCgA" + "9e7y1Q4eAmlkAXckODk3MDBjMDctZTBlMS00ZmUwLWFjYTItODQ5MzIwOWE3ZTQyKAD17vLVDh4J" + "dGV4dENvbG9yAXcHZGVmYXVsdCgA9e7y1Q4eD2JhY2tncm91bmRDb2xvcgF3B2RlZmF1bHQA" +) + class UserFactory(factory.django.DjangoModelFactory): """A factory to random users for testing purposes.""" @@ -75,7 +91,7 @@ class DocumentFactory(factory.django.DjangoModelFactory): title = factory.Sequence(lambda n: f"document{n}") excerpt = factory.Sequence(lambda n: f"excerpt{n}") - content = factory.Sequence(lambda n: f"content{n}") + content = YDOC_HELLO_WORLD_BASE64 creator = factory.SubFactory(UserFactory) deleted_at = None link_reach = factory.fuzzy.FuzzyChoice( diff --git a/src/backend/core/tests/test_utils.py b/src/backend/core/tests/test_utils.py new file mode 100644 index 00000000..4fa33e1e --- /dev/null +++ b/src/backend/core/tests/test_utils.py @@ -0,0 +1,37 @@ +"""Test util base64_yjs_to_text.""" + +from core import utils + +# This base64 string is an example of what is saved in the database. +# This base64 is generated from the blocknote editor, it contains +# the text \n# *Hello* \n- w**or**ld +TEST_BASE64_STRING = ( + "AR717vLVDgAHAQ5kb2N1bWVudC1zdG9yZQMKYmxvY2tHcm91cAcA9e7y1Q4AAw5ibG9ja0NvbnRh" + "aW5lcgcA9e7y1Q4BAwdoZWFkaW5nBwD17vLVDgIGBgD17vLVDgMGaXRhbGljAnt9hPXu8tUOBAVI" + "ZWxsb4b17vLVDgkGaXRhbGljBG51bGwoAPXu8tUOAg10ZXh0QWxpZ25tZW50AXcEbGVmdCgA9e7y" + "1Q4CBWxldmVsAX0BKAD17vLVDgECaWQBdyQwNGQ2MjM0MS04MzI2LTQyMzYtYTA4My00ODdlMjZm" + "YWQyMzAoAPXu8tUOAQl0ZXh0Q29sb3IBdwdkZWZhdWx0KAD17vLVDgEPYmFja2dyb3VuZENvbG9y" + "AXcHZGVmYXVsdIf17vLVDgEDDmJsb2NrQ29udGFpbmVyBwD17vLVDhADDmJ1bGxldExpc3RJdGVt" + "BwD17vLVDhEGBAD17vLVDhIBd4b17vLVDhMEYm9sZAJ7fYT17vLVDhQCb3KG9e7y1Q4WBGJvbGQE" + "bnVsbIT17vLVDhcCbGQoAPXu8tUOEQ10ZXh0QWxpZ25tZW50AXcEbGVmdCgA9e7y1Q4QAmlkAXck" + "ZDM1MWUwNjgtM2U1NS00MjI2LThlYTUtYWJiMjYzMTk4ZTJhKAD17vLVDhAJdGV4dENvbG9yAXcH" + "ZGVmYXVsdCgA9e7y1Q4QD2JhY2tncm91bmRDb2xvcgF3B2RlZmF1bHSH9e7y1Q4QAw5ibG9ja0Nv" + "bnRhaW5lcgcA9e7y1Q4eAwlwYXJhZ3JhcGgoAPXu8tUOHw10ZXh0QWxpZ25tZW50AXcEbGVmdCgA" + "9e7y1Q4eAmlkAXckODk3MDBjMDctZTBlMS00ZmUwLWFjYTItODQ5MzIwOWE3ZTQyKAD17vLVDh4J" + "dGV4dENvbG9yAXcHZGVmYXVsdCgA9e7y1Q4eD2JhY2tncm91bmRDb2xvcgF3B2RlZmF1bHQA" +) + + +def test_utils_base64_yjs_to_text(): + """Test extract text from saved yjs document""" + assert utils.base64_yjs_to_text(TEST_BASE64_STRING) == "Hello world" + + +def test_utils_base64_yjs_to_xml(): + """Test extract xml from saved yjs document""" + content = utils.base64_yjs_to_xml(TEST_BASE64_STRING) + assert ( + 'Hello' in content + or 'Hello' in content + ) + assert 'world' in content diff --git a/src/backend/core/tests/test_utils_base64_yjs_to_text.py b/src/backend/core/tests/test_utils_base64_yjs_to_text.py new file mode 100644 index 00000000..376bb85d --- /dev/null +++ b/src/backend/core/tests/test_utils_base64_yjs_to_text.py @@ -0,0 +1,29 @@ +"""Test util base64_yjs_to_text.""" + +from core.utils import base64_yjs_to_text + + +def test_base64_yjs_to_text(): + """ + Test extract_text_from_saved_yjs_document + This base64 string is an example of what is saved in the database. + This base64 is generated from the blocknote editor, it contains + the text \n# *Hello* \n- w**or**ld + """ + base64_string = ( + "AR717vLVDgAHAQ5kb2N1bWVudC1zdG9yZQMKYmxvY2tHcm91cAcA9e7y1Q4AAw5ibG9ja0NvbnRh" + "aW5lcgcA9e7y1Q4BAwdoZWFkaW5nBwD17vLVDgIGBgD17vLVDgMGaXRhbGljAnt9hPXu8tUOBAVI" + "ZWxsb4b17vLVDgkGaXRhbGljBG51bGwoAPXu8tUOAg10ZXh0QWxpZ25tZW50AXcEbGVmdCgA9e7y" + "1Q4CBWxldmVsAX0BKAD17vLVDgECaWQBdyQwNGQ2MjM0MS04MzI2LTQyMzYtYTA4My00ODdlMjZm" + "YWQyMzAoAPXu8tUOAQl0ZXh0Q29sb3IBdwdkZWZhdWx0KAD17vLVDgEPYmFja2dyb3VuZENvbG9y" + "AXcHZGVmYXVsdIf17vLVDgEDDmJsb2NrQ29udGFpbmVyBwD17vLVDhADDmJ1bGxldExpc3RJdGVt" + "BwD17vLVDhEGBAD17vLVDhIBd4b17vLVDhMEYm9sZAJ7fYT17vLVDhQCb3KG9e7y1Q4WBGJvbGQE" + "bnVsbIT17vLVDhcCbGQoAPXu8tUOEQ10ZXh0QWxpZ25tZW50AXcEbGVmdCgA9e7y1Q4QAmlkAXck" + "ZDM1MWUwNjgtM2U1NS00MjI2LThlYTUtYWJiMjYzMTk4ZTJhKAD17vLVDhAJdGV4dENvbG9yAXcH" + "ZGVmYXVsdCgA9e7y1Q4QD2JhY2tncm91bmRDb2xvcgF3B2RlZmF1bHSH9e7y1Q4QAw5ibG9ja0Nv" + "bnRhaW5lcgcA9e7y1Q4eAwlwYXJhZ3JhcGgoAPXu8tUOHw10ZXh0QWxpZ25tZW50AXcEbGVmdCgA" + "9e7y1Q4eAmlkAXckODk3MDBjMDctZTBlMS00ZmUwLWFjYTItODQ5MzIwOWE3ZTQyKAD17vLVDh4J" + "dGV4dENvbG9yAXcHZGVmYXVsdCgA9e7y1Q4eD2JhY2tncm91bmRDb2xvcgF3B2RlZmF1bHQA" + ) + + assert base64_yjs_to_text(base64_string) == "Hello world" diff --git a/src/backend/core/utils.py b/src/backend/core/utils.py new file mode 100644 index 00000000..bd2e0170 --- /dev/null +++ b/src/backend/core/utils.py @@ -0,0 +1,25 @@ +"""Utils for the core app.""" + +import base64 + +import y_py as Y +from bs4 import BeautifulSoup + + +def base64_yjs_to_xml(base64_string): + """Extract xml from base64 yjs document.""" + + decoded_bytes = base64.b64decode(base64_string) + uint8_array = bytearray(decoded_bytes) + + doc = Y.YDoc() # pylint: disable=E1101 + Y.apply_update(doc, uint8_array) # pylint: disable=E1101 + return str(doc.get_xml_element("document-store")) + + +def base64_yjs_to_text(base64_string): + """Extract text from base64 yjs document.""" + + blocknote_structure = base64_yjs_to_xml(base64_string) + soup = BeautifulSoup(blocknote_structure, "html.parser") + return soup.get_text(separator=" ").strip() diff --git a/src/backend/pyproject.toml b/src/backend/pyproject.toml index c742af5f..17cb0049 100644 --- a/src/backend/pyproject.toml +++ b/src/backend/pyproject.toml @@ -25,6 +25,7 @@ license = { file = "LICENSE" } readme = "README.md" requires-python = ">=3.12" dependencies = [ + "beautifulsoup4==4.12.3", "boto3==1.37.18", "Brotli==1.1.0", "celery[redis]==5.4.0", @@ -47,6 +48,7 @@ dependencies = [ "gunicorn==23.0.0", "jsonschema==4.23.0", "markdown==3.7", + "mozilla-django-oidc==4.0.1", "nested-multipart-parser==1.5.0", "openai==1.68.2", "psycopg[binary]==3.2.6", @@ -55,8 +57,13 @@ dependencies = [ "requests==2.32.3", "sentry-sdk==2.24.0", "url-normalize==1.4.3", +<<<<<<< HEAD "whitenoise==6.9.0", "mozilla-django-oidc==4.0.1", +======= + "whitenoise==6.8.2", + "y-py==0.6.2", +>>>>>>> f087cd70 (✨(backend) add util to extract text from Ydoc content) ] [project.urls]