diff --git a/CHANGELOG.md b/CHANGELOG.md index d4f738fd..c979a118 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -132,6 +132,7 @@ and this project adheres to ## Added +- ✨(backend) add duplicate action to the document API endpoint - ⚗️(backend) add util to extract text from base64 yjs document - ✨(backend) add soft delete and restore API endpoints to documents #516 - ✨(backend) allow organizing documents in a tree structure #516 diff --git a/src/backend/core/admin.py b/src/backend/core/admin.py index 080b492a..e096b020 100644 --- a/src/backend/core/admin.py +++ b/src/backend/core/admin.py @@ -151,6 +151,8 @@ class DocumentAdmin(TreeAdmin): "path", "depth", "numchild", + "duplicated_from", + "attachments", ) }, ), @@ -166,8 +168,10 @@ class DocumentAdmin(TreeAdmin): "updated_at", ) readonly_fields = ( + "attachments", "creator", "depth", + "duplicated_from", "id", "numchild", "path", diff --git a/src/backend/core/api/serializers.py b/src/backend/core/api/serializers.py index 82044fe2..89b835a9 100644 --- a/src/backend/core/api/serializers.py +++ b/src/backend/core/api/serializers.py @@ -381,6 +381,27 @@ class LinkDocumentSerializer(serializers.ModelSerializer): ] +class DocumentDuplicationSerializer(serializers.Serializer): + """ + Serializer for duplicating a document. + Allows specifying whether to keep access permissions. + """ + + with_accesses = serializers.BooleanField(default=False) + + def create(self, validated_data): + """ + This serializer is not intended to create objects. + """ + raise NotImplementedError("This serializer does not support creation.") + + def update(self, instance, validated_data): + """ + This serializer is not intended to update objects. + """ + raise NotImplementedError("This serializer does not support updating.") + + # Suppress the warning about not implementing `create` and `update` methods # since we don't use a model and only rely on the serializer for validation # pylint: disable=abstract-method diff --git a/src/backend/core/api/viewsets.py b/src/backend/core/api/viewsets.py index e4d39918..8544aed9 100644 --- a/src/backend/core/api/viewsets.py +++ b/src/backend/core/api/viewsets.py @@ -16,6 +16,8 @@ from django.db import transaction from django.db.models.expressions import RawSQL from django.db.models.functions import Left, Length from django.http import Http404, StreamingHttpResponse +from django.utils.text import capfirst +from django.utils.translation import gettext_lazy as _ import requests import rest_framework as drf @@ -28,26 +30,13 @@ from rest_framework.throttling import UserRateThrottle from core import authentication, enums, models from core.services.ai_services import AIService from core.services.collaboration_services import CollaborationService +from core.utils import extract_attachments, filter_descendants from . import permissions, serializers, utils from .filters import DocumentFilter, ListDocumentFilter logger = logging.getLogger(__name__) -<<<<<<< HEAD -ATTACHMENTS_FOLDER = "attachments" -UUID_REGEX = ( - r"[a-fA-F0-9]{8}-[a-fA-F0-9]{4}-[a-fA-F0-9]{4}-[a-fA-F0-9]{4}-[a-fA-F0-9]{12}" -) -FILE_EXT_REGEX = r"\.[a-zA-Z0-9]{1,10}" -MEDIA_STORAGE_URL_PATTERN = re.compile( - f"{settings.MEDIA_URL:s}(?P{UUID_REGEX:s})/" - f"(?P{ATTACHMENTS_FOLDER:s}/{UUID_REGEX:s}(?:-unsafe)?{FILE_EXT_REGEX:s})$" -) -COLLABORATION_WS_URL_PATTERN = re.compile(rf"(?:^|&)room=(?P{UUID_REGEX})(?:&|$)") -======= ->>>>>>> 8076486a (✅(backend) add missing test on media-auth and collaboration-auth) - # pylint: disable=too-many-ancestors @@ -904,6 +893,82 @@ class DocumentViewSet( utils.nest_tree(serializer.data, self.queryset.model.steplen) ) + @drf.decorators.action( + detail=True, + methods=["post"], + permission_classes=[permissions.IsAuthenticated, permissions.AccessPermission], + url_path="duplicate", + ) + @transaction.atomic + def duplicate(self, request, *args, **kwargs): + """ + Duplicate a document and store the links to attached files in the duplicated + document to allow cross-access. + + Optionally duplicates accesses if `with_accesses` is set to true + in the payload. + """ + # Get document while checking permissions + document = self.get_object() + + serializer = serializers.DocumentDuplicationSerializer( + data=request.data, partial=True + ) + serializer.is_valid(raise_exception=True) + with_accesses = serializer.validated_data.get("with_accesses", False) + + base64_yjs_content = document.content + + # Duplicate the document instance + link_kwargs = ( + {"link_reach": document.link_reach, "link_role": document.link_role} + if with_accesses + else {} + ) + extracted_attachments = set(extract_attachments(document.content)) + attachments = list(extracted_attachments & set(document.attachments)) + duplicated_document = document.add_sibling( + "right", + title=capfirst(_("copy of {title}").format(title=document.title)), + content=base64_yjs_content, + attachments=attachments, + duplicated_from=document, + creator=request.user, + **link_kwargs, + ) + + # Always add the logged-in user as OWNER + accesses_to_create = [ + models.DocumentAccess( + document=duplicated_document, + user=request.user, + role=models.RoleChoices.OWNER, + ) + ] + + # If accesses should be duplicated, add other users' accesses as per original document + if with_accesses: + original_accesses = models.DocumentAccess.objects.filter( + document=document + ).exclude(user=request.user) + + accesses_to_create.extend( + models.DocumentAccess( + document=duplicated_document, + user_id=access.user_id, + team=access.team, + role=access.role, + ) + for access in original_accesses + ) + + # Bulk create all the duplicated accesses + models.DocumentAccess.objects.bulk_create(accesses_to_create) + + return drf_response.Response( + {"id": str(duplicated_document.id)}, status=status.HTTP_201_CREATED + ) + @drf.decorators.action(detail=True, methods=["get"], url_path="versions") def versions_list(self, request, *args, **kwargs): """ @@ -1053,7 +1118,7 @@ class DocumentViewSet( # Generate a generic yet unique filename to store the image in object storage file_id = uuid.uuid4() - extension = serializer.validated_data["expected_extension"] + ext = serializer.validated_data["expected_extension"] # Prepare metadata for storage extra_args = { @@ -1065,7 +1130,7 @@ class DocumentViewSet( extra_args["Metadata"]["is_unsafe"] = "true" file_unsafe = "-unsafe" - key = f"{document.key_base}/{ATTACHMENTS_FOLDER:s}/{file_id!s}{file_unsafe}.{extension:s}" + key = f"{document.key_base}/{enums.ATTACHMENTS_FOLDER:s}/{file_id!s}{file_unsafe}.{ext:s}" file_name = serializer.validated_data["file_name"] if ( @@ -1085,6 +1150,10 @@ class DocumentViewSet( file, default_storage.bucket_name, key, ExtraArgs=extra_args ) + # Make the attachment readable by document readers + document.attachments.append(key) + document.save() + return drf.response.Response( {"file": f"{settings.MEDIA_URL:s}{key:s}"}, status=drf.status.HTTP_201_CREATED, @@ -1152,20 +1221,35 @@ class DocumentViewSet( url_params = self._auth_get_url_params( enums.MEDIA_STORAGE_URL_PATTERN, parsed_url.path ) - document = self._auth_get_document(url_params["pk"]) - if not document.get_abilities(request.user).get(self.action, False): - logger.debug( - "User '%s' lacks permission for document '%s'", - request.user, - document.pk, - ) + user = request.user + key = f"{url_params['pk']:s}/{url_params['attachment']:s}" + + # Look for a document to which the user has access and that includes this attachment + # We must look into all descendants of any document to which the user has access per se + readable_per_se_paths = ( + self.queryset.readable_per_se(user) + .order_by("path") + .values_list("path", flat=True) + ) + + attachments_documents = ( + self.queryset.filter(attachments__contains=[key]) + .only("path") + .order_by("path") + ) + readable_attachments_paths = filter_descendants( + [doc.path for doc in attachments_documents], + readable_per_se_paths, + skip_sorting=True, + ) + + if not readable_attachments_paths: + logger.debug("User '%s' lacks permission for attachment", user) raise drf.exceptions.PermissionDenied() # Generate S3 authorization headers using the extracted URL parameters - request = utils.generate_s3_authorization_headers( - f"{url_params['pk']:s}/{url_params['key']:s}" - ) + request = utils.generate_s3_authorization_headers(key) return drf.response.Response("authorized", headers=request.headers, status=200) diff --git a/src/backend/core/enums.py b/src/backend/core/enums.py index 8cd50baf..a9bdd898 100644 --- a/src/backend/core/enums.py +++ b/src/backend/core/enums.py @@ -12,10 +12,13 @@ ATTACHMENTS_FOLDER = "attachments" UUID_REGEX = ( r"[a-fA-F0-9]{8}-[a-fA-F0-9]{4}-[a-fA-F0-9]{4}-[a-fA-F0-9]{4}-[a-fA-F0-9]{12}" ) -FILE_EXT_REGEX = r"\.[a-zA-Z]{3,4}" +FILE_EXT_REGEX = r"\.[a-zA-Z0-9]{1,10}" MEDIA_STORAGE_URL_PATTERN = re.compile( f"{settings.MEDIA_URL:s}(?P{UUID_REGEX:s})/" - f"(?P{ATTACHMENTS_FOLDER:s}/{UUID_REGEX:s}{FILE_EXT_REGEX:s})$" + f"(?P{ATTACHMENTS_FOLDER:s}/{UUID_REGEX:s}(?:-unsafe)?{FILE_EXT_REGEX:s})$" +) +MEDIA_STORAGE_URL_EXTRACT = re.compile( + f"{settings.MEDIA_URL:s}({UUID_REGEX}/{ATTACHMENTS_FOLDER}/{UUID_REGEX}{FILE_EXT_REGEX})" ) COLLABORATION_WS_URL_PATTERN = re.compile(rf"(?:^|&)room=(?P{UUID_REGEX})(?:&|$)") diff --git a/src/backend/core/migrations/0020_remove_is_public_add_field_attachments_and_duplicated_from.py b/src/backend/core/migrations/0020_remove_is_public_add_field_attachments_and_duplicated_from.py new file mode 100644 index 00000000..cc8700b0 --- /dev/null +++ b/src/backend/core/migrations/0020_remove_is_public_add_field_attachments_and_duplicated_from.py @@ -0,0 +1,77 @@ +# Generated by Django 5.1.4 on 2025-01-18 11:53 +import re + +import django.contrib.postgres.fields +import django.db.models.deletion +from django.core.files.storage import default_storage +from django.db import migrations, models + +from botocore.exceptions import ClientError + +import core.models +from core.utils import extract_attachments + + +def populate_attachments_on_all_documents(apps, schema_editor): + """Populate "attachments" field on all existing documents in the database.""" + Document = apps.get_model("core", "Document") + + for document in Document.objects.all(): + try: + response = default_storage.connection.meta.client.get_object( + Bucket=default_storage.bucket_name, Key=f"{document.pk!s}/file" + ) + except (FileNotFoundError, ClientError): + pass + else: + content = response["Body"].read().decode("utf-8") + document.attachments = extract_attachments(content) + document.save(update_fields=["attachments"]) + + +class Migration(migrations.Migration): + dependencies = [ + ("core", "0019_alter_user_language_default_to_null"), + ] + + operations = [ + # v2.0.0 was released so we can now remove BC field "is_public" + migrations.RemoveField( + model_name="document", + name="is_public", + ), + migrations.AlterModelManagers( + name="user", + managers=[ + ("objects", core.models.UserManager()), + ], + ), + migrations.AddField( + model_name="document", + name="attachments", + field=django.contrib.postgres.fields.ArrayField( + base_field=models.CharField(max_length=255), + blank=True, + default=list, + editable=False, + null=True, + size=None, + ), + ), + migrations.AddField( + model_name="document", + name="duplicated_from", + field=models.ForeignKey( + blank=True, + editable=False, + null=True, + on_delete=django.db.models.deletion.SET_NULL, + related_name="duplicates", + to="core.document", + ), + ), + migrations.RunPython( + populate_attachments_on_all_documents, + reverse_code=migrations.RunPython.noop, + ), + ] diff --git a/src/backend/core/models.py b/src/backend/core/models.py index 4edd40e8..83405bae 100644 --- a/src/backend/core/models.py +++ b/src/backend/core/models.py @@ -13,6 +13,7 @@ from logging import getLogger from django.conf import settings from django.contrib.auth import models as auth_models from django.contrib.auth.base_user import AbstractBaseUser +from django.contrib.postgres.fields import ArrayField from django.contrib.sites.models import Site from django.core import mail, validators from django.core.cache import cache @@ -427,10 +428,12 @@ class DocumentQuerySet(MP_NodeQuerySet): def readable_per_se(self, user): """ - Filters the queryset to return documents that the given user has - permission to read. + Filters the queryset to return documents on which the given user has + direct access, team access or link access. This will not return all the + documents that a user can read because it can be obtained via an ancestor. :param user: The user for whom readable documents are to be fetched. - :return: A queryset of documents readable by the user. + :return: A queryset of documents for which the user has direct access, + team access or link access. """ if user.is_authenticated: return self.filter( @@ -459,7 +462,9 @@ class DocumentManager(MP_NodeManager): """ Filters documents based on user permissions using the custom queryset. :param user: The user for whom readable documents are to be fetched. - :return: A queryset of documents readable by the user. + :return: A queryset of documents for which the user has direct access, + team access or link access. This will not return all the documents + that a user can read because it can be obtained via an ancestor. """ return self.get_queryset().readable_per_se(user) @@ -486,6 +491,21 @@ class Document(MP_Node, BaseModel): ) deleted_at = models.DateTimeField(null=True, blank=True) ancestors_deleted_at = models.DateTimeField(null=True, blank=True) + duplicated_from = models.ForeignKey( + "self", + on_delete=models.SET_NULL, + related_name="duplicates", + editable=False, + blank=True, + null=True, + ) + attachments = ArrayField( + models.CharField(max_length=255), + default=list, + editable=False, + blank=True, + null=True, + ) _content = None @@ -800,6 +820,7 @@ class Document(MP_Node, BaseModel): "cors_proxy": can_get, "descendants": can_get, "destroy": is_owner, + "duplicate": can_get, "favorite": can_get and user.is_authenticated, "link_configuration": is_owner_or_admin, "invite_owner": is_owner, diff --git a/src/backend/core/tests/documents/test_api_documents_attachment_upload.py b/src/backend/core/tests/documents/test_api_documents_attachment_upload.py index 000d0251..a6324f24 100644 --- a/src/backend/core/tests/documents/test_api_documents_attachment_upload.py +++ b/src/backend/core/tests/documents/test_api_documents_attachment_upload.py @@ -67,10 +67,12 @@ def test_api_documents_attachment_upload_anonymous_success(): file_path = response.json()["file"] match = pattern.search(file_path) file_id = match.group(1) - # Validate that file_id is a valid UUID uuid.UUID(file_id) + document.refresh_from_db() + assert document.attachments == [f"{document.id!s}/attachments/{file_id!s}.png"] + # Now, check the metadata of the uploaded file key = file_path.replace("/media", "") file_head = default_storage.connection.meta.client.head_object( @@ -112,6 +114,9 @@ def test_api_documents_attachment_upload_authenticated_forbidden(reach, role): "detail": "You do not have permission to perform this action." } + document.refresh_from_db() + assert document.attachments == [] + @pytest.mark.parametrize( "reach, role", @@ -122,8 +127,8 @@ def test_api_documents_attachment_upload_authenticated_forbidden(reach, role): ) def test_api_documents_attachment_upload_authenticated_success(reach, role): """ - Autenticated who are not related to a document should be able to upload a file - if the link reach and role permit it. + Autenticated users who are not related to a document should be able to upload + a file when the link reach and role permit it. """ user = factories.UserFactory() @@ -145,6 +150,9 @@ def test_api_documents_attachment_upload_authenticated_success(reach, role): # Validate that file_id is a valid UUID uuid.UUID(file_id) + document.refresh_from_db() + assert document.attachments == [f"{document.id!s}/attachments/{file_id!s}.png"] + @pytest.mark.parametrize("via", VIA) def test_api_documents_attachment_upload_reader(via, mock_user_teams): @@ -175,6 +183,9 @@ def test_api_documents_attachment_upload_reader(via, mock_user_teams): "detail": "You do not have permission to perform this action." } + document.refresh_from_db() + assert document.attachments == [] + @pytest.mark.parametrize("role", ["editor", "administrator", "owner"]) @pytest.mark.parametrize("via", VIA) @@ -211,6 +222,9 @@ def test_api_documents_attachment_upload_success(via, role, mock_user_teams): # Validate that file_id is a valid UUID uuid.UUID(file_id) + document.refresh_from_db() + assert document.attachments == [f"{document.id!s}/attachments/{file_id!s}.png"] + # Now, check the metadata of the uploaded file key = file_path.replace("/media", "") file_head = default_storage.connection.meta.client.head_object( @@ -236,6 +250,9 @@ def test_api_documents_attachment_upload_invalid(client): assert response.status_code == 400 assert response.json() == {"file": ["No file was submitted."]} + document.refresh_from_db() + assert document.attachments == [] + def test_api_documents_attachment_upload_size_limit_exceeded(settings): """The uploaded file should not exceeed the maximum size in settings.""" @@ -258,6 +275,9 @@ def test_api_documents_attachment_upload_size_limit_exceeded(settings): assert response.status_code == 400 assert response.json() == {"file": ["File size exceeds the maximum limit of 1 MB."]} + document.refresh_from_db() + assert document.attachments == [] + @pytest.mark.parametrize( "name,content,extension,content_type", @@ -293,6 +313,11 @@ def test_api_documents_attachment_upload_fix_extension( match = pattern.search(file_path) file_id = match.group(1) + document.refresh_from_db() + assert document.attachments == [ + f"{document.id!s}/attachments/{file_id!s}.{extension:s}" + ] + assert "-unsafe" in file_id # Validate that file_id is a valid UUID file_id = file_id.replace("-unsafe", "") @@ -323,6 +348,9 @@ def test_api_documents_attachment_upload_empty_file(): assert response.status_code == 400 assert response.json() == {"file": ["The submitted file is empty."]} + document.refresh_from_db() + assert document.attachments == [] + def test_api_documents_attachment_upload_unsafe(): """A file with an unsafe mime type should be tagged as such.""" @@ -345,6 +373,9 @@ def test_api_documents_attachment_upload_unsafe(): match = pattern.search(file_path) file_id = match.group(1) + document.refresh_from_db() + assert document.attachments == [f"{document.id!s}/attachments/{file_id!s}.exe"] + assert "-unsafe" in file_id # Validate that file_id is a valid UUID file_id = file_id.replace("-unsafe", "") diff --git a/src/backend/core/tests/documents/test_api_documents_duplicate.py b/src/backend/core/tests/documents/test_api_documents_duplicate.py new file mode 100644 index 00000000..ec13d878 --- /dev/null +++ b/src/backend/core/tests/documents/test_api_documents_duplicate.py @@ -0,0 +1,206 @@ +""" +Test file uploads API endpoint for users in impress's core app. +""" + +import base64 +import uuid +from io import BytesIO +from urllib.parse import urlparse + +from django.conf import settings +from django.core.files.storage import default_storage +from django.utils import timezone + +import pytest +import requests +import y_py +from rest_framework.test import APIClient + +from core import factories, models + +pytestmark = pytest.mark.django_db + +PIXEL = ( + b"\x89PNG\r\n\x1a\n\x00\x00\x00\rIHDR\x00\x00\x00\x01\x00\x00\x00\x01\x08\x06\x00" + b"\x00\x00\x1f\x15\xc4\x89\x00\x00\x00\nIDATx\x9cc\xf8\xff\xff?\x00\x05\xfe\x02\xfe" + b"\xa7V\xbd\xfa\x00\x00\x00\x00IEND\xaeB`\x82" +) + + +def get_image_refs(document_id): + """Generate an image key for testing.""" + image_key = f"{document_id!s}/attachments/{uuid.uuid4()!s}.png" + default_storage.connection.meta.client.put_object( + Bucket=default_storage.bucket_name, + Key=image_key, + Body=BytesIO(PIXEL), + ContentType="image/png", + ) + return image_key, f"http://localhost/media/{image_key:s}" + + +def test_api_documents_duplicate_forbidden(): + """A user who doesn't have read access to a document should not be allowed to duplicate it.""" + user = factories.UserFactory() + client = APIClient() + client.force_login(user) + + document = factories.DocumentFactory( + link_reach="restricted", + users=[factories.UserFactory()], + title="my document", + ) + + response = client.post(f"/api/v1.0/documents/{document.id!s}/duplicate/") + + assert response.status_code == 403 + assert models.Document.objects.count() == 1 + + +def test_api_documents_duplicate_anonymous(): + """Anonymous users should not be able to duplicate documents even with read access.""" + + document = factories.DocumentFactory(link_reach="public") + + response = APIClient().post(f"/api/v1.0/documents/{document.id!s}/duplicate/") + + assert response.status_code == 401 + assert models.Document.objects.count() == 1 + + +@pytest.mark.parametrize("index", range(3)) +def test_api_documents_duplicate_success(index): + """ + Anonymous users should be able to retrieve attachments linked to a public document. + Accesses should not be duplicated if the user does not request it specifically. + Attachments that are not in the content should not be passed for access in the + duplicated document's "attachments" list. + """ + user = factories.UserFactory() + client = APIClient() + client.force_login(user) + + document_ids = [uuid.uuid4() for _ in range(3)] + image_refs = [get_image_refs(doc_id) for doc_id in document_ids] + + # Create document content with the first image only + ydoc = y_py.YDoc() # pylint: disable=no-member + with ydoc.begin_transaction() as txn: + xml_fragment = ydoc.get_xml_element("document-store") + xml_fragment.push_xml_element(txn, "image").set_attribute( + txn, "src", image_refs[0][1] + ) + update = y_py.encode_state_as_update(ydoc) # pylint: disable=no-member + base64_content = base64.b64encode(update).decode("utf-8") + + # Create documents + document = factories.DocumentFactory( + id=document_ids[index], + content=base64_content, + link_reach="restricted", + users=[user, factories.UserFactory()], + title="document with an image", + attachments=[key for key, _ in image_refs], + ) + factories.DocumentFactory(id=document_ids[(index + 1) % 3]) + # Don't create document for third ID to check that it doesn't impact access to attachments + + # Duplicate the document via the API endpoint + response = client.post(f"/api/v1.0/documents/{document.id}/duplicate/") + + assert response.status_code == 201 + + duplicated_document = models.Document.objects.get(id=response.json()["id"]) + assert duplicated_document.title == "Copy of document with an image" + assert duplicated_document.content == document.content + assert duplicated_document.creator == user + assert duplicated_document.link_reach == "restricted" + assert duplicated_document.link_role == "reader" + assert duplicated_document.duplicated_from == document + assert duplicated_document.attachments == [ + image_refs[0][0] + ] # Only the first image key + assert duplicated_document.get_parent() == document.get_parent() + assert duplicated_document.path == document.get_next_sibling().path + + # Check that accesses were not duplicated. + # The user who did the duplicate is forced as owner + assert duplicated_document.accesses.count() == 1 + access = duplicated_document.accesses.first() + assert access.user == user + assert access.role == "owner" + + # Ensure access persists after the owner loses access to the original document + models.DocumentAccess.objects.filter(document=document).delete() + response = client.get( + "/api/v1.0/documents/media-auth/", HTTP_X_ORIGINAL_URL=image_refs[0][1] + ) + + assert response.status_code == 200 + + authorization = response["Authorization"] + assert "AWS4-HMAC-SHA256 Credential=" in authorization + assert ( + "SignedHeaders=host;x-amz-content-sha256;x-amz-date, Signature=" + in authorization + ) + assert response["X-Amz-Date"] == timezone.now().strftime("%Y%m%dT%H%M%SZ") + + s3_url = urlparse(settings.AWS_S3_ENDPOINT_URL) + response = requests.get( + f"{settings.AWS_S3_ENDPOINT_URL:s}/impress-media-storage/{image_refs[0][0]:s}", + headers={ + "authorization": authorization, + "x-amz-date": response["x-amz-date"], + "x-amz-content-sha256": response["x-amz-content-sha256"], + "Host": f"{s3_url.hostname:s}:{s3_url.port:d}", + }, + timeout=1, + ) + assert response.content == PIXEL + + # Ensure the other images are not accessible + for _, url in image_refs[1:]: + response = client.get( + "/api/v1.0/documents/media-auth/", HTTP_X_ORIGINAL_URL=url + ) + assert response.status_code == 403 + + +def test_api_documents_duplicate_with_accesses(): + """Accesses should be duplicated if the user requests it specifically.""" + user = factories.UserFactory() + client = APIClient() + client.force_login(user) + + document = factories.DocumentFactory( + users=[user], + title="document with accesses", + ) + user_access = factories.UserDocumentAccessFactory(document=document) + team_access = factories.TeamDocumentAccessFactory(document=document) + + # Duplicate the document via the API endpoint requesting to duplicate accesses + response = client.post( + f"/api/v1.0/documents/{document.id!s}/duplicate/", + {"with_accesses": True}, + format="json", + ) + + assert response.status_code == 201 + + duplicated_document = models.Document.objects.get(id=response.json()["id"]) + assert duplicated_document.title == "Copy of document with accesses" + assert duplicated_document.content == document.content + assert duplicated_document.link_reach == document.link_reach + assert duplicated_document.link_role == document.link_role + assert duplicated_document.creator == user + assert duplicated_document.duplicated_from == document + assert duplicated_document.attachments == [] + + # Check that accesses were duplicated and the user who did the duplicate is forced as owner + duplicated_accesses = duplicated_document.accesses + assert duplicated_accesses.count() == 3 + assert duplicated_accesses.get(user=user).role == "owner" + assert duplicated_accesses.get(user=user_access.user).role == user_access.role + assert duplicated_accesses.get(team=team_access.team).role == team_access.role diff --git a/src/backend/core/tests/documents/test_api_documents_media_auth.py b/src/backend/core/tests/documents/test_api_documents_media_auth.py index 25e47ed9..e6ed3d00 100644 --- a/src/backend/core/tests/documents/test_api_documents_media_auth.py +++ b/src/backend/core/tests/documents/test_api_documents_media_auth.py @@ -37,11 +37,9 @@ def test_api_documents_media_auth_unkown_document(): def test_api_documents_media_auth_anonymous_public(): """Anonymous users should be able to retrieve attachments linked to a public document""" - document = factories.DocumentFactory(link_reach="public") - + document_id = uuid4() filename = f"{uuid4()!s}.jpg" - key = f"{document.pk!s}/attachments/{filename:s}" - + key = f"{document_id!s}/attachments/{filename:s}" default_storage.connection.meta.client.put_object( Bucket=default_storage.bucket_name, Key=key, @@ -49,6 +47,8 @@ def test_api_documents_media_auth_anonymous_public(): ContentType="text/plain", ) + factories.DocumentFactory(id=document_id, link_reach="public", attachments=[key]) + original_url = f"http://localhost/media/{key:s}" response = APIClient().get( "/api/v1.0/documents/media-auth/", HTTP_X_ORIGINAL_URL=original_url @@ -81,8 +81,6 @@ def test_api_documents_media_auth_anonymous_public(): def test_api_documents_media_auth_extensions(): """Files with extensions of any format should work.""" - document = factories.DocumentFactory(link_reach="public") - extensions = [ "c", "go", @@ -91,10 +89,15 @@ def test_api_documents_media_auth_extensions(): "woff2", "appimage", ] + document_id = uuid4() + keys = [] for ext in extensions: - filename = f"{uuid.uuid4()!s}.{ext:s}" - key = f"{document.pk!s}/attachments/{filename:s}" + filename = f"{uuid4()!s}.{ext:s}" + keys.append(f"{document_id!s}/attachments/{filename:s}") + factories.DocumentFactory(link_reach="public", attachments=keys) + + for key in keys: original_url = f"http://localhost/media/{key:s}" response = APIClient().get( "/api/v1.0/documents/media-auth/", HTTP_X_ORIGINAL_URL=original_url @@ -109,10 +112,11 @@ def test_api_documents_media_auth_anonymous_authenticated_or_restricted(reach): Anonymous users should not be allowed to retrieve attachments linked to a document with link reach set to authenticated or restricted. """ - document = factories.DocumentFactory(link_reach=reach) - + document_id = uuid4() filename = f"{uuid4()!s}.jpg" - media_url = f"http://localhost/media/{document.pk!s}/attachments/{filename:s}" + media_url = f"http://localhost/media/{document_id!s}/attachments/{filename:s}" + + factories.DocumentFactory(id=document_id, link_reach=reach) response = APIClient().get( "/api/v1.0/documents/media-auth/", HTTP_X_ORIGINAL_URL=media_url @@ -122,20 +126,16 @@ def test_api_documents_media_auth_anonymous_authenticated_or_restricted(reach): assert "Authorization" not in response -@pytest.mark.parametrize("reach", ["public", "authenticated"]) -def test_api_documents_media_auth_authenticated_public_or_authenticated(reach): +def test_api_documents_media_auth_anonymous_attachments(): """ - Authenticated users who are not related to a document should be able to retrieve - attachments related to a document with public or authenticated link reach. + Declaring a media key as original attachment on a document to which + a user has access should give them access to the attachment file + regardless of their access rights on the original document. """ - document = factories.DocumentFactory(link_reach=reach) - - user = factories.UserFactory() - client = APIClient() - client.force_login(user) - + document_id = uuid4() filename = f"{uuid4()!s}.jpg" - key = f"{document.pk!s}/attachments/{filename:s}" + key = f"{document_id!s}/attachments/{filename:s}" + media_url = f"http://localhost/media/{key:s}" default_storage.connection.meta.client.put_object( Bucket=default_storage.bucket_name, @@ -144,9 +144,73 @@ def test_api_documents_media_auth_authenticated_public_or_authenticated(reach): ContentType="text/plain", ) - original_url = f"http://localhost/media/{key:s}" + factories.DocumentFactory(id=document_id, link_reach="restricted") + + response = APIClient().get( + "/api/v1.0/documents/media-auth/", HTTP_X_ORIGINAL_URL=media_url + ) + assert response.status_code == 403 + + # Let's now add a document to which the anonymous user has access and + # pointing to the attachment + parent = factories.DocumentFactory(link_reach="public") + factories.DocumentFactory(parent=parent, link_reach="restricted", attachments=[key]) + + response = APIClient().get( + "/api/v1.0/documents/media-auth/", HTTP_X_ORIGINAL_URL=media_url + ) + + assert response.status_code == 200 + + authorization = response["Authorization"] + assert "AWS4-HMAC-SHA256 Credential=" in authorization + assert ( + "SignedHeaders=host;x-amz-content-sha256;x-amz-date, Signature=" + in authorization + ) + assert response["X-Amz-Date"] == timezone.now().strftime("%Y%m%dT%H%M%SZ") + + s3_url = urlparse(settings.AWS_S3_ENDPOINT_URL) + file_url = f"{settings.AWS_S3_ENDPOINT_URL:s}/impress-media-storage/{key:s}" + response = requests.get( + file_url, + headers={ + "authorization": authorization, + "x-amz-date": response["x-amz-date"], + "x-amz-content-sha256": response["x-amz-content-sha256"], + "Host": f"{s3_url.hostname:s}:{s3_url.port:d}", + }, + timeout=1, + ) + assert response.content.decode("utf-8") == "my prose" + + +@pytest.mark.parametrize("reach", ["public", "authenticated"]) +def test_api_documents_media_auth_authenticated_public_or_authenticated(reach): + """ + Authenticated users who are not related to a document should be able to retrieve + attachments related to a document with public or authenticated link reach. + """ + user = factories.UserFactory() + client = APIClient() + client.force_login(user) + + document_id = uuid4() + filename = f"{uuid4()!s}.jpg" + key = f"{document_id!s}/attachments/{filename:s}" + media_url = f"http://localhost/media/{key:s}" + + default_storage.connection.meta.client.put_object( + Bucket=default_storage.bucket_name, + Key=key, + Body=BytesIO(b"my prose"), + ContentType="text/plain", + ) + + factories.DocumentFactory(id=document_id, link_reach=reach, attachments=[key]) + response = client.get( - "/api/v1.0/documents/media-auth/", HTTP_X_ORIGINAL_URL=original_url + "/api/v1.0/documents/media-auth/", HTTP_X_ORIGINAL_URL=media_url ) assert response.status_code == 200 @@ -179,14 +243,18 @@ def test_api_documents_media_auth_authenticated_restricted(): Authenticated users who are not related to a document should not be allowed to retrieve attachments linked to a document that is restricted. """ - document = factories.DocumentFactory(link_reach="restricted") - user = factories.UserFactory(with_owned_document=True) client = APIClient() client.force_login(user) + document_id = uuid4() filename = f"{uuid4()!s}.jpg" - media_url = f"http://localhost/media/{document.pk!s}/attachments/{filename:s}" + key = f"{document_id!s}/attachments/{filename:s}" + media_url = f"http://localhost/media/{key:s}" + + factories.DocumentFactory( + id=document_id, link_reach="restricted", attachments=[key] + ) response = client.get( "/api/v1.0/documents/media-auth/", HTTP_X_ORIGINAL_URL=media_url @@ -206,16 +274,10 @@ def test_api_documents_media_auth_related(via, mock_user_teams): client = APIClient() client.force_login(user) - document = factories.DocumentFactory() - if via == USER: - factories.UserDocumentAccessFactory(document=document, user=user) - elif via == TEAM: - mock_user_teams.return_value = ["lasuite", "unknown"] - factories.TeamDocumentAccessFactory(document=document, team="lasuite") - + document_id = uuid4() filename = f"{uuid4()!s}.jpg" - key = f"{document.pk!s}/attachments/{filename:s}" - + key = f"{document_id!s}/attachments/{filename:s}" + media_url = f"http://localhost/media/{key:s}" default_storage.connection.meta.client.put_object( Bucket=default_storage.bucket_name, Key=key, @@ -223,9 +285,17 @@ def test_api_documents_media_auth_related(via, mock_user_teams): ContentType="text/plain", ) - original_url = f"http://localhost/media/{key:s}" + document = factories.DocumentFactory( + id=document_id, link_reach="restricted", attachments=[key] + ) + if via == USER: + factories.UserDocumentAccessFactory(document=document, user=user) + elif via == TEAM: + mock_user_teams.return_value = ["lasuite", "unknown"] + factories.TeamDocumentAccessFactory(document=document, team="lasuite") + response = client.get( - "/api/v1.0/documents/media-auth/", HTTP_X_ORIGINAL_URL=original_url + "/api/v1.0/documents/media-auth/", HTTP_X_ORIGINAL_URL=media_url ) assert response.status_code == 200 diff --git a/src/backend/core/tests/documents/test_api_documents_retrieve.py b/src/backend/core/tests/documents/test_api_documents_retrieve.py index 9c556f0f..1148d7c1 100644 --- a/src/backend/core/tests/documents/test_api_documents_retrieve.py +++ b/src/backend/core/tests/documents/test_api_documents_retrieve.py @@ -37,6 +37,7 @@ def test_api_documents_retrieve_anonymous_public_standalone(): "cors_proxy": True, "descendants": True, "destroy": False, + "duplicate": True, # Anonymous user can't favorite a document even with read access "favorite": False, "invite_owner": False, @@ -103,6 +104,7 @@ def test_api_documents_retrieve_anonymous_public_parent(): "descendants": True, "cors_proxy": True, "destroy": False, + "duplicate": True, # Anonymous user can't favorite a document even with read access "favorite": False, "invite_owner": False, @@ -198,6 +200,7 @@ def test_api_documents_retrieve_authenticated_unrelated_public_or_authenticated( "descendants": True, "cors_proxy": True, "destroy": False, + "duplicate": True, "favorite": True, "invite_owner": False, "link_configuration": False, @@ -271,6 +274,7 @@ def test_api_documents_retrieve_authenticated_public_or_authenticated_parent(rea "descendants": True, "cors_proxy": True, "destroy": False, + "duplicate": True, "favorite": True, "invite_owner": False, "link_configuration": False, @@ -450,6 +454,7 @@ def test_api_documents_retrieve_authenticated_related_parent(): "descendants": True, "cors_proxy": True, "destroy": access.role == "owner", + "duplicate": True, "favorite": True, "invite_owner": access.role == "owner", "link_configuration": access.role in ["administrator", "owner"], diff --git a/src/backend/core/tests/documents/test_api_documents_trashbin.py b/src/backend/core/tests/documents/test_api_documents_trashbin.py index 86287caa..6db898ea 100644 --- a/src/backend/core/tests/documents/test_api_documents_trashbin.py +++ b/src/backend/core/tests/documents/test_api_documents_trashbin.py @@ -81,6 +81,7 @@ def test_api_documents_trashbin_format(): "descendants": True, "cors_proxy": True, "destroy": True, + "duplicate": True, "favorite": True, "invite_owner": True, "link_configuration": True, diff --git a/src/backend/core/tests/documents/test_api_documents_update_extract_attachments.py b/src/backend/core/tests/documents/test_api_documents_update_extract_attachments.py new file mode 100644 index 00000000..f18c73db --- /dev/null +++ b/src/backend/core/tests/documents/test_api_documents_update_extract_attachments.py @@ -0,0 +1,153 @@ +""" +Test extract-attachments on document update in docs core app. +""" + +import base64 +from uuid import uuid4 + +import pytest +import y_py +from rest_framework.test import APIClient + +from core import factories + +pytestmark = pytest.mark.django_db + + +def get_ydoc_with_mages(image_keys): + """Return a ydoc from text for testing purposes.""" + ydoc = y_py.YDoc() # pylint: disable=no-member + with ydoc.begin_transaction() as txn: + xml_fragment = ydoc.get_xml_element("document-store") + for key in image_keys: + xml_image = xml_fragment.push_xml_element(txn, "image") + xml_image.set_attribute(txn, "src", f"http://localhost/media/{key:s}") + + update = y_py.encode_state_as_update(ydoc) # pylint: disable=no-member + return base64.b64encode(update).decode("utf-8") + + +def test_api_documents_update_new_attachment_keys_anonymous(django_assert_num_queries): + """ + When an anonymous user updates a document, the attachment keys extracted from the + updated content should be added to the list of "attachments" ot the document if these + attachments are already readable by anonymous users. + """ + image_keys = [f"{uuid4()!s}/attachments/{uuid4()!s}.png" for _ in range(4)] + document = factories.DocumentFactory( + content=get_ydoc_with_mages(image_keys[:1]), + attachments=[image_keys[0]], + link_reach="public", + link_role="editor", + ) + + factories.DocumentFactory(attachments=[image_keys[1]], link_reach="public") + factories.DocumentFactory(attachments=[image_keys[2]], link_reach="authenticated") + factories.DocumentFactory(attachments=[image_keys[3]], link_reach="restricted") + expected_keys = {image_keys[i] for i in [0, 1]} + + with django_assert_num_queries(9): + response = APIClient().put( + f"/api/v1.0/documents/{document.id!s}/", + {"content": get_ydoc_with_mages(image_keys)}, + format="json", + ) + assert response.status_code == 200 + + document.refresh_from_db() + assert set(document.attachments) == expected_keys + + # Check that the db query to check attachments readability for extracted + # keys is not done if the content changes but no new keys are found + with django_assert_num_queries(5): + response = APIClient().put( + f"/api/v1.0/documents/{document.id!s}/", + {"content": get_ydoc_with_mages(image_keys[:2])}, + format="json", + ) + assert response.status_code == 200 + + document.refresh_from_db() + assert len(document.attachments) == 2 + assert set(document.attachments) == expected_keys + + +def test_api_documents_update_new_attachment_keys_authenticated( + django_assert_num_queries, +): + """ + When an authenticated user updates a document, the attachment keys extracted from the + updated content should be added to the list of "attachments" ot the document if these + attachments are already readable by the editing user. + """ + user = factories.UserFactory() + client = APIClient() + client.force_login(user) + + image_keys = [f"{uuid4()!s}/attachments/{uuid4()!s}.png" for _ in range(5)] + document = factories.DocumentFactory( + content=get_ydoc_with_mages(image_keys[:1]), + attachments=[image_keys[0]], + users=[(user, "editor")], + ) + + factories.DocumentFactory(attachments=[image_keys[1]], link_reach="public") + factories.DocumentFactory(attachments=[image_keys[2]], link_reach="authenticated") + factories.DocumentFactory(attachments=[image_keys[3]], link_reach="restricted") + factories.DocumentFactory(attachments=[image_keys[4]], users=[user]) + expected_keys = {image_keys[i] for i in [0, 1, 2, 4]} + + with django_assert_num_queries(10): + response = client.put( + f"/api/v1.0/documents/{document.id!s}/", + {"content": get_ydoc_with_mages(image_keys)}, + format="json", + ) + assert response.status_code == 200 + + document.refresh_from_db() + assert set(document.attachments) == expected_keys + + # Check that the db query to check attachments readability for extracted + # keys is not done if the content changes but no new keys are found + with django_assert_num_queries(6): + response = client.put( + f"/api/v1.0/documents/{document.id!s}/", + {"content": get_ydoc_with_mages(image_keys[:2])}, + format="json", + ) + assert response.status_code == 200 + + document.refresh_from_db() + assert len(document.attachments) == 4 + assert set(document.attachments) == expected_keys + + +def test_api_documents_update_new_attachment_keys_duplicate(): + """ + Duplicate keys in the content should not result in duplicates in the document's attachments. + """ + user = factories.UserFactory() + client = APIClient() + client.force_login(user) + + image_key1 = f"{uuid4()!s}/attachments/{uuid4()!s}.png" + image_key2 = f"{uuid4()!s}/attachments/{uuid4()!s}.png" + document = factories.DocumentFactory( + content=get_ydoc_with_mages([image_key1]), + attachments=[image_key1], + users=[(user, "editor")], + ) + + factories.DocumentFactory(attachments=[image_key2], users=[user]) + + response = client.put( + f"/api/v1.0/documents/{document.id!s}/", + {"content": get_ydoc_with_mages([image_key1, image_key2, image_key2])}, + format="json", + ) + assert response.status_code == 200 + + document.refresh_from_db() + assert len(document.attachments) == 2 + assert set(document.attachments) == {image_key1, image_key2} diff --git a/src/backend/core/tests/migrations/test_migrations_0020_remove_is_public_add_field_attachments_and_duplicated_from.py b/src/backend/core/tests/migrations/test_migrations_0020_remove_is_public_add_field_attachments_and_duplicated_from.py new file mode 100644 index 00000000..94ce5375 --- /dev/null +++ b/src/backend/core/tests/migrations/test_migrations_0020_remove_is_public_add_field_attachments_and_duplicated_from.py @@ -0,0 +1,55 @@ +import base64 +import uuid + +from django.core.files.base import ContentFile +from django.core.files.storage import default_storage + +import pytest +import y_py + +from core import models + + +@pytest.mark.django_db +def test_populate_attachments_on_all_documents(migrator): + """Test that the migration populates attachments on existing documents.""" + old_state = migrator.apply_initial_migration( + ("core", "0019_alter_user_language_default_to_null") + ) + OldDocument = old_state.apps.get_model("core", "Document") + + old_doc_without_attachments = OldDocument.objects.create( + title="Doc without attachments", depth=1, path="0000002" + ) + old_doc_with_attachments = OldDocument.objects.create( + title="Doc with attachments", depth=1, path="0000001" + ) + + # Create document content with an image + file_key = f"{old_doc_with_attachments.id!s}/file" + ydoc = y_py.YDoc() # pylint: disable=no-member + image_key = f"{old_doc_with_attachments.id!s}/attachments/{uuid.uuid4()!s}.png" + with ydoc.begin_transaction() as txn: + xml_fragment = ydoc.get_xml_element("document-store") + xml_fragment.push_xml_element(txn, "image").set_attribute( + txn, "src", f"http://localhost/media/{image_key:s}" + ) + update = y_py.encode_state_as_update(ydoc) # pylint: disable=no-member + base64_content = base64.b64encode(update).decode("utf-8") + bytes_content = base64_content.encode("utf-8") + content_file = ContentFile(bytes_content) + default_storage.save(file_key, content_file) + + # Apply the migration + new_state = migrator.apply_tested_migration( + ("core", "0020_remove_is_public_add_field_attachments_and_duplicated_from") + ) + NewDocument = new_state.apps.get_model("core", "Document") + + new_doc_with_attachments = NewDocument.objects.get(pk=old_doc_with_attachments.pk) + new_doc_without_attachments = NewDocument.objects.get( + pk=old_doc_without_attachments.pk + ) + + assert new_doc_without_attachments.attachments == [] + assert new_doc_with_attachments.attachments == [image_key] diff --git a/src/backend/core/tests/test_models_documents.py b/src/backend/core/tests/test_models_documents.py index f51857ee..e1037815 100644 --- a/src/backend/core/tests/test_models_documents.py +++ b/src/backend/core/tests/test_models_documents.py @@ -161,6 +161,7 @@ def test_models_documents_get_abilities_forbidden( "descendants": False, "cors_proxy": False, "destroy": False, + "duplicate": False, "favorite": False, "invite_owner": False, "media_auth": False, @@ -220,6 +221,7 @@ def test_models_documents_get_abilities_reader( "descendants": True, "cors_proxy": True, "destroy": False, + "duplicate": True, "favorite": is_authenticated, "invite_owner": False, "link_configuration": False, @@ -281,6 +283,7 @@ def test_models_documents_get_abilities_editor( "descendants": True, "cors_proxy": True, "destroy": False, + "duplicate": True, "favorite": is_authenticated, "invite_owner": False, "link_configuration": False, @@ -331,6 +334,7 @@ def test_models_documents_get_abilities_owner(django_assert_num_queries): "descendants": True, "cors_proxy": True, "destroy": True, + "duplicate": True, "favorite": True, "invite_owner": True, "link_configuration": True, @@ -378,6 +382,7 @@ def test_models_documents_get_abilities_administrator(django_assert_num_queries) "descendants": True, "cors_proxy": True, "destroy": False, + "duplicate": True, "favorite": True, "invite_owner": False, "link_configuration": True, @@ -428,6 +433,7 @@ def test_models_documents_get_abilities_editor_user(django_assert_num_queries): "descendants": True, "cors_proxy": True, "destroy": False, + "duplicate": True, "favorite": True, "invite_owner": False, "link_configuration": False, @@ -485,6 +491,7 @@ def test_models_documents_get_abilities_reader_user( "descendants": True, "cors_proxy": True, "destroy": False, + "duplicate": True, "favorite": True, "invite_owner": False, "link_configuration": False, @@ -540,6 +547,7 @@ def test_models_documents_get_abilities_preset_role(django_assert_num_queries): "descendants": True, "cors_proxy": True, "destroy": False, + "duplicate": True, "favorite": True, "invite_owner": False, "link_configuration": False, diff --git a/src/backend/core/tests/test_utils_base64_yjs_to_text.py b/src/backend/core/tests/test_utils_base64_yjs_to_text.py index 376bb85d..2ffa168d 100644 --- a/src/backend/core/tests/test_utils_base64_yjs_to_text.py +++ b/src/backend/core/tests/test_utils_base64_yjs_to_text.py @@ -1,9 +1,15 @@ """Test util base64_yjs_to_text.""" +import base64 +import uuid + +import y_py + +from core import utils from core.utils import base64_yjs_to_text -def test_base64_yjs_to_text(): +def test_utils_base64_yjs_to_text(): """ Test extract_text_from_saved_yjs_document This base64 string is an example of what is saved in the database. @@ -27,3 +33,38 @@ def test_base64_yjs_to_text(): ) assert base64_yjs_to_text(base64_string) == "Hello world" + + +def test_utils_extract_attachments(): + """ + All attachment keys in the document content should be extracted. + """ + document_id = uuid.uuid4() + image_key1 = f"{document_id!s}/attachments/{uuid.uuid4()!s}.png" + image_url1 = f"http://localhost/media/{image_key1:s}" + + image_key2 = f"{uuid.uuid4()!s}/attachments/{uuid.uuid4()!s}.png" + image_url2 = f"http://localhost/{image_key2:s}" + + image_key3 = f"{uuid.uuid4()!s}/attachments/{uuid.uuid4()!s}.png" + image_url3 = f"http://localhost/media/{image_key3:s}" + + ydoc = y_py.YDoc() # pylint: disable=no-member + with ydoc.begin_transaction() as txn: + xml_fragment = ydoc.get_xml_element("document-store") + + xml_image = xml_fragment.push_xml_element(txn, "image") + xml_image.set_attribute(txn, "src", image_url1) + + xml_image = xml_fragment.push_xml_element(txn, "image") + xml_image.set_attribute(txn, "src", image_url2) + + xml_paragraph = xml_fragment.push_xml_element(txn, "paragraph") + xml_text = xml_paragraph.push_xml_text(txn) + xml_text.push(txn, image_url3) + + update = y_py.encode_state_as_update(ydoc) # pylint: disable=no-member + base64_string = base64.b64encode(update).decode("utf-8") + + # image_url3 is missing the "/media/" part and shouldn't get extracted + assert utils.extract_attachments(base64_string) == [image_key1, image_key3] diff --git a/src/backend/core/tests/test_utils_filter_descendants.py b/src/backend/core/tests/test_utils_filter_descendants.py new file mode 100644 index 00000000..cf37a0d9 --- /dev/null +++ b/src/backend/core/tests/test_utils_filter_descendants.py @@ -0,0 +1,163 @@ +""" +Unit tests for the filter_root_paths utility function. +""" + +from core.utils import filter_descendants + + +def test_utils_filter_descendants_success(): + """ + The `filter_descendants` function should correctly identify descendant paths + from a given list of paths and root paths. + + This test verifies that the function returns only the paths that have a prefix + matching one of the root paths. + """ + paths = [ + "0001", + "00010001", + "000100010001", + "000100010002", + "000100020001", + "000100020002", + "0002", + "00020001", + "00020002", + "00030001", + "000300010001", + "00030002", + "0004", + "000400010003", + "0004000100030001", + "000400010004", + ] + root_paths = [ + "0001", + "0002", + "000400010003", + ] + filtered_paths = filter_descendants(paths, root_paths, skip_sorting=True) + assert filtered_paths == [ + "0001", + "00010001", + "000100010001", + "000100010002", + "000100020001", + "000100020002", + "0002", + "00020001", + "00020002", + "000400010003", + "0004000100030001", + ] + + +def test_utils_filter_descendants_sorting(): + """ + The `filter_descendants` function should handle unsorted input when sorting is enabled. + + This test verifies that the function sorts the input if sorting is not skipped + and still correctly identifies accessible descendant paths. + """ + paths = [ + "000300010001", + "000100010002", + "0001", + "00010001", + "000100010001", + "000100020002", + "000100020001", + "0002", + "00020001", + "00020002", + "00030001", + "00030002", + "0004000100030001", + "0004", + "000400010003", + "000400010004", + ] + root_paths = [ + "0002", + "000400010003", + "0001", + ] + filtered_paths = filter_descendants(paths, root_paths) + assert filtered_paths == [ + "0001", + "00010001", + "000100010001", + "000100010002", + "000100020001", + "000100020002", + "0002", + "00020001", + "00020002", + "000400010003", + "0004000100030001", + ] + + filtered_paths = filter_descendants(paths, root_paths, skip_sorting=True) + assert filtered_paths == [ + "0001", + "00010001", + "000100010001", + "000100010002", + "000100020001", + "000100020002", + "0002", + "00020001", + "00020002", + "000400010003", + "0004000100030001", + ] + + +def test_utils_filter_descendants_empty(): + """ + The function should return an empty list if one or both inputs are empty. + """ + assert not filter_descendants([], ["0001"]) + assert not filter_descendants(["0001"], []) + assert not filter_descendants([], []) + + +def test_utils_filter_descendants_no_match(): + """ + The function should return an empty list if no path starts with any root path. + """ + paths = ["0001", "0002", "0003"] + root_paths = ["0004", "0005"] + assert not filter_descendants(paths, root_paths, skip_sorting=True) + + +def test_utils_filter_descendants_exact_match(): + """ + The function should include paths that exactly match a root path. + """ + paths = ["0001", "0002", "0003"] + root_paths = ["0001", "0002"] + assert filter_descendants(paths, root_paths, skip_sorting=True) == ["0001", "0002"] + + +def test_utils_filter_descendants_single_root_matches_all(): + """ + A single root path should match all its descendants. + """ + paths = ["0001", "00010001", "000100010001", "00010002"] + root_paths = ["0001"] + assert filter_descendants(paths, root_paths) == [ + "0001", + "00010001", + "000100010001", + "00010002", + ] + + +def test_utils_filter_descendants_path_shorter_than_root(): + """ + A path shorter than any root path should not match. + """ + paths = ["0001", "0002"] + root_paths = ["00010001"] + assert not filter_descendants(paths, root_paths) diff --git a/src/backend/core/utils.py b/src/backend/core/utils.py index bd2e0170..88a628b2 100644 --- a/src/backend/core/utils.py +++ b/src/backend/core/utils.py @@ -1,10 +1,52 @@ """Utils for the core app.""" import base64 +import re import y_py as Y from bs4 import BeautifulSoup +from core import enums + + +def filter_descendants(paths, root_paths, skip_sorting=False): + """ + Filters paths to keep only those that are descendants of any path in root_paths. + + A path is considered a descendant of a root path if it starts with the root path. + If `skip_sorting` is not set to True, the function will sort both lists before + processing because both `paths` and `root_paths` need to be in lexicographic order + before going through the algorithm. + + Args: + paths (iterable of str): List of paths to be filtered. + root_paths (iterable of str): List of paths to check as potential prefixes. + skip_sorting (bool): If True, assumes both `paths` and `root_paths` are already sorted. + + Returns: + list of str: A list of sorted paths that are descendants of any path in `root_paths`. + """ + results = [] + i = 0 + n = len(root_paths) + + if not skip_sorting: + paths.sort() + root_paths.sort() + + for path in paths: + # Try to find a matching prefix in the sorted accessible paths + while i < n: + if path.startswith(root_paths[i]): + results.append(path) + break + if root_paths[i] < path: + i += 1 + else: + # If paths[i] > path, no need to keep searching + break + return results + def base64_yjs_to_xml(base64_string): """Extract xml from base64 yjs document.""" @@ -23,3 +65,9 @@ def base64_yjs_to_text(base64_string): blocknote_structure = base64_yjs_to_xml(base64_string) soup = BeautifulSoup(blocknote_structure, "html.parser") return soup.get_text(separator=" ").strip() + + +def extract_attachments(content): + """Helper method to extract media paths from a document's content.""" + xml_content = base64_yjs_to_xml(content) + return re.findall(enums.MEDIA_STORAGE_URL_EXTRACT, xml_content) diff --git a/src/backend/pyproject.toml b/src/backend/pyproject.toml index 17cb0049..e25f433a 100644 --- a/src/backend/pyproject.toml +++ b/src/backend/pyproject.toml @@ -57,13 +57,8 @@ dependencies = [ "requests==2.32.3", "sentry-sdk==2.24.0", "url-normalize==1.4.3", -<<<<<<< HEAD "whitenoise==6.9.0", - "mozilla-django-oidc==4.0.1", -======= - "whitenoise==6.8.2", "y-py==0.6.2", ->>>>>>> f087cd70 (✨(backend) add util to extract text from Ydoc content) ] [project.urls]