(backend) extract attachment keys from updated content for access

We can't prevent document editors from copy/pasting content to from one
document to another. The problem is that copying content, will copy the
urls pointing to attachments but if we don't do anything, the reader of
the document to which the content is being pasted, may not be allowed to
access the attachment files from the original document.

Using the work from the previous commit, we can grant access to the readers
of the target document by extracting the attachment keys from the content and
adding themto the target document's "attachments" field. Before doing this,
we check that the current user can indeed access the attachment files extracted
from the content and that they are allowed to edit the current document.
This commit is contained in:
Samuel Paccoud - DINUM
2025-01-21 23:56:50 +01:00
committed by Manuel Raynaud
parent 34a208a80d
commit c02f19a2cd
4 changed files with 92 additions and 2 deletions

View File

@@ -10,7 +10,7 @@ from django.utils.translation import gettext_lazy as _
import magic
from rest_framework import exceptions, serializers
from core import enums, models
from core import enums, models, utils
from core.services.ai_services import AI_ACTIONS
from core.services.converter_services import (
ConversionError,
@@ -268,6 +268,53 @@ class DocumentSerializer(ListDocumentSerializer):
return value
def save(self, **kwargs):
"""
Process the content field to extract attachment keys and update the document's
"attachments" field for access control.
"""
content = self.validated_data.get("content", "")
extracted_attachments = set(utils.extract_attachments(content))
existing_attachments = (
set(self.instance.attachments or []) if self.instance else set()
)
new_attachments = extracted_attachments - existing_attachments
if new_attachments:
attachments_documents = (
models.Document.objects.filter(
attachments__overlap=list(new_attachments)
)
.only("path", "attachments")
.order_by("path")
)
user = self.context["request"].user
readable_per_se_paths = (
models.Document.objects.readable_per_se(user)
.order_by("path")
.values_list("path", flat=True)
)
readable_attachments_paths = utils.filter_descendants(
[doc.path for doc in attachments_documents],
readable_per_se_paths,
skip_sorting=True,
)
readable_attachments = set()
for document in attachments_documents:
if document.path not in readable_attachments_paths:
continue
readable_attachments.update(set(document.attachments) & new_attachments)
# Update attachments with readable keys
self.validated_data["attachments"] = list(
existing_attachments | readable_attachments
)
return super().save(**kwargs)
class ServerCreateDocumentSerializer(serializers.Serializer):
"""

View File

@@ -1,5 +1,5 @@
"""
Test file uploads API endpoint for users in impress's core app.
Test media-auth authorization API endpoint in docs core app.
"""
from io import BytesIO

View File

@@ -1,5 +1,10 @@
"""Test util base64_yjs_to_text."""
import base64
import uuid
import y_py
from core import utils
# This base64 string is an example of what is saved in the database.
@@ -35,3 +40,38 @@ def test_utils_base64_yjs_to_xml():
or '<heading "textAlignment"="left" "level"="1">Hello</heading>' in content
)
assert '<bulletListItem "textAlignment"="left">world</bulletListItem>' in content
def test_utils_extract_attachments():
"""
All attachment keys in the document content should be extracted.
"""
document_id = uuid.uuid4()
image_key1 = f"{document_id!s}/attachments/{uuid.uuid4()!s}.png"
image_url1 = f"http://localhost/media/{image_key1:s}"
image_key2 = f"{uuid.uuid4()!s}/attachments/{uuid.uuid4()!s}.png"
image_url2 = f"http://localhost/{image_key2:s}"
image_key3 = f"{uuid.uuid4()!s}/attachments/{uuid.uuid4()!s}.png"
image_url3 = f"http://localhost/media/{image_key3:s}"
ydoc = y_py.YDoc() # pylint: disable=no-member
with ydoc.begin_transaction() as txn:
xml_fragment = ydoc.get_xml_element("document-store")
xml_image = xml_fragment.push_xml_element(txn, "image")
xml_image.set_attribute(txn, "src", image_url1)
xml_image = xml_fragment.push_xml_element(txn, "image")
xml_image.set_attribute(txn, "src", image_url2)
xml_paragraph = xml_fragment.push_xml_element(txn, "paragraph")
xml_text = xml_paragraph.push_xml_text(txn)
xml_text.push(txn, image_url3)
update = y_py.encode_state_as_update(ydoc) # pylint: disable=no-member
base64_string = base64.b64encode(update).decode("utf-8")
# image_key2 is missing the "/media/" part and shouldn't get extracted
assert utils.extract_attachments(base64_string) == [image_key1, image_key3]

View File

@@ -69,5 +69,8 @@ def base64_yjs_to_text(base64_string):
def extract_attachments(content):
"""Helper method to extract media paths from a document's content."""
if not content:
return []
xml_content = base64_yjs_to_xml(content)
return re.findall(enums.MEDIA_STORAGE_URL_EXTRACT, xml_content)