From 67dc7feb98466090422add289531905f39c7665a Mon Sep 17 00:00:00 2001 From: Anthony LC Date: Mon, 20 Jan 2025 17:20:51 +0100 Subject: [PATCH] =?UTF-8?q?=F0=9F=9A=91=EF=B8=8F(backend)=20command=20to?= =?UTF-8?q?=20update=20attachment=20content-type?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The uploaded files in the system are missing the content-type. We add a command to update the content-type of the existing uploaded files. This command will run one time when we will deploy to the environments. --- src/backend/core/management/__init__.py | 0 .../core/management/commands/__init__.py | 0 .../update_files_content_type_metadata.py | 95 +++++++++++++++++++ ...test_update_files_content_type_metadata.py | 50 ++++++++++ 4 files changed, 145 insertions(+) create mode 100644 src/backend/core/management/__init__.py create mode 100644 src/backend/core/management/commands/__init__.py create mode 100644 src/backend/core/management/commands/update_files_content_type_metadata.py create mode 100644 src/backend/core/tests/commands/test_update_files_content_type_metadata.py diff --git a/src/backend/core/management/__init__.py b/src/backend/core/management/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/src/backend/core/management/commands/__init__.py b/src/backend/core/management/commands/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/src/backend/core/management/commands/update_files_content_type_metadata.py b/src/backend/core/management/commands/update_files_content_type_metadata.py new file mode 100644 index 00000000..bb2e5253 --- /dev/null +++ b/src/backend/core/management/commands/update_files_content_type_metadata.py @@ -0,0 +1,95 @@ +"""Management command updating the metadata for all the files in the MinIO bucket.""" + +from django.core.files.storage import default_storage +from django.core.management.base import BaseCommand + +import magic + +from core.models import Document + +# pylint: disable=too-many-locals, broad-exception-caught + + +class Command(BaseCommand): + """Update the metadata for all the files in the MinIO bucket.""" + + help = __doc__ + + def handle(self, *args, **options): + """Execute management command.""" + s3_client = default_storage.connection.meta.client + bucket_name = default_storage.bucket_name + + mime_detector = magic.Magic(mime=True) + + documents = Document.objects.all() + self.stdout.write( + f"[INFO] Found {documents.count()} documents. Starting ContentType fix..." + ) + + for doc in documents: + doc_id_str = str(doc.id) + prefix = f"{doc_id_str}/attachments/" + self.stdout.write( + f"[INFO] Processing attachments under prefix '{prefix}' ..." + ) + + continuation_token = None + total_updated = 0 + + while True: + list_kwargs = {"Bucket": bucket_name, "Prefix": prefix} + if continuation_token: + list_kwargs["ContinuationToken"] = continuation_token + + response = s3_client.list_objects_v2(**list_kwargs) + + # If no objects found under this prefix, break out of the loop + if "Contents" not in response: + break + + for obj in response["Contents"]: + key = obj["Key"] + + # Skip if it's a folder + if key.endswith("/"): + continue + + try: + # Get existing metadata + head_resp = s3_client.head_object(Bucket=bucket_name, Key=key) + + # Read first ~1KB for MIME detection + partial_obj = s3_client.get_object( + Bucket=bucket_name, Key=key, Range="bytes=0-1023" + ) + partial_data = partial_obj["Body"].read() + + # Detect MIME type + magic_mime_type = mime_detector.from_buffer(partial_data) + + # Update ContentType + s3_client.copy_object( + Bucket=bucket_name, + CopySource={"Bucket": bucket_name, "Key": key}, + Key=key, + ContentType=magic_mime_type, + Metadata=head_resp.get("Metadata", {}), + MetadataDirective="REPLACE", + ) + total_updated += 1 + + except Exception as exc: # noqa + self.stderr.write( + f"[ERROR] Could not update ContentType for {key}: {exc}" + ) + + if response.get("IsTruncated"): + continuation_token = response.get("NextContinuationToken") + else: + break + + if total_updated > 0: + self.stdout.write( + f"[INFO] -> Updated {total_updated} objects for Document {doc_id_str}." + ) diff --git a/src/backend/core/tests/commands/test_update_files_content_type_metadata.py b/src/backend/core/tests/commands/test_update_files_content_type_metadata.py new file mode 100644 index 00000000..4ece3614 --- /dev/null +++ b/src/backend/core/tests/commands/test_update_files_content_type_metadata.py @@ -0,0 +1,50 @@ +""" +Unit test for `update_files_content_type_metadata` command. +""" + +import uuid + +from django.core.files.storage import default_storage +from django.core.management import call_command + +import pytest + +from core import factories + + +@pytest.mark.django_db +def test_update_files_content_type_metadata(): + """ + Test that the command `update_files_content_type_metadata` + fixes the ContentType of attachment in the storage. + """ + s3_client = default_storage.connection.meta.client + bucket_name = default_storage.bucket_name + + # Create files with a wrong ContentType + keys = [] + for _ in range(10): + doc_id = uuid.uuid4() + factories.DocumentFactory(id=doc_id) + key = f"{doc_id}/attachments/testfile.png" + keys.append(key) + fake_png = b"\x89PNG\r\n\x1a\n\x00\x00\x00\rIHDR..." + s3_client.put_object( + Bucket=bucket_name, + Key=key, + Body=fake_png, + ContentType="text/plain", + Metadata={"owner": "None"}, + ) + + # Call the command that fixes the ContentType + call_command("update_files_content_type_metadata") + + for key in keys: + head_resp = s3_client.head_object(Bucket=bucket_name, Key=key) + assert ( + head_resp["ContentType"] == "image/png" + ), f"ContentType not fixed, got {head_resp['ContentType']!r}" + + # Check that original metadata was preserved + assert head_resp["Metadata"].get("owner") == "None"