🚑️(backend) command to update attachment content-type

The uploaded files in the system are missing
the content-type.
We add a command to update the content-type of
the existing uploaded files.
This command will run one time when we will deploy
to the environments.
This commit is contained in:
Anthony LC
2025-01-20 17:20:51 +01:00
committed by Anthony LC
parent 5b4b100e90
commit 67dc7feb98
4 changed files with 145 additions and 0 deletions

View File

View File

@@ -0,0 +1,95 @@
"""Management command updating the metadata for all the files in the MinIO bucket."""
from django.core.files.storage import default_storage
from django.core.management.base import BaseCommand
import magic
from core.models import Document
# pylint: disable=too-many-locals, broad-exception-caught
class Command(BaseCommand):
"""Update the metadata for all the files in the MinIO bucket."""
help = __doc__
def handle(self, *args, **options):
"""Execute management command."""
s3_client = default_storage.connection.meta.client
bucket_name = default_storage.bucket_name
mime_detector = magic.Magic(mime=True)
documents = Document.objects.all()
self.stdout.write(
f"[INFO] Found {documents.count()} documents. Starting ContentType fix..."
)
for doc in documents:
doc_id_str = str(doc.id)
prefix = f"{doc_id_str}/attachments/"
self.stdout.write(
f"[INFO] Processing attachments under prefix '{prefix}' ..."
)
continuation_token = None
total_updated = 0
while True:
list_kwargs = {"Bucket": bucket_name, "Prefix": prefix}
if continuation_token:
list_kwargs["ContinuationToken"] = continuation_token
response = s3_client.list_objects_v2(**list_kwargs)
# If no objects found under this prefix, break out of the loop
if "Contents" not in response:
break
for obj in response["Contents"]:
key = obj["Key"]
# Skip if it's a folder
if key.endswith("/"):
continue
try:
# Get existing metadata
head_resp = s3_client.head_object(Bucket=bucket_name, Key=key)
# Read first ~1KB for MIME detection
partial_obj = s3_client.get_object(
Bucket=bucket_name, Key=key, Range="bytes=0-1023"
)
partial_data = partial_obj["Body"].read()
# Detect MIME type
magic_mime_type = mime_detector.from_buffer(partial_data)
# Update ContentType
s3_client.copy_object(
Bucket=bucket_name,
CopySource={"Bucket": bucket_name, "Key": key},
Key=key,
ContentType=magic_mime_type,
Metadata=head_resp.get("Metadata", {}),
MetadataDirective="REPLACE",
)
total_updated += 1
except Exception as exc: # noqa
self.stderr.write(
f"[ERROR] Could not update ContentType for {key}: {exc}"
)
if response.get("IsTruncated"):
continuation_token = response.get("NextContinuationToken")
else:
break
if total_updated > 0:
self.stdout.write(
f"[INFO] -> Updated {total_updated} objects for Document {doc_id_str}."
)

View File

@@ -0,0 +1,50 @@
"""
Unit test for `update_files_content_type_metadata` command.
"""
import uuid
from django.core.files.storage import default_storage
from django.core.management import call_command
import pytest
from core import factories
@pytest.mark.django_db
def test_update_files_content_type_metadata():
"""
Test that the command `update_files_content_type_metadata`
fixes the ContentType of attachment in the storage.
"""
s3_client = default_storage.connection.meta.client
bucket_name = default_storage.bucket_name
# Create files with a wrong ContentType
keys = []
for _ in range(10):
doc_id = uuid.uuid4()
factories.DocumentFactory(id=doc_id)
key = f"{doc_id}/attachments/testfile.png"
keys.append(key)
fake_png = b"\x89PNG\r\n\x1a\n\x00\x00\x00\rIHDR..."
s3_client.put_object(
Bucket=bucket_name,
Key=key,
Body=fake_png,
ContentType="text/plain",
Metadata={"owner": "None"},
)
# Call the command that fixes the ContentType
call_command("update_files_content_type_metadata")
for key in keys:
head_resp = s3_client.head_object(Bucket=bucket_name, Key=key)
assert (
head_resp["ContentType"] == "image/png"
), f"ContentType not fixed, got {head_resp['ContentType']!r}"
# Check that original metadata was preserved
assert head_resp["Metadata"].get("owner") == "None"