From 8e262da8f59cc1a2a2673e09bae6bb90ea67c060 Mon Sep 17 00:00:00 2001 From: Samuel Paccoud - DINUM Date: Sat, 6 Apr 2024 09:09:46 +0200 Subject: [PATCH] =?UTF-8?q?=E2=9C=A8(documents)=20add=20content=20field=20?= =?UTF-8?q?as=20an=20S3=20object?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The content field is a writable property on the model which is persisted in object storage. We take advantage of the versioning, robustness and scalability of S3. --- docker-compose.yml | 4 -- env.d/development/common.dist | 6 ++ src/backend/core/api/fields.py | 24 ++++++++ src/backend/core/api/serializers.py | 4 ++ src/backend/core/factories.py | 1 + src/backend/core/models.py | 49 +++++++++++++++ .../documents/test_api_documents_retrieve.py | 6 ++ .../documents/test_api_documents_update.py | 6 +- .../core/tests/test_models_documents.py | 46 ++++++++++++++ src/backend/impress/settings.py | 60 ++++++++----------- src/backend/pyproject.toml | 2 +- 11 files changed, 164 insertions(+), 44 deletions(-) create mode 100644 src/backend/core/api/fields.py diff --git a/docker-compose.yml b/docker-compose.yml index 7f40d6e5..4e6adbd0 100644 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -59,7 +59,6 @@ services: - "8071:8000" volumes: - ./src/backend:/app - - ./data/media:/data/media - ./data/static:/data/static depends_on: - postgresql @@ -78,7 +77,6 @@ services: - env.d/development/postgresql volumes: - ./src/backend:/app - - ./data/media:/data/media - ./data/static:/data/static depends_on: - app-dev @@ -96,8 +94,6 @@ services: env_file: - env.d/development/common - env.d/development/postgresql - volumes: - - ./data/media:/data/media depends_on: - postgresql - redis diff --git a/env.d/development/common.dist b/env.d/development/common.dist index 1f339ba3..712c4dbd 100644 --- a/env.d/development/common.dist +++ b/env.d/development/common.dist @@ -16,6 +16,12 @@ DJANGO_EMAIL_PORT=1025 # Backend url IMPRESS_BASE_URL="http://localhost:8072" +# Media +STORAGES_STATICFILES_BACKEND=django.contrib.staticfiles.storage.StaticFilesStorage +AWS_S3_ENDPOINT_URL=http://minio:9000 +AWS_S3_ACCESS_KEY_ID=impress +AWS_S3_SECRET_ACCESS_KEY=password + # OIDC OIDC_OP_JWKS_ENDPOINT=http://nginx:8083/realms/impress/protocol/openid-connect/certs OIDC_OP_AUTHORIZATION_ENDPOINT=http://localhost:8083/realms/impress/protocol/openid-connect/auth diff --git a/src/backend/core/api/fields.py b/src/backend/core/api/fields.py new file mode 100644 index 00000000..9c2424b7 --- /dev/null +++ b/src/backend/core/api/fields.py @@ -0,0 +1,24 @@ +"""A JSONField for DRF to handle serialization/deserialization.""" +import json + +from rest_framework import serializers + + +class JSONField(serializers.Field): + """ + A custom field for handling JSON data. + """ + + def to_representation(self, value): + """ + Convert the JSON string to a Python dictionary for serialization. + """ + return value + + def to_internal_value(self, data): + """ + Convert the Python dictionary to a JSON string for deserialization. + """ + if data is None: + return None + return json.dumps(data) diff --git a/src/backend/core/api/serializers.py b/src/backend/core/api/serializers.py index e695a295..4318b14f 100644 --- a/src/backend/core/api/serializers.py +++ b/src/backend/core/api/serializers.py @@ -7,6 +7,8 @@ from timezone_field.rest_framework import TimeZoneSerializerField from core import models +from .fields import JSONField + class UserSerializer(serializers.ModelSerializer): """Serialize users.""" @@ -134,6 +136,8 @@ class BaseResourceSerializer(serializers.ModelSerializer): class DocumentSerializer(BaseResourceSerializer): """Serialize documents.""" + content = JSONField(required=False) + class Meta: model = models.Document fields = ["id", "content", "title", "accesses", "abilities", "is_public"] diff --git a/src/backend/core/factories.py b/src/backend/core/factories.py index 790bce6b..3f6c0050 100644 --- a/src/backend/core/factories.py +++ b/src/backend/core/factories.py @@ -35,6 +35,7 @@ class DocumentFactory(factory.django.DjangoModelFactory): title = factory.Sequence(lambda n: f"document{n}") is_public = factory.Faker("boolean") + content = factory.LazyFunction(lambda: {"foo": fake.word()}) @factory.post_generation def users(self, create, extracted, **kwargs): diff --git a/src/backend/core/models.py b/src/backend/core/models.py index 671fc2c7..5c60e4d4 100644 --- a/src/backend/core/models.py +++ b/src/backend/core/models.py @@ -1,6 +1,8 @@ """ Declare and configure the models for the impress core application """ +import hashlib +import json import textwrap import uuid @@ -8,6 +10,8 @@ from django.conf import settings from django.contrib.auth import models as auth_models from django.contrib.auth.base_user import AbstractBaseUser from django.core import mail, validators +from django.core.files.base import ContentFile +from django.core.files.storage import default_storage from django.db import models from django.template.base import Template as DjangoTemplate from django.template.context import Context @@ -249,6 +253,8 @@ class Document(BaseModel): help_text=_("Whether this document is public for anyone to use."), ) + _content = None + class Meta: db_table = "impress_document" ordering = ("title",) @@ -258,6 +264,49 @@ class Document(BaseModel): def __str__(self): return self.title + @property + def content(self): + """Return the json content from object storage if available""" + if self._content is None and self.id: + try: + # Load content from object storage + with default_storage.open(str(self.id)) as f: + self._content = json.load(f) + except FileNotFoundError: + pass + return self._content + + @content.setter + def content(self, content): + """Cache the content, don't write to object storage yet""" + if isinstance(content, str): + content = json.loads(content) + if not isinstance(content, dict): + raise ValueError("content should be a json object.") + self._content = content + + def save(self, *args, **kwargs): + """Write content to object storage only if _content has changed.""" + super().save(*args, **kwargs) + + if self._content: + file_key = str(self.pk) + bytes_content = json.dumps(self._content).encode("utf-8") + + if default_storage.exists(file_key): + response = default_storage.connection.meta.client.head_object( + Bucket=default_storage.bucket_name, Key=file_key + ) + has_changed = ( + response["ETag"].strip('"') + != hashlib.md5(bytes_content).hexdigest() # noqa + ) + else: + has_changed = True + if has_changed: + content_file = ContentFile(bytes_content) + default_storage.save(file_key, content_file) + def get_abilities(self, user): """ Compute and return abilities for a given user on the document. diff --git a/src/backend/core/tests/documents/test_api_documents_retrieve.py b/src/backend/core/tests/documents/test_api_documents_retrieve.py index 58585bdf..4e8f28f5 100644 --- a/src/backend/core/tests/documents/test_api_documents_retrieve.py +++ b/src/backend/core/tests/documents/test_api_documents_retrieve.py @@ -28,6 +28,7 @@ def test_api_documents_retrieve_anonymous_public(): "accesses": [], "title": document.title, "is_public": True, + "content": {"foo": document.content["foo"]}, } @@ -69,6 +70,7 @@ def test_api_documents_retrieve_authenticated_unrelated_public(): "accesses": [], "title": document.title, "is_public": True, + "content": {"foo": document.content["foo"]}, } @@ -132,6 +134,7 @@ def test_api_documents_retrieve_authenticated_related_direct(): assert response.json() == { "id": str(document.id), "title": document.title, + "content": {"foo": document.content["foo"]}, "abilities": document.get_abilities(user), "is_public": document.is_public, } @@ -246,6 +249,7 @@ def test_api_documents_retrieve_authenticated_related_team_members( assert response.json() == { "id": str(document.id), "title": document.title, + "content": {"foo": document.content["foo"]}, "abilities": document.get_abilities(user), "is_public": False, } @@ -343,6 +347,7 @@ def test_api_documents_retrieve_authenticated_related_team_administrators( assert response.json() == { "id": str(document.id), "title": document.title, + "content": {"foo": document.content["foo"]}, "abilities": document.get_abilities(user), "is_public": False, } @@ -444,6 +449,7 @@ def test_api_documents_retrieve_authenticated_related_team_owners( assert response.json() == { "id": str(document.id), "title": document.title, + "content": {"foo": document.content["foo"]}, "abilities": document.get_abilities(user), "is_public": False, } diff --git a/src/backend/core/tests/documents/test_api_documents_update.py b/src/backend/core/tests/documents/test_api_documents_update.py index 8d1c98fa..fb2981c6 100644 --- a/src/backend/core/tests/documents/test_api_documents_update.py +++ b/src/backend/core/tests/documents/test_api_documents_update.py @@ -6,7 +6,7 @@ import random import pytest from rest_framework.test import APIClient -from core import factories +from core import factories, models from core.api import serializers from core.tests.conftest import TEAM, USER, VIA @@ -138,7 +138,7 @@ def test_api_documents_update_authenticated_administrator_or_owner( ) assert response.status_code == 200 - document.refresh_from_db() + document = models.Document.objects.get(pk=document.pk) document_values = serializers.DocumentSerializer(instance=document).data for key, value in document_values.items(): if key in ["id", "accesses"]: @@ -175,7 +175,7 @@ def test_api_documents_update_authenticated_owners(via, mock_user_get_teams): ) assert response.status_code == 200 - document.refresh_from_db() + document = models.Document.objects.get(pk=document.pk) document_values = serializers.DocumentSerializer(instance=document).data for key, value in document_values.items(): if key in ["id", "accesses"]: diff --git a/src/backend/core/tests/test_models_documents.py b/src/backend/core/tests/test_models_documents.py index b4c1f554..2fc04eb1 100644 --- a/src/backend/core/tests/test_models_documents.py +++ b/src/backend/core/tests/test_models_documents.py @@ -3,8 +3,10 @@ Unit tests for the Document model """ from django.contrib.auth.models import AnonymousUser from django.core.exceptions import ValidationError +from django.core.files.storage import default_storage import pytest +import requests from core import factories, models @@ -159,3 +161,47 @@ def test_models_documents_get_abilities_preset_role(django_assert_num_queries): "manage_accesses": False, "partial_update": False, } + + +def test_models_documents_file_upload_to_minio(): + """Validate read/write from/to minio""" + document = factories.DocumentFactory() + document.content = {"foé": "çar"} + document.save() + + # Check that the file exists in MinIO: + file_key = str(document.pk) + # - through the storage backend + assert default_storage.exists(file_key) is True + # - directly from minio + signed_url = default_storage.url(file_key) + response = requests.get(signed_url, timeout=1) + assert response.json() == {"foé": "çar"} + + +def test_models_documents_version_duplicate(): + """A new version should be created in object storage only if the content has changed.""" + document = factories.DocumentFactory() + + file_key = str(document.pk) + response = default_storage.connection.meta.client.list_object_versions( + Bucket=default_storage.bucket_name, Prefix=file_key + ) + assert len(response["Versions"]) == 1 + + # Save again with the same content + document.save() + + response = default_storage.connection.meta.client.list_object_versions( + Bucket=default_storage.bucket_name, Prefix=file_key + ) + assert len(response["Versions"]) == 1 + + # Save modified content + document.content = {"foo": "spam"} + document.save() + + response = default_storage.connection.meta.client.list_object_versions( + Bucket=default_storage.bucket_name, Prefix=file_key + ) + assert len(response["Versions"]) == 2 diff --git a/src/backend/impress/settings.py b/src/backend/impress/settings.py index 0b98310d..462109d5 100755 --- a/src/backend/impress/settings.py +++ b/src/backend/impress/settings.py @@ -108,13 +108,35 @@ class Base(Configuration): STORAGES = { "default": { - "BACKEND": "django.core.files.storage.FileSystemStorage", + "BACKEND": "storages.backends.s3.S3Storage", }, "staticfiles": { - "BACKEND": "django.contrib.staticfiles.storage.StaticFilesStorage", + "BACKEND": values.Value( + "whitenoise.storage.CompressedManifestStaticFilesStorage", + environ_name="STORAGES_STATICFILES_BACKEND", + ), }, } + # Media + AWS_S3_ENDPOINT_URL = values.Value( + environ_name="AWS_S3_ENDPOINT_URL", environ_prefix=None + ) + AWS_S3_ACCESS_KEY_ID = values.Value( + environ_name="AWS_S3_ACCESS_KEY_ID", environ_prefix=None + ) + AWS_S3_SECRET_ACCESS_KEY = values.Value( + environ_name="AWS_S3_SECRET_ACCESS_KEY", environ_prefix=None + ) + AWS_S3_REGION_NAME = values.Value( + environ_name="AWS_S3_REGION_NAME", environ_prefix=None + ) + AWS_STORAGE_BUCKET_NAME = values.Value( + "impress-media-storage", + environ_name="AWS_STORAGE_BUCKET_NAME", + environ_prefix=None, + ) + # Internationalization # https://docs.djangoproject.com/en/3.1/topics/i18n/ @@ -451,15 +473,6 @@ class Test(Base): ] USE_SWAGGER = True - STORAGES = { - "default": { - "BACKEND": "django.core.files.storage.FileSystemStorage", - }, - "staticfiles": { - "BACKEND": "django.contrib.staticfiles.storage.StaticFilesStorage", - }, - } - CELERY_TASK_ALWAYS_EAGER = values.BooleanValue(True) def __init__(self): @@ -506,34 +519,9 @@ class Production(Base): CSRF_COOKIE_SECURE = True SESSION_COOKIE_SECURE = True - # For static files in production, we want to use a backend that includes a hash in - # the filename, that is calculated from the file content, so that browsers always - # get the updated version of each file. - STORAGES = { - "default": { - "BACKEND": "storages.backends.s3.S3Storage", - }, - "staticfiles": { - # For static files in production, we want to use a backend that includes a hash in - # the filename, that is calculated from the file content, so that browsers always - # get the updated version of each file. - "BACKEND": values.Value( - "whitenoise.storage.CompressedManifestStaticFilesStorage", - environ_name="STORAGES_STATICFILES_BACKEND", - ) - }, - } - # Privacy SECURE_REFERRER_POLICY = "same-origin" - # Media - AWS_S3_ENDPOINT_URL = values.Value() - AWS_S3_ACCESS_KEY_ID = values.Value() - AWS_S3_SECRET_ACCESS_KEY = values.Value() - AWS_STORAGE_BUCKET_NAME = values.Value("tf-default-impress-media-storage") - AWS_S3_REGION_NAME = values.Value() - CACHES = { "default": { "BACKEND": "django_redis.cache.RedisCache", diff --git a/src/backend/pyproject.toml b/src/backend/pyproject.toml index 7e979f4a..2233b421 100644 --- a/src/backend/pyproject.toml +++ b/src/backend/pyproject.toml @@ -34,7 +34,7 @@ dependencies = [ "django-parler==2.3", "redis==5.0.3", "django-redis==5.4.0", - "django-storages==1.14.2", + "django-storages[s3]==1.14.2", "django-timezone-field>=5.1", "django==5.0.3", "djangorestframework==3.14.0",