(documents) add content field as an S3 object

The content field is a writable property on the model which is persisted
in object storage. We take advantage of the versioning, robustness and
scalability of S3.
This commit is contained in:
Samuel Paccoud - DINUM
2024-04-06 09:09:46 +02:00
committed by Anthony LC
parent 4956beb040
commit 8e262da8f5
11 changed files with 164 additions and 44 deletions

View File

@@ -59,7 +59,6 @@ services:
- "8071:8000"
volumes:
- ./src/backend:/app
- ./data/media:/data/media
- ./data/static:/data/static
depends_on:
- postgresql
@@ -78,7 +77,6 @@ services:
- env.d/development/postgresql
volumes:
- ./src/backend:/app
- ./data/media:/data/media
- ./data/static:/data/static
depends_on:
- app-dev
@@ -96,8 +94,6 @@ services:
env_file:
- env.d/development/common
- env.d/development/postgresql
volumes:
- ./data/media:/data/media
depends_on:
- postgresql
- redis

View File

@@ -16,6 +16,12 @@ DJANGO_EMAIL_PORT=1025
# Backend url
IMPRESS_BASE_URL="http://localhost:8072"
# Media
STORAGES_STATICFILES_BACKEND=django.contrib.staticfiles.storage.StaticFilesStorage
AWS_S3_ENDPOINT_URL=http://minio:9000
AWS_S3_ACCESS_KEY_ID=impress
AWS_S3_SECRET_ACCESS_KEY=password
# OIDC
OIDC_OP_JWKS_ENDPOINT=http://nginx:8083/realms/impress/protocol/openid-connect/certs
OIDC_OP_AUTHORIZATION_ENDPOINT=http://localhost:8083/realms/impress/protocol/openid-connect/auth

View File

@@ -0,0 +1,24 @@
"""A JSONField for DRF to handle serialization/deserialization."""
import json
from rest_framework import serializers
class JSONField(serializers.Field):
"""
A custom field for handling JSON data.
"""
def to_representation(self, value):
"""
Convert the JSON string to a Python dictionary for serialization.
"""
return value
def to_internal_value(self, data):
"""
Convert the Python dictionary to a JSON string for deserialization.
"""
if data is None:
return None
return json.dumps(data)

View File

@@ -7,6 +7,8 @@ from timezone_field.rest_framework import TimeZoneSerializerField
from core import models
from .fields import JSONField
class UserSerializer(serializers.ModelSerializer):
"""Serialize users."""
@@ -134,6 +136,8 @@ class BaseResourceSerializer(serializers.ModelSerializer):
class DocumentSerializer(BaseResourceSerializer):
"""Serialize documents."""
content = JSONField(required=False)
class Meta:
model = models.Document
fields = ["id", "content", "title", "accesses", "abilities", "is_public"]

View File

@@ -35,6 +35,7 @@ class DocumentFactory(factory.django.DjangoModelFactory):
title = factory.Sequence(lambda n: f"document{n}")
is_public = factory.Faker("boolean")
content = factory.LazyFunction(lambda: {"foo": fake.word()})
@factory.post_generation
def users(self, create, extracted, **kwargs):

View File

@@ -1,6 +1,8 @@
"""
Declare and configure the models for the impress core application
"""
import hashlib
import json
import textwrap
import uuid
@@ -8,6 +10,8 @@ from django.conf import settings
from django.contrib.auth import models as auth_models
from django.contrib.auth.base_user import AbstractBaseUser
from django.core import mail, validators
from django.core.files.base import ContentFile
from django.core.files.storage import default_storage
from django.db import models
from django.template.base import Template as DjangoTemplate
from django.template.context import Context
@@ -249,6 +253,8 @@ class Document(BaseModel):
help_text=_("Whether this document is public for anyone to use."),
)
_content = None
class Meta:
db_table = "impress_document"
ordering = ("title",)
@@ -258,6 +264,49 @@ class Document(BaseModel):
def __str__(self):
return self.title
@property
def content(self):
"""Return the json content from object storage if available"""
if self._content is None and self.id:
try:
# Load content from object storage
with default_storage.open(str(self.id)) as f:
self._content = json.load(f)
except FileNotFoundError:
pass
return self._content
@content.setter
def content(self, content):
"""Cache the content, don't write to object storage yet"""
if isinstance(content, str):
content = json.loads(content)
if not isinstance(content, dict):
raise ValueError("content should be a json object.")
self._content = content
def save(self, *args, **kwargs):
"""Write content to object storage only if _content has changed."""
super().save(*args, **kwargs)
if self._content:
file_key = str(self.pk)
bytes_content = json.dumps(self._content).encode("utf-8")
if default_storage.exists(file_key):
response = default_storage.connection.meta.client.head_object(
Bucket=default_storage.bucket_name, Key=file_key
)
has_changed = (
response["ETag"].strip('"')
!= hashlib.md5(bytes_content).hexdigest() # noqa
)
else:
has_changed = True
if has_changed:
content_file = ContentFile(bytes_content)
default_storage.save(file_key, content_file)
def get_abilities(self, user):
"""
Compute and return abilities for a given user on the document.

View File

@@ -28,6 +28,7 @@ def test_api_documents_retrieve_anonymous_public():
"accesses": [],
"title": document.title,
"is_public": True,
"content": {"foo": document.content["foo"]},
}
@@ -69,6 +70,7 @@ def test_api_documents_retrieve_authenticated_unrelated_public():
"accesses": [],
"title": document.title,
"is_public": True,
"content": {"foo": document.content["foo"]},
}
@@ -132,6 +134,7 @@ def test_api_documents_retrieve_authenticated_related_direct():
assert response.json() == {
"id": str(document.id),
"title": document.title,
"content": {"foo": document.content["foo"]},
"abilities": document.get_abilities(user),
"is_public": document.is_public,
}
@@ -246,6 +249,7 @@ def test_api_documents_retrieve_authenticated_related_team_members(
assert response.json() == {
"id": str(document.id),
"title": document.title,
"content": {"foo": document.content["foo"]},
"abilities": document.get_abilities(user),
"is_public": False,
}
@@ -343,6 +347,7 @@ def test_api_documents_retrieve_authenticated_related_team_administrators(
assert response.json() == {
"id": str(document.id),
"title": document.title,
"content": {"foo": document.content["foo"]},
"abilities": document.get_abilities(user),
"is_public": False,
}
@@ -444,6 +449,7 @@ def test_api_documents_retrieve_authenticated_related_team_owners(
assert response.json() == {
"id": str(document.id),
"title": document.title,
"content": {"foo": document.content["foo"]},
"abilities": document.get_abilities(user),
"is_public": False,
}

View File

@@ -6,7 +6,7 @@ import random
import pytest
from rest_framework.test import APIClient
from core import factories
from core import factories, models
from core.api import serializers
from core.tests.conftest import TEAM, USER, VIA
@@ -138,7 +138,7 @@ def test_api_documents_update_authenticated_administrator_or_owner(
)
assert response.status_code == 200
document.refresh_from_db()
document = models.Document.objects.get(pk=document.pk)
document_values = serializers.DocumentSerializer(instance=document).data
for key, value in document_values.items():
if key in ["id", "accesses"]:
@@ -175,7 +175,7 @@ def test_api_documents_update_authenticated_owners(via, mock_user_get_teams):
)
assert response.status_code == 200
document.refresh_from_db()
document = models.Document.objects.get(pk=document.pk)
document_values = serializers.DocumentSerializer(instance=document).data
for key, value in document_values.items():
if key in ["id", "accesses"]:

View File

@@ -3,8 +3,10 @@ Unit tests for the Document model
"""
from django.contrib.auth.models import AnonymousUser
from django.core.exceptions import ValidationError
from django.core.files.storage import default_storage
import pytest
import requests
from core import factories, models
@@ -159,3 +161,47 @@ def test_models_documents_get_abilities_preset_role(django_assert_num_queries):
"manage_accesses": False,
"partial_update": False,
}
def test_models_documents_file_upload_to_minio():
"""Validate read/write from/to minio"""
document = factories.DocumentFactory()
document.content = {"foé": "çar"}
document.save()
# Check that the file exists in MinIO:
file_key = str(document.pk)
# - through the storage backend
assert default_storage.exists(file_key) is True
# - directly from minio
signed_url = default_storage.url(file_key)
response = requests.get(signed_url, timeout=1)
assert response.json() == {"foé": "çar"}
def test_models_documents_version_duplicate():
"""A new version should be created in object storage only if the content has changed."""
document = factories.DocumentFactory()
file_key = str(document.pk)
response = default_storage.connection.meta.client.list_object_versions(
Bucket=default_storage.bucket_name, Prefix=file_key
)
assert len(response["Versions"]) == 1
# Save again with the same content
document.save()
response = default_storage.connection.meta.client.list_object_versions(
Bucket=default_storage.bucket_name, Prefix=file_key
)
assert len(response["Versions"]) == 1
# Save modified content
document.content = {"foo": "spam"}
document.save()
response = default_storage.connection.meta.client.list_object_versions(
Bucket=default_storage.bucket_name, Prefix=file_key
)
assert len(response["Versions"]) == 2

View File

@@ -108,13 +108,35 @@ class Base(Configuration):
STORAGES = {
"default": {
"BACKEND": "django.core.files.storage.FileSystemStorage",
"BACKEND": "storages.backends.s3.S3Storage",
},
"staticfiles": {
"BACKEND": "django.contrib.staticfiles.storage.StaticFilesStorage",
"BACKEND": values.Value(
"whitenoise.storage.CompressedManifestStaticFilesStorage",
environ_name="STORAGES_STATICFILES_BACKEND",
),
},
}
# Media
AWS_S3_ENDPOINT_URL = values.Value(
environ_name="AWS_S3_ENDPOINT_URL", environ_prefix=None
)
AWS_S3_ACCESS_KEY_ID = values.Value(
environ_name="AWS_S3_ACCESS_KEY_ID", environ_prefix=None
)
AWS_S3_SECRET_ACCESS_KEY = values.Value(
environ_name="AWS_S3_SECRET_ACCESS_KEY", environ_prefix=None
)
AWS_S3_REGION_NAME = values.Value(
environ_name="AWS_S3_REGION_NAME", environ_prefix=None
)
AWS_STORAGE_BUCKET_NAME = values.Value(
"impress-media-storage",
environ_name="AWS_STORAGE_BUCKET_NAME",
environ_prefix=None,
)
# Internationalization
# https://docs.djangoproject.com/en/3.1/topics/i18n/
@@ -451,15 +473,6 @@ class Test(Base):
]
USE_SWAGGER = True
STORAGES = {
"default": {
"BACKEND": "django.core.files.storage.FileSystemStorage",
},
"staticfiles": {
"BACKEND": "django.contrib.staticfiles.storage.StaticFilesStorage",
},
}
CELERY_TASK_ALWAYS_EAGER = values.BooleanValue(True)
def __init__(self):
@@ -506,34 +519,9 @@ class Production(Base):
CSRF_COOKIE_SECURE = True
SESSION_COOKIE_SECURE = True
# For static files in production, we want to use a backend that includes a hash in
# the filename, that is calculated from the file content, so that browsers always
# get the updated version of each file.
STORAGES = {
"default": {
"BACKEND": "storages.backends.s3.S3Storage",
},
"staticfiles": {
# For static files in production, we want to use a backend that includes a hash in
# the filename, that is calculated from the file content, so that browsers always
# get the updated version of each file.
"BACKEND": values.Value(
"whitenoise.storage.CompressedManifestStaticFilesStorage",
environ_name="STORAGES_STATICFILES_BACKEND",
)
},
}
# Privacy
SECURE_REFERRER_POLICY = "same-origin"
# Media
AWS_S3_ENDPOINT_URL = values.Value()
AWS_S3_ACCESS_KEY_ID = values.Value()
AWS_S3_SECRET_ACCESS_KEY = values.Value()
AWS_STORAGE_BUCKET_NAME = values.Value("tf-default-impress-media-storage")
AWS_S3_REGION_NAME = values.Value()
CACHES = {
"default": {
"BACKEND": "django_redis.cache.RedisCache",

View File

@@ -34,7 +34,7 @@ dependencies = [
"django-parler==2.3",
"redis==5.0.3",
"django-redis==5.4.0",
"django-storages==1.14.2",
"django-storages[s3]==1.14.2",
"django-timezone-field>=5.1",
"django==5.0.3",
"djangorestframework==3.14.0",