(backend) add soft delete to documents and refactor db queryset

Now that we have introduced a document tree structure, it is not
possible to allow deleting documents anymore as it impacts the whole
subtree below the deleted document and the consequences are too big.

We introduce soft delete in order to give a second thought to the
document's owner (who is the only one to be allowed to delete a
document). After a document is soft deleted, the owner can still
see it in the trashbin (/api/v1.0/documents/trashbin).
After a grace period (30 days be default) the document disappears
from the trashbin and can't be restored anymore. Note that even
then it is still kept in database. Cleaning the database to erase
deleted documents after the grace period can be done as a maintenance
script.
This commit is contained in:
Samuel Paccoud - DINUM
2025-01-02 17:20:09 +01:00
committed by Anthony LC
parent 4de03d292a
commit 8ccfdb3c6a
21 changed files with 1373 additions and 252 deletions

View File

@@ -24,7 +24,7 @@ class DocumentFilter(django_filters.FilterSet):
class Meta:
model = models.Document
fields = ["is_creator_me", "is_favorite", "link_reach", "title"]
fields = ["is_creator_me", "is_favorite", "title"]
# pylint: disable=unused-argument
def filter_is_creator_me(self, queryset, name, value):
@@ -63,7 +63,4 @@ class DocumentFilter(django_filters.FilterSet):
if not user.is_authenticated:
return queryset
if value:
return queryset.filter(favorited_by_users__user=user)
return queryset.exclude(favorited_by_users__user=user)
return queryset.filter(is_favorite=bool(value))

View File

@@ -2,10 +2,11 @@
from django.core import exceptions
from django.db.models import Q
from django.http import Http404
from rest_framework import permissions
from core.models import DocumentAccess, RoleChoices
from core.models import DocumentAccess, RoleChoices, get_trashbin_cutoff
ACTION_FOR_METHOD_TO_PERMISSION = {
"versions_detail": {"DELETE": "versions_destroy", "GET": "versions_retrieve"},
@@ -110,3 +111,26 @@ class AccessPermission(permissions.BasePermission):
except KeyError:
pass
return abilities.get(action, False)
class DocumentAccessPermission(AccessPermission):
"""Subclass to handle soft deletion specificities."""
def has_object_permission(self, request, view, obj):
"""
Return a 404 on deleted documents
- for which the trashbin cutoff is past
- for which the current user is not owner of the document or one of its ancestors
"""
if (
deleted_at := obj.ancestors_deleted_at
) and deleted_at < get_trashbin_cutoff():
raise Http404
# Compute permission first to ensure the "user_roles" attribute is set
has_permission = super().has_object_permission(request, view, obj)
if obj.ancestors_deleted_at and not RoleChoices.OWNER in obj.user_roles:
raise Http404
return has_permission

View File

@@ -11,6 +11,29 @@ import botocore
from rest_framework.throttling import BaseThrottle
def filter_root_paths(paths, skip_sorting=False):
"""
Filters root paths from a list of paths representing a tree structure.
A root path is defined as a path that is not a prefix of any other path.
Args:
paths (list of str): The list of paths.
Returns:
list of str: The filtered list of root paths.
"""
if not skip_sorting:
paths.sort()
root_paths = []
for path in paths:
# If the current path is not a prefix of the last added root path, add it
if not root_paths or not path.startswith(root_paths[-1]):
root_paths.append(path)
return root_paths
def generate_s3_authorization_headers(key):
"""
Generate authorization headers for an s3 object.

View File

@@ -8,6 +8,7 @@ from urllib.parse import urlparse
from django.conf import settings
from django.contrib.postgres.aggregates import ArrayAgg
from django.contrib.postgres.fields import ArrayField
from django.contrib.postgres.search import TrigramSimilarity
from django.core.exceptions import ValidationError
from django.core.files.storage import default_storage
@@ -29,7 +30,7 @@ from django.http import Http404
import rest_framework as drf
from botocore.exceptions import ClientError
from django_filters import rest_framework as drf_filters
from rest_framework import filters, status
from rest_framework import filters, status, viewsets
from rest_framework import response as drf_response
from rest_framework.permissions import AllowAny
@@ -56,7 +57,7 @@ COLLABORATION_WS_URL_PATTERN = re.compile(rf"(?:^|&)room=(?P<pk>{UUID_REGEX})(?:
# pylint: disable=too-many-ancestors
class NestedGenericViewSet(drf.viewsets.GenericViewSet):
class NestedGenericViewSet(viewsets.GenericViewSet):
"""
A generic Viewset aims to be used in a nested route context.
e.g: `/api/v1.0/resource_1/<resource_1_pk>/resource_2/<resource_2_pk>/`
@@ -137,7 +138,7 @@ class Pagination(drf.pagination.PageNumberPagination):
class UserViewSet(
drf.mixins.UpdateModelMixin, drf.viewsets.GenericViewSet, drf.mixins.ListModelMixin
drf.mixins.UpdateModelMixin, viewsets.GenericViewSet, drf.mixins.ListModelMixin
):
"""User ViewSet"""
@@ -315,118 +316,242 @@ class DocumentMetadata(drf.metadata.SimpleMetadata):
class DocumentViewSet(
drf.mixins.CreateModelMixin,
drf.mixins.DestroyModelMixin,
drf.mixins.ListModelMixin,
drf.mixins.UpdateModelMixin,
drf.viewsets.GenericViewSet,
viewsets.GenericViewSet,
):
"""
Document ViewSet for managing documents.
DocumentViewSet API.
Provides endpoints for creating, updating, and deleting documents,
along with filtering options.
This view set provides CRUD operations and additional actions for managing documents.
Supports filtering, ordering, and annotations for enhanced querying capabilities.
Filtering:
### API Endpoints:
1. **List**: Retrieve a paginated list of documents.
Example: GET /documents/?page=2
2. **Retrieve**: Get a specific document by its ID.
Example: GET /documents/{id}/
3. **Create**: Create a new document.
Example: POST /documents/
4. **Update**: Update a document by its ID.
Example: PUT /documents/{id}/
5. **Delete**: Soft delete a document by its ID.
Example: DELETE /documents/{id}/
### Additional Actions:
1. **Trashbin**: List soft deleted documents for a document owner
Example: GET /documents/{id}/trashbin/
2. **Children**: List or create child documents.
Example: GET, POST /documents/{id}/children/
3. **Versions List**: Retrieve version history of a document.
Example: GET /documents/{id}/versions/
4. **Version Detail**: Get or delete a specific document version.
Example: GET, DELETE /documents/{id}/versions/{version_id}/
5. **Favorite**: Get list of favorite documents for a user. Mark or unmark
a document as favorite.
Examples:
- GET /documents/favorite/
- POST, DELETE /documents/{id}/favorite/
6. **Create for Owner**: Create a document via server-to-server on behalf of a user.
Example: POST /documents/create-for-owner/
7. **Link Configuration**: Update document link configuration.
Example: PUT /documents/{id}/link-configuration/
8. **Attachment Upload**: Upload a file attachment for the document.
Example: POST /documents/{id}/attachment-upload/
9. **Media Auth**: Authorize access to document media.
Example: GET /documents/media-auth/
10. **Collaboration Auth**: Authorize access to the collaboration server for a document.
Example: GET /documents/collaboration-auth/
11. **AI Transform**: Apply a transformation action on a piece of text with AI.
Example: POST /documents/{id}/ai-transform/
Expected data:
- text (str): The input text.
- action (str): The transformation type, one of [prompt, correct, rephrase, summarize].
Returns: JSON response with the processed text.
Throttled by: AIDocumentRateThrottle, AIUserRateThrottle.
12. **AI Translate**: Translate a piece of text with AI.
Example: POST /documents/{id}/ai-translate/
Expected data:
- text (str): The input text.
- language (str): The target language, chosen from settings.LANGUAGES.
Returns: JSON response with the translated text.
Throttled by: AIDocumentRateThrottle, AIUserRateThrottle.
### Ordering: created_at, updated_at, is_favorite, title
Example:
- Ascending: GET /api/v1.0/documents/?ordering=created_at
- Desceding: GET /api/v1.0/documents/?ordering=-title
### Filtering:
- `is_creator_me=true`: Returns documents created by the current user.
- `is_creator_me=false`: Returns documents created by other users.
- `is_favorite=true`: Returns documents marked as favorite by the current user
- `is_favorite=false`: Returns documents not marked as favorite by the current user
- `title=hello`: Returns documents which title contains the "hello" string
Example Usage:
Example:
- GET /api/v1.0/documents/?is_creator_me=true&is_favorite=true
- GET /api/v1.0/documents/?is_creator_me=false&title=hello
### Annotations:
1. **is_favorite**: Indicates whether the document is marked as favorite by the current user.
2. **user_roles**: Roles the current user has on the document or its ancestors.
### Notes:
- Only the highest ancestor in a document hierarchy is shown in list views.
- Implements soft delete logic to retain document tree structures.
"""
filter_backends = [drf_filters.DjangoFilterBackend, filters.OrderingFilter]
filter_backends = [drf_filters.DjangoFilterBackend]
filterset_class = DocumentFilter
metadata_class = DocumentMetadata
ordering = ["-updated_at"]
ordering_fields = ["created_at", "is_favorite", "updated_at", "title"]
ordering_fields = ["created_at", "updated_at", "title"]
permission_classes = [
permissions.AccessPermission,
permissions.DocumentAccessPermission,
]
queryset = models.Document.objects.all()
serializer_class = serializers.DocumentSerializer
def get_serializer_class(self):
"""
Use ListDocumentSerializer for list actions, otherwise use DocumentSerializer.
Use ListDocumentSerializer for list actions; otherwise, use DocumentSerializer.
"""
if self.action == "list":
return serializers.ListDocumentSerializer
return self.serializer_class
def annotate_queryset(self, queryset):
"""Annotate document queryset with favorite and number of accesses."""
user = self.request.user
# Annotate the number of accesses taking into account ancestors
ancestor_accesses_query = (
models.DocumentAccess.objects.filter(
document__path=Left(OuterRef("path"), Length("document__path")),
)
.order_by()
.annotate(total_accesses=Func(Value("id"), function="COUNT"))
.values("total_accesses")
return (
serializers.ListDocumentSerializer
if self.action == "list"
else self.serializer_class
)
# Annotate with the number of accesses, default to 0 if no accesses exist
queryset = queryset.annotate(nb_accesses=Subquery(ancestor_accesses_query))
if not user.is_authenticated:
# If the user is not authenticated, annotate `is_favorite` as False
return queryset.annotate(is_favorite=Value(False))
# Annotate the queryset to indicate if the document is favorited by the current user
favorite_exists = models.DocumentFavorite.objects.filter(
document_id=OuterRef("pk"), user=user
)
return queryset.annotate(is_favorite=Exists(favorite_exists))
def get_queryset(self):
"""Optimize queryset to include favorite status for the current user."""
queryset = super().get_queryset()
queryset = self.annotate_queryset(queryset)
return queryset.distinct()
def list(self, request, *args, **kwargs):
"""Restrict resources returned by the list endpoint"""
queryset = self.filter_queryset(self.get_queryset())
def annotate_is_favorite(self, queryset):
"""
Annotate document queryset with the favorite status for the current user.
"""
user = self.request.user
if user.is_authenticated:
queryset = queryset.filter(
db.Q(accesses__user=user)
| db.Q(accesses__team__in=user.teams)
| (
db.Q(link_traces__user=user)
& ~db.Q(link_reach=models.LinkReachChoices.RESTRICTED)
favorite_exists_subquery = models.DocumentFavorite.objects.filter(
document_id=db.OuterRef("pk"), user=user
)
return queryset.annotate(is_favorite=db.Exists(favorite_exists_subquery))
return queryset.annotate(is_favorite=db.Value(False))
def annotate_user_roles(self, queryset):
"""
Annotate document queryset with the roles of the current user
on the document or its ancestors.
"""
user = self.request.user
output_field = ArrayField(base_field=db.CharField())
if user.is_authenticated:
user_roles_subquery = models.DocumentAccess.objects.filter(
db.Q(user=user) | db.Q(team__in=user.teams),
document__path=Left(db.OuterRef("path"), Length("document__path")),
).values_list("role", flat=True)
return queryset.annotate(
user_roles=db.Func(
user_roles_subquery, function="ARRAY", output_field=output_field
)
)
# Among the results, we may have documents that are ancestors/children of each other
# In this case we want to keep only the highest ancestor. Let's annotate, each document
# with the path of its highest ancestor within results so we can use it to filter
shortest_path = Subquery(
queryset.filter(path=Left(OuterRef("path"), Length("path")))
.order_by("path") # Get the shortest (root) path
.values("path")[:1]
)
queryset = queryset.annotate(root_path=shortest_path)
return queryset.annotate(
user_roles=db.Value([], output_field=output_field),
)
# Filter documents based on their shortest path (root path)
queryset = queryset.filter(
root_path=F(
"path"
) # Keep only documents who are the annotated highest ancestor
def get_queryset(self):
"""Get queryset performing all annotation and filtering on the document tree structure."""
user = self.request.user
queryset = super().get_queryset()
# Only list views need filtering and annotation
if self.detail:
return queryset
if not user.is_authenticated:
return queryset.none()
queryset = queryset.filter(ancestors_deleted_at__isnull=True)
# Filter documents to which the current user has access...
access_documents_ids = models.DocumentAccess.objects.filter(
db.Q(user=user) | db.Q(team__in=user.teams)
).values_list("document_id", flat=True)
# ...or that were previously accessed and are not restricted
traced_documents_ids = models.LinkTrace.objects.filter(user=user).values_list(
"document_id", flat=True
)
return queryset.filter(
db.Q(id__in=access_documents_ids)
| (
db.Q(id__in=traced_documents_ids)
& ~db.Q(link_reach=models.LinkReachChoices.RESTRICTED)
)
)
def filter_queryset(self, queryset):
"""Apply annotations and filters sequentially."""
filterset = DocumentFilter(
self.request.GET, queryset=queryset, request=self.request
)
filterset.is_valid()
filter_data = filterset.form.cleaned_data
# Filter as early as possible on fields that are available on the model
for field in ["is_creator_me", "title"]:
queryset = filterset.filters[field].filter(queryset, filter_data[field])
queryset = self.annotate_user_roles(queryset)
if self.action == "list":
# Among the results, we may have documents that are ancestors/descendants
# of each other. In this case we want to keep only the highest ancestors.
root_paths = utils.filter_root_paths(
queryset.order_by("path").values_list("path", flat=True),
skip_sorting=True,
)
queryset = queryset.filter(path__in=root_paths)
# Annotate the queryset with an attribute marking instances as highest ancestor
# in order to save some time while computing abilities in the instance
queryset = queryset.annotate(
is_highest_ancestor_for_user=db.Value(
True, output_field=db.BooleanField()
)
)
else:
queryset = queryset.none()
# Annotate favorite status and filter if applicable as late as possible
queryset = self.annotate_is_favorite(queryset)
queryset = filterset.filters["is_favorite"].filter(
queryset, filter_data["is_favorite"]
)
# Apply ordering only now that everyting is filtered and annotated
return filters.OrderingFilter().filter_queryset(self.request, queryset, self)
def get_response_for_queryset(self, queryset):
"""Return paginated response for the queryset if requested."""
page = self.paginate_queryset(queryset)
if page is not None:
serializer = self.get_serializer(page, many=True)
return self.get_paginated_response(serializer.data)
result = self.get_paginated_response(serializer.data)
return result
serializer = self.get_serializer(queryset, many=True)
return drf.response.Response(serializer.data)
@@ -437,20 +562,18 @@ class DocumentViewSet(
on a user's list view even though the user has no specific role in the document (link
access when the link reach configuration of the document allows it).
"""
user = self.request.user
instance = self.get_object()
serializer = self.get_serializer(instance)
if self.request.user.is_authenticated:
try:
# Add a trace that the user visited the document (this is needed to include
# the document in the user's list view)
models.LinkTrace.objects.create(
document=instance,
user=self.request.user,
)
except ValidationError:
# The trace already exists, so we just pass without doing anything
pass
# The `create` query generates 5 db queries which are much less efficient than an
# `exists` query. The user will visit the document many times after the first visit
# so that's what we should optimize for.
if (
user.is_authenticated
and not instance.link_traces.filter(user=user).exists()
):
models.LinkTrace.objects.create(document=instance, user=request.user)
return drf.response.Response(serializer.data)
@@ -467,6 +590,47 @@ class DocumentViewSet(
role=models.RoleChoices.OWNER,
)
def perform_destroy(self, instance):
"""Override to implement a soft delete instead of dumping the record in database."""
instance.soft_delete()
@drf.decorators.action(
detail=False,
methods=["get"],
)
def favorite_list(self, request, *args, **kwargs):
"""Get list of favorite documents for the current user."""
user = request.user
favorite_documents_ids = models.DocumentFavorite.objects.filter(
user=user
).values_list("document_id", flat=True)
queryset = self.get_queryset()
queryset = queryset.filter(id__in=favorite_documents_ids)
return self.get_response_for_queryset(queryset)
@drf.decorators.action(
detail=False,
methods=["get"],
serializer_class=serializers.ListDocumentSerializer,
)
def trashbin(self, request, *args, **kwargs):
"""
Retrieve soft-deleted documents for which the current user has the owner role.
The selected documents are those deleted within the cutoff period defined in the
settings (see TRASHBIN_CUTOFF_DAYS), before they are considered permanently deleted.
"""
queryset = self.queryset.filter(
deleted_at__isnull=False,
deleted_at__gte=models.get_trashbin_cutoff(),
)
queryset = self.annotate_user_roles(queryset)
queryset = queryset.filter(user_roles__contains=[models.RoleChoices.OWNER])
return self.get_response_for_queryset(queryset)
@drf.decorators.action(
authentication_classes=[authentication.ServerToServerAuthentication],
detail=False,
@@ -553,6 +717,7 @@ class DocumentViewSet(
@drf.decorators.action(
detail=True,
methods=["get", "post"],
ordering=["path"],
serializer_class=serializers.ListDocumentSerializer,
url_path="children",
)
@@ -586,16 +751,11 @@ class DocumentViewSet(
)
# GET: List children
queryset = document.get_children()
queryset = self.annotate_queryset(queryset)
page = self.paginate_queryset(queryset)
if page is not None:
serializer = self.get_serializer(page, many=True)
return self.get_paginated_response(serializer.data)
serializer = self.get_serializer(queryset, many=True)
return drf.response.Response(serializer.data)
queryset = document.get_children().filter(deleted_at__isnull=True)
queryset = self.filter_queryset(queryset)
queryset = self.annotate_is_favorite(queryset)
queryset = self.annotate_user_roles(queryset)
return self.get_response_for_queryset(queryset)
@drf.decorators.action(detail=True, methods=["get"], url_path="versions")
def versions_list(self, request, *args, **kwargs):
@@ -617,7 +777,7 @@ class DocumentViewSet(
# document. Filter to get the minimum access date for the logged-in user
access_queryset = models.DocumentAccess.objects.filter(
db.Q(user=user) | db.Q(team__in=user.teams),
document__path=Left(Value(document.path), Length("document__path")),
document__path=Left(db.Value(document.path), Length("document__path")),
).aggregate(min_date=db.Min("created_at"))
# Handle the case where the user has no accesses
@@ -657,7 +817,7 @@ class DocumentViewSet(
access.created_at
for access in models.DocumentAccess.objects.filter(
db.Q(user=user) | db.Q(team__in=user.teams),
document__path=Left(Value(document.path), Length("document__path")),
document__path=Left(db.Value(document.path), Length("document__path")),
)
)
@@ -948,7 +1108,7 @@ class DocumentAccessViewSet(
drf.mixins.ListModelMixin,
drf.mixins.RetrieveModelMixin,
drf.mixins.UpdateModelMixin,
drf.viewsets.GenericViewSet,
viewsets.GenericViewSet,
):
"""
API ViewSet for all interactions with document accesses.
@@ -1021,7 +1181,7 @@ class TemplateViewSet(
drf.mixins.DestroyModelMixin,
drf.mixins.RetrieveModelMixin,
drf.mixins.UpdateModelMixin,
drf.viewsets.GenericViewSet,
viewsets.GenericViewSet,
):
"""Template ViewSet"""
@@ -1045,14 +1205,14 @@ class TemplateViewSet(
user_roles_query = (
models.TemplateAccess.objects.filter(
Q(user=user) | Q(team__in=user.teams),
template_id=OuterRef("pk"),
db.Q(user=user) | db.Q(team__in=user.teams),
template_id=db.OuterRef("pk"),
)
.values("template")
.annotate(roles_array=ArrayAgg("role"))
.values("roles_array")
)
return queryset.annotate(user_roles=Subquery(user_roles_query)).distinct()
return queryset.annotate(user_roles=db.Subquery(user_roles_query)).distinct()
def list(self, request, *args, **kwargs):
"""Restrict templates returned by the list endpoint"""
@@ -1092,7 +1252,7 @@ class TemplateAccessViewSet(
drf.mixins.ListModelMixin,
drf.mixins.RetrieveModelMixin,
drf.mixins.UpdateModelMixin,
drf.viewsets.GenericViewSet,
viewsets.GenericViewSet,
):
"""
API ViewSet for all interactions with template accesses.
@@ -1132,7 +1292,7 @@ class InvitationViewset(
drf.mixins.RetrieveModelMixin,
drf.mixins.DestroyModelMixin,
drf.mixins.UpdateModelMixin,
drf.viewsets.GenericViewSet,
viewsets.GenericViewSet,
):
"""API ViewSet for user invitations to document.