src/backend/core/utils.py

"""Utils for the core app."""

import base64
import logging
import re
import time
from collections import defaultdict

from django.core.cache import cache
from django.db import models as db
from django.db.models import Subquery

import pycrdt
from bs4 import BeautifulSoup

from core import enums, models

logger = logging.getLogger(__name__)


def get_ancestor_to_descendants_map(paths, steplen):
    """
    Given a list of document paths, return a mapping of ancestor_path -> set of descendant_paths.

    Each path is assumed to use materialized path format with fixed-length segments.

    Args:
        paths (list of str): List of full document paths.
        steplen (int): Length of each path segment.

    Returns:
        dict[str, set[str]]: Mapping from ancestor path to its descendant paths (including itself).
    """
    ancestor_map = defaultdict(set)
    for path in paths:
        for i in range(steplen, len(path) + 1, steplen):
            ancestor = path[:i]
            ancestor_map[ancestor].add(path)
    return ancestor_map


def filter_descendants(paths, root_paths, skip_sorting=False):
    """
    Filters paths to keep only those that are descendants of any path in root_paths.

    A path is considered a descendant of a root path if it starts with the root path.
    If `skip_sorting` is not set to True, the function will sort both lists before
    processing because both `paths` and `root_paths` need to be in lexicographic order
    before going through the algorithm.

    Args:
        paths (iterable of str): List of paths to be filtered.
        root_paths (iterable of str): List of paths to check as potential prefixes.
        skip_sorting (bool): If True, assumes both `paths` and `root_paths` are already sorted.

    Returns:
        list of str: A list of sorted paths that are descendants of any path in `root_paths`.
    """
    results = []
    i = 0
    n = len(root_paths)

    if not skip_sorting:
        paths.sort()
        root_paths.sort()

    for path in paths:
        # Try to find a matching prefix in the sorted accessible paths
        while i < n:
            if path.startswith(root_paths[i]):
                results.append(path)
                break
            if root_paths[i] < path:
                i += 1
            else:
                # If paths[i] > path, no need to keep searching
                break
    return results


def base64_yjs_to_xml(base64_string):
    """Extract xml from base64 yjs document."""

    decoded_bytes = base64.b64decode(base64_string)
    # uint8_array = bytearray(decoded_bytes)

    doc = pycrdt.Doc()
    doc.apply_update(decoded_bytes)
    return str(doc.get("document-store", type=pycrdt.XmlFragment))


def base64_yjs_to_text(base64_string):
    """Extract text from base64 yjs document."""

    blocknote_structure = base64_yjs_to_xml(base64_string)
    soup = BeautifulSoup(blocknote_structure, "lxml-xml")
    return soup.get_text(separator=" ", strip=True)


def extract_attachments(content):
    """Helper method to extract media paths from a document's content."""
    if not content:
        return []

    xml_content = base64_yjs_to_xml(content)
    return re.findall(enums.MEDIA_STORAGE_URL_EXTRACT, xml_content)


def get_users_sharing_documents_with_cache_key(user):
    """Generate a unique cache key for each user."""
    return f"users_sharing_documents_with_{user.id}"


def users_sharing_documents_with(user):
    """
    Returns a map of users sharing documents with the given user,
    sorted by last shared date.
    """
    start_time = time.time()
    cache_key = get_users_sharing_documents_with_cache_key(user)
    cached_result = cache.get(cache_key)

    if cached_result is not None:
        elapsed = time.time() - start_time
        logger.info(
            "users_sharing_documents_with cache hit for user %s (took %.3fs)",
            user.id,
            elapsed,
        )
        return cached_result

    user_docs_qs = models.DocumentAccess.objects.filter(user=user).values_list(
        "document_id", flat=True
    )
    shared_qs = (
        models.DocumentAccess.objects.filter(document_id__in=Subquery(user_docs_qs))
        .exclude(user=user)
        .values("user")
        .annotate(last_shared=db.Max("created_at"))
    )
    result = {item["user"]: item["last_shared"] for item in shared_qs}
    cache.set(cache_key, result, 86400)  # Cache for 1 day
    elapsed = time.time() - start_time
    logger.info(
        "users_sharing_documents_with cache miss for user %s (took %.3fs)",
        user.id,
        elapsed,
    )
    return result
✨(backend) add util to extract text from Ydoc content Documents content is stored in the Ydoc format. We need a util to extract it as xml/text. 2024-12-27 10:19:16 +01:00			`"""Utils for the core app."""`

			`import base64`
🚸(backend) sort user search results by proximity with the active user (#1802) ## Purpose Allows a user to find more easily the other users they search, with the following order of priority: - users they already share documents with (more recent first) - users that share the same full email domain - ~~users that share the same partial email domain (last two parts)~~ - ~~other users~~ Edit: We need to ilter out other users in order to not reveal email addresses from members of other organisations. It's still possible to invite them by email. Solves #1521 ## Proposal - [x] Add a new function in `core/utils.py`: `users_sharing_documents_with()` - [x] Use it as a key to sort the results of a basic user search - [x] Filter user results to avoid reveal of users (and email addresses) of other orgs or that have not been interacted with. - [x] User research through "full" email address (contains the '@') is left unaffected. --------- Co-authored-by: Anthony LC <anthony.le-courric@mail.numerique.gouv.fr> 2026-02-11 18:51:45 +01:00			`import logging`
✨(backend) add duplicate action to the document API endpoint We took this opportunity to refactor the way access is controlled on media attachments. We now add the media key to a list on the document instance each time a media is uploaded to a document. This list is passed along when a document is duplicated, allowing us to grant access to readers on the new document, even if they don't have or lost access to the original document. We also propose an option to reproduce the same access rights on the duplicate document as what was in place on the original document. This can be requested by passing the "with_accesses=true" option in the query string. The tricky point is that we need to extract attachment keys from the existing documents and set them on the new "attachments" field that is now used to track access rights on media files. 2025-01-20 10:23:18 +01:00			`import re`
🚸(backend) sort user search results by proximity with the active user (#1802) ## Purpose Allows a user to find more easily the other users they search, with the following order of priority: - users they already share documents with (more recent first) - users that share the same full email domain - ~~users that share the same partial email domain (last two parts)~~ - ~~other users~~ Edit: We need to ilter out other users in order to not reveal email addresses from members of other organisations. It's still possible to invite them by email. Solves #1521 ## Proposal - [x] Add a new function in `core/utils.py`: `users_sharing_documents_with()` - [x] Use it as a key to sort the results of a basic user search - [x] Filter user results to avoid reveal of users (and email addresses) of other orgs or that have not been interacted with. - [x] User research through "full" email address (contains the '@') is left unaffected. --------- Co-authored-by: Anthony LC <anthony.le-courric@mail.numerique.gouv.fr> 2026-02-11 18:51:45 +01:00			`import time`
✨(backend) add async triggers to enable document indexation with find On document content or permission changes, start a celery job that will call the indexation API of the app "Find". Signed-off-by: Fabre Florian <ffabre@hybird.org> 2025-08-06 17:35:38 +02:00			`from collections import defaultdict`
✨(backend) add util to extract text from Ydoc content Documents content is stored in the Ydoc format. We need a util to extract it as xml/text. 2024-12-27 10:19:16 +01:00
🚸(backend) sort user search results by proximity with the active user (#1802) ## Purpose Allows a user to find more easily the other users they search, with the following order of priority: - users they already share documents with (more recent first) - users that share the same full email domain - ~~users that share the same partial email domain (last two parts)~~ - ~~other users~~ Edit: We need to ilter out other users in order to not reveal email addresses from members of other organisations. It's still possible to invite them by email. Solves #1521 ## Proposal - [x] Add a new function in `core/utils.py`: `users_sharing_documents_with()` - [x] Use it as a key to sort the results of a basic user search - [x] Filter user results to avoid reveal of users (and email addresses) of other orgs or that have not been interacted with. - [x] User research through "full" email address (contains the '@') is left unaffected. --------- Co-authored-by: Anthony LC <anthony.le-courric@mail.numerique.gouv.fr> 2026-02-11 18:51:45 +01:00			`from django.core.cache import cache`
			`from django.db import models as db`
			`from django.db.models import Subquery`

♻️(back) replace Ypy by pycrdt Ypy is deprecated and unmaintained. We have problem with parsing existing documents. We replace it by pycrdt, library actively maintained and without the issues we have with Ypy. 2025-03-26 23:23:59 +01:00			`import pycrdt`
✨(backend) add util to extract text from Ydoc content Documents content is stored in the Ydoc format. We need a util to extract it as xml/text. 2024-12-27 10:19:16 +01:00			`from bs4 import BeautifulSoup`

🚸(backend) sort user search results by proximity with the active user (#1802) ## Purpose Allows a user to find more easily the other users they search, with the following order of priority: - users they already share documents with (more recent first) - users that share the same full email domain - ~~users that share the same partial email domain (last two parts)~~ - ~~other users~~ Edit: We need to ilter out other users in order to not reveal email addresses from members of other organisations. It's still possible to invite them by email. Solves #1521 ## Proposal - [x] Add a new function in `core/utils.py`: `users_sharing_documents_with()` - [x] Use it as a key to sort the results of a basic user search - [x] Filter user results to avoid reveal of users (and email addresses) of other orgs or that have not been interacted with. - [x] User research through "full" email address (contains the '@') is left unaffected. --------- Co-authored-by: Anthony LC <anthony.le-courric@mail.numerique.gouv.fr> 2026-02-11 18:51:45 +01:00			`from core import enums, models`

			`logger = logging.getLogger(__name__)`
✨(backend) add duplicate action to the document API endpoint We took this opportunity to refactor the way access is controlled on media attachments. We now add the media key to a list on the document instance each time a media is uploaded to a document. This list is passed along when a document is duplicated, allowing us to grant access to readers on the new document, even if they don't have or lost access to the original document. We also propose an option to reproduce the same access rights on the duplicate document as what was in place on the original document. This can be requested by passing the "with_accesses=true" option in the query string. The tricky point is that we need to extract attachment keys from the existing documents and set them on the new "attachments" field that is now used to track access rights on media files. 2025-01-20 10:23:18 +01:00

✨(backend) add document search indexer Add indexer that loops across documents in the database, formats them as json objects and indexes them in the remote "Find" mico-service. 2025-07-24 12:31:20 +02:00			`def get_ancestor_to_descendants_map(paths, steplen):`
			`"""`
			`Given a list of document paths, return a mapping of ancestor_path -> set of descendant_paths.`

			`Each path is assumed to use materialized path format with fixed-length segments.`

			`Args:`
			`paths (list of str): List of full document paths.`
			`steplen (int): Length of each path segment.`

			`Returns:`
			`dict[str, set[str]]: Mapping from ancestor path to its descendant paths (including itself).`
			`"""`
			`ancestor_map = defaultdict(set)`
			`for path in paths:`
			`for i in range(steplen, len(path) + 1, steplen):`
			`ancestor = path[:i]`
			`ancestor_map[ancestor].add(path)`
			`return ancestor_map`


✨(backend) add duplicate action to the document API endpoint We took this opportunity to refactor the way access is controlled on media attachments. We now add the media key to a list on the document instance each time a media is uploaded to a document. This list is passed along when a document is duplicated, allowing us to grant access to readers on the new document, even if they don't have or lost access to the original document. We also propose an option to reproduce the same access rights on the duplicate document as what was in place on the original document. This can be requested by passing the "with_accesses=true" option in the query string. The tricky point is that we need to extract attachment keys from the existing documents and set them on the new "attachments" field that is now used to track access rights on media files. 2025-01-20 10:23:18 +01:00			`def filter_descendants(paths, root_paths, skip_sorting=False):`
			`"""`
			`Filters paths to keep only those that are descendants of any path in root_paths.`

			`A path is considered a descendant of a root path if it starts with the root path.`
			If `skip_sorting` is not set to True, the function will sort both lists before
			processing because both `paths` and `root_paths` need to be in lexicographic order
			`before going through the algorithm.`

			`Args:`
			`paths (iterable of str): List of paths to be filtered.`
			`root_paths (iterable of str): List of paths to check as potential prefixes.`
			skip_sorting (bool): If True, assumes both `paths` and `root_paths` are already sorted.

			`Returns:`
			list of str: A list of sorted paths that are descendants of any path in `root_paths`.
			`"""`
			`results = []`
			`i = 0`
			`n = len(root_paths)`

			`if not skip_sorting:`
			`paths.sort()`
			`root_paths.sort()`

			`for path in paths:`
			`# Try to find a matching prefix in the sorted accessible paths`
			`while i < n:`
			`if path.startswith(root_paths[i]):`
			`results.append(path)`
			`break`
			`if root_paths[i] < path:`
			`i += 1`
			`else:`
			`# If paths[i] > path, no need to keep searching`
			`break`
			`return results`

✨(backend) add util to extract text from Ydoc content Documents content is stored in the Ydoc format. We need a util to extract it as xml/text. 2024-12-27 10:19:16 +01:00
			`def base64_yjs_to_xml(base64_string):`
			`"""Extract xml from base64 yjs document."""`

			`decoded_bytes = base64.b64decode(base64_string)`
♻️(back) replace Ypy by pycrdt Ypy is deprecated and unmaintained. We have problem with parsing existing documents. We replace it by pycrdt, library actively maintained and without the issues we have with Ypy. 2025-03-26 23:23:59 +01:00			`# uint8_array = bytearray(decoded_bytes)`
✨(backend) add util to extract text from Ydoc content Documents content is stored in the Ydoc format. We need a util to extract it as xml/text. 2024-12-27 10:19:16 +01:00
♻️(back) replace Ypy by pycrdt Ypy is deprecated and unmaintained. We have problem with parsing existing documents. We replace it by pycrdt, library actively maintained and without the issues we have with Ypy. 2025-03-26 23:23:59 +01:00			`doc = pycrdt.Doc()`
			`doc.apply_update(decoded_bytes)`
			`return str(doc.get("document-store", type=pycrdt.XmlFragment))`
✨(backend) add util to extract text from Ydoc content Documents content is stored in the Ydoc format. We need a util to extract it as xml/text. 2024-12-27 10:19:16 +01:00

			`def base64_yjs_to_text(base64_string):`
			`"""Extract text from base64 yjs document."""`

			`blocknote_structure = base64_yjs_to_xml(base64_string)`
♻️(back) replace Ypy by pycrdt Ypy is deprecated and unmaintained. We have problem with parsing existing documents. We replace it by pycrdt, library actively maintained and without the issues we have with Ypy. 2025-03-26 23:23:59 +01:00			`soup = BeautifulSoup(blocknote_structure, "lxml-xml")`
			`return soup.get_text(separator=" ", strip=True)`
✨(backend) add duplicate action to the document API endpoint We took this opportunity to refactor the way access is controlled on media attachments. We now add the media key to a list on the document instance each time a media is uploaded to a document. This list is passed along when a document is duplicated, allowing us to grant access to readers on the new document, even if they don't have or lost access to the original document. We also propose an option to reproduce the same access rights on the duplicate document as what was in place on the original document. This can be requested by passing the "with_accesses=true" option in the query string. The tricky point is that we need to extract attachment keys from the existing documents and set them on the new "attachments" field that is now used to track access rights on media files. 2025-01-20 10:23:18 +01:00

			`def extract_attachments(content):`
			`"""Helper method to extract media paths from a document's content."""`
✨(backend) extract attachment keys from updated content for access We can't prevent document editors from copy/pasting content to from one document to another. The problem is that copying content, will copy the urls pointing to attachments but if we don't do anything, the reader of the document to which the content is being pasted, may not be allowed to access the attachment files from the original document. Using the work from the previous commit, we can grant access to the readers of the target document by extracting the attachment keys from the content and adding themto the target document's "attachments" field. Before doing this, we check that the current user can indeed access the attachment files extracted from the content and that they are allowed to edit the current document. 2025-01-21 23:56:50 +01:00			`if not content:`
			`return []`

✨(backend) add duplicate action to the document API endpoint We took this opportunity to refactor the way access is controlled on media attachments. We now add the media key to a list on the document instance each time a media is uploaded to a document. This list is passed along when a document is duplicated, allowing us to grant access to readers on the new document, even if they don't have or lost access to the original document. We also propose an option to reproduce the same access rights on the duplicate document as what was in place on the original document. This can be requested by passing the "with_accesses=true" option in the query string. The tricky point is that we need to extract attachment keys from the existing documents and set them on the new "attachments" field that is now used to track access rights on media files. 2025-01-20 10:23:18 +01:00			`xml_content = base64_yjs_to_xml(content)`
			`return re.findall(enums.MEDIA_STORAGE_URL_EXTRACT, xml_content)`
🚸(backend) sort user search results by proximity with the active user (#1802) ## Purpose Allows a user to find more easily the other users they search, with the following order of priority: - users they already share documents with (more recent first) - users that share the same full email domain - ~~users that share the same partial email domain (last two parts)~~ - ~~other users~~ Edit: We need to ilter out other users in order to not reveal email addresses from members of other organisations. It's still possible to invite them by email. Solves #1521 ## Proposal - [x] Add a new function in `core/utils.py`: `users_sharing_documents_with()` - [x] Use it as a key to sort the results of a basic user search - [x] Filter user results to avoid reveal of users (and email addresses) of other orgs or that have not been interacted with. - [x] User research through "full" email address (contains the '@') is left unaffected. --------- Co-authored-by: Anthony LC <anthony.le-courric@mail.numerique.gouv.fr> 2026-02-11 18:51:45 +01:00

			`def get_users_sharing_documents_with_cache_key(user):`
			`"""Generate a unique cache key for each user."""`
			`return f"users_sharing_documents_with_{user.id}"`


			`def users_sharing_documents_with(user):`
			`"""`
			`Returns a map of users sharing documents with the given user,`
			`sorted by last shared date.`
			`"""`
			`start_time = time.time()`
			`cache_key = get_users_sharing_documents_with_cache_key(user)`
			`cached_result = cache.get(cache_key)`

			`if cached_result is not None:`
			`elapsed = time.time() - start_time`
			`logger.info(`
			`"users_sharing_documents_with cache hit for user %s (took %.3fs)",`
			`user.id,`
			`elapsed,`
			`)`
			`return cached_result`

			`user_docs_qs = models.DocumentAccess.objects.filter(user=user).values_list(`
			`"document_id", flat=True`
			`)`
			`shared_qs = (`
			`models.DocumentAccess.objects.filter(document_id__in=Subquery(user_docs_qs))`
			`.exclude(user=user)`
			`.values("user")`
			`.annotate(last_shared=db.Max("created_at"))`
			`)`
			`result = {item["user"]: item["last_shared"] for item in shared_qs}`
			`cache.set(cache_key, result, 86400) # Cache for 1 day`
			`elapsed = time.time() - start_time`
			`logger.info(`
			`"users_sharing_documents_with cache miss for user %s (took %.3fs)",`
			`user.id,`
			`elapsed,`
			`)`
			`return result`