2024-12-27 10:19:16 +01:00
|
|
|
"""Utils for the core app."""
|
|
|
|
|
|
|
|
|
|
import base64
|
2026-02-11 18:51:45 +01:00
|
|
|
import logging
|
2025-01-20 10:23:18 +01:00
|
|
|
import re
|
2026-02-11 18:51:45 +01:00
|
|
|
import time
|
2025-08-06 17:35:38 +02:00
|
|
|
from collections import defaultdict
|
2024-12-27 10:19:16 +01:00
|
|
|
|
2026-02-11 18:51:45 +01:00
|
|
|
from django.core.cache import cache
|
|
|
|
|
from django.db import models as db
|
|
|
|
|
from django.db.models import Subquery
|
|
|
|
|
|
2025-03-26 23:23:59 +01:00
|
|
|
import pycrdt
|
2024-12-27 10:19:16 +01:00
|
|
|
from bs4 import BeautifulSoup
|
|
|
|
|
|
2026-02-11 18:51:45 +01:00
|
|
|
from core import enums, models
|
|
|
|
|
|
|
|
|
|
logger = logging.getLogger(__name__)
|
2025-01-20 10:23:18 +01:00
|
|
|
|
|
|
|
|
|
2025-07-24 12:31:20 +02:00
|
|
|
def get_ancestor_to_descendants_map(paths, steplen):
|
|
|
|
|
"""
|
|
|
|
|
Given a list of document paths, return a mapping of ancestor_path -> set of descendant_paths.
|
|
|
|
|
|
|
|
|
|
Each path is assumed to use materialized path format with fixed-length segments.
|
|
|
|
|
|
|
|
|
|
Args:
|
|
|
|
|
paths (list of str): List of full document paths.
|
|
|
|
|
steplen (int): Length of each path segment.
|
|
|
|
|
|
|
|
|
|
Returns:
|
|
|
|
|
dict[str, set[str]]: Mapping from ancestor path to its descendant paths (including itself).
|
|
|
|
|
"""
|
|
|
|
|
ancestor_map = defaultdict(set)
|
|
|
|
|
for path in paths:
|
|
|
|
|
for i in range(steplen, len(path) + 1, steplen):
|
|
|
|
|
ancestor = path[:i]
|
|
|
|
|
ancestor_map[ancestor].add(path)
|
|
|
|
|
return ancestor_map
|
|
|
|
|
|
|
|
|
|
|
2025-01-20 10:23:18 +01:00
|
|
|
def filter_descendants(paths, root_paths, skip_sorting=False):
|
|
|
|
|
"""
|
|
|
|
|
Filters paths to keep only those that are descendants of any path in root_paths.
|
|
|
|
|
|
|
|
|
|
A path is considered a descendant of a root path if it starts with the root path.
|
|
|
|
|
If `skip_sorting` is not set to True, the function will sort both lists before
|
|
|
|
|
processing because both `paths` and `root_paths` need to be in lexicographic order
|
|
|
|
|
before going through the algorithm.
|
|
|
|
|
|
|
|
|
|
Args:
|
|
|
|
|
paths (iterable of str): List of paths to be filtered.
|
|
|
|
|
root_paths (iterable of str): List of paths to check as potential prefixes.
|
|
|
|
|
skip_sorting (bool): If True, assumes both `paths` and `root_paths` are already sorted.
|
|
|
|
|
|
|
|
|
|
Returns:
|
|
|
|
|
list of str: A list of sorted paths that are descendants of any path in `root_paths`.
|
|
|
|
|
"""
|
|
|
|
|
results = []
|
|
|
|
|
i = 0
|
|
|
|
|
n = len(root_paths)
|
|
|
|
|
|
|
|
|
|
if not skip_sorting:
|
|
|
|
|
paths.sort()
|
|
|
|
|
root_paths.sort()
|
|
|
|
|
|
|
|
|
|
for path in paths:
|
|
|
|
|
# Try to find a matching prefix in the sorted accessible paths
|
|
|
|
|
while i < n:
|
|
|
|
|
if path.startswith(root_paths[i]):
|
|
|
|
|
results.append(path)
|
|
|
|
|
break
|
|
|
|
|
if root_paths[i] < path:
|
|
|
|
|
i += 1
|
|
|
|
|
else:
|
|
|
|
|
# If paths[i] > path, no need to keep searching
|
|
|
|
|
break
|
|
|
|
|
return results
|
|
|
|
|
|
2024-12-27 10:19:16 +01:00
|
|
|
|
|
|
|
|
def base64_yjs_to_xml(base64_string):
|
|
|
|
|
"""Extract xml from base64 yjs document."""
|
|
|
|
|
|
|
|
|
|
decoded_bytes = base64.b64decode(base64_string)
|
2025-03-26 23:23:59 +01:00
|
|
|
# uint8_array = bytearray(decoded_bytes)
|
2024-12-27 10:19:16 +01:00
|
|
|
|
2025-03-26 23:23:59 +01:00
|
|
|
doc = pycrdt.Doc()
|
|
|
|
|
doc.apply_update(decoded_bytes)
|
|
|
|
|
return str(doc.get("document-store", type=pycrdt.XmlFragment))
|
2024-12-27 10:19:16 +01:00
|
|
|
|
|
|
|
|
|
|
|
|
|
def base64_yjs_to_text(base64_string):
|
|
|
|
|
"""Extract text from base64 yjs document."""
|
|
|
|
|
|
|
|
|
|
blocknote_structure = base64_yjs_to_xml(base64_string)
|
2025-03-26 23:23:59 +01:00
|
|
|
soup = BeautifulSoup(blocknote_structure, "lxml-xml")
|
|
|
|
|
return soup.get_text(separator=" ", strip=True)
|
2025-01-20 10:23:18 +01:00
|
|
|
|
|
|
|
|
|
|
|
|
|
def extract_attachments(content):
|
|
|
|
|
"""Helper method to extract media paths from a document's content."""
|
2025-01-21 23:56:50 +01:00
|
|
|
if not content:
|
|
|
|
|
return []
|
|
|
|
|
|
2025-01-20 10:23:18 +01:00
|
|
|
xml_content = base64_yjs_to_xml(content)
|
|
|
|
|
return re.findall(enums.MEDIA_STORAGE_URL_EXTRACT, xml_content)
|
2026-02-11 18:51:45 +01:00
|
|
|
|
|
|
|
|
|
|
|
|
|
def get_users_sharing_documents_with_cache_key(user):
|
|
|
|
|
"""Generate a unique cache key for each user."""
|
|
|
|
|
return f"users_sharing_documents_with_{user.id}"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def users_sharing_documents_with(user):
|
|
|
|
|
"""
|
|
|
|
|
Returns a map of users sharing documents with the given user,
|
|
|
|
|
sorted by last shared date.
|
|
|
|
|
"""
|
|
|
|
|
start_time = time.time()
|
|
|
|
|
cache_key = get_users_sharing_documents_with_cache_key(user)
|
|
|
|
|
cached_result = cache.get(cache_key)
|
|
|
|
|
|
|
|
|
|
if cached_result is not None:
|
|
|
|
|
elapsed = time.time() - start_time
|
|
|
|
|
logger.info(
|
|
|
|
|
"users_sharing_documents_with cache hit for user %s (took %.3fs)",
|
|
|
|
|
user.id,
|
|
|
|
|
elapsed,
|
|
|
|
|
)
|
|
|
|
|
return cached_result
|
|
|
|
|
|
|
|
|
|
user_docs_qs = models.DocumentAccess.objects.filter(user=user).values_list(
|
|
|
|
|
"document_id", flat=True
|
|
|
|
|
)
|
|
|
|
|
shared_qs = (
|
|
|
|
|
models.DocumentAccess.objects.filter(document_id__in=Subquery(user_docs_qs))
|
|
|
|
|
.exclude(user=user)
|
|
|
|
|
.values("user")
|
|
|
|
|
.annotate(last_shared=db.Max("created_at"))
|
|
|
|
|
)
|
|
|
|
|
result = {item["user"]: item["last_shared"] for item in shared_qs}
|
|
|
|
|
cache.set(cache_key, result, 86400) # Cache for 1 day
|
|
|
|
|
elapsed = time.time() - start_time
|
|
|
|
|
logger.info(
|
|
|
|
|
"users_sharing_documents_with cache miss for user %s (took %.3fs)",
|
|
|
|
|
user.id,
|
|
|
|
|
elapsed,
|
|
|
|
|
)
|
|
|
|
|
return result
|