✨(backend) add async triggers to enable document indexation with find

On document content or permission changes, start a celery job that will call the indexation API of the app "Find". Signed-off-by: Fabre Florian <ffabre@hybird.org>
2025-08-06 17:35:38 +02:00
parent 1d9c2a8118
commit 72238c1ab6
9 changed files with 381 additions and 20 deletions
--- a/src/backend/core/services/search_indexers.py
+++ b/src/backend/core/services/search_indexers.py
@@ -5,6 +5,7 @@ from abc import ABC, abstractmethod
 from collections import defaultdict

 from django.conf import settings
+from django.contrib.auth.models import AnonymousUser

 import requests

@@ -18,11 +19,13 @@ def get_batch_accesses_by_users_and_teams(paths):
    Get accesses related to a list of document paths,
    grouped by users and teams, including all ancestor paths.
    """
-    print("paths: ", paths)
-    ancestor_map = utils.get_ancestor_to_descendants_map(paths, steplen=models.Document.steplen)
+    # print("paths: ", paths)
+    ancestor_map = utils.get_ancestor_to_descendants_map(
+        paths, steplen=models.Document.steplen
+    )
    ancestor_paths = list(ancestor_map.keys())
-    print("ancestor map: ", ancestor_map)
-    print("ancestor paths: ", ancestor_paths)
+    # print("ancestor map: ", ancestor_map)
+    # print("ancestor paths: ", ancestor_paths)

    access_qs = models.DocumentAccess.objects.filter(
        document__path__in=ancestor_paths
@@ -44,6 +47,22 @@ def get_batch_accesses_by_users_and_teams(paths):
    return dict(access_by_document_path)


+def get_visited_document_ids_of(user):
+    if isinstance(user, AnonymousUser):
+        return []
+
+    # TODO : exclude links when user already have a specific access to the doc
+    qs = models.LinkTrace.objects.filter(
+        user=user
+    ).exclude(
+        document__accesses__user=user,
+    )
+
+    return list({
+        str(id) for id in qs.values_list("document_id", flat=True)
+    })
+
+
 class BaseDocumentIndexer(ABC):
    """
    Base class for document indexers.
@@ -84,6 +103,7 @@ class BaseDocumentIndexer(ABC):
            serialized_batch = [
                self.serialize_document(document, accesses_by_document_path)
                for document in documents_batch
+                if document.content
            ]
            self.push(serialized_batch)

@@ -103,6 +123,38 @@ class BaseDocumentIndexer(ABC):
        Must be implemented by subclasses.
        """

+    def search(self, text, user, token):
+        """
+        Search for documents in Find app.
+        """
+        visited_ids = get_visited_document_ids_of(user)
+
+        response = self.search_query(data={
+            "q": text,
+            "visited": visited_ids,
+            "services": ["docs"],
+        }, token=token)
+
+        print(response)
+
+        return self.format_response(response)
+
+    @abstractmethod
+    def search_query(self, data, token) -> dict:
+        """
+        Retreive documents from the Find app API.
+
+        Must be implemented by subclasses.
+        """
+
+    @abstractmethod
+    def format_response(self, data: dict):
+        """
+        Convert the JSON response from Find app as document queryset.
+
+        Must be implemented by subclasses.
+        """
+

 class FindDocumentIndexer(BaseDocumentIndexer):
    """
@@ -121,10 +173,12 @@ class FindDocumentIndexer(BaseDocumentIndexer):
            dict: A JSON-serializable dictionary.
        """
        doc_path = document.path
-        text_content = utils.base64_yjs_to_text(document.content)
+        doc_content = document.content
+        text_content = utils.base64_yjs_to_text(doc_content) if doc_content else ""
+
        return {
            "id": str(document.id),
-            "title": document.title,
+            "title": document.title or "",
            "content": text_content,
            "depth": document.depth,
            "path": document.path,
@@ -138,6 +192,46 @@ class FindDocumentIndexer(BaseDocumentIndexer):
            "is_active": not bool(document.ancestors_deleted_at),
        }

+    def search_query(self, data, token) -> requests.Response:
+        """
+        Retrieve documents from the Find app API.
+
+        Args:
+            data (dict): search data
+            token (str): OICD token
+
+        Returns:
+            dict: A JSON-serializable dictionary.
+        """
+        url = getattr(settings, "SEARCH_INDEXER_QUERY_URL", None)
+
+        if not url:
+            raise RuntimeError(
+                "SEARCH_INDEXER_QUERY_URL must be set in Django settings before indexing."
+            )
+
+        try:
+            response = requests.post(
+                url,
+                json=data,
+                headers={"Authorization": f"Bearer {token}"},
+                timeout=10,
+            )
+            response.raise_for_status()
+            return response.json()
+        except requests.exceptions.HTTPError as e:
+            logger.error("HTTPError: %s", e)
+            logger.error("Response content: %s", response.text)  # type: ignore
+            raise
+
+    def format_response(self, data: dict):
+        """
+        Retrieve documents ids from Find app response and return a queryset.
+        """
+        return models.Document.objects.filter(pk__in=[
+            d['_id'] for d in data
+        ])
+
    def push(self, data):
        """
        Push a batch of documents to the Find backend.
@@ -156,6 +250,7 @@ class FindDocumentIndexer(BaseDocumentIndexer):
            raise RuntimeError(
                "SEARCH_INDEXER_SECRET must be set in Django settings before indexing."
            )
+
        try:
            response = requests.post(
                url,