✨(backend) add document search view

New API view that calls the indexed documents search view (resource server) of app "Find". Signed-off-by: Fabre Florian <ffabre@hybird.org>
2025-08-13 06:50:58 +02:00
parent 3228f65092
commit d721b97f68
10 changed files with 312 additions and 34 deletions
--- a/env.d/development/common
+++ b/env.d/development/common
@@ -49,6 +49,11 @@ LOGOUT_REDIRECT_URL=http://localhost:3000
 OIDC_REDIRECT_ALLOWED_HOSTS=["http://localhost:8083", "http://localhost:3000"]
 OIDC_AUTH_REQUEST_EXTRA_PARAMS={"acr_values": "eidas1"}

+# Store OIDC tokens in the session
+OIDC_STORE_ACCESS_TOKEN = True  # Store the access token in the session
+OIDC_STORE_REFRESH_TOKEN = True  # Store the encrypted refresh token in the session
+OIDC_STORE_REFRESH_TOKEN_KEY = AnExampleKeyForDevPurposeOnly
+
 # AI
 AI_FEATURE_ENABLED=true
 AI_BASE_URL=https://openaiendpoint.com
--- a/src/backend/core/api/serializers.py
+++ b/src/backend/core/api/serializers.py
@@ -1013,3 +1013,17 @@ class ThreadSerializer(serializers.ModelSerializer):
        if request:
            return thread.get_abilities(request.user)
        return {}
+
+
+class FindDocumentSerializer(serializers.Serializer):
+    """Serializer for Find search requests"""
+
+    q = serializers.CharField(required=True)
+
+    def validate_q(self, value):
+        """Ensure the text field is not empty."""
+
+        if len(value.strip()) == 0:
+            raise serializers.ValidationError("Text field cannot be empty.")
+
+        return value
--- a/src/backend/core/api/viewsets.py
+++ b/src/backend/core/api/viewsets.py
@@ -23,6 +23,7 @@ from django.db.models.functions import Greatest, Left, Length
 from django.http import Http404, StreamingHttpResponse
 from django.urls import reverse
 from django.utils import timezone
+from django.utils.decorators import method_decorator
 from django.utils.functional import cached_property
 from django.utils.text import capfirst, slugify
 from django.utils.translation import gettext_lazy as _
@@ -33,6 +34,7 @@ from botocore.exceptions import ClientError
 from csp.constants import NONE
 from csp.decorators import csp_update
 from lasuite.malware_detection import malware_detection
+from lasuite.oidc_login.decorators import refresh_oidc_access_token
 from rest_framework import filters, status, viewsets
 from rest_framework import response as drf_response
 from rest_framework.permissions import AllowAny
@@ -50,6 +52,7 @@ from core.services.converter_services import (
 from core.services.converter_services import (
    YdocConverter,
 )
+from core.services.search_indexers import FindDocumentIndexer
 from core.tasks.mail import send_ask_for_access_mail
 from core.utils import extract_attachments, filter_descendants

@@ -387,6 +390,7 @@ class DocumentViewSet(
    list_serializer_class = serializers.ListDocumentSerializer
    trashbin_serializer_class = serializers.ListDocumentSerializer
    tree_serializer_class = serializers.ListDocumentSerializer
+    search_serializer_class = serializers.ListDocumentSerializer

    def get_queryset(self):
        """Get queryset performing all annotation and filtering on the document tree structure."""
@@ -1078,10 +1082,37 @@ class DocumentViewSet(
            {"id": str(duplicated_document.id)}, status=status.HTTP_201_CREATED
        )

-    # TODO
-    # @drf.decorators.action(detail=False, methods=["get"])
-    # def search(self, request, *args, **kwargs):
-    #    index.search()
+    @drf.decorators.action(detail=False, methods=["get"], url_path="search")
+    @method_decorator(refresh_oidc_access_token)
+    def search(self, request, *args, **kwargs):
+        """
+        Returns a DRF response containing the filtered, annotated and ordered document list.
+        The filtering allows full text search through the opensearch indexation app "find".
+        """
+        access_token = request.session.get("oidc_access_token")
+
+        serializer = serializers.FindDocumentSerializer(data=request.query_params)
+        serializer.is_valid(raise_exception=True)
+
+        try:
+            indexer = FindDocumentIndexer()
+            queryset = indexer.search(
+                text=serializer.validated_data.get("q", ""),
+                user=request.user,
+                token=access_token,
+            )
+        except RuntimeError:
+            return drf.response.Response(
+                {"detail": "The service is not configured properly."},
+                status=status.HTTP_401_UNAUTHORIZED,
+            )
+
+        return self.get_response_for_queryset(
+            queryset,
+            context={
+                "request": request,
+            },
+        )

    @drf.decorators.action(detail=True, methods=["get"], url_path="versions")
    def versions_list(self, request, *args, **kwargs):
--- a/src/backend/core/models.py
+++ b/src/backend/core/models.py
@@ -41,8 +41,8 @@ from .choices import (
    RoleChoices,
    get_equivalent_link_definition,
 )
-from .validators import sub_validator
 from .tasks.find import trigger_document_indexer
+from .validators import sub_validator

 logger = getLogger(__name__)

--- a/src/backend/core/services/search_indexers.py
+++ b/src/backend/core/services/search_indexers.py
@@ -48,19 +48,19 @@ def get_batch_accesses_by_users_and_teams(paths):


 def get_visited_document_ids_of(user):
+    """
+    Returns the ids of the documents that have a linktrace to the user and NOT owned.
+    It will be use to limit the opensearch responses to the public documents already
+    "visited" by the user.
+    """
    if isinstance(user, AnonymousUser):
        return []

-    # TODO : exclude links when user already have a specific access to the doc
-    qs = models.LinkTrace.objects.filter(
-        user=user
-    ).exclude(
+    qs = models.LinkTrace.objects.filter(user=user).exclude(
        document__accesses__user=user,
    )

-    return list({
-        str(id) for id in qs.values_list("document_id", flat=True)
-    })
+    return list({str(id) for id in qs.values_list("document_id", flat=True)})


 class BaseDocumentIndexer(ABC):
@@ -129,13 +129,14 @@ class BaseDocumentIndexer(ABC):
        """
        visited_ids = get_visited_document_ids_of(user)

-        response = self.search_query(data={
-            "q": text,
-            "visited": visited_ids,
-            "services": ["docs"],
-        }, token=token)
-
-        print(response)
+        response = self.search_query(
+            data={
+                "q": text,
+                "visited": visited_ids,
+                "services": ["docs"],
+            },
+            token=token,
+        )

        return self.format_response(response)

@@ -207,7 +208,7 @@ class FindDocumentIndexer(BaseDocumentIndexer):

        if not url:
            raise RuntimeError(
-                "SEARCH_INDEXER_QUERY_URL must be set in Django settings before indexing."
+                "SEARCH_INDEXER_QUERY_URL must be set in Django settings before search."
            )

        try:
@@ -228,9 +229,7 @@ class FindDocumentIndexer(BaseDocumentIndexer):
        """
        Retrieve documents ids from Find app response and return a queryset.
        """
-        return models.Document.objects.filter(pk__in=[
-            d['_id'] for d in data
-        ])
+        return models.Document.objects.filter(pk__in=[d["_id"] for d in data])

    def push(self, data):
        """
--- a/src/backend/core/tasks/find.py
+++ b/src/backend/core/tasks/find.py
@@ -86,7 +86,8 @@ def trigger_document_indexer(document, on_commit=False):

        logger.info(
            "Add task for document %s indexation in %.2f seconds",
-            document.pk, countdown
+            document.pk,
+            countdown,
        )

        # Each time this method is called during the countdown, we increment the
--- a/src/backend/core/tests/commands/test_index.py
+++ b/src/backend/core/tests/commands/test_index.py
@@ -21,7 +21,7 @@ def test_index():

    with transaction.atomic():
        doc = factories.DocumentFactory()
-        empty_doc = factories.DocumentFactory(title=None, content='')
+        empty_doc = factories.DocumentFactory(title=None, content="")
        no_title_doc = factories.DocumentFactory(title=None)

        factories.UserDocumentAccessFactory(document=doc, user=user)
@@ -43,7 +43,10 @@ def test_index():
        push_call_args = [call.args[0] for call in mock_push.call_args_list]

        assert len(push_call_args) == 1  # called once but with a batch of docs
-        assert sorted(push_call_args[0], key=sortkey) == sorted([
-            indexer.serialize_document(doc, accesses),
-            indexer.serialize_document(no_title_doc, accesses),
-        ], key=sortkey)
+        assert sorted(push_call_args[0], key=sortkey) == sorted(
+            [
+                indexer.serialize_document(doc, accesses),
+                indexer.serialize_document(no_title_doc, accesses),
+            ],
+            key=sortkey,
+        )
--- a/src/backend/core/tests/documents/test_api_documents_search.py
+++ b/src/backend/core/tests/documents/test_api_documents_search.py
@@ -0,0 +1,137 @@
+"""
+Tests for Documents API endpoint in impress's core app: list
+"""
+
+import pytest
+import responses
+from faker import Faker
+from rest_framework.test import APIClient
+
+from core import factories, models
+
+fake = Faker()
+pytestmark = pytest.mark.django_db
+
+
+@pytest.mark.parametrize("role", models.LinkRoleChoices.values)
+@pytest.mark.parametrize("reach", models.LinkReachChoices.values)
+@responses.activate
+def test_api_documents_search_anonymous(reach, role, settings):
+    """
+    Anonymous users should not be allowed to search documents whatever the
+    link reach and link role
+    """
+    factories.DocumentFactory(link_reach=reach, link_role=role)
+    settings.SEARCH_INDEXER_QUERY_URL = "http://find/api/v1.0/search"
+
+    factories.DocumentFactory(link_reach=reach, link_role=role)
+
+    # Find response
+    responses.add(
+        responses.POST,
+        "http://find/api/v1.0/search",
+        json=[],
+        status=200,
+    )
+
+    response = APIClient().get("/api/v1.0/documents/search/", data={"q": "alpha"})
+
+    assert response.status_code == 200
+    assert response.json() == {
+        "count": 0,
+        "next": None,
+        "previous": None,
+        "results": [],
+    }
+
+
+def test_api_documents_search_endpoint_is_none(settings):
+    """Missing SEARCH_INDEXER_QUERY_URL should throw an error"""
+    settings.SEARCH_INDEXER_QUERY_URL = None
+
+    user = factories.UserFactory()
+
+    client = APIClient()
+    client.force_login(user)
+
+    response = APIClient().get("/api/v1.0/documents/search/", data={"q": "alpha"})
+
+    assert response.status_code == 401
+    assert response.json() == {"detail": "The service is not configured properly."}
+
+
+@responses.activate
+def test_api_documents_search_invalid_params(settings):
+    """Validate the format of documents as returned by the search view."""
+    settings.SEARCH_INDEXER_QUERY_URL = "http://find/api/v1.0/search"
+
+    user = factories.UserFactory()
+
+    client = APIClient()
+    client.force_login(user)
+
+    response = APIClient().get("/api/v1.0/documents/search/")
+
+    assert response.status_code == 400
+    assert response.json() == {"q": ["This field is required."]}
+
+
+@responses.activate
+def test_api_documents_search_format(settings):
+    """Validate the format of documents as returned by the search view."""
+    settings.SEARCH_INDEXER_QUERY_URL = "http://find/api/v1.0/search"
+
+    user = factories.UserFactory()
+
+    client = APIClient()
+    client.force_login(user)
+
+    user_a, user_b, user_c = factories.UserFactory.create_batch(3)
+    document = factories.DocumentFactory(
+        title="alpha",
+        users=(user_a, user_c),
+        link_traces=(user, user_b),
+    )
+    access = factories.UserDocumentAccessFactory(document=document, user=user)
+
+    # Find response
+    responses.add(
+        responses.POST,
+        "http://find/api/v1.0/search",
+        json=[
+            {"_id": str(document.pk)},
+        ],
+        status=200,
+    )
+    response = client.get("/api/v1.0/documents/search/", data={"q": "alpha"})
+
+    assert response.status_code == 200
+    content = response.json()
+    results = content.pop("results")
+    assert content == {
+        "count": 1,
+        "next": None,
+        "previous": None,
+    }
+    assert len(results) == 1
+    assert results[0] == {
+        "id": str(document.id),
+        "abilities": document.get_abilities(user),
+        "ancestors_link_reach": None,
+        "ancestors_link_role": None,
+        "computed_link_reach": document.computed_link_reach,
+        "computed_link_role": document.computed_link_role,
+        "created_at": document.created_at.isoformat().replace("+00:00", "Z"),
+        "creator": str(document.creator.id),
+        "depth": 1,
+        "excerpt": document.excerpt,
+        "link_reach": document.link_reach,
+        "link_role": document.link_role,
+        "nb_accesses_ancestors": 3,
+        "nb_accesses_direct": 3,
+        "numchild": 0,
+        "path": document.path,
+        "title": document.title,
+        "updated_at": document.updated_at.isoformat().replace("+00:00", "Z"),
+        "user_role": access.role,
+    }
--- a/src/backend/core/tests/test_services_search_indexers.py
+++ b/src/backend/core/tests/test_services_search_indexers.py
@@ -1,11 +1,17 @@
 """Tests for Documents search indexers"""

+from functools import partial
 from unittest.mock import patch

+from django.contrib.auth.models import AnonymousUser
+
 import pytest

-from core import factories, utils
-from core.services.search_indexers import FindDocumentIndexer
+from core import factories, models, utils
+from core.services.search_indexers import (
+    FindDocumentIndexer,
+    get_visited_document_ids_of,
+)

 pytestmark = pytest.mark.django_db

@@ -187,7 +193,6 @@ def test_services_search_indexers_ancestors_link_reach(mock_push):

    FindDocumentIndexer().index()

-    seen_doc_ids = set()
    results = {doc["id"]: doc for doc in mock_push.call_args[0][0]}
    assert len(results) == 4
    assert results[str(great_grand_parent.id)]["reach"] == "restricted"
@@ -207,7 +212,6 @@ def test_services_search_indexers_ancestors_users(mock_push):

    FindDocumentIndexer().index()

-    seen_doc_ids = set()
    results = {doc["id"]: doc for doc in mock_push.call_args[0][0]}
    assert len(results) == 3
    assert results[str(grand_parent.id)]["users"] == [str(user_gp.sub)]
@@ -228,7 +232,6 @@ def test_services_search_indexers_ancestors_teams(mock_push):

    FindDocumentIndexer().index()

-    seen_doc_ids = set()
    results = {doc["id"]: doc for doc in mock_push.call_args[0][0]}
    assert len(results) == 3
    assert results[str(grand_parent.id)]["groups"] == ["team_gp"]
@@ -258,3 +261,85 @@ def test_push_uses_correct_url_and_data(mock_post, settings):
    assert args[0] == settings.SEARCH_INDEXER_URL
    assert kwargs.get("json") == sample_data
    assert kwargs.get("timeout") == 10
+
+
+def test_get_visited_document_ids_of():
+    """
+    get_visited_document_ids_of() returns the ids of the documents viewed
+    by the user BUT without specific access configuration (like public ones)
+    """
+    user = factories.UserFactory()
+    other = factories.UserFactory()
+    anonymous = AnonymousUser()
+
+    assert not get_visited_document_ids_of(anonymous)
+    assert not get_visited_document_ids_of(user)
+
+    doc1, doc2, _ = factories.DocumentFactory.create_batch(3)
+
+    create_link = partial(models.LinkTrace.objects.create, user=user, is_masked=False)
+
+    create_link(document=doc1)
+    create_link(document=doc2)
+
+    # The third document is not visited
+    assert sorted(get_visited_document_ids_of(user)) == sorted(
+        [str(doc1.pk), str(doc2.pk)]
+    )
+
+    factories.UserDocumentAccessFactory(user=other, document=doc1)
+    factories.UserDocumentAccessFactory(user=user, document=doc2)
+
+    # The second document have an access for the user
+    assert get_visited_document_ids_of(user) == [str(doc1.pk)]
+
+
+@patch("requests.post")
+def test_services_search_indexers_search(mock_post, settings):
+    """
+    search() should call requests.post to SEARCH_INDEXER_QUERY_URL with the
+    document ids from linktraces.
+    """
+    user = factories.UserFactory()
+    indexer = FindDocumentIndexer()
+
+    mock_response = mock_post.return_value
+    mock_response.raise_for_status.return_value = None  # No error
+
+    doc1, doc2, _ = factories.DocumentFactory.create_batch(3)
+
+    create_link = partial(models.LinkTrace.objects.create, user=user, is_masked=False)
+
+    create_link(document=doc1)
+    create_link(document=doc2)
+
+    indexer.search("alpha", user=user, token="mytoken")
+
+    args, kwargs = mock_post.call_args
+
+    assert args[0] == settings.SEARCH_INDEXER_QUERY_URL
+
+    query_data = kwargs.get("json")
+    assert query_data["q"] == "alpha"
+    assert sorted(query_data["visited"]) == sorted([str(doc1.pk), str(doc2.pk)])
+    assert query_data["services"] == ["docs"]
+
+    assert kwargs.get("headers") == {"Authorization": "Bearer mytoken"}
+    assert kwargs.get("timeout") == 10
+
+
+def test_search_query_raises_error_if_search_endpoint_is_none(settings):
+    """
+    Indexer should raise RuntimeError if SEARCH_INDEXER_QUERY_URL is None or empty.
+    """
+    settings.SEARCH_INDEXER_QUERY_URL = None
+    indexer = FindDocumentIndexer()
+    user = factories.UserFactory()
+
+    with pytest.raises(RuntimeError) as exc_info:
+        indexer.search("alpha", user=user, token="mytoken")
+
+    assert (
+        "SEARCH_INDEXER_QUERY_URL must be set in Django settings before search."
+        in str(exc_info.value)
+    )
--- a/src/backend/impress/settings.py
+++ b/src/backend/impress/settings.py
@@ -109,6 +109,9 @@ class Base(Configuration):
    SEARCH_INDEXER_SECRET = values.Value(
        default=None, environ_name="SEARCH_INDEXER_SECRET", environ_prefix=None
    )
+    SEARCH_INDEXER_QUERY_URL = values.Value(
+        default=None, environ_name="SEARCH_INDEXER_QUERY_URL", environ_prefix=None
+    )

    # Static files (CSS, JavaScript, Images)
    STATIC_URL = "/static/"