From d721b97f68e24f87fe8a03a2c831ba193119df39 Mon Sep 17 00:00:00 2001 From: Fabre Florian Date: Wed, 13 Aug 2025 06:50:58 +0200 Subject: [PATCH] =?UTF-8?q?=E2=9C=A8(backend)=20add=20document=20search=20?= =?UTF-8?q?view?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit New API view that calls the indexed documents search view (resource server) of app "Find". Signed-off-by: Fabre Florian --- env.d/development/common | 5 + src/backend/core/api/serializers.py | 14 ++ src/backend/core/api/viewsets.py | 39 ++++- src/backend/core/models.py | 2 +- src/backend/core/services/search_indexers.py | 35 +++-- src/backend/core/tasks/find.py | 3 +- src/backend/core/tests/commands/test_index.py | 13 +- .../documents/test_api_documents_search.py | 137 ++++++++++++++++++ .../tests/test_services_search_indexers.py | 95 +++++++++++- src/backend/impress/settings.py | 3 + 10 files changed, 312 insertions(+), 34 deletions(-) create mode 100644 src/backend/core/tests/documents/test_api_documents_search.py diff --git a/env.d/development/common b/env.d/development/common index de857d5b..6df4a722 100644 --- a/env.d/development/common +++ b/env.d/development/common @@ -49,6 +49,11 @@ LOGOUT_REDIRECT_URL=http://localhost:3000 OIDC_REDIRECT_ALLOWED_HOSTS=["http://localhost:8083", "http://localhost:3000"] OIDC_AUTH_REQUEST_EXTRA_PARAMS={"acr_values": "eidas1"} +# Store OIDC tokens in the session +OIDC_STORE_ACCESS_TOKEN = True # Store the access token in the session +OIDC_STORE_REFRESH_TOKEN = True # Store the encrypted refresh token in the session +OIDC_STORE_REFRESH_TOKEN_KEY = AnExampleKeyForDevPurposeOnly + # AI AI_FEATURE_ENABLED=true AI_BASE_URL=https://openaiendpoint.com diff --git a/src/backend/core/api/serializers.py b/src/backend/core/api/serializers.py index 47754efe..4d6bf39c 100644 --- a/src/backend/core/api/serializers.py +++ b/src/backend/core/api/serializers.py @@ -1013,3 +1013,17 @@ class ThreadSerializer(serializers.ModelSerializer): if request: return thread.get_abilities(request.user) return {} + + +class FindDocumentSerializer(serializers.Serializer): + """Serializer for Find search requests""" + + q = serializers.CharField(required=True) + + def validate_q(self, value): + """Ensure the text field is not empty.""" + + if len(value.strip()) == 0: + raise serializers.ValidationError("Text field cannot be empty.") + + return value diff --git a/src/backend/core/api/viewsets.py b/src/backend/core/api/viewsets.py index b028b06d..01d4f96a 100644 --- a/src/backend/core/api/viewsets.py +++ b/src/backend/core/api/viewsets.py @@ -23,6 +23,7 @@ from django.db.models.functions import Greatest, Left, Length from django.http import Http404, StreamingHttpResponse from django.urls import reverse from django.utils import timezone +from django.utils.decorators import method_decorator from django.utils.functional import cached_property from django.utils.text import capfirst, slugify from django.utils.translation import gettext_lazy as _ @@ -33,6 +34,7 @@ from botocore.exceptions import ClientError from csp.constants import NONE from csp.decorators import csp_update from lasuite.malware_detection import malware_detection +from lasuite.oidc_login.decorators import refresh_oidc_access_token from rest_framework import filters, status, viewsets from rest_framework import response as drf_response from rest_framework.permissions import AllowAny @@ -50,6 +52,7 @@ from core.services.converter_services import ( from core.services.converter_services import ( YdocConverter, ) +from core.services.search_indexers import FindDocumentIndexer from core.tasks.mail import send_ask_for_access_mail from core.utils import extract_attachments, filter_descendants @@ -387,6 +390,7 @@ class DocumentViewSet( list_serializer_class = serializers.ListDocumentSerializer trashbin_serializer_class = serializers.ListDocumentSerializer tree_serializer_class = serializers.ListDocumentSerializer + search_serializer_class = serializers.ListDocumentSerializer def get_queryset(self): """Get queryset performing all annotation and filtering on the document tree structure.""" @@ -1078,10 +1082,37 @@ class DocumentViewSet( {"id": str(duplicated_document.id)}, status=status.HTTP_201_CREATED ) - # TODO - # @drf.decorators.action(detail=False, methods=["get"]) - # def search(self, request, *args, **kwargs): - # index.search() + @drf.decorators.action(detail=False, methods=["get"], url_path="search") + @method_decorator(refresh_oidc_access_token) + def search(self, request, *args, **kwargs): + """ + Returns a DRF response containing the filtered, annotated and ordered document list. + The filtering allows full text search through the opensearch indexation app "find". + """ + access_token = request.session.get("oidc_access_token") + + serializer = serializers.FindDocumentSerializer(data=request.query_params) + serializer.is_valid(raise_exception=True) + + try: + indexer = FindDocumentIndexer() + queryset = indexer.search( + text=serializer.validated_data.get("q", ""), + user=request.user, + token=access_token, + ) + except RuntimeError: + return drf.response.Response( + {"detail": "The service is not configured properly."}, + status=status.HTTP_401_UNAUTHORIZED, + ) + + return self.get_response_for_queryset( + queryset, + context={ + "request": request, + }, + ) @drf.decorators.action(detail=True, methods=["get"], url_path="versions") def versions_list(self, request, *args, **kwargs): diff --git a/src/backend/core/models.py b/src/backend/core/models.py index cf6e7c06..7fbbd8b7 100644 --- a/src/backend/core/models.py +++ b/src/backend/core/models.py @@ -41,8 +41,8 @@ from .choices import ( RoleChoices, get_equivalent_link_definition, ) -from .validators import sub_validator from .tasks.find import trigger_document_indexer +from .validators import sub_validator logger = getLogger(__name__) diff --git a/src/backend/core/services/search_indexers.py b/src/backend/core/services/search_indexers.py index dda2b585..e24ac390 100644 --- a/src/backend/core/services/search_indexers.py +++ b/src/backend/core/services/search_indexers.py @@ -48,19 +48,19 @@ def get_batch_accesses_by_users_and_teams(paths): def get_visited_document_ids_of(user): + """ + Returns the ids of the documents that have a linktrace to the user and NOT owned. + It will be use to limit the opensearch responses to the public documents already + "visited" by the user. + """ if isinstance(user, AnonymousUser): return [] - # TODO : exclude links when user already have a specific access to the doc - qs = models.LinkTrace.objects.filter( - user=user - ).exclude( + qs = models.LinkTrace.objects.filter(user=user).exclude( document__accesses__user=user, ) - return list({ - str(id) for id in qs.values_list("document_id", flat=True) - }) + return list({str(id) for id in qs.values_list("document_id", flat=True)}) class BaseDocumentIndexer(ABC): @@ -129,13 +129,14 @@ class BaseDocumentIndexer(ABC): """ visited_ids = get_visited_document_ids_of(user) - response = self.search_query(data={ - "q": text, - "visited": visited_ids, - "services": ["docs"], - }, token=token) - - print(response) + response = self.search_query( + data={ + "q": text, + "visited": visited_ids, + "services": ["docs"], + }, + token=token, + ) return self.format_response(response) @@ -207,7 +208,7 @@ class FindDocumentIndexer(BaseDocumentIndexer): if not url: raise RuntimeError( - "SEARCH_INDEXER_QUERY_URL must be set in Django settings before indexing." + "SEARCH_INDEXER_QUERY_URL must be set in Django settings before search." ) try: @@ -228,9 +229,7 @@ class FindDocumentIndexer(BaseDocumentIndexer): """ Retrieve documents ids from Find app response and return a queryset. """ - return models.Document.objects.filter(pk__in=[ - d['_id'] for d in data - ]) + return models.Document.objects.filter(pk__in=[d["_id"] for d in data]) def push(self, data): """ diff --git a/src/backend/core/tasks/find.py b/src/backend/core/tasks/find.py index 858c83ee..f65d3d05 100644 --- a/src/backend/core/tasks/find.py +++ b/src/backend/core/tasks/find.py @@ -86,7 +86,8 @@ def trigger_document_indexer(document, on_commit=False): logger.info( "Add task for document %s indexation in %.2f seconds", - document.pk, countdown + document.pk, + countdown, ) # Each time this method is called during the countdown, we increment the diff --git a/src/backend/core/tests/commands/test_index.py b/src/backend/core/tests/commands/test_index.py index 1da70344..a726e712 100644 --- a/src/backend/core/tests/commands/test_index.py +++ b/src/backend/core/tests/commands/test_index.py @@ -21,7 +21,7 @@ def test_index(): with transaction.atomic(): doc = factories.DocumentFactory() - empty_doc = factories.DocumentFactory(title=None, content='') + empty_doc = factories.DocumentFactory(title=None, content="") no_title_doc = factories.DocumentFactory(title=None) factories.UserDocumentAccessFactory(document=doc, user=user) @@ -43,7 +43,10 @@ def test_index(): push_call_args = [call.args[0] for call in mock_push.call_args_list] assert len(push_call_args) == 1 # called once but with a batch of docs - assert sorted(push_call_args[0], key=sortkey) == sorted([ - indexer.serialize_document(doc, accesses), - indexer.serialize_document(no_title_doc, accesses), - ], key=sortkey) + assert sorted(push_call_args[0], key=sortkey) == sorted( + [ + indexer.serialize_document(doc, accesses), + indexer.serialize_document(no_title_doc, accesses), + ], + key=sortkey, + ) diff --git a/src/backend/core/tests/documents/test_api_documents_search.py b/src/backend/core/tests/documents/test_api_documents_search.py new file mode 100644 index 00000000..79f1e6af --- /dev/null +++ b/src/backend/core/tests/documents/test_api_documents_search.py @@ -0,0 +1,137 @@ +""" +Tests for Documents API endpoint in impress's core app: list +""" + +import pytest +import responses +from faker import Faker +from rest_framework.test import APIClient + +from core import factories, models + +fake = Faker() +pytestmark = pytest.mark.django_db + + +@pytest.mark.parametrize("role", models.LinkRoleChoices.values) +@pytest.mark.parametrize("reach", models.LinkReachChoices.values) +@responses.activate +def test_api_documents_search_anonymous(reach, role, settings): + """ + Anonymous users should not be allowed to search documents whatever the + link reach and link role + """ + factories.DocumentFactory(link_reach=reach, link_role=role) + settings.SEARCH_INDEXER_QUERY_URL = "http://find/api/v1.0/search" + + factories.DocumentFactory(link_reach=reach, link_role=role) + + # Find response + responses.add( + responses.POST, + "http://find/api/v1.0/search", + json=[], + status=200, + ) + + response = APIClient().get("/api/v1.0/documents/search/", data={"q": "alpha"}) + + assert response.status_code == 200 + assert response.json() == { + "count": 0, + "next": None, + "previous": None, + "results": [], + } + + +def test_api_documents_search_endpoint_is_none(settings): + """Missing SEARCH_INDEXER_QUERY_URL should throw an error""" + settings.SEARCH_INDEXER_QUERY_URL = None + + user = factories.UserFactory() + + client = APIClient() + client.force_login(user) + + response = APIClient().get("/api/v1.0/documents/search/", data={"q": "alpha"}) + + assert response.status_code == 401 + assert response.json() == {"detail": "The service is not configured properly."} + + +@responses.activate +def test_api_documents_search_invalid_params(settings): + """Validate the format of documents as returned by the search view.""" + settings.SEARCH_INDEXER_QUERY_URL = "http://find/api/v1.0/search" + + user = factories.UserFactory() + + client = APIClient() + client.force_login(user) + + response = APIClient().get("/api/v1.0/documents/search/") + + assert response.status_code == 400 + assert response.json() == {"q": ["This field is required."]} + + +@responses.activate +def test_api_documents_search_format(settings): + """Validate the format of documents as returned by the search view.""" + settings.SEARCH_INDEXER_QUERY_URL = "http://find/api/v1.0/search" + + user = factories.UserFactory() + + client = APIClient() + client.force_login(user) + + user_a, user_b, user_c = factories.UserFactory.create_batch(3) + document = factories.DocumentFactory( + title="alpha", + users=(user_a, user_c), + link_traces=(user, user_b), + ) + access = factories.UserDocumentAccessFactory(document=document, user=user) + + # Find response + responses.add( + responses.POST, + "http://find/api/v1.0/search", + json=[ + {"_id": str(document.pk)}, + ], + status=200, + ) + response = client.get("/api/v1.0/documents/search/", data={"q": "alpha"}) + + assert response.status_code == 200 + content = response.json() + results = content.pop("results") + assert content == { + "count": 1, + "next": None, + "previous": None, + } + assert len(results) == 1 + assert results[0] == { + "id": str(document.id), + "abilities": document.get_abilities(user), + "ancestors_link_reach": None, + "ancestors_link_role": None, + "computed_link_reach": document.computed_link_reach, + "computed_link_role": document.computed_link_role, + "created_at": document.created_at.isoformat().replace("+00:00", "Z"), + "creator": str(document.creator.id), + "depth": 1, + "excerpt": document.excerpt, + "link_reach": document.link_reach, + "link_role": document.link_role, + "nb_accesses_ancestors": 3, + "nb_accesses_direct": 3, + "numchild": 0, + "path": document.path, + "title": document.title, + "updated_at": document.updated_at.isoformat().replace("+00:00", "Z"), + "user_role": access.role, + } diff --git a/src/backend/core/tests/test_services_search_indexers.py b/src/backend/core/tests/test_services_search_indexers.py index 0d74d845..5f04f3ae 100644 --- a/src/backend/core/tests/test_services_search_indexers.py +++ b/src/backend/core/tests/test_services_search_indexers.py @@ -1,11 +1,17 @@ """Tests for Documents search indexers""" +from functools import partial from unittest.mock import patch +from django.contrib.auth.models import AnonymousUser + import pytest -from core import factories, utils -from core.services.search_indexers import FindDocumentIndexer +from core import factories, models, utils +from core.services.search_indexers import ( + FindDocumentIndexer, + get_visited_document_ids_of, +) pytestmark = pytest.mark.django_db @@ -187,7 +193,6 @@ def test_services_search_indexers_ancestors_link_reach(mock_push): FindDocumentIndexer().index() - seen_doc_ids = set() results = {doc["id"]: doc for doc in mock_push.call_args[0][0]} assert len(results) == 4 assert results[str(great_grand_parent.id)]["reach"] == "restricted" @@ -207,7 +212,6 @@ def test_services_search_indexers_ancestors_users(mock_push): FindDocumentIndexer().index() - seen_doc_ids = set() results = {doc["id"]: doc for doc in mock_push.call_args[0][0]} assert len(results) == 3 assert results[str(grand_parent.id)]["users"] == [str(user_gp.sub)] @@ -228,7 +232,6 @@ def test_services_search_indexers_ancestors_teams(mock_push): FindDocumentIndexer().index() - seen_doc_ids = set() results = {doc["id"]: doc for doc in mock_push.call_args[0][0]} assert len(results) == 3 assert results[str(grand_parent.id)]["groups"] == ["team_gp"] @@ -258,3 +261,85 @@ def test_push_uses_correct_url_and_data(mock_post, settings): assert args[0] == settings.SEARCH_INDEXER_URL assert kwargs.get("json") == sample_data assert kwargs.get("timeout") == 10 + + +def test_get_visited_document_ids_of(): + """ + get_visited_document_ids_of() returns the ids of the documents viewed + by the user BUT without specific access configuration (like public ones) + """ + user = factories.UserFactory() + other = factories.UserFactory() + anonymous = AnonymousUser() + + assert not get_visited_document_ids_of(anonymous) + assert not get_visited_document_ids_of(user) + + doc1, doc2, _ = factories.DocumentFactory.create_batch(3) + + create_link = partial(models.LinkTrace.objects.create, user=user, is_masked=False) + + create_link(document=doc1) + create_link(document=doc2) + + # The third document is not visited + assert sorted(get_visited_document_ids_of(user)) == sorted( + [str(doc1.pk), str(doc2.pk)] + ) + + factories.UserDocumentAccessFactory(user=other, document=doc1) + factories.UserDocumentAccessFactory(user=user, document=doc2) + + # The second document have an access for the user + assert get_visited_document_ids_of(user) == [str(doc1.pk)] + + +@patch("requests.post") +def test_services_search_indexers_search(mock_post, settings): + """ + search() should call requests.post to SEARCH_INDEXER_QUERY_URL with the + document ids from linktraces. + """ + user = factories.UserFactory() + indexer = FindDocumentIndexer() + + mock_response = mock_post.return_value + mock_response.raise_for_status.return_value = None # No error + + doc1, doc2, _ = factories.DocumentFactory.create_batch(3) + + create_link = partial(models.LinkTrace.objects.create, user=user, is_masked=False) + + create_link(document=doc1) + create_link(document=doc2) + + indexer.search("alpha", user=user, token="mytoken") + + args, kwargs = mock_post.call_args + + assert args[0] == settings.SEARCH_INDEXER_QUERY_URL + + query_data = kwargs.get("json") + assert query_data["q"] == "alpha" + assert sorted(query_data["visited"]) == sorted([str(doc1.pk), str(doc2.pk)]) + assert query_data["services"] == ["docs"] + + assert kwargs.get("headers") == {"Authorization": "Bearer mytoken"} + assert kwargs.get("timeout") == 10 + + +def test_search_query_raises_error_if_search_endpoint_is_none(settings): + """ + Indexer should raise RuntimeError if SEARCH_INDEXER_QUERY_URL is None or empty. + """ + settings.SEARCH_INDEXER_QUERY_URL = None + indexer = FindDocumentIndexer() + user = factories.UserFactory() + + with pytest.raises(RuntimeError) as exc_info: + indexer.search("alpha", user=user, token="mytoken") + + assert ( + "SEARCH_INDEXER_QUERY_URL must be set in Django settings before search." + in str(exc_info.value) + ) diff --git a/src/backend/impress/settings.py b/src/backend/impress/settings.py index d0b63774..3232c246 100755 --- a/src/backend/impress/settings.py +++ b/src/backend/impress/settings.py @@ -109,6 +109,9 @@ class Base(Configuration): SEARCH_INDEXER_SECRET = values.Value( default=None, environ_name="SEARCH_INDEXER_SECRET", environ_prefix=None ) + SEARCH_INDEXER_QUERY_URL = values.Value( + default=None, environ_name="SEARCH_INDEXER_QUERY_URL", environ_prefix=None + ) # Static files (CSS, JavaScript, Images) STATIC_URL = "/static/"