diff --git a/env.d/development/common b/env.d/development/common index 6df4a722..eb5b0c54 100644 --- a/env.d/development/common +++ b/env.d/development/common @@ -52,7 +52,7 @@ OIDC_AUTH_REQUEST_EXTRA_PARAMS={"acr_values": "eidas1"} # Store OIDC tokens in the session OIDC_STORE_ACCESS_TOKEN = True # Store the access token in the session OIDC_STORE_REFRESH_TOKEN = True # Store the encrypted refresh token in the session -OIDC_STORE_REFRESH_TOKEN_KEY = AnExampleKeyForDevPurposeOnly +OIDC_STORE_REFRESH_TOKEN_KEY = ThisIsAnExampleKeyForDevPurposeOnly # AI AI_FEATURE_ENABLED=true diff --git a/src/backend/core/api/serializers.py b/src/backend/core/api/serializers.py index 4d6bf39c..932b5a81 100644 --- a/src/backend/core/api/serializers.py +++ b/src/backend/core/api/serializers.py @@ -1018,12 +1018,4 @@ class ThreadSerializer(serializers.ModelSerializer): class FindDocumentSerializer(serializers.Serializer): """Serializer for Find search requests""" - q = serializers.CharField(required=True) - - def validate_q(self, value): - """Ensure the text field is not empty.""" - - if len(value.strip()) == 0: - raise serializers.ValidationError("Text field cannot be empty.") - - return value + q = serializers.CharField(required=True, allow_blank=False, trim_whitespace=True) diff --git a/src/backend/core/api/viewsets.py b/src/backend/core/api/viewsets.py index 01d4f96a..2a5fc50c 100644 --- a/src/backend/core/api/viewsets.py +++ b/src/backend/core/api/viewsets.py @@ -13,7 +13,7 @@ from django.conf import settings from django.contrib.postgres.aggregates import ArrayAgg from django.contrib.postgres.search import TrigramSimilarity from django.core.cache import cache -from django.core.exceptions import ValidationError +from django.core.exceptions import ImproperlyConfigured, ValidationError from django.core.files.storage import default_storage from django.core.validators import URLValidator from django.db import connection, transaction @@ -52,7 +52,7 @@ from core.services.converter_services import ( from core.services.converter_services import ( YdocConverter, ) -from core.services.search_indexers import FindDocumentIndexer +from core.services.search_indexers import get_document_indexer_class from core.tasks.mail import send_ask_for_access_mail from core.utils import extract_attachments, filter_descendants @@ -1095,15 +1095,15 @@ class DocumentViewSet( serializer.is_valid(raise_exception=True) try: - indexer = FindDocumentIndexer() + indexer = get_document_indexer_class()() queryset = indexer.search( text=serializer.validated_data.get("q", ""), user=request.user, token=access_token, ) - except RuntimeError: + except ImproperlyConfigured: return drf.response.Response( - {"detail": "The service is not configured properly."}, + {"detail": "The service is not properly configured."}, status=status.HTTP_401_UNAUTHORIZED, ) diff --git a/src/backend/core/models.py b/src/backend/core/models.py index 7fbbd8b7..c05c1a83 100644 --- a/src/backend/core/models.py +++ b/src/backend/core/models.py @@ -956,7 +956,7 @@ class Document(MP_Node, BaseModel): @receiver(signals.post_save, sender=Document) -def document_post_save(sender, instance, **kwargs): +def document_post_save(sender, instance, **kwargs): # pylint: disable=unused-argument """ Asynchronous call to the document indexer at the end of the transaction. Note : Within the transaction we can have an empty content and a serialization @@ -1196,7 +1196,7 @@ class DocumentAccess(BaseAccess): @receiver(signals.post_save, sender=DocumentAccess) -def document_access_post_save(sender, instance, created, **kwargs): +def document_access_post_save(sender, instance, created, **kwargs): # pylint: disable=unused-argument """ Asynchronous call to the document indexer at the end of the transaction. """ diff --git a/src/backend/core/services/search_indexers.py b/src/backend/core/services/search_indexers.py index e24ac390..48a437be 100644 --- a/src/backend/core/services/search_indexers.py +++ b/src/backend/core/services/search_indexers.py @@ -3,9 +3,12 @@ import logging from abc import ABC, abstractmethod from collections import defaultdict +from functools import cache from django.conf import settings from django.contrib.auth.models import AnonymousUser +from django.core.exceptions import ImproperlyConfigured +from django.utils.module_loading import import_string import requests @@ -14,18 +17,33 @@ from core import models, utils logger = logging.getLogger(__name__) +@cache +def get_document_indexer_class() -> "BaseDocumentIndexer": + """Return the indexer backend class based on the settings.""" + classpath = settings.SEARCH_INDEXER_CLASS + + if not classpath: + raise ImproperlyConfigured( + "SEARCH_INDEXER_CLASS must be set in Django settings." + ) + + try: + return import_string(settings.SEARCH_INDEXER_CLASS) + except ImportError as err: + raise ImproperlyConfigured( + f"SEARCH_INDEXER_CLASS setting is not valid : {err}" + ) from err + + def get_batch_accesses_by_users_and_teams(paths): """ Get accesses related to a list of document paths, grouped by users and teams, including all ancestor paths. """ - # print("paths: ", paths) ancestor_map = utils.get_ancestor_to_descendants_map( paths, steplen=models.Document.steplen ) ancestor_paths = list(ancestor_map.keys()) - # print("ancestor map: ", ancestor_map) - # print("ancestor paths: ", ancestor_paths) access_qs = models.DocumentAccess.objects.filter( document__path__in=ancestor_paths @@ -80,6 +98,24 @@ class BaseDocumentIndexer(ABC): Defaults to settings.SEARCH_INDEXER_BATCH_SIZE. """ self.batch_size = batch_size or settings.SEARCH_INDEXER_BATCH_SIZE + self.indexer_url = settings.SEARCH_INDEXER_URL + self.indexer_secret = settings.SEARCH_INDEXER_SECRET + self.search_url = settings.SEARCH_INDEXER_QUERY_URL + + if not self.indexer_url: + raise ImproperlyConfigured( + "SEARCH_INDEXER_URL must be set in Django settings." + ) + + if not self.indexer_secret: + raise ImproperlyConfigured( + "SEARCH_INDEXER_SECRET must be set in Django settings." + ) + + if not self.search_url: + raise ImproperlyConfigured( + "SEARCH_INDEXER_QUERY_URL must be set in Django settings." + ) def index(self): """ @@ -143,7 +179,7 @@ class BaseDocumentIndexer(ABC): @abstractmethod def search_query(self, data, token) -> dict: """ - Retreive documents from the Find app API. + Retrieve documents from the Find app API. Must be implemented by subclasses. """ @@ -204,16 +240,9 @@ class FindDocumentIndexer(BaseDocumentIndexer): Returns: dict: A JSON-serializable dictionary. """ - url = getattr(settings, "SEARCH_INDEXER_QUERY_URL", None) - - if not url: - raise RuntimeError( - "SEARCH_INDEXER_QUERY_URL must be set in Django settings before search." - ) - try: response = requests.post( - url, + self.search_url, json=data, headers={"Authorization": f"Bearer {token}"}, timeout=10, @@ -222,7 +251,6 @@ class FindDocumentIndexer(BaseDocumentIndexer): return response.json() except requests.exceptions.HTTPError as e: logger.error("HTTPError: %s", e) - logger.error("Response content: %s", response.text) # type: ignore raise def format_response(self, data: dict): @@ -238,27 +266,15 @@ class FindDocumentIndexer(BaseDocumentIndexer): Args: data (list): List of document dictionaries. """ - url = getattr(settings, "SEARCH_INDEXER_URL", None) - if not url: - raise RuntimeError( - "SEARCH_INDEXER_URL must be set in Django settings before indexing." - ) - - secret = getattr(settings, "SEARCH_INDEXER_SECRET", None) - if not secret: - raise RuntimeError( - "SEARCH_INDEXER_SECRET must be set in Django settings before indexing." - ) try: response = requests.post( - url, + self.indexer_url, json=data, - headers={"Authorization": f"Bearer {secret}"}, + headers={"Authorization": f"Bearer {self.indexer_secret}"}, timeout=10, ) response.raise_for_status() except requests.exceptions.HTTPError as e: logger.error("HTTPError: %s", e) - logger.error("Response content: %s", response.text) raise diff --git a/src/backend/core/tasks/find.py b/src/backend/core/tasks/find.py index f65d3d05..4a58b830 100644 --- a/src/backend/core/tasks/find.py +++ b/src/backend/core/tasks/find.py @@ -1,5 +1,6 @@ """Trigger document indexation using celery task.""" +from functools import partial from logging import getLogger from django.conf import settings @@ -8,8 +9,8 @@ from django.db import transaction from core import models from core.services.search_indexers import ( - FindDocumentIndexer, get_batch_accesses_by_users_and_teams, + get_document_indexer_class, ) from impress.celery_app import app @@ -52,7 +53,7 @@ def document_indexer_task(document_id): return doc = models.Document.objects.get(pk=document_id) - indexer = FindDocumentIndexer() + indexer = get_document_indexer_class()() accesses = get_batch_accesses_by_users_and_teams((doc.path,)) data = indexer.serialize_document(document=doc, accesses=accesses) @@ -75,11 +76,9 @@ def trigger_document_indexer(document, on_commit=False): pass if on_commit: - - def _aux(): - trigger_document_indexer(document, on_commit=False) - - transaction.on_commit(_aux) + transaction.on_commit( + partial(trigger_document_indexer, document, on_commit=False) + ) else: key = document_indexer_debounce_key(document.pk) countdown = getattr(settings, "SEARCH_INDEXER_COUNTDOWN", 1) diff --git a/src/backend/core/tests/commands/test_index.py b/src/backend/core/tests/commands/test_index.py index a726e712..d30e0118 100644 --- a/src/backend/core/tests/commands/test_index.py +++ b/src/backend/core/tests/commands/test_index.py @@ -2,6 +2,7 @@ Unit test for `index` command. """ +from operator import itemgetter from unittest import mock from django.core.management import call_command @@ -34,19 +35,18 @@ def test_index(): str(no_title_doc.path): {"users": [user.sub]}, } - def sortkey(d): - return d["id"] - with mock.patch.object(FindDocumentIndexer, "push") as mock_push: call_command("index") push_call_args = [call.args[0] for call in mock_push.call_args_list] - assert len(push_call_args) == 1 # called once but with a batch of docs - assert sorted(push_call_args[0], key=sortkey) == sorted( + # called once but with a batch of docs + mock_push.assert_called_once() + + assert sorted(push_call_args[0], key=itemgetter("id")) == sorted( [ indexer.serialize_document(doc, accesses), indexer.serialize_document(no_title_doc, accesses), ], - key=sortkey, + key=itemgetter("id"), ) diff --git a/src/backend/core/tests/documents/test_api_documents_search.py b/src/backend/core/tests/documents/test_api_documents_search.py index 79f1e6af..f5dcfa24 100644 --- a/src/backend/core/tests/documents/test_api_documents_search.py +++ b/src/backend/core/tests/documents/test_api_documents_search.py @@ -57,7 +57,7 @@ def test_api_documents_search_endpoint_is_none(settings): response = APIClient().get("/api/v1.0/documents/search/", data={"q": "alpha"}) assert response.status_code == 401 - assert response.json() == {"detail": "The service is not configured properly."} + assert response.json() == {"detail": "The service is not properly configured."} @responses.activate @@ -75,6 +75,11 @@ def test_api_documents_search_invalid_params(settings): assert response.status_code == 400 assert response.json() == {"q": ["This field is required."]} + response = APIClient().get("/api/v1.0/documents/search/", data={"q": " "}) + + assert response.status_code == 400 + assert response.json() == {"q": ["This field may not be blank."]} + @responses.activate def test_api_documents_search_format(settings): diff --git a/src/backend/core/tests/test_models_documents.py b/src/backend/core/tests/test_models_documents.py index cbe01f83..7931eacd 100644 --- a/src/backend/core/tests/test_models_documents.py +++ b/src/backend/core/tests/test_models_documents.py @@ -3,6 +3,7 @@ Unit tests for the Document model """ # pylint: disable=too-many-lines +from operator import itemgetter import random import smtplib import time @@ -1638,7 +1639,7 @@ def test_models_documents_post_save_indexer(mock_push, settings): factories.UserDocumentAccessFactory(document=doc2, user=user) factories.UserDocumentAccessFactory(document=doc3, user=user) - time.sleep(0.1) # waits for the end of the tasks + time.sleep(0.2) # waits for the end of the tasks accesses = { str(doc1.path): {"users": [user.sub]}, @@ -1650,16 +1651,13 @@ def test_models_documents_post_save_indexer(mock_push, settings): indexer = FindDocumentIndexer() - def sortkey(d): - return d["id"] - - assert sorted(data, key=sortkey) == sorted( + assert sorted(data, key=itemgetter('id')) == sorted( [ indexer.serialize_document(doc1, accesses), indexer.serialize_document(doc2, accesses), indexer.serialize_document(doc3, accesses), ], - key=sortkey, + key=itemgetter('id'), ) # The debounce counters should be reset diff --git a/src/backend/core/tests/test_services_search_indexers.py b/src/backend/core/tests/test_services_search_indexers.py index 5f04f3ae..9b4309ac 100644 --- a/src/backend/core/tests/test_services_search_indexers.py +++ b/src/backend/core/tests/test_services_search_indexers.py @@ -4,77 +4,154 @@ from functools import partial from unittest.mock import patch from django.contrib.auth.models import AnonymousUser +from django.core.exceptions import ImproperlyConfigured +from django.utils.module_loading import import_string import pytest from core import factories, models, utils from core.services.search_indexers import ( + BaseDocumentIndexer, FindDocumentIndexer, + get_document_indexer_class, get_visited_document_ids_of, ) pytestmark = pytest.mark.django_db -def test_push_raises_error_if_search_indexer_url_is_none(settings): +class FakeDocumentIndexer(BaseDocumentIndexer): + """Fake indexer for test purpose""" + + def serialize_document(self, document, accesses): + return {} + + def push(self, data): + pass + + def search_query(self, data, token): + return {} + + def format_response(self, data: dict): + return {} + + + +@pytest.fixture(name="fake_indexer_settings") +def fake_indexer_settings_fixture(settings): + """Fixture to switch the indexer to the FakeDocumentIndexer.""" + _original_backend = str(settings.SEARCH_INDEXER_CLASS) + + settings.SEARCH_INDEXER_CLASS = ( + "core.tests.test_services_search_indexers.FakeDocumentIndexer" + ) + get_document_indexer_class.cache_clear() + + yield settings + + settings.SEARCH_INDEXER_CLASS = _original_backend + # clear cache to prevent issues with other tests + get_document_indexer_class.cache_clear() + + +def test_services_search_indexer_class_is_empty(fake_indexer_settings): + """ + Should raise ImproperlyConfigured if SEARCH_INDEXER_CLASS is None or empty. + """ + fake_indexer_settings.SEARCH_INDEXER_CLASS = None + + with pytest.raises(ImproperlyConfigured) as exc_info: + get_document_indexer_class() + + assert "SEARCH_INDEXER_CLASS must be set in Django settings." in str(exc_info.value) + + fake_indexer_settings.SEARCH_INDEXER_CLASS = "" + + # clear cache again + get_document_indexer_class.cache_clear() + + with pytest.raises(ImproperlyConfigured) as exc_info: + get_document_indexer_class() + + assert "SEARCH_INDEXER_CLASS must be set in Django settings." in str(exc_info.value) + + +def test_services_search_indexer_class_invalid(fake_indexer_settings): + """ + Should raise RuntimeError if SEARCH_INDEXER_CLASS cannot be imported. + """ + fake_indexer_settings.SEARCH_INDEXER_CLASS = "unknown.Unknown" + + with pytest.raises(ImproperlyConfigured) as exc_info: + get_document_indexer_class() + + assert ( + "SEARCH_INDEXER_CLASS setting is not valid : No module named 'unknown'" + in str(exc_info.value) + ) + + +def test_services_search_indexer_class(fake_indexer_settings): + """ + Import indexer class defined in setting SEARCH_INDEXER_CLASS. + """ + fake_indexer_settings.SEARCH_INDEXER_CLASS = ( + "core.tests.test_services_search_indexers.FakeDocumentIndexer" + ) + + assert get_document_indexer_class() == import_string( + "core.tests.test_services_search_indexers.FakeDocumentIndexer" + ) + +def test_services_search_indexer_url_is_none(settings): """ Indexer should raise RuntimeError if SEARCH_INDEXER_URL is None or empty. """ settings.SEARCH_INDEXER_URL = None - indexer = FindDocumentIndexer() - with pytest.raises(RuntimeError) as exc_info: - indexer.push([]) + with pytest.raises(ImproperlyConfigured) as exc_info: + FindDocumentIndexer() - assert "SEARCH_INDEXER_URL must be set in Django settings before indexing." in str( - exc_info.value - ) + assert "SEARCH_INDEXER_URL must be set in Django settings." in str(exc_info.value) -def test_push_raises_error_if_search_indexer_url_is_empty(settings): +def test_services_search_indexer_url_is_empty(settings): """ Indexer should raise RuntimeError if SEARCH_INDEXER_URL is empty string. """ settings.SEARCH_INDEXER_URL = "" - indexer = FindDocumentIndexer() - with pytest.raises(RuntimeError) as exc_info: - indexer.push([]) + with pytest.raises(ImproperlyConfigured) as exc_info: + FindDocumentIndexer() - assert "SEARCH_INDEXER_URL must be set in Django settings before indexing." in str( - exc_info.value - ) + assert "SEARCH_INDEXER_URL must be set in Django settings." in str(exc_info.value) -def test_push_raises_error_if_search_indexer_secret_is_none(settings): +def test_services_search_indexer_secret_is_none(settings): """ Indexer should raise RuntimeError if SEARCH_INDEXER_SECRET is None or empty. """ settings.SEARCH_INDEXER_SECRET = None - indexer = FindDocumentIndexer() - with pytest.raises(RuntimeError) as exc_info: - indexer.push([]) + with pytest.raises(ImproperlyConfigured) as exc_info: + FindDocumentIndexer() - assert ( - "SEARCH_INDEXER_SECRET must be set in Django settings before indexing." - in str(exc_info.value) + assert "SEARCH_INDEXER_SECRET must be set in Django settings." in str( + exc_info.value ) -def test_push_raises_error_if_search_indexer_secret_is_empty(settings): +def test_services_search_indexer_secret_is_empty(settings): """ Indexer should raise RuntimeError if SEARCH_INDEXER_SECRET is empty string. """ settings.SEARCH_INDEXER_SECRET = "" - indexer = FindDocumentIndexer() - with pytest.raises(RuntimeError) as exc_info: - indexer.push([]) + with pytest.raises(ImproperlyConfigured) as exc_info: + FindDocumentIndexer() - assert ( - "SEARCH_INDEXER_SECRET must be set in Django settings before indexing." - in str(exc_info.value) + assert "SEARCH_INDEXER_SECRET must be set in Django settings." in str( + exc_info.value ) @@ -333,13 +410,10 @@ def test_search_query_raises_error_if_search_endpoint_is_none(settings): Indexer should raise RuntimeError if SEARCH_INDEXER_QUERY_URL is None or empty. """ settings.SEARCH_INDEXER_QUERY_URL = None - indexer = FindDocumentIndexer() - user = factories.UserFactory() - with pytest.raises(RuntimeError) as exc_info: - indexer.search("alpha", user=user, token="mytoken") + with pytest.raises(ImproperlyConfigured) as exc_info: + FindDocumentIndexer() - assert ( - "SEARCH_INDEXER_QUERY_URL must be set in Django settings before search." - in str(exc_info.value) + assert "SEARCH_INDEXER_QUERY_URL must be set in Django settings." in str( + exc_info.value ) diff --git a/src/backend/impress/settings.py b/src/backend/impress/settings.py index 3232c246..d7b41cd5 100755 --- a/src/backend/impress/settings.py +++ b/src/backend/impress/settings.py @@ -100,6 +100,11 @@ class Base(Configuration): DEFAULT_AUTO_FIELD = "django.db.models.AutoField" # Search + SEARCH_INDEXER_CLASS = values.Value( + default="core.services.search_indexers.FindDocumentIndexer", + environ_name="SEARCH_INDEXER_CLASS", + environ_prefix=None, + ) SEARCH_INDEXER_BATCH_SIZE = values.IntegerValue( default=100_000, environ_name="SEARCH_INDEXER_BATCH_SIZE", environ_prefix=None ) @@ -977,6 +982,11 @@ class Test(Base): # Tests are raising warnings because the /data/static directory does not exist STATIC_ROOT = None + # Setup indexer configuration to make test working on the CI. + SEARCH_INDEXER_SECRET = "ThisIsAKeyForTest" # noqa + SEARCH_INDEXER_URL = "http://localhost:8081/api/v1.0/documents/index/" + SEARCH_INDEXER_QUERY_URL = "http://localhost:8081/api/v1.0/documents/search/" + CELERY_TASK_ALWAYS_EAGER = values.BooleanValue(True) def __init__(self):