(backend) improve search indexer service configuration

New SEARCH_INDEXER_CLASS setting to define the indexer service class.
Raise ImpoperlyConfigured errors instead of RuntimeError in index service.

Signed-off-by: Fabre Florian <ffabre@hybird.org>
This commit is contained in:
Fabre Florian
2025-09-11 14:05:56 +02:00
committed by Quentin BEY
parent d721b97f68
commit 24460ffc3a
11 changed files with 193 additions and 99 deletions

View File

@@ -52,7 +52,7 @@ OIDC_AUTH_REQUEST_EXTRA_PARAMS={"acr_values": "eidas1"}
# Store OIDC tokens in the session # Store OIDC tokens in the session
OIDC_STORE_ACCESS_TOKEN = True # Store the access token in the session OIDC_STORE_ACCESS_TOKEN = True # Store the access token in the session
OIDC_STORE_REFRESH_TOKEN = True # Store the encrypted refresh token in the session OIDC_STORE_REFRESH_TOKEN = True # Store the encrypted refresh token in the session
OIDC_STORE_REFRESH_TOKEN_KEY = AnExampleKeyForDevPurposeOnly OIDC_STORE_REFRESH_TOKEN_KEY = ThisIsAnExampleKeyForDevPurposeOnly
# AI # AI
AI_FEATURE_ENABLED=true AI_FEATURE_ENABLED=true

View File

@@ -1018,12 +1018,4 @@ class ThreadSerializer(serializers.ModelSerializer):
class FindDocumentSerializer(serializers.Serializer): class FindDocumentSerializer(serializers.Serializer):
"""Serializer for Find search requests""" """Serializer for Find search requests"""
q = serializers.CharField(required=True) q = serializers.CharField(required=True, allow_blank=False, trim_whitespace=True)
def validate_q(self, value):
"""Ensure the text field is not empty."""
if len(value.strip()) == 0:
raise serializers.ValidationError("Text field cannot be empty.")
return value

View File

@@ -13,7 +13,7 @@ from django.conf import settings
from django.contrib.postgres.aggregates import ArrayAgg from django.contrib.postgres.aggregates import ArrayAgg
from django.contrib.postgres.search import TrigramSimilarity from django.contrib.postgres.search import TrigramSimilarity
from django.core.cache import cache from django.core.cache import cache
from django.core.exceptions import ValidationError from django.core.exceptions import ImproperlyConfigured, ValidationError
from django.core.files.storage import default_storage from django.core.files.storage import default_storage
from django.core.validators import URLValidator from django.core.validators import URLValidator
from django.db import connection, transaction from django.db import connection, transaction
@@ -52,7 +52,7 @@ from core.services.converter_services import (
from core.services.converter_services import ( from core.services.converter_services import (
YdocConverter, YdocConverter,
) )
from core.services.search_indexers import FindDocumentIndexer from core.services.search_indexers import get_document_indexer_class
from core.tasks.mail import send_ask_for_access_mail from core.tasks.mail import send_ask_for_access_mail
from core.utils import extract_attachments, filter_descendants from core.utils import extract_attachments, filter_descendants
@@ -1095,15 +1095,15 @@ class DocumentViewSet(
serializer.is_valid(raise_exception=True) serializer.is_valid(raise_exception=True)
try: try:
indexer = FindDocumentIndexer() indexer = get_document_indexer_class()()
queryset = indexer.search( queryset = indexer.search(
text=serializer.validated_data.get("q", ""), text=serializer.validated_data.get("q", ""),
user=request.user, user=request.user,
token=access_token, token=access_token,
) )
except RuntimeError: except ImproperlyConfigured:
return drf.response.Response( return drf.response.Response(
{"detail": "The service is not configured properly."}, {"detail": "The service is not properly configured."},
status=status.HTTP_401_UNAUTHORIZED, status=status.HTTP_401_UNAUTHORIZED,
) )

View File

@@ -956,7 +956,7 @@ class Document(MP_Node, BaseModel):
@receiver(signals.post_save, sender=Document) @receiver(signals.post_save, sender=Document)
def document_post_save(sender, instance, **kwargs): def document_post_save(sender, instance, **kwargs): # pylint: disable=unused-argument
""" """
Asynchronous call to the document indexer at the end of the transaction. Asynchronous call to the document indexer at the end of the transaction.
Note : Within the transaction we can have an empty content and a serialization Note : Within the transaction we can have an empty content and a serialization
@@ -1196,7 +1196,7 @@ class DocumentAccess(BaseAccess):
@receiver(signals.post_save, sender=DocumentAccess) @receiver(signals.post_save, sender=DocumentAccess)
def document_access_post_save(sender, instance, created, **kwargs): def document_access_post_save(sender, instance, created, **kwargs): # pylint: disable=unused-argument
""" """
Asynchronous call to the document indexer at the end of the transaction. Asynchronous call to the document indexer at the end of the transaction.
""" """

View File

@@ -3,9 +3,12 @@
import logging import logging
from abc import ABC, abstractmethod from abc import ABC, abstractmethod
from collections import defaultdict from collections import defaultdict
from functools import cache
from django.conf import settings from django.conf import settings
from django.contrib.auth.models import AnonymousUser from django.contrib.auth.models import AnonymousUser
from django.core.exceptions import ImproperlyConfigured
from django.utils.module_loading import import_string
import requests import requests
@@ -14,18 +17,33 @@ from core import models, utils
logger = logging.getLogger(__name__) logger = logging.getLogger(__name__)
@cache
def get_document_indexer_class() -> "BaseDocumentIndexer":
"""Return the indexer backend class based on the settings."""
classpath = settings.SEARCH_INDEXER_CLASS
if not classpath:
raise ImproperlyConfigured(
"SEARCH_INDEXER_CLASS must be set in Django settings."
)
try:
return import_string(settings.SEARCH_INDEXER_CLASS)
except ImportError as err:
raise ImproperlyConfigured(
f"SEARCH_INDEXER_CLASS setting is not valid : {err}"
) from err
def get_batch_accesses_by_users_and_teams(paths): def get_batch_accesses_by_users_and_teams(paths):
""" """
Get accesses related to a list of document paths, Get accesses related to a list of document paths,
grouped by users and teams, including all ancestor paths. grouped by users and teams, including all ancestor paths.
""" """
# print("paths: ", paths)
ancestor_map = utils.get_ancestor_to_descendants_map( ancestor_map = utils.get_ancestor_to_descendants_map(
paths, steplen=models.Document.steplen paths, steplen=models.Document.steplen
) )
ancestor_paths = list(ancestor_map.keys()) ancestor_paths = list(ancestor_map.keys())
# print("ancestor map: ", ancestor_map)
# print("ancestor paths: ", ancestor_paths)
access_qs = models.DocumentAccess.objects.filter( access_qs = models.DocumentAccess.objects.filter(
document__path__in=ancestor_paths document__path__in=ancestor_paths
@@ -80,6 +98,24 @@ class BaseDocumentIndexer(ABC):
Defaults to settings.SEARCH_INDEXER_BATCH_SIZE. Defaults to settings.SEARCH_INDEXER_BATCH_SIZE.
""" """
self.batch_size = batch_size or settings.SEARCH_INDEXER_BATCH_SIZE self.batch_size = batch_size or settings.SEARCH_INDEXER_BATCH_SIZE
self.indexer_url = settings.SEARCH_INDEXER_URL
self.indexer_secret = settings.SEARCH_INDEXER_SECRET
self.search_url = settings.SEARCH_INDEXER_QUERY_URL
if not self.indexer_url:
raise ImproperlyConfigured(
"SEARCH_INDEXER_URL must be set in Django settings."
)
if not self.indexer_secret:
raise ImproperlyConfigured(
"SEARCH_INDEXER_SECRET must be set in Django settings."
)
if not self.search_url:
raise ImproperlyConfigured(
"SEARCH_INDEXER_QUERY_URL must be set in Django settings."
)
def index(self): def index(self):
""" """
@@ -143,7 +179,7 @@ class BaseDocumentIndexer(ABC):
@abstractmethod @abstractmethod
def search_query(self, data, token) -> dict: def search_query(self, data, token) -> dict:
""" """
Retreive documents from the Find app API. Retrieve documents from the Find app API.
Must be implemented by subclasses. Must be implemented by subclasses.
""" """
@@ -204,16 +240,9 @@ class FindDocumentIndexer(BaseDocumentIndexer):
Returns: Returns:
dict: A JSON-serializable dictionary. dict: A JSON-serializable dictionary.
""" """
url = getattr(settings, "SEARCH_INDEXER_QUERY_URL", None)
if not url:
raise RuntimeError(
"SEARCH_INDEXER_QUERY_URL must be set in Django settings before search."
)
try: try:
response = requests.post( response = requests.post(
url, self.search_url,
json=data, json=data,
headers={"Authorization": f"Bearer {token}"}, headers={"Authorization": f"Bearer {token}"},
timeout=10, timeout=10,
@@ -222,7 +251,6 @@ class FindDocumentIndexer(BaseDocumentIndexer):
return response.json() return response.json()
except requests.exceptions.HTTPError as e: except requests.exceptions.HTTPError as e:
logger.error("HTTPError: %s", e) logger.error("HTTPError: %s", e)
logger.error("Response content: %s", response.text) # type: ignore
raise raise
def format_response(self, data: dict): def format_response(self, data: dict):
@@ -238,27 +266,15 @@ class FindDocumentIndexer(BaseDocumentIndexer):
Args: Args:
data (list): List of document dictionaries. data (list): List of document dictionaries.
""" """
url = getattr(settings, "SEARCH_INDEXER_URL", None)
if not url:
raise RuntimeError(
"SEARCH_INDEXER_URL must be set in Django settings before indexing."
)
secret = getattr(settings, "SEARCH_INDEXER_SECRET", None)
if not secret:
raise RuntimeError(
"SEARCH_INDEXER_SECRET must be set in Django settings before indexing."
)
try: try:
response = requests.post( response = requests.post(
url, self.indexer_url,
json=data, json=data,
headers={"Authorization": f"Bearer {secret}"}, headers={"Authorization": f"Bearer {self.indexer_secret}"},
timeout=10, timeout=10,
) )
response.raise_for_status() response.raise_for_status()
except requests.exceptions.HTTPError as e: except requests.exceptions.HTTPError as e:
logger.error("HTTPError: %s", e) logger.error("HTTPError: %s", e)
logger.error("Response content: %s", response.text)
raise raise

View File

@@ -1,5 +1,6 @@
"""Trigger document indexation using celery task.""" """Trigger document indexation using celery task."""
from functools import partial
from logging import getLogger from logging import getLogger
from django.conf import settings from django.conf import settings
@@ -8,8 +9,8 @@ from django.db import transaction
from core import models from core import models
from core.services.search_indexers import ( from core.services.search_indexers import (
FindDocumentIndexer,
get_batch_accesses_by_users_and_teams, get_batch_accesses_by_users_and_teams,
get_document_indexer_class,
) )
from impress.celery_app import app from impress.celery_app import app
@@ -52,7 +53,7 @@ def document_indexer_task(document_id):
return return
doc = models.Document.objects.get(pk=document_id) doc = models.Document.objects.get(pk=document_id)
indexer = FindDocumentIndexer() indexer = get_document_indexer_class()()
accesses = get_batch_accesses_by_users_and_teams((doc.path,)) accesses = get_batch_accesses_by_users_and_teams((doc.path,))
data = indexer.serialize_document(document=doc, accesses=accesses) data = indexer.serialize_document(document=doc, accesses=accesses)
@@ -75,11 +76,9 @@ def trigger_document_indexer(document, on_commit=False):
pass pass
if on_commit: if on_commit:
transaction.on_commit(
def _aux(): partial(trigger_document_indexer, document, on_commit=False)
trigger_document_indexer(document, on_commit=False) )
transaction.on_commit(_aux)
else: else:
key = document_indexer_debounce_key(document.pk) key = document_indexer_debounce_key(document.pk)
countdown = getattr(settings, "SEARCH_INDEXER_COUNTDOWN", 1) countdown = getattr(settings, "SEARCH_INDEXER_COUNTDOWN", 1)

View File

@@ -2,6 +2,7 @@
Unit test for `index` command. Unit test for `index` command.
""" """
from operator import itemgetter
from unittest import mock from unittest import mock
from django.core.management import call_command from django.core.management import call_command
@@ -34,19 +35,18 @@ def test_index():
str(no_title_doc.path): {"users": [user.sub]}, str(no_title_doc.path): {"users": [user.sub]},
} }
def sortkey(d):
return d["id"]
with mock.patch.object(FindDocumentIndexer, "push") as mock_push: with mock.patch.object(FindDocumentIndexer, "push") as mock_push:
call_command("index") call_command("index")
push_call_args = [call.args[0] for call in mock_push.call_args_list] push_call_args = [call.args[0] for call in mock_push.call_args_list]
assert len(push_call_args) == 1 # called once but with a batch of docs # called once but with a batch of docs
assert sorted(push_call_args[0], key=sortkey) == sorted( mock_push.assert_called_once()
assert sorted(push_call_args[0], key=itemgetter("id")) == sorted(
[ [
indexer.serialize_document(doc, accesses), indexer.serialize_document(doc, accesses),
indexer.serialize_document(no_title_doc, accesses), indexer.serialize_document(no_title_doc, accesses),
], ],
key=sortkey, key=itemgetter("id"),
) )

View File

@@ -57,7 +57,7 @@ def test_api_documents_search_endpoint_is_none(settings):
response = APIClient().get("/api/v1.0/documents/search/", data={"q": "alpha"}) response = APIClient().get("/api/v1.0/documents/search/", data={"q": "alpha"})
assert response.status_code == 401 assert response.status_code == 401
assert response.json() == {"detail": "The service is not configured properly."} assert response.json() == {"detail": "The service is not properly configured."}
@responses.activate @responses.activate
@@ -75,6 +75,11 @@ def test_api_documents_search_invalid_params(settings):
assert response.status_code == 400 assert response.status_code == 400
assert response.json() == {"q": ["This field is required."]} assert response.json() == {"q": ["This field is required."]}
response = APIClient().get("/api/v1.0/documents/search/", data={"q": " "})
assert response.status_code == 400
assert response.json() == {"q": ["This field may not be blank."]}
@responses.activate @responses.activate
def test_api_documents_search_format(settings): def test_api_documents_search_format(settings):

View File

@@ -3,6 +3,7 @@ Unit tests for the Document model
""" """
# pylint: disable=too-many-lines # pylint: disable=too-many-lines
from operator import itemgetter
import random import random
import smtplib import smtplib
import time import time
@@ -1638,7 +1639,7 @@ def test_models_documents_post_save_indexer(mock_push, settings):
factories.UserDocumentAccessFactory(document=doc2, user=user) factories.UserDocumentAccessFactory(document=doc2, user=user)
factories.UserDocumentAccessFactory(document=doc3, user=user) factories.UserDocumentAccessFactory(document=doc3, user=user)
time.sleep(0.1) # waits for the end of the tasks time.sleep(0.2) # waits for the end of the tasks
accesses = { accesses = {
str(doc1.path): {"users": [user.sub]}, str(doc1.path): {"users": [user.sub]},
@@ -1650,16 +1651,13 @@ def test_models_documents_post_save_indexer(mock_push, settings):
indexer = FindDocumentIndexer() indexer = FindDocumentIndexer()
def sortkey(d): assert sorted(data, key=itemgetter('id')) == sorted(
return d["id"]
assert sorted(data, key=sortkey) == sorted(
[ [
indexer.serialize_document(doc1, accesses), indexer.serialize_document(doc1, accesses),
indexer.serialize_document(doc2, accesses), indexer.serialize_document(doc2, accesses),
indexer.serialize_document(doc3, accesses), indexer.serialize_document(doc3, accesses),
], ],
key=sortkey, key=itemgetter('id'),
) )
# The debounce counters should be reset # The debounce counters should be reset

View File

@@ -4,77 +4,154 @@ from functools import partial
from unittest.mock import patch from unittest.mock import patch
from django.contrib.auth.models import AnonymousUser from django.contrib.auth.models import AnonymousUser
from django.core.exceptions import ImproperlyConfigured
from django.utils.module_loading import import_string
import pytest import pytest
from core import factories, models, utils from core import factories, models, utils
from core.services.search_indexers import ( from core.services.search_indexers import (
BaseDocumentIndexer,
FindDocumentIndexer, FindDocumentIndexer,
get_document_indexer_class,
get_visited_document_ids_of, get_visited_document_ids_of,
) )
pytestmark = pytest.mark.django_db pytestmark = pytest.mark.django_db
def test_push_raises_error_if_search_indexer_url_is_none(settings): class FakeDocumentIndexer(BaseDocumentIndexer):
"""Fake indexer for test purpose"""
def serialize_document(self, document, accesses):
return {}
def push(self, data):
pass
def search_query(self, data, token):
return {}
def format_response(self, data: dict):
return {}
@pytest.fixture(name="fake_indexer_settings")
def fake_indexer_settings_fixture(settings):
"""Fixture to switch the indexer to the FakeDocumentIndexer."""
_original_backend = str(settings.SEARCH_INDEXER_CLASS)
settings.SEARCH_INDEXER_CLASS = (
"core.tests.test_services_search_indexers.FakeDocumentIndexer"
)
get_document_indexer_class.cache_clear()
yield settings
settings.SEARCH_INDEXER_CLASS = _original_backend
# clear cache to prevent issues with other tests
get_document_indexer_class.cache_clear()
def test_services_search_indexer_class_is_empty(fake_indexer_settings):
"""
Should raise ImproperlyConfigured if SEARCH_INDEXER_CLASS is None or empty.
"""
fake_indexer_settings.SEARCH_INDEXER_CLASS = None
with pytest.raises(ImproperlyConfigured) as exc_info:
get_document_indexer_class()
assert "SEARCH_INDEXER_CLASS must be set in Django settings." in str(exc_info.value)
fake_indexer_settings.SEARCH_INDEXER_CLASS = ""
# clear cache again
get_document_indexer_class.cache_clear()
with pytest.raises(ImproperlyConfigured) as exc_info:
get_document_indexer_class()
assert "SEARCH_INDEXER_CLASS must be set in Django settings." in str(exc_info.value)
def test_services_search_indexer_class_invalid(fake_indexer_settings):
"""
Should raise RuntimeError if SEARCH_INDEXER_CLASS cannot be imported.
"""
fake_indexer_settings.SEARCH_INDEXER_CLASS = "unknown.Unknown"
with pytest.raises(ImproperlyConfigured) as exc_info:
get_document_indexer_class()
assert (
"SEARCH_INDEXER_CLASS setting is not valid : No module named 'unknown'"
in str(exc_info.value)
)
def test_services_search_indexer_class(fake_indexer_settings):
"""
Import indexer class defined in setting SEARCH_INDEXER_CLASS.
"""
fake_indexer_settings.SEARCH_INDEXER_CLASS = (
"core.tests.test_services_search_indexers.FakeDocumentIndexer"
)
assert get_document_indexer_class() == import_string(
"core.tests.test_services_search_indexers.FakeDocumentIndexer"
)
def test_services_search_indexer_url_is_none(settings):
""" """
Indexer should raise RuntimeError if SEARCH_INDEXER_URL is None or empty. Indexer should raise RuntimeError if SEARCH_INDEXER_URL is None or empty.
""" """
settings.SEARCH_INDEXER_URL = None settings.SEARCH_INDEXER_URL = None
indexer = FindDocumentIndexer()
with pytest.raises(RuntimeError) as exc_info: with pytest.raises(ImproperlyConfigured) as exc_info:
indexer.push([]) FindDocumentIndexer()
assert "SEARCH_INDEXER_URL must be set in Django settings before indexing." in str( assert "SEARCH_INDEXER_URL must be set in Django settings." in str(exc_info.value)
exc_info.value
)
def test_push_raises_error_if_search_indexer_url_is_empty(settings): def test_services_search_indexer_url_is_empty(settings):
""" """
Indexer should raise RuntimeError if SEARCH_INDEXER_URL is empty string. Indexer should raise RuntimeError if SEARCH_INDEXER_URL is empty string.
""" """
settings.SEARCH_INDEXER_URL = "" settings.SEARCH_INDEXER_URL = ""
indexer = FindDocumentIndexer()
with pytest.raises(RuntimeError) as exc_info: with pytest.raises(ImproperlyConfigured) as exc_info:
indexer.push([]) FindDocumentIndexer()
assert "SEARCH_INDEXER_URL must be set in Django settings before indexing." in str( assert "SEARCH_INDEXER_URL must be set in Django settings." in str(exc_info.value)
exc_info.value
)
def test_push_raises_error_if_search_indexer_secret_is_none(settings): def test_services_search_indexer_secret_is_none(settings):
""" """
Indexer should raise RuntimeError if SEARCH_INDEXER_SECRET is None or empty. Indexer should raise RuntimeError if SEARCH_INDEXER_SECRET is None or empty.
""" """
settings.SEARCH_INDEXER_SECRET = None settings.SEARCH_INDEXER_SECRET = None
indexer = FindDocumentIndexer()
with pytest.raises(RuntimeError) as exc_info: with pytest.raises(ImproperlyConfigured) as exc_info:
indexer.push([]) FindDocumentIndexer()
assert ( assert "SEARCH_INDEXER_SECRET must be set in Django settings." in str(
"SEARCH_INDEXER_SECRET must be set in Django settings before indexing." exc_info.value
in str(exc_info.value)
) )
def test_push_raises_error_if_search_indexer_secret_is_empty(settings): def test_services_search_indexer_secret_is_empty(settings):
""" """
Indexer should raise RuntimeError if SEARCH_INDEXER_SECRET is empty string. Indexer should raise RuntimeError if SEARCH_INDEXER_SECRET is empty string.
""" """
settings.SEARCH_INDEXER_SECRET = "" settings.SEARCH_INDEXER_SECRET = ""
indexer = FindDocumentIndexer()
with pytest.raises(RuntimeError) as exc_info: with pytest.raises(ImproperlyConfigured) as exc_info:
indexer.push([]) FindDocumentIndexer()
assert ( assert "SEARCH_INDEXER_SECRET must be set in Django settings." in str(
"SEARCH_INDEXER_SECRET must be set in Django settings before indexing." exc_info.value
in str(exc_info.value)
) )
@@ -333,13 +410,10 @@ def test_search_query_raises_error_if_search_endpoint_is_none(settings):
Indexer should raise RuntimeError if SEARCH_INDEXER_QUERY_URL is None or empty. Indexer should raise RuntimeError if SEARCH_INDEXER_QUERY_URL is None or empty.
""" """
settings.SEARCH_INDEXER_QUERY_URL = None settings.SEARCH_INDEXER_QUERY_URL = None
indexer = FindDocumentIndexer()
user = factories.UserFactory()
with pytest.raises(RuntimeError) as exc_info: with pytest.raises(ImproperlyConfigured) as exc_info:
indexer.search("alpha", user=user, token="mytoken") FindDocumentIndexer()
assert ( assert "SEARCH_INDEXER_QUERY_URL must be set in Django settings." in str(
"SEARCH_INDEXER_QUERY_URL must be set in Django settings before search." exc_info.value
in str(exc_info.value)
) )

View File

@@ -100,6 +100,11 @@ class Base(Configuration):
DEFAULT_AUTO_FIELD = "django.db.models.AutoField" DEFAULT_AUTO_FIELD = "django.db.models.AutoField"
# Search # Search
SEARCH_INDEXER_CLASS = values.Value(
default="core.services.search_indexers.FindDocumentIndexer",
environ_name="SEARCH_INDEXER_CLASS",
environ_prefix=None,
)
SEARCH_INDEXER_BATCH_SIZE = values.IntegerValue( SEARCH_INDEXER_BATCH_SIZE = values.IntegerValue(
default=100_000, environ_name="SEARCH_INDEXER_BATCH_SIZE", environ_prefix=None default=100_000, environ_name="SEARCH_INDEXER_BATCH_SIZE", environ_prefix=None
) )
@@ -977,6 +982,11 @@ class Test(Base):
# Tests are raising warnings because the /data/static directory does not exist # Tests are raising warnings because the /data/static directory does not exist
STATIC_ROOT = None STATIC_ROOT = None
# Setup indexer configuration to make test working on the CI.
SEARCH_INDEXER_SECRET = "ThisIsAKeyForTest" # noqa
SEARCH_INDEXER_URL = "http://localhost:8081/api/v1.0/documents/index/"
SEARCH_INDEXER_QUERY_URL = "http://localhost:8081/api/v1.0/documents/search/"
CELERY_TASK_ALWAYS_EAGER = values.BooleanValue(True) CELERY_TASK_ALWAYS_EAGER = values.BooleanValue(True)
def __init__(self): def __init__(self):