(backend) improve search indexer service configuration

New SEARCH_INDEXER_CLASS setting to define the indexer service class.
Raise ImpoperlyConfigured errors instead of RuntimeError in index service.

Signed-off-by: Fabre Florian <ffabre@hybird.org>
This commit is contained in:
Fabre Florian
2025-09-11 14:05:56 +02:00
committed by Quentin BEY
parent d721b97f68
commit 24460ffc3a
11 changed files with 193 additions and 99 deletions

View File

@@ -52,7 +52,7 @@ OIDC_AUTH_REQUEST_EXTRA_PARAMS={"acr_values": "eidas1"}
# Store OIDC tokens in the session
OIDC_STORE_ACCESS_TOKEN = True # Store the access token in the session
OIDC_STORE_REFRESH_TOKEN = True # Store the encrypted refresh token in the session
OIDC_STORE_REFRESH_TOKEN_KEY = AnExampleKeyForDevPurposeOnly
OIDC_STORE_REFRESH_TOKEN_KEY = ThisIsAnExampleKeyForDevPurposeOnly
# AI
AI_FEATURE_ENABLED=true

View File

@@ -1018,12 +1018,4 @@ class ThreadSerializer(serializers.ModelSerializer):
class FindDocumentSerializer(serializers.Serializer):
"""Serializer for Find search requests"""
q = serializers.CharField(required=True)
def validate_q(self, value):
"""Ensure the text field is not empty."""
if len(value.strip()) == 0:
raise serializers.ValidationError("Text field cannot be empty.")
return value
q = serializers.CharField(required=True, allow_blank=False, trim_whitespace=True)

View File

@@ -13,7 +13,7 @@ from django.conf import settings
from django.contrib.postgres.aggregates import ArrayAgg
from django.contrib.postgres.search import TrigramSimilarity
from django.core.cache import cache
from django.core.exceptions import ValidationError
from django.core.exceptions import ImproperlyConfigured, ValidationError
from django.core.files.storage import default_storage
from django.core.validators import URLValidator
from django.db import connection, transaction
@@ -52,7 +52,7 @@ from core.services.converter_services import (
from core.services.converter_services import (
YdocConverter,
)
from core.services.search_indexers import FindDocumentIndexer
from core.services.search_indexers import get_document_indexer_class
from core.tasks.mail import send_ask_for_access_mail
from core.utils import extract_attachments, filter_descendants
@@ -1095,15 +1095,15 @@ class DocumentViewSet(
serializer.is_valid(raise_exception=True)
try:
indexer = FindDocumentIndexer()
indexer = get_document_indexer_class()()
queryset = indexer.search(
text=serializer.validated_data.get("q", ""),
user=request.user,
token=access_token,
)
except RuntimeError:
except ImproperlyConfigured:
return drf.response.Response(
{"detail": "The service is not configured properly."},
{"detail": "The service is not properly configured."},
status=status.HTTP_401_UNAUTHORIZED,
)

View File

@@ -956,7 +956,7 @@ class Document(MP_Node, BaseModel):
@receiver(signals.post_save, sender=Document)
def document_post_save(sender, instance, **kwargs):
def document_post_save(sender, instance, **kwargs): # pylint: disable=unused-argument
"""
Asynchronous call to the document indexer at the end of the transaction.
Note : Within the transaction we can have an empty content and a serialization
@@ -1196,7 +1196,7 @@ class DocumentAccess(BaseAccess):
@receiver(signals.post_save, sender=DocumentAccess)
def document_access_post_save(sender, instance, created, **kwargs):
def document_access_post_save(sender, instance, created, **kwargs): # pylint: disable=unused-argument
"""
Asynchronous call to the document indexer at the end of the transaction.
"""

View File

@@ -3,9 +3,12 @@
import logging
from abc import ABC, abstractmethod
from collections import defaultdict
from functools import cache
from django.conf import settings
from django.contrib.auth.models import AnonymousUser
from django.core.exceptions import ImproperlyConfigured
from django.utils.module_loading import import_string
import requests
@@ -14,18 +17,33 @@ from core import models, utils
logger = logging.getLogger(__name__)
@cache
def get_document_indexer_class() -> "BaseDocumentIndexer":
"""Return the indexer backend class based on the settings."""
classpath = settings.SEARCH_INDEXER_CLASS
if not classpath:
raise ImproperlyConfigured(
"SEARCH_INDEXER_CLASS must be set in Django settings."
)
try:
return import_string(settings.SEARCH_INDEXER_CLASS)
except ImportError as err:
raise ImproperlyConfigured(
f"SEARCH_INDEXER_CLASS setting is not valid : {err}"
) from err
def get_batch_accesses_by_users_and_teams(paths):
"""
Get accesses related to a list of document paths,
grouped by users and teams, including all ancestor paths.
"""
# print("paths: ", paths)
ancestor_map = utils.get_ancestor_to_descendants_map(
paths, steplen=models.Document.steplen
)
ancestor_paths = list(ancestor_map.keys())
# print("ancestor map: ", ancestor_map)
# print("ancestor paths: ", ancestor_paths)
access_qs = models.DocumentAccess.objects.filter(
document__path__in=ancestor_paths
@@ -80,6 +98,24 @@ class BaseDocumentIndexer(ABC):
Defaults to settings.SEARCH_INDEXER_BATCH_SIZE.
"""
self.batch_size = batch_size or settings.SEARCH_INDEXER_BATCH_SIZE
self.indexer_url = settings.SEARCH_INDEXER_URL
self.indexer_secret = settings.SEARCH_INDEXER_SECRET
self.search_url = settings.SEARCH_INDEXER_QUERY_URL
if not self.indexer_url:
raise ImproperlyConfigured(
"SEARCH_INDEXER_URL must be set in Django settings."
)
if not self.indexer_secret:
raise ImproperlyConfigured(
"SEARCH_INDEXER_SECRET must be set in Django settings."
)
if not self.search_url:
raise ImproperlyConfigured(
"SEARCH_INDEXER_QUERY_URL must be set in Django settings."
)
def index(self):
"""
@@ -143,7 +179,7 @@ class BaseDocumentIndexer(ABC):
@abstractmethod
def search_query(self, data, token) -> dict:
"""
Retreive documents from the Find app API.
Retrieve documents from the Find app API.
Must be implemented by subclasses.
"""
@@ -204,16 +240,9 @@ class FindDocumentIndexer(BaseDocumentIndexer):
Returns:
dict: A JSON-serializable dictionary.
"""
url = getattr(settings, "SEARCH_INDEXER_QUERY_URL", None)
if not url:
raise RuntimeError(
"SEARCH_INDEXER_QUERY_URL must be set in Django settings before search."
)
try:
response = requests.post(
url,
self.search_url,
json=data,
headers={"Authorization": f"Bearer {token}"},
timeout=10,
@@ -222,7 +251,6 @@ class FindDocumentIndexer(BaseDocumentIndexer):
return response.json()
except requests.exceptions.HTTPError as e:
logger.error("HTTPError: %s", e)
logger.error("Response content: %s", response.text) # type: ignore
raise
def format_response(self, data: dict):
@@ -238,27 +266,15 @@ class FindDocumentIndexer(BaseDocumentIndexer):
Args:
data (list): List of document dictionaries.
"""
url = getattr(settings, "SEARCH_INDEXER_URL", None)
if not url:
raise RuntimeError(
"SEARCH_INDEXER_URL must be set in Django settings before indexing."
)
secret = getattr(settings, "SEARCH_INDEXER_SECRET", None)
if not secret:
raise RuntimeError(
"SEARCH_INDEXER_SECRET must be set in Django settings before indexing."
)
try:
response = requests.post(
url,
self.indexer_url,
json=data,
headers={"Authorization": f"Bearer {secret}"},
headers={"Authorization": f"Bearer {self.indexer_secret}"},
timeout=10,
)
response.raise_for_status()
except requests.exceptions.HTTPError as e:
logger.error("HTTPError: %s", e)
logger.error("Response content: %s", response.text)
raise

View File

@@ -1,5 +1,6 @@
"""Trigger document indexation using celery task."""
from functools import partial
from logging import getLogger
from django.conf import settings
@@ -8,8 +9,8 @@ from django.db import transaction
from core import models
from core.services.search_indexers import (
FindDocumentIndexer,
get_batch_accesses_by_users_and_teams,
get_document_indexer_class,
)
from impress.celery_app import app
@@ -52,7 +53,7 @@ def document_indexer_task(document_id):
return
doc = models.Document.objects.get(pk=document_id)
indexer = FindDocumentIndexer()
indexer = get_document_indexer_class()()
accesses = get_batch_accesses_by_users_and_teams((doc.path,))
data = indexer.serialize_document(document=doc, accesses=accesses)
@@ -75,11 +76,9 @@ def trigger_document_indexer(document, on_commit=False):
pass
if on_commit:
def _aux():
trigger_document_indexer(document, on_commit=False)
transaction.on_commit(_aux)
transaction.on_commit(
partial(trigger_document_indexer, document, on_commit=False)
)
else:
key = document_indexer_debounce_key(document.pk)
countdown = getattr(settings, "SEARCH_INDEXER_COUNTDOWN", 1)

View File

@@ -2,6 +2,7 @@
Unit test for `index` command.
"""
from operator import itemgetter
from unittest import mock
from django.core.management import call_command
@@ -34,19 +35,18 @@ def test_index():
str(no_title_doc.path): {"users": [user.sub]},
}
def sortkey(d):
return d["id"]
with mock.patch.object(FindDocumentIndexer, "push") as mock_push:
call_command("index")
push_call_args = [call.args[0] for call in mock_push.call_args_list]
assert len(push_call_args) == 1 # called once but with a batch of docs
assert sorted(push_call_args[0], key=sortkey) == sorted(
# called once but with a batch of docs
mock_push.assert_called_once()
assert sorted(push_call_args[0], key=itemgetter("id")) == sorted(
[
indexer.serialize_document(doc, accesses),
indexer.serialize_document(no_title_doc, accesses),
],
key=sortkey,
key=itemgetter("id"),
)

View File

@@ -57,7 +57,7 @@ def test_api_documents_search_endpoint_is_none(settings):
response = APIClient().get("/api/v1.0/documents/search/", data={"q": "alpha"})
assert response.status_code == 401
assert response.json() == {"detail": "The service is not configured properly."}
assert response.json() == {"detail": "The service is not properly configured."}
@responses.activate
@@ -75,6 +75,11 @@ def test_api_documents_search_invalid_params(settings):
assert response.status_code == 400
assert response.json() == {"q": ["This field is required."]}
response = APIClient().get("/api/v1.0/documents/search/", data={"q": " "})
assert response.status_code == 400
assert response.json() == {"q": ["This field may not be blank."]}
@responses.activate
def test_api_documents_search_format(settings):

View File

@@ -3,6 +3,7 @@ Unit tests for the Document model
"""
# pylint: disable=too-many-lines
from operator import itemgetter
import random
import smtplib
import time
@@ -1638,7 +1639,7 @@ def test_models_documents_post_save_indexer(mock_push, settings):
factories.UserDocumentAccessFactory(document=doc2, user=user)
factories.UserDocumentAccessFactory(document=doc3, user=user)
time.sleep(0.1) # waits for the end of the tasks
time.sleep(0.2) # waits for the end of the tasks
accesses = {
str(doc1.path): {"users": [user.sub]},
@@ -1650,16 +1651,13 @@ def test_models_documents_post_save_indexer(mock_push, settings):
indexer = FindDocumentIndexer()
def sortkey(d):
return d["id"]
assert sorted(data, key=sortkey) == sorted(
assert sorted(data, key=itemgetter('id')) == sorted(
[
indexer.serialize_document(doc1, accesses),
indexer.serialize_document(doc2, accesses),
indexer.serialize_document(doc3, accesses),
],
key=sortkey,
key=itemgetter('id'),
)
# The debounce counters should be reset

View File

@@ -4,77 +4,154 @@ from functools import partial
from unittest.mock import patch
from django.contrib.auth.models import AnonymousUser
from django.core.exceptions import ImproperlyConfigured
from django.utils.module_loading import import_string
import pytest
from core import factories, models, utils
from core.services.search_indexers import (
BaseDocumentIndexer,
FindDocumentIndexer,
get_document_indexer_class,
get_visited_document_ids_of,
)
pytestmark = pytest.mark.django_db
def test_push_raises_error_if_search_indexer_url_is_none(settings):
class FakeDocumentIndexer(BaseDocumentIndexer):
"""Fake indexer for test purpose"""
def serialize_document(self, document, accesses):
return {}
def push(self, data):
pass
def search_query(self, data, token):
return {}
def format_response(self, data: dict):
return {}
@pytest.fixture(name="fake_indexer_settings")
def fake_indexer_settings_fixture(settings):
"""Fixture to switch the indexer to the FakeDocumentIndexer."""
_original_backend = str(settings.SEARCH_INDEXER_CLASS)
settings.SEARCH_INDEXER_CLASS = (
"core.tests.test_services_search_indexers.FakeDocumentIndexer"
)
get_document_indexer_class.cache_clear()
yield settings
settings.SEARCH_INDEXER_CLASS = _original_backend
# clear cache to prevent issues with other tests
get_document_indexer_class.cache_clear()
def test_services_search_indexer_class_is_empty(fake_indexer_settings):
"""
Should raise ImproperlyConfigured if SEARCH_INDEXER_CLASS is None or empty.
"""
fake_indexer_settings.SEARCH_INDEXER_CLASS = None
with pytest.raises(ImproperlyConfigured) as exc_info:
get_document_indexer_class()
assert "SEARCH_INDEXER_CLASS must be set in Django settings." in str(exc_info.value)
fake_indexer_settings.SEARCH_INDEXER_CLASS = ""
# clear cache again
get_document_indexer_class.cache_clear()
with pytest.raises(ImproperlyConfigured) as exc_info:
get_document_indexer_class()
assert "SEARCH_INDEXER_CLASS must be set in Django settings." in str(exc_info.value)
def test_services_search_indexer_class_invalid(fake_indexer_settings):
"""
Should raise RuntimeError if SEARCH_INDEXER_CLASS cannot be imported.
"""
fake_indexer_settings.SEARCH_INDEXER_CLASS = "unknown.Unknown"
with pytest.raises(ImproperlyConfigured) as exc_info:
get_document_indexer_class()
assert (
"SEARCH_INDEXER_CLASS setting is not valid : No module named 'unknown'"
in str(exc_info.value)
)
def test_services_search_indexer_class(fake_indexer_settings):
"""
Import indexer class defined in setting SEARCH_INDEXER_CLASS.
"""
fake_indexer_settings.SEARCH_INDEXER_CLASS = (
"core.tests.test_services_search_indexers.FakeDocumentIndexer"
)
assert get_document_indexer_class() == import_string(
"core.tests.test_services_search_indexers.FakeDocumentIndexer"
)
def test_services_search_indexer_url_is_none(settings):
"""
Indexer should raise RuntimeError if SEARCH_INDEXER_URL is None or empty.
"""
settings.SEARCH_INDEXER_URL = None
indexer = FindDocumentIndexer()
with pytest.raises(RuntimeError) as exc_info:
indexer.push([])
with pytest.raises(ImproperlyConfigured) as exc_info:
FindDocumentIndexer()
assert "SEARCH_INDEXER_URL must be set in Django settings before indexing." in str(
exc_info.value
)
assert "SEARCH_INDEXER_URL must be set in Django settings." in str(exc_info.value)
def test_push_raises_error_if_search_indexer_url_is_empty(settings):
def test_services_search_indexer_url_is_empty(settings):
"""
Indexer should raise RuntimeError if SEARCH_INDEXER_URL is empty string.
"""
settings.SEARCH_INDEXER_URL = ""
indexer = FindDocumentIndexer()
with pytest.raises(RuntimeError) as exc_info:
indexer.push([])
with pytest.raises(ImproperlyConfigured) as exc_info:
FindDocumentIndexer()
assert "SEARCH_INDEXER_URL must be set in Django settings before indexing." in str(
exc_info.value
)
assert "SEARCH_INDEXER_URL must be set in Django settings." in str(exc_info.value)
def test_push_raises_error_if_search_indexer_secret_is_none(settings):
def test_services_search_indexer_secret_is_none(settings):
"""
Indexer should raise RuntimeError if SEARCH_INDEXER_SECRET is None or empty.
"""
settings.SEARCH_INDEXER_SECRET = None
indexer = FindDocumentIndexer()
with pytest.raises(RuntimeError) as exc_info:
indexer.push([])
with pytest.raises(ImproperlyConfigured) as exc_info:
FindDocumentIndexer()
assert (
"SEARCH_INDEXER_SECRET must be set in Django settings before indexing."
in str(exc_info.value)
assert "SEARCH_INDEXER_SECRET must be set in Django settings." in str(
exc_info.value
)
def test_push_raises_error_if_search_indexer_secret_is_empty(settings):
def test_services_search_indexer_secret_is_empty(settings):
"""
Indexer should raise RuntimeError if SEARCH_INDEXER_SECRET is empty string.
"""
settings.SEARCH_INDEXER_SECRET = ""
indexer = FindDocumentIndexer()
with pytest.raises(RuntimeError) as exc_info:
indexer.push([])
with pytest.raises(ImproperlyConfigured) as exc_info:
FindDocumentIndexer()
assert (
"SEARCH_INDEXER_SECRET must be set in Django settings before indexing."
in str(exc_info.value)
assert "SEARCH_INDEXER_SECRET must be set in Django settings." in str(
exc_info.value
)
@@ -333,13 +410,10 @@ def test_search_query_raises_error_if_search_endpoint_is_none(settings):
Indexer should raise RuntimeError if SEARCH_INDEXER_QUERY_URL is None or empty.
"""
settings.SEARCH_INDEXER_QUERY_URL = None
indexer = FindDocumentIndexer()
user = factories.UserFactory()
with pytest.raises(RuntimeError) as exc_info:
indexer.search("alpha", user=user, token="mytoken")
with pytest.raises(ImproperlyConfigured) as exc_info:
FindDocumentIndexer()
assert (
"SEARCH_INDEXER_QUERY_URL must be set in Django settings before search."
in str(exc_info.value)
assert "SEARCH_INDEXER_QUERY_URL must be set in Django settings." in str(
exc_info.value
)

View File

@@ -100,6 +100,11 @@ class Base(Configuration):
DEFAULT_AUTO_FIELD = "django.db.models.AutoField"
# Search
SEARCH_INDEXER_CLASS = values.Value(
default="core.services.search_indexers.FindDocumentIndexer",
environ_name="SEARCH_INDEXER_CLASS",
environ_prefix=None,
)
SEARCH_INDEXER_BATCH_SIZE = values.IntegerValue(
default=100_000, environ_name="SEARCH_INDEXER_BATCH_SIZE", environ_prefix=None
)
@@ -977,6 +982,11 @@ class Test(Base):
# Tests are raising warnings because the /data/static directory does not exist
STATIC_ROOT = None
# Setup indexer configuration to make test working on the CI.
SEARCH_INDEXER_SECRET = "ThisIsAKeyForTest" # noqa
SEARCH_INDEXER_URL = "http://localhost:8081/api/v1.0/documents/index/"
SEARCH_INDEXER_QUERY_URL = "http://localhost:8081/api/v1.0/documents/search/"
CELERY_TASK_ALWAYS_EAGER = values.BooleanValue(True)
def __init__(self):