(backend) some refactor of indexer classes & modules

Rename FindDocumentIndexer as SearchIndexer
Rename FindDocumentSerializer as SearchDocumentSerializer
Rename package core.tasks.find as core.task.search
Remove logs on http errors in SearchIndexer
Factorise some code in search API view.

Signed-off-by: Fabre Florian <ffabre@hybird.org>
This commit is contained in:
Fabre Florian
2025-10-07 20:54:21 +02:00
committed by Quentin BEY
parent 6f282ec5d6
commit 044c1495a9
10 changed files with 112 additions and 111 deletions

View File

@@ -79,7 +79,7 @@ Y_PROVIDER_API_KEY=yprovider-api-key
THEME_CUSTOMIZATION_CACHE_TIMEOUT=15
# Indexer
SEARCH_INDEXER_CLASS="core.services.search_indexers.FindDocumentIndexer"
SEARCH_INDEXER_CLASS="core.services.search_indexers.SearchIndexer"
SEARCH_INDEXER_SECRET=find-api-key-for-docs-with-exactly-50-chars-length # Key generated by create_demo in Find app.
SEARCH_INDEXER_URL="http://find:8000/api/v1.0/documents/index/"
SEARCH_INDEXER_QUERY_URL="http://find:8000/api/v1.0/documents/search/"

View File

@@ -1015,8 +1015,8 @@ class ThreadSerializer(serializers.ModelSerializer):
return {}
class FindDocumentSerializer(serializers.Serializer):
"""Serializer for Find search requests"""
class SearchDocumentSerializer(serializers.Serializer):
"""Serializer for fulltext search requests through Find application"""
q = serializers.CharField(required=True, allow_blank=False, trim_whitespace=True)
page_size = serializers.IntegerField(

View File

@@ -1085,16 +1085,49 @@ class DocumentViewSet(
{"id": str(duplicated_document.id)}, status=status.HTTP_201_CREATED
)
def _simple_search_queryset(self, params):
"""
Returns a queryset filtered by the content of the document title
"""
text = params.validated_data["q"]
# As the 'list' view we get a prefiltered queryset (deleted docs are excluded)
queryset = self.get_queryset()
filterset = DocumentFilter({"title": text}, queryset=queryset)
if not filterset.is_valid():
raise drf.exceptions.ValidationError(filterset.errors)
return filterset.filter_queryset(queryset)
def _fulltext_search_queryset(self, indexer, token, user, params):
"""
Returns a queryset from the results the fulltext search of Find
"""
text = params.validated_data["q"]
queryset = models.Document.objects.all()
# Retrieve the documents ids from Find.
results = indexer.search(
text=text,
token=token,
visited=get_visited_document_ids_of(queryset, user),
page=params.validated_data.get("page", 1),
page_size=params.validated_data.get("page_size", 20),
)
return queryset.filter(pk__in=results)
@drf.decorators.action(detail=False, methods=["get"], url_path="search")
@method_decorator(refresh_oidc_access_token)
def search(self, request, *args, **kwargs):
"""
Returns a DRF response containing the filtered, annotated and ordered document list.
Applies filtering based on request parameter 'q' from `FindDocumentSerializer`.
Applies filtering based on request parameter 'q' from `SearchDocumentSerializer`.
Depending of the configuration it can be:
- A fulltext search through the opensearch indexation app "find" if the backend is
enabled (see SEARCH_BACKEND_CLASS)
enabled (see SEARCH_INDEXER_CLASS)
- A filtering by the model field 'title'.
The ordering is always by the most recent first.
@@ -1102,46 +1135,22 @@ class DocumentViewSet(
access_token = request.session.get("oidc_access_token")
user = request.user
serializer = serializers.FindDocumentSerializer(data=request.query_params)
serializer.is_valid(raise_exception=True)
params = serializers.SearchDocumentSerializer(data=request.query_params)
params.is_valid(raise_exception=True)
indexer = get_document_indexer()
text = serializer.validated_data["q"]
# The indexer is not configured, so we fallback on a simple filter on the
# model field 'title'.
if not indexer:
# As the 'list' view we get a prefiltered queryset (deleted docs are excluded)
queryset = self.get_queryset()
filterset = DocumentFilter({"title": text}, queryset=queryset)
if not filterset.is_valid():
raise drf.exceptions.ValidationError(filterset.errors)
queryset = filterset.filter_queryset(queryset).order_by("-updated_at")
return self.get_response_for_queryset(
queryset,
context={
"request": request,
},
if indexer:
queryset = self._fulltext_search_queryset(
indexer, token=access_token, user=user, params=params
)
queryset = models.Document.objects.all()
# Retrieve the documents ids from Find.
results = indexer.search(
text=text,
token=access_token,
visited=get_visited_document_ids_of(queryset, user),
page=serializer.validated_data.get("page", 1),
page_size=serializer.validated_data.get("page_size", 20),
)
queryset = queryset.filter(pk__in=results).order_by("-updated_at")
else:
# The indexer is not configured, we fallback on a simple icontains filter by the
# model field 'title'.
queryset = self._simple_search_queryset(params)
return self.get_response_for_queryset(
queryset,
queryset.order_by("-updated_at"),
context={
"request": request,
},

View File

@@ -223,7 +223,7 @@ class BaseDocumentIndexer(ABC):
"""
class FindDocumentIndexer(BaseDocumentIndexer):
class SearchIndexer(BaseDocumentIndexer):
"""
Document indexer that pushes documents to La Suite Find app.
"""
@@ -270,18 +270,14 @@ class FindDocumentIndexer(BaseDocumentIndexer):
Returns:
dict: A JSON-serializable dictionary.
"""
try:
response = requests.post(
self.search_url,
json=data,
headers={"Authorization": f"Bearer {token}"},
timeout=10,
)
response.raise_for_status()
return response.json()
except requests.exceptions.HTTPError as e:
logger.error("HTTPError: %s", e)
raise
response = requests.post(
self.search_url,
json=data,
headers={"Authorization": f"Bearer {token}"},
timeout=10,
)
response.raise_for_status()
return response.json()
def push(self, data):
"""
@@ -290,14 +286,10 @@ class FindDocumentIndexer(BaseDocumentIndexer):
Args:
data (list): List of document dictionaries.
"""
try:
response = requests.post(
self.indexer_url,
json=data,
headers={"Authorization": f"Bearer {self.indexer_secret}"},
timeout=10,
)
response.raise_for_status()
except requests.exceptions.HTTPError as e:
logger.error("HTTPError: %s", e)
raise
response = requests.post(
self.indexer_url,
json=data,
headers={"Authorization": f"Bearer {self.indexer_secret}"},
timeout=10,
)
response.raise_for_status()

View File

@@ -9,7 +9,7 @@ from django.db.models import signals
from django.dispatch import receiver
from . import models
from .tasks.find import trigger_document_indexer
from .tasks.search import trigger_document_indexer
@receiver(signals.post_save, sender=models.Document)

View File

@@ -11,7 +11,7 @@ from django.db import transaction
import pytest
from core import factories
from core.services.search_indexers import FindDocumentIndexer
from core.services.search_indexers import SearchIndexer
@pytest.mark.django_db
@@ -19,7 +19,7 @@ from core.services.search_indexers import FindDocumentIndexer
def test_index():
"""Test the command `index` that run the Find app indexer for all the available documents."""
user = factories.UserFactory()
indexer = FindDocumentIndexer()
indexer = SearchIndexer()
with transaction.atomic():
doc = factories.DocumentFactory()
@@ -36,7 +36,7 @@ def test_index():
str(no_title_doc.path): {"users": [user.sub]},
}
with mock.patch.object(FindDocumentIndexer, "push") as mock_push:
with mock.patch.object(SearchIndexer, "push") as mock_push:
call_command("index")
push_call_args = [call.args[0] for call in mock_push.call_args_list]

View File

@@ -39,7 +39,7 @@ def indexer_settings_fixture(settings):
get_document_indexer.cache_clear()
settings.SEARCH_INDEXER_CLASS = "core.services.search_indexers.FindDocumentIndexer"
settings.SEARCH_INDEXER_CLASS = "core.services.search_indexers.SearchIndexer"
settings.SEARCH_INDEXER_SECRET = "ThisIsAKeyForTest"
settings.SEARCH_INDEXER_URL = "http://localhost:8081/api/v1.0/documents/index/"
settings.SEARCH_INDEXER_QUERY_URL = (

View File

@@ -21,7 +21,7 @@ from django.utils import timezone
import pytest
from core import factories, models
from core.services.search_indexers import FindDocumentIndexer
from core.services.search_indexers import SearchIndexer
pytestmark = pytest.mark.django_db
@@ -1622,7 +1622,7 @@ def test_models_documents_compute_ancestors_links_paths_mapping_structure(
}
@mock.patch.object(FindDocumentIndexer, "push")
@mock.patch.object(SearchIndexer, "push")
@pytest.mark.django_db(transaction=True)
def test_models_documents_post_save_indexer(mock_push, indexer_settings):
"""Test indexation task on document creation"""
@@ -1634,7 +1634,7 @@ def test_models_documents_post_save_indexer(mock_push, indexer_settings):
accesses = {}
data = [call.args[0] for call in mock_push.call_args_list]
indexer = FindDocumentIndexer()
indexer = SearchIndexer()
assert sorted(data, key=itemgetter("id")) == sorted(
[
@@ -1651,7 +1651,7 @@ def test_models_documents_post_save_indexer(mock_push, indexer_settings):
assert cache.get(f"doc-indexer-debounce-{doc3.pk}") == 0
@mock.patch.object(FindDocumentIndexer, "push")
@mock.patch.object(SearchIndexer, "push")
@pytest.mark.django_db(transaction=True)
def test_models_documents_post_save_indexer_not_configured(mock_push, indexer_settings):
"""Task should not start an indexation when disabled"""
@@ -1664,7 +1664,7 @@ def test_models_documents_post_save_indexer_not_configured(mock_push, indexer_se
assert mock_push.call_args_list == []
@mock.patch.object(FindDocumentIndexer, "push")
@mock.patch.object(SearchIndexer, "push")
@pytest.mark.django_db(transaction=True)
def test_models_documents_post_save_indexer_with_accesses(mock_push, indexer_settings):
"""Test indexation task on document creation"""
@@ -1687,7 +1687,7 @@ def test_models_documents_post_save_indexer_with_accesses(mock_push, indexer_set
data = [call.args[0] for call in mock_push.call_args_list]
indexer = FindDocumentIndexer()
indexer = SearchIndexer()
assert sorted(data, key=itemgetter("id")) == sorted(
[
@@ -1704,7 +1704,7 @@ def test_models_documents_post_save_indexer_with_accesses(mock_push, indexer_set
assert cache.get(f"doc-indexer-debounce-{doc3.pk}") == 0
@mock.patch.object(FindDocumentIndexer, "push")
@mock.patch.object(SearchIndexer, "push")
@pytest.mark.django_db(transaction=True)
def test_models_documents_post_save_indexer_deleted(mock_push, indexer_settings):
"""Indexation task on deleted or ancestor_deleted documents"""
@@ -1747,7 +1747,7 @@ def test_models_documents_post_save_indexer_deleted(mock_push, indexer_settings)
data = [call.args[0] for call in mock_push.call_args_list]
indexer = FindDocumentIndexer()
indexer = SearchIndexer()
# Even deleted document are re-indexed : only update their status in the future ?
assert sorted(data, key=itemgetter("id")) == sorted(
@@ -1766,7 +1766,7 @@ def test_models_documents_post_save_indexer_deleted(mock_push, indexer_settings)
assert cache.get(f"doc-indexer-debounce-{doc_ancestor_deleted.pk}") == 0
@mock.patch.object(FindDocumentIndexer, "push")
@mock.patch.object(SearchIndexer, "push")
@pytest.mark.django_db(transaction=True)
def test_models_documents_post_save_indexer_restored(mock_push, indexer_settings):
"""Restart indexation task on restored documents"""
@@ -1820,7 +1820,7 @@ def test_models_documents_post_save_indexer_restored(mock_push, indexer_settings
data = [call.args[0] for call in mock_push.call_args_list]
indexer = FindDocumentIndexer()
indexer = SearchIndexer()
# All docs are re-indexed
assert sorted(data, key=itemgetter("id")) == sorted(
@@ -1840,10 +1840,10 @@ def test_models_documents_post_save_indexer_debounce(indexer_settings):
"""Test indexation task skipping on document update"""
indexer_settings.SEARCH_INDEXER_COUNTDOWN = 0
indexer = FindDocumentIndexer()
indexer = SearchIndexer()
user = factories.UserFactory()
with mock.patch.object(FindDocumentIndexer, "push"):
with mock.patch.object(SearchIndexer, "push"):
with transaction.atomic():
doc = factories.DocumentFactory()
factories.UserDocumentAccessFactory(document=doc, user=user)
@@ -1852,7 +1852,7 @@ def test_models_documents_post_save_indexer_debounce(indexer_settings):
str(doc.path): {"users": [user.sub]},
}
with mock.patch.object(FindDocumentIndexer, "push") as mock_push:
with mock.patch.object(SearchIndexer, "push") as mock_push:
# Simulate 1 waiting task
cache.set(f"doc-indexer-debounce-{doc.pk}", 1)
@@ -1863,7 +1863,7 @@ def test_models_documents_post_save_indexer_debounce(indexer_settings):
assert [call.args[0] for call in mock_push.call_args_list] == []
with mock.patch.object(FindDocumentIndexer, "push") as mock_push:
with mock.patch.object(SearchIndexer, "push") as mock_push:
# No waiting task
cache.set(f"doc-indexer-debounce-{doc.pk}", 0)
@@ -1881,10 +1881,10 @@ def test_models_documents_access_post_save_indexer(indexer_settings):
"""Test indexation task on DocumentAccess update"""
indexer_settings.SEARCH_INDEXER_COUNTDOWN = 0
indexer = FindDocumentIndexer()
indexer = SearchIndexer()
user = factories.UserFactory()
with mock.patch.object(FindDocumentIndexer, "push"):
with mock.patch.object(SearchIndexer, "push"):
with transaction.atomic():
doc = factories.DocumentFactory()
doc_access = factories.UserDocumentAccessFactory(document=doc, user=user)
@@ -1893,9 +1893,9 @@ def test_models_documents_access_post_save_indexer(indexer_settings):
str(doc.path): {"users": [user.sub]},
}
indexer = FindDocumentIndexer()
indexer = SearchIndexer()
with mock.patch.object(FindDocumentIndexer, "push") as mock_push:
with mock.patch.object(SearchIndexer, "push") as mock_push:
with transaction.atomic():
doc_access.save()

View File

@@ -15,7 +15,7 @@ from requests import HTTPError
from core import factories, models, utils
from core.services.search_indexers import (
BaseDocumentIndexer,
FindDocumentIndexer,
SearchIndexer,
get_document_indexer,
get_visited_document_ids_of,
)
@@ -78,7 +78,7 @@ def test_services_search_indexer_is_configured(indexer_settings):
# Valid class
indexer_settings.SEARCH_INDEXER_CLASS = (
"core.services.search_indexers.FindDocumentIndexer"
"core.services.search_indexers.SearchIndexer"
)
get_document_indexer.cache_clear()
@@ -98,7 +98,7 @@ def test_services_search_indexer_url_is_none(indexer_settings):
indexer_settings.SEARCH_INDEXER_URL = None
with pytest.raises(ImproperlyConfigured) as exc_info:
FindDocumentIndexer()
SearchIndexer()
assert "SEARCH_INDEXER_URL must be set in Django settings." in str(exc_info.value)
@@ -110,7 +110,7 @@ def test_services_search_indexer_url_is_empty(indexer_settings):
indexer_settings.SEARCH_INDEXER_URL = ""
with pytest.raises(ImproperlyConfigured) as exc_info:
FindDocumentIndexer()
SearchIndexer()
assert "SEARCH_INDEXER_URL must be set in Django settings." in str(exc_info.value)
@@ -122,7 +122,7 @@ def test_services_search_indexer_secret_is_none(indexer_settings):
indexer_settings.SEARCH_INDEXER_SECRET = None
with pytest.raises(ImproperlyConfigured) as exc_info:
FindDocumentIndexer()
SearchIndexer()
assert "SEARCH_INDEXER_SECRET must be set in Django settings." in str(
exc_info.value
@@ -136,7 +136,7 @@ def test_services_search_indexer_secret_is_empty(indexer_settings):
indexer_settings.SEARCH_INDEXER_SECRET = ""
with pytest.raises(ImproperlyConfigured) as exc_info:
FindDocumentIndexer()
SearchIndexer()
assert "SEARCH_INDEXER_SECRET must be set in Django settings." in str(
exc_info.value
@@ -150,7 +150,7 @@ def test_services_search_endpoint_is_none(indexer_settings):
indexer_settings.SEARCH_INDEXER_QUERY_URL = None
with pytest.raises(ImproperlyConfigured) as exc_info:
FindDocumentIndexer()
SearchIndexer()
assert "SEARCH_INDEXER_QUERY_URL must be set in Django settings." in str(
exc_info.value
@@ -164,7 +164,7 @@ def test_services_search_endpoint_is_empty(indexer_settings):
indexer_settings.SEARCH_INDEXER_QUERY_URL = ""
with pytest.raises(ImproperlyConfigured) as exc_info:
FindDocumentIndexer()
SearchIndexer()
assert "SEARCH_INDEXER_QUERY_URL must be set in Django settings." in str(
exc_info.value
@@ -192,7 +192,7 @@ def test_services_search_indexers_serialize_document_returns_expected_json():
}
}
indexer = FindDocumentIndexer()
indexer = SearchIndexer()
result = indexer.serialize_document(document, accesses)
assert set(result.pop("users")) == {str(user_a.sub), str(user_b.sub)}
@@ -221,7 +221,7 @@ def test_services_search_indexers_serialize_document_deleted():
parent.soft_delete()
document.refresh_from_db()
indexer = FindDocumentIndexer()
indexer = SearchIndexer()
result = indexer.serialize_document(document, {})
assert result["is_active"] is False
@@ -232,7 +232,7 @@ def test_services_search_indexers_serialize_document_empty():
"""Empty documents returns empty content in the serialized json."""
document = factories.DocumentFactory(content="", title=None)
indexer = FindDocumentIndexer()
indexer = SearchIndexer()
result = indexer.serialize_document(document, {})
assert result["content"] == ""
@@ -256,10 +256,10 @@ def test_services_search_indexers_index_errors(indexer_settings):
)
with pytest.raises(HTTPError):
FindDocumentIndexer().index()
SearchIndexer().index()
@patch.object(FindDocumentIndexer, "push")
@patch.object(SearchIndexer, "push")
def test_services_search_indexers_batches_pass_only_batch_accesses(
mock_push, indexer_settings
):
@@ -276,7 +276,7 @@ def test_services_search_indexers_batches_pass_only_batch_accesses(
access = factories.UserDocumentAccessFactory(document=document)
expected_user_subs[str(document.id)] = str(access.user.sub)
assert FindDocumentIndexer().index() == 5
assert SearchIndexer().index() == 5
# Should be 3 batches: 2 + 2 + 1
assert mock_push.call_count == 3
@@ -299,7 +299,7 @@ def test_services_search_indexers_batches_pass_only_batch_accesses(
assert seen_doc_ids == {str(d.id) for d in documents}
@patch.object(FindDocumentIndexer, "push")
@patch.object(SearchIndexer, "push")
@pytest.mark.usefixtures("indexer_settings")
def test_services_search_indexers_ignore_empty_documents(mock_push):
"""
@@ -311,7 +311,7 @@ def test_services_search_indexers_ignore_empty_documents(mock_push):
empty_title = factories.DocumentFactory(title="")
empty_content = factories.DocumentFactory(content="")
assert FindDocumentIndexer().index() == 3
assert SearchIndexer().index() == 3
assert mock_push.call_count == 1
@@ -327,7 +327,7 @@ def test_services_search_indexers_ignore_empty_documents(mock_push):
}
@patch.object(FindDocumentIndexer, "push")
@patch.object(SearchIndexer, "push")
@pytest.mark.usefixtures("indexer_settings")
def test_services_search_indexers_ancestors_link_reach(mock_push):
"""Document accesses and reach should take into account ancestors link reaches."""
@@ -338,7 +338,7 @@ def test_services_search_indexers_ancestors_link_reach(mock_push):
parent = factories.DocumentFactory(parent=grand_parent, link_reach="public")
document = factories.DocumentFactory(parent=parent, link_reach="restricted")
assert FindDocumentIndexer().index() == 4
assert SearchIndexer().index() == 4
results = {doc["id"]: doc for doc in mock_push.call_args[0][0]}
assert len(results) == 4
@@ -348,7 +348,7 @@ def test_services_search_indexers_ancestors_link_reach(mock_push):
assert results[str(document.id)]["reach"] == "public"
@patch.object(FindDocumentIndexer, "push")
@patch.object(SearchIndexer, "push")
@pytest.mark.usefixtures("indexer_settings")
def test_services_search_indexers_ancestors_users(mock_push):
"""Document accesses and reach should include users from ancestors."""
@@ -358,7 +358,7 @@ def test_services_search_indexers_ancestors_users(mock_push):
parent = factories.DocumentFactory(parent=grand_parent, users=[user_p])
document = factories.DocumentFactory(parent=parent, users=[user_d])
assert FindDocumentIndexer().index() == 3
assert SearchIndexer().index() == 3
results = {doc["id"]: doc for doc in mock_push.call_args[0][0]}
assert len(results) == 3
@@ -371,7 +371,7 @@ def test_services_search_indexers_ancestors_users(mock_push):
}
@patch.object(FindDocumentIndexer, "push")
@patch.object(SearchIndexer, "push")
@pytest.mark.usefixtures("indexer_settings")
def test_services_search_indexers_ancestors_teams(mock_push):
"""Document accesses and reach should include teams from ancestors."""
@@ -379,7 +379,7 @@ def test_services_search_indexers_ancestors_teams(mock_push):
parent = factories.DocumentFactory(parent=grand_parent, teams=["team_p"])
document = factories.DocumentFactory(parent=parent, teams=["team_d"])
assert FindDocumentIndexer().index() == 3
assert SearchIndexer().index() == 3
results = {doc["id"]: doc for doc in mock_push.call_args[0][0]}
assert len(results) == 3
@@ -396,7 +396,7 @@ def test_push_uses_correct_url_and_data(mock_post, indexer_settings):
"""
indexer_settings.SEARCH_INDEXER_URL = "http://example.com/index"
indexer = FindDocumentIndexer()
indexer = SearchIndexer()
sample_data = [{"id": "123", "title": "Test"}]
mock_response = mock_post.return_value
@@ -497,7 +497,7 @@ def test_services_search_indexers_search_errors(indexer_settings):
)
with pytest.raises(HTTPError):
FindDocumentIndexer().search("alpha", token="mytoken")
SearchIndexer().search("alpha", token="mytoken")
@patch("requests.post")
@@ -507,7 +507,7 @@ def test_services_search_indexers_search(mock_post, indexer_settings):
document ids from linktraces.
"""
user = factories.UserFactory()
indexer = FindDocumentIndexer()
indexer = SearchIndexer()
mock_response = mock_post.return_value
mock_response.raise_for_status.return_value = None # No error