(backend) add fallback search & default ordering

Filter deleted documents from visited ones.
Set default ordering to the Find API search call (-updated_at)
BaseDocumentIndexer.search now returns a list of document ids instead of models.
Do not call the indexer in signals when SEARCH_INDEXER_CLASS is not defined
or properly configured.

Signed-off-by: Fabre Florian <ffabre@hybird.org>
This commit is contained in:
Fabre Florian
2025-09-17 07:47:15 +02:00
committed by Quentin BEY
parent bf978b5376
commit 01c31ddd74
11 changed files with 558 additions and 153 deletions

View File

@@ -1019,3 +1019,7 @@ class FindDocumentSerializer(serializers.Serializer):
"""Serializer for Find search requests""" """Serializer for Find search requests"""
q = serializers.CharField(required=True, allow_blank=False, trim_whitespace=True) q = serializers.CharField(required=True, allow_blank=False, trim_whitespace=True)
page_size = serializers.IntegerField(
required=False, min_value=1, max_value=50, default=20
)
page = serializers.IntegerField(required=False, min_value=1, default=1)

View File

@@ -13,7 +13,7 @@ from django.conf import settings
from django.contrib.postgres.aggregates import ArrayAgg from django.contrib.postgres.aggregates import ArrayAgg
from django.contrib.postgres.search import TrigramSimilarity from django.contrib.postgres.search import TrigramSimilarity
from django.core.cache import cache from django.core.cache import cache
from django.core.exceptions import ImproperlyConfigured, ValidationError from django.core.exceptions import ValidationError
from django.core.files.storage import default_storage from django.core.files.storage import default_storage
from django.core.validators import URLValidator from django.core.validators import URLValidator
from django.db import connection, transaction from django.db import connection, transaction
@@ -52,7 +52,10 @@ from core.services.converter_services import (
from core.services.converter_services import ( from core.services.converter_services import (
YdocConverter, YdocConverter,
) )
from core.services.search_indexers import get_document_indexer_class from core.services.search_indexers import (
default_document_indexer,
get_visited_document_ids_of,
)
from core.tasks.mail import send_ask_for_access_mail from core.tasks.mail import send_ask_for_access_mail
from core.utils import extract_attachments, filter_descendants from core.utils import extract_attachments, filter_descendants
@@ -1090,23 +1093,42 @@ class DocumentViewSet(
The filtering allows full text search through the opensearch indexation app "find". The filtering allows full text search through the opensearch indexation app "find".
""" """
access_token = request.session.get("oidc_access_token") access_token = request.session.get("oidc_access_token")
user = request.user
serializer = serializers.FindDocumentSerializer(data=request.query_params) serializer = serializers.FindDocumentSerializer(data=request.query_params)
serializer.is_valid(raise_exception=True) serializer.is_valid(raise_exception=True)
try: indexer = default_document_indexer()
indexer = get_document_indexer_class()()
queryset = indexer.search( if not indexer:
text=serializer.validated_data.get("q", ""), queryset = self.get_queryset()
user=request.user, filterset = DocumentFilter(
token=access_token, {"title": serializer.validated_data.get("q", "")}, queryset=queryset
) )
except ImproperlyConfigured:
return drf.response.Response( if not filterset.is_valid():
{"detail": "The service is not properly configured."}, raise drf.exceptions.ValidationError(filterset.errors)
status=status.HTTP_401_UNAUTHORIZED,
queryset = filterset.filter_queryset(queryset).order_by("-updated_at")
return self.get_response_for_queryset(
queryset,
context={
"request": request,
},
) )
queryset = models.Document.objects.all()
results = indexer.search(
text=serializer.validated_data.get("q", ""),
token=access_token,
visited=get_visited_document_ids_of(queryset, user),
page=serializer.validated_data.get("page", 1),
page_size=serializer.validated_data.get("page_size", 20),
)
queryset = queryset.filter(pk__in=results)
return self.get_response_for_queryset( return self.get_response_for_queryset(
queryset, queryset,
context={ context={

View File

@@ -8,6 +8,7 @@ from functools import cache
from django.conf import settings from django.conf import settings
from django.contrib.auth.models import AnonymousUser from django.contrib.auth.models import AnonymousUser
from django.core.exceptions import ImproperlyConfigured from django.core.exceptions import ImproperlyConfigured
from django.db.models import Subquery
from django.utils.module_loading import import_string from django.utils.module_loading import import_string
import requests import requests
@@ -18,7 +19,23 @@ logger = logging.getLogger(__name__)
@cache @cache
def get_document_indexer_class() -> "BaseDocumentIndexer": def default_document_indexer():
"""Returns default indexer service is enabled and properly configured."""
# For this usecase an empty indexer class is not an issue but a feature.
if not getattr(settings, "SEARCH_INDEXER_CLASS", None):
logger.info("Document indexer is not configured (see SEARCH_INDEXER_CLASS)")
return None
try:
return get_document_indexer_class()()
except ImproperlyConfigured as err:
logger.error("Document indexer is not properly configured : %s", err)
return None
@cache
def get_document_indexer_class():
"""Return the indexer backend class based on the settings.""" """Return the indexer backend class based on the settings."""
classpath = settings.SEARCH_INDEXER_CLASS classpath = settings.SEARCH_INDEXER_CLASS
@@ -65,7 +82,7 @@ def get_batch_accesses_by_users_and_teams(paths):
return dict(access_by_document_path) return dict(access_by_document_path)
def get_visited_document_ids_of(user): def get_visited_document_ids_of(queryset, user):
""" """
Returns the ids of the documents that have a linktrace to the user and NOT owned. Returns the ids of the documents that have a linktrace to the user and NOT owned.
It will be use to limit the opensearch responses to the public documents already It will be use to limit the opensearch responses to the public documents already
@@ -74,11 +91,18 @@ def get_visited_document_ids_of(user):
if isinstance(user, AnonymousUser): if isinstance(user, AnonymousUser):
return [] return []
qs = models.LinkTrace.objects.filter(user=user).exclude( qs = models.LinkTrace.objects.filter(user=user)
document__accesses__user=user,
docs = (
queryset.exclude(accesses__user=user)
.filter(
deleted_at__isnull=True,
ancestors_deleted_at__isnull=True,
)
.filter(pk__in=Subquery(qs.values("document_id")))
) )
return list({str(id) for id in qs.values_list("document_id", flat=True)}) return list({str(id) for id in docs.values_list("pk", flat=True)})
class BaseDocumentIndexer(ABC): class BaseDocumentIndexer(ABC):
@@ -159,22 +183,41 @@ class BaseDocumentIndexer(ABC):
Must be implemented by subclasses. Must be implemented by subclasses.
""" """
def search(self, text, user, token): # pylint: disable-next=too-many-arguments,too-many-positional-arguments
def search(self, text, token, visited=(), page=1, page_size=50):
""" """
Search for documents in Find app. Search for documents in Find app.
""" Ensure the same default ordering as "Docs" list : -updated_at
visited_ids = get_visited_document_ids_of(user)
Returns ids of the documents
Args:
text (str): Text search content.
token (str): OIDC Authentication token.
visited (list, optional):
List of ids of active public documents with LinkTrace
Defaults to settings.SEARCH_INDEXER_BATCH_SIZE.
page (int, optional):
The page number to retrieve.
Defaults to 1 if not specified.
page_size (int, optional):
The number of results to return per page.
Defaults to 50 if not specified.
"""
response = self.search_query( response = self.search_query(
data={ data={
"q": text, "q": text,
"visited": visited_ids, "visited": visited,
"services": ["docs"], "services": ["docs"],
"page_number": page,
"page_size": page_size,
"order_by": "updated_at",
"order_direction": "desc",
}, },
token=token, token=token,
) )
return self.format_response(response) return [d["_id"] for d in response]
@abstractmethod @abstractmethod
def search_query(self, data, token) -> dict: def search_query(self, data, token) -> dict:
@@ -184,14 +227,6 @@ class BaseDocumentIndexer(ABC):
Must be implemented by subclasses. Must be implemented by subclasses.
""" """
@abstractmethod
def format_response(self, data: dict):
"""
Convert the JSON response from Find app as document queryset.
Must be implemented by subclasses.
"""
class FindDocumentIndexer(BaseDocumentIndexer): class FindDocumentIndexer(BaseDocumentIndexer):
""" """
@@ -253,12 +288,6 @@ class FindDocumentIndexer(BaseDocumentIndexer):
logger.error("HTTPError: %s", e) logger.error("HTTPError: %s", e)
raise raise
def format_response(self, data: dict):
"""
Retrieve documents ids from Find app response and return a queryset.
"""
return models.Document.objects.filter(pk__in=[d["_id"] for d in data])
def push(self, data): def push(self, data):
""" """
Push a batch of documents to the Find backend. Push a batch of documents to the Find backend.
@@ -266,7 +295,6 @@ class FindDocumentIndexer(BaseDocumentIndexer):
Args: Args:
data (list): List of document dictionaries. data (list): List of document dictionaries.
""" """
try: try:
response = requests.post( response = requests.post(
self.indexer_url, self.indexer_url,

View File

@@ -2,10 +2,14 @@
Declare and configure the signals for the impress core application Declare and configure the signals for the impress core application
""" """
from functools import partial
from django.db import transaction
from django.db.models import signals from django.db.models import signals
from django.dispatch import receiver from django.dispatch import receiver
from . import models from . import models
from .services.search_indexers import default_document_indexer
from .tasks.find import trigger_document_indexer from .tasks.find import trigger_document_indexer
@@ -16,7 +20,8 @@ def document_post_save(sender, instance, **kwargs): # pylint: disable=unused-ar
Note : Within the transaction we can have an empty content and a serialization Note : Within the transaction we can have an empty content and a serialization
error. error.
""" """
trigger_document_indexer(instance, on_commit=True) if default_document_indexer() is not None:
transaction.on_commit(partial(trigger_document_indexer, instance))
@receiver(signals.post_save, sender=models.DocumentAccess) @receiver(signals.post_save, sender=models.DocumentAccess)
@@ -24,5 +29,5 @@ def document_access_post_save(sender, instance, created, **kwargs): # pylint: d
""" """
Asynchronous call to the document indexer at the end of the transaction. Asynchronous call to the document indexer at the end of the transaction.
""" """
if not created: if not created and default_document_indexer() is not None:
trigger_document_indexer(instance.document, on_commit=True) transaction.on_commit(partial(trigger_document_indexer, instance.document))

View File

@@ -1,11 +1,9 @@
"""Trigger document indexation using celery task.""" """Trigger document indexation using celery task."""
from functools import partial
from logging import getLogger from logging import getLogger
from django.conf import settings from django.conf import settings
from django.core.cache import cache from django.core.cache import cache
from django.db import transaction
from impress.celery_app import app from impress.celery_app import app
@@ -37,7 +35,7 @@ def decr_counter(key):
@app.task @app.task
def document_indexer_task(document_id): def document_indexer_task(document_id):
"""Send indexation query for a document using celery task.""" """Celery Task : Sends indexation query for a document."""
key = document_indexer_debounce_key(document_id) key = document_indexer_debounce_key(document_id)
# check if the counter : if still up, skip the task. only the last one # check if the counter : if still up, skip the task. only the last one
@@ -46,6 +44,7 @@ def document_indexer_task(document_id):
logger.info("Skip document %s indexation", document_id) logger.info("Skip document %s indexation", document_id)
return return
# Prevents some circular imports
# pylint: disable=import-outside-toplevel # pylint: disable=import-outside-toplevel
from core import models # noqa: PLC0415 from core import models # noqa: PLC0415
from core.services.search_indexers import ( # noqa: PLC0415 from core.services.search_indexers import ( # noqa: PLC0415
@@ -63,35 +62,27 @@ def document_indexer_task(document_id):
indexer.push(data) indexer.push(data)
def trigger_document_indexer(document, on_commit=False): def trigger_document_indexer(document):
""" """
Trigger indexation task with debounce a delay set by the SEARCH_INDEXER_COUNTDOWN setting. Trigger indexation task with debounce a delay set by the SEARCH_INDEXER_COUNTDOWN setting.
Args: Args:
document (Document): The document instance. document (Document): The document instance.
on_commit (bool): Wait for the end of the transaction before starting the task
(some fields may be in wrong state within the transaction)
""" """
if document.deleted_at or document.ancestors_deleted_at: if document.deleted_at or document.ancestors_deleted_at:
pass return
if on_commit: key = document_indexer_debounce_key(document.pk)
transaction.on_commit( countdown = getattr(settings, "SEARCH_INDEXER_COUNTDOWN", 1)
partial(trigger_document_indexer, document, on_commit=False)
)
else:
key = document_indexer_debounce_key(document.pk)
countdown = getattr(settings, "SEARCH_INDEXER_COUNTDOWN", 1)
logger.info( logger.info(
"Add task for document %s indexation in %.2f seconds", "Add task for document %s indexation in %.2f seconds",
document.pk, document.pk,
countdown, countdown,
) )
# Each time this method is called during the countdown, we increment the # Each time this method is called during the countdown, we increment the
# counter and each task decrease it, so the index be run only once. # counter and each task decrease it, so the index be run only once.
incr_counter(key) incr_counter(key)
document_indexer_task.apply_async(args=[document.pk], countdown=countdown) document_indexer_task.apply_async(args=[document.pk], countdown=countdown)

View File

@@ -15,6 +15,7 @@ from core.services.search_indexers import FindDocumentIndexer
@pytest.mark.django_db @pytest.mark.django_db
@pytest.mark.usefixtures("indexer_settings")
def test_index(): def test_index():
"""Test the command `index` that run the Find app indexer for all the available documents.""" """Test the command `index` that run the Find app indexer for all the available documents."""
user = factories.UserFactory() user = factories.UserFactory()

View File

@@ -24,3 +24,32 @@ def mock_user_teams():
"core.models.User.teams", new_callable=mock.PropertyMock "core.models.User.teams", new_callable=mock.PropertyMock
) as mock_teams: ) as mock_teams:
yield mock_teams yield mock_teams
@pytest.fixture(name="indexer_settings")
def indexer_settings_fixture(settings):
"""
Setup valid settings for the document indexer. Clear the indexer cache.
"""
# pylint: disable-next=import-outside-toplevel
from core.services.search_indexers import ( # noqa: PLC0415
default_document_indexer,
get_document_indexer_class,
)
default_document_indexer.cache_clear()
get_document_indexer_class.cache_clear()
settings.SEARCH_INDEXER_CLASS = "core.services.search_indexers.FindDocumentIndexer"
settings.SEARCH_INDEXER_SECRET = "ThisIsAKeyForTest"
settings.SEARCH_INDEXER_URL = "http://localhost:8081/api/v1.0/documents/index/"
settings.SEARCH_INDEXER_QUERY_URL = (
"http://localhost:8081/api/v1.0/documents/search/"
)
yield settings
# clear cache to prevent issues with other tests
default_document_indexer.cache_clear()
get_document_indexer_class.cache_clear()

View File

@@ -2,12 +2,15 @@
Tests for Documents API endpoint in impress's core app: list Tests for Documents API endpoint in impress's core app: list
""" """
from json import loads as json_loads
import pytest import pytest
import responses import responses
from faker import Faker from faker import Faker
from rest_framework.test import APIClient from rest_framework.test import APIClient
from core import factories, models from core import factories, models
from core.services.search_indexers import default_document_indexer
fake = Faker() fake = Faker()
pytestmark = pytest.mark.django_db pytestmark = pytest.mark.django_db
@@ -16,13 +19,12 @@ pytestmark = pytest.mark.django_db
@pytest.mark.parametrize("role", models.LinkRoleChoices.values) @pytest.mark.parametrize("role", models.LinkRoleChoices.values)
@pytest.mark.parametrize("reach", models.LinkReachChoices.values) @pytest.mark.parametrize("reach", models.LinkReachChoices.values)
@responses.activate @responses.activate
def test_api_documents_search_anonymous(reach, role, settings): def test_api_documents_search_anonymous(reach, role, indexer_settings):
""" """
Anonymous users should not be allowed to search documents whatever the Anonymous users should not be allowed to search documents whatever the
link reach and link role link reach and link role
""" """
factories.DocumentFactory(link_reach=reach, link_role=role) indexer_settings.SEARCH_INDEXER_QUERY_URL = "http://find/api/v1.0/search"
settings.SEARCH_INDEXER_QUERY_URL = "http://find/api/v1.0/search"
factories.DocumentFactory(link_reach=reach, link_role=role) factories.DocumentFactory(link_reach=reach, link_role=role)
@@ -45,46 +47,90 @@ def test_api_documents_search_anonymous(reach, role, settings):
} }
def test_api_documents_search_endpoint_is_none(settings): def test_api_documents_search_endpoint_is_none(indexer_settings):
"""Missing SEARCH_INDEXER_QUERY_URL should throw an error""" """
settings.SEARCH_INDEXER_QUERY_URL = None Missing SEARCH_INDEXER_QUERY_URL, so the indexer is not properly configured.
Should fallback on title filter
"""
indexer_settings.SEARCH_INDEXER_QUERY_URL = None
assert default_document_indexer() is None
user = factories.UserFactory() user = factories.UserFactory()
document = factories.DocumentFactory(title="alpha")
access = factories.UserDocumentAccessFactory(document=document, user=user)
client = APIClient() client = APIClient()
client.force_login(user) client.force_login(user)
response = APIClient().get("/api/v1.0/documents/search/", data={"q": "alpha"}) response = client.get("/api/v1.0/documents/search/", data={"q": "alpha"})
assert response.status_code == 401 assert response.status_code == 200
assert response.json() == {"detail": "The service is not properly configured."} content = response.json()
results = content.pop("results")
assert content == {
"count": 1,
"next": None,
"previous": None,
}
assert len(results) == 1
assert results[0] == {
"id": str(document.id),
"abilities": document.get_abilities(user),
"ancestors_link_reach": None,
"ancestors_link_role": None,
"computed_link_reach": document.computed_link_reach,
"computed_link_role": document.computed_link_role,
"created_at": document.created_at.isoformat().replace("+00:00", "Z"),
"creator": str(document.creator.id),
"depth": 1,
"excerpt": document.excerpt,
"link_reach": document.link_reach,
"link_role": document.link_role,
"nb_accesses_ancestors": 1,
"nb_accesses_direct": 1,
"numchild": 0,
"path": document.path,
"title": document.title,
"updated_at": document.updated_at.isoformat().replace("+00:00", "Z"),
"user_role": access.role,
}
@responses.activate @responses.activate
def test_api_documents_search_invalid_params(settings): def test_api_documents_search_invalid_params(indexer_settings):
"""Validate the format of documents as returned by the search view.""" """Validate the format of documents as returned by the search view."""
settings.SEARCH_INDEXER_QUERY_URL = "http://find/api/v1.0/search" indexer_settings.SEARCH_INDEXER_QUERY_URL = "http://find/api/v1.0/search"
user = factories.UserFactory() user = factories.UserFactory()
client = APIClient() client = APIClient()
client.force_login(user) client.force_login(user)
response = APIClient().get("/api/v1.0/documents/search/") response = client.get("/api/v1.0/documents/search/")
assert response.status_code == 400 assert response.status_code == 400
assert response.json() == {"q": ["This field is required."]} assert response.json() == {"q": ["This field is required."]}
response = APIClient().get("/api/v1.0/documents/search/", data={"q": " "}) response = client.get("/api/v1.0/documents/search/", data={"q": " "})
assert response.status_code == 400 assert response.status_code == 400
assert response.json() == {"q": ["This field may not be blank."]} assert response.json() == {"q": ["This field may not be blank."]}
response = client.get(
"/api/v1.0/documents/search/", data={"q": "any", "page": "NaN"}
)
assert response.status_code == 400
assert response.json() == {"page": ["A valid integer is required."]}
@responses.activate @responses.activate
def test_api_documents_search_format(settings): def test_api_documents_search_format(indexer_settings):
"""Validate the format of documents as returned by the search view.""" """Validate the format of documents as returned by the search view."""
settings.SEARCH_INDEXER_QUERY_URL = "http://find/api/v1.0/search" indexer_settings.SEARCH_INDEXER_QUERY_URL = "http://find/api/v1.0/search"
assert default_document_indexer() is not None
user = factories.UserFactory() user = factories.UserFactory()
@@ -140,3 +186,49 @@ def test_api_documents_search_format(settings):
"updated_at": document.updated_at.isoformat().replace("+00:00", "Z"), "updated_at": document.updated_at.isoformat().replace("+00:00", "Z"),
"user_role": access.role, "user_role": access.role,
} }
@responses.activate
def test_api_documents_search_pagination(indexer_settings):
"""Documents should be ordered by descending "updated_at" by default"""
indexer_settings.SEARCH_INDEXER_QUERY_URL = "http://find/api/v1.0/search"
assert default_document_indexer() is not None
user = factories.UserFactory()
client = APIClient()
client.force_login(user)
docs = factories.DocumentFactory.create_batch(10)
# Find response
# pylint: disable-next=assignment-from-none
api_search = responses.add(
responses.POST,
"http://find/api/v1.0/search",
json=[{"_id": str(doc.pk)} for doc in docs],
status=200,
)
response = client.get(
"/api/v1.0/documents/search/", data={"q": "alpha", "page": 2, "page_size": 5}
)
assert response.status_code == 200
content = response.json()
results = content.pop("results")
assert len(results) == 5
# Check the query parameters.
assert api_search.call_count == 1
assert api_search.calls[0].response.status_code == 200
assert json_loads(api_search.calls[0].request.body) == {
"q": "alpha",
"visited": [],
"services": ["docs"],
"page_number": 2,
"page_size": 5,
"order_by": "updated_at",
"order_direction": "desc",
}

View File

@@ -1626,9 +1626,9 @@ def test_models_documents_compute_ancestors_links_paths_mapping_structure(
@mock.patch.object(FindDocumentIndexer, "push") @mock.patch.object(FindDocumentIndexer, "push")
@pytest.mark.django_db(transaction=True) @pytest.mark.django_db(transaction=True)
def test_models_documents_post_save_indexer(mock_push, settings): def test_models_documents_post_save_indexer(mock_push, indexer_settings):
"""Test indexation task on document creation""" """Test indexation task on document creation"""
settings.SEARCH_INDEXER_COUNTDOWN = 0 indexer_settings.SEARCH_INDEXER_COUNTDOWN = 0
user = factories.UserFactory() user = factories.UserFactory()
@@ -1666,10 +1666,127 @@ def test_models_documents_post_save_indexer(mock_push, settings):
assert cache.get(document_indexer_debounce_key(doc3.pk)) == 0 assert cache.get(document_indexer_debounce_key(doc3.pk)) == 0
@mock.patch.object(FindDocumentIndexer, "push")
@pytest.mark.django_db(transaction=True) @pytest.mark.django_db(transaction=True)
def test_models_documents_post_save_indexer_debounce(settings): def test_models_documents_post_save_indexer_deleted(mock_push, indexer_settings):
"""Skip indexation task on deleted or ancestor_deleted documents"""
indexer_settings.SEARCH_INDEXER_COUNTDOWN = 0
user = factories.UserFactory()
with transaction.atomic():
doc = factories.DocumentFactory()
doc_deleted = factories.DocumentFactory()
doc_ancestor_deleted = factories.DocumentFactory(parent=doc_deleted)
doc_deleted.soft_delete()
doc_ancestor_deleted.ancestors_deleted_at = doc_deleted.deleted_at
factories.UserDocumentAccessFactory(document=doc, user=user)
factories.UserDocumentAccessFactory(document=doc_deleted, user=user)
factories.UserDocumentAccessFactory(document=doc_ancestor_deleted, user=user)
doc_deleted.refresh_from_db()
doc_ancestor_deleted.refresh_from_db()
assert doc_deleted.deleted_at is not None
assert doc_deleted.ancestors_deleted_at is not None
assert doc_ancestor_deleted.deleted_at is None
assert doc_ancestor_deleted.ancestors_deleted_at is not None
time.sleep(0.2) # waits for the end of the tasks
accesses = {
str(doc.path): {"users": [user.sub]},
str(doc_deleted.path): {"users": [user.sub]},
str(doc_ancestor_deleted.path): {"users": [user.sub]},
}
data = [call.args[0] for call in mock_push.call_args_list]
indexer = FindDocumentIndexer()
# Only the not deleted document is indexed
assert data == [
indexer.serialize_document(doc, accesses),
]
# The debounce counters should be reset
assert cache.get(document_indexer_debounce_key(doc.pk)) == 0
# These caches are not filled
assert cache.get(document_indexer_debounce_key(doc_deleted.pk)) is None
assert cache.get(document_indexer_debounce_key(doc_ancestor_deleted.pk)) is None
@mock.patch.object(FindDocumentIndexer, "push")
@pytest.mark.django_db(transaction=True)
def test_models_documents_post_save_indexer_restored(mock_push, indexer_settings):
"""Restart indexation task on restored documents"""
indexer_settings.SEARCH_INDEXER_COUNTDOWN = 0
user = factories.UserFactory()
with transaction.atomic():
doc = factories.DocumentFactory()
doc_deleted = factories.DocumentFactory()
doc_ancestor_deleted = factories.DocumentFactory(parent=doc_deleted)
doc_deleted.soft_delete()
doc_ancestor_deleted.ancestors_deleted_at = doc_deleted.deleted_at
factories.UserDocumentAccessFactory(document=doc, user=user)
factories.UserDocumentAccessFactory(document=doc_deleted, user=user)
factories.UserDocumentAccessFactory(document=doc_ancestor_deleted, user=user)
doc_deleted.refresh_from_db()
doc_ancestor_deleted.refresh_from_db()
assert doc_deleted.deleted_at is not None
assert doc_deleted.ancestors_deleted_at is not None
assert doc_ancestor_deleted.deleted_at is None
assert doc_ancestor_deleted.ancestors_deleted_at is not None
time.sleep(0.2) # waits for the end of the tasks
doc_deleted.restore()
doc_deleted.refresh_from_db()
doc_ancestor_deleted.refresh_from_db()
assert doc_deleted.deleted_at is None
assert doc_deleted.ancestors_deleted_at is None
assert doc_ancestor_deleted.deleted_at is None
assert doc_ancestor_deleted.ancestors_deleted_at is None
time.sleep(0.2)
accesses = {
str(doc.path): {"users": [user.sub]},
str(doc_deleted.path): {"users": [user.sub]},
str(doc_ancestor_deleted.path): {"users": [user.sub]},
}
data = [call.args[0] for call in mock_push.call_args_list]
indexer = FindDocumentIndexer()
# All docs are re-indexed
assert sorted(data, key=itemgetter("id")) == sorted(
[
indexer.serialize_document(doc, accesses),
indexer.serialize_document(doc_deleted, accesses),
# The restored document child is not saved so no indexation.
],
key=itemgetter("id"),
)
@pytest.mark.django_db(transaction=True)
def test_models_documents_post_save_indexer_debounce(indexer_settings):
"""Test indexation task skipping on document update""" """Test indexation task skipping on document update"""
settings.SEARCH_INDEXER_COUNTDOWN = 0 indexer_settings.SEARCH_INDEXER_COUNTDOWN = 0
indexer = FindDocumentIndexer() indexer = FindDocumentIndexer()
user = factories.UserFactory() user = factories.UserFactory()
@@ -1714,9 +1831,9 @@ def test_models_documents_post_save_indexer_debounce(settings):
@pytest.mark.django_db(transaction=True) @pytest.mark.django_db(transaction=True)
def test_models_documents_access_post_save_indexer(settings): def test_models_documents_access_post_save_indexer(indexer_settings):
"""Test indexation task on DocumentAccess update""" """Test indexation task on DocumentAccess update"""
settings.SEARCH_INDEXER_COUNTDOWN = 0 indexer_settings.SEARCH_INDEXER_COUNTDOWN = 0
indexer = FindDocumentIndexer() indexer = FindDocumentIndexer()
user = factories.UserFactory() user = factories.UserFactory()

View File

@@ -1,6 +1,7 @@
"""Tests for Documents search indexers""" """Tests for Documents search indexers"""
from functools import partial from functools import partial
from json import dumps as json_dumps
from unittest.mock import patch from unittest.mock import patch
from django.contrib.auth.models import AnonymousUser from django.contrib.auth.models import AnonymousUser
@@ -8,11 +9,14 @@ from django.core.exceptions import ImproperlyConfigured
from django.utils.module_loading import import_string from django.utils.module_loading import import_string
import pytest import pytest
import responses
from requests import HTTPError
from core import factories, models, utils from core import factories, models, utils
from core.services.search_indexers import ( from core.services.search_indexers import (
BaseDocumentIndexer, BaseDocumentIndexer,
FindDocumentIndexer, FindDocumentIndexer,
default_document_indexer,
get_document_indexer_class, get_document_indexer_class,
get_visited_document_ids_of, get_visited_document_ids_of,
) )
@@ -32,39 +36,19 @@ class FakeDocumentIndexer(BaseDocumentIndexer):
def search_query(self, data, token): def search_query(self, data, token):
return {} return {}
def format_response(self, data: dict):
return {}
def test_services_search_indexer_class_is_empty(indexer_settings):
@pytest.fixture(name="fake_indexer_settings")
def fake_indexer_settings_fixture(settings):
"""Fixture to switch the indexer to the FakeDocumentIndexer."""
_original_backend = str(settings.SEARCH_INDEXER_CLASS)
settings.SEARCH_INDEXER_CLASS = (
"core.tests.test_services_search_indexers.FakeDocumentIndexer"
)
get_document_indexer_class.cache_clear()
yield settings
settings.SEARCH_INDEXER_CLASS = _original_backend
# clear cache to prevent issues with other tests
get_document_indexer_class.cache_clear()
def test_services_search_indexer_class_is_empty(fake_indexer_settings):
""" """
Should raise ImproperlyConfigured if SEARCH_INDEXER_CLASS is None or empty. Should raise ImproperlyConfigured if SEARCH_INDEXER_CLASS is None or empty.
""" """
fake_indexer_settings.SEARCH_INDEXER_CLASS = None indexer_settings.SEARCH_INDEXER_CLASS = None
with pytest.raises(ImproperlyConfigured) as exc_info: with pytest.raises(ImproperlyConfigured) as exc_info:
get_document_indexer_class() get_document_indexer_class()
assert "SEARCH_INDEXER_CLASS must be set in Django settings." in str(exc_info.value) assert "SEARCH_INDEXER_CLASS must be set in Django settings." in str(exc_info.value)
fake_indexer_settings.SEARCH_INDEXER_CLASS = "" indexer_settings.SEARCH_INDEXER_CLASS = ""
# clear cache again # clear cache again
get_document_indexer_class.cache_clear() get_document_indexer_class.cache_clear()
@@ -75,11 +59,11 @@ def test_services_search_indexer_class_is_empty(fake_indexer_settings):
assert "SEARCH_INDEXER_CLASS must be set in Django settings." in str(exc_info.value) assert "SEARCH_INDEXER_CLASS must be set in Django settings." in str(exc_info.value)
def test_services_search_indexer_class_invalid(fake_indexer_settings): def test_services_search_indexer_class_invalid(indexer_settings):
""" """
Should raise RuntimeError if SEARCH_INDEXER_CLASS cannot be imported. Should raise RuntimeError if SEARCH_INDEXER_CLASS cannot be imported.
""" """
fake_indexer_settings.SEARCH_INDEXER_CLASS = "unknown.Unknown" indexer_settings.SEARCH_INDEXER_CLASS = "unknown.Unknown"
with pytest.raises(ImproperlyConfigured) as exc_info: with pytest.raises(ImproperlyConfigured) as exc_info:
get_document_indexer_class() get_document_indexer_class()
@@ -90,11 +74,11 @@ def test_services_search_indexer_class_invalid(fake_indexer_settings):
) )
def test_services_search_indexer_class(fake_indexer_settings): def test_services_search_indexer_class(indexer_settings):
""" """
Import indexer class defined in setting SEARCH_INDEXER_CLASS. Import indexer class defined in setting SEARCH_INDEXER_CLASS.
""" """
fake_indexer_settings.SEARCH_INDEXER_CLASS = ( indexer_settings.SEARCH_INDEXER_CLASS = (
"core.tests.test_services_search_indexers.FakeDocumentIndexer" "core.tests.test_services_search_indexers.FakeDocumentIndexer"
) )
@@ -103,11 +87,43 @@ def test_services_search_indexer_class(fake_indexer_settings):
) )
def test_services_search_indexer_url_is_none(settings): def test_services_search_indexer_is_configured(indexer_settings):
"""
Should return true only when the indexer class and other configuration settings
are valid.
"""
indexer_settings.SEARCH_INDEXER_CLASS = None
# None
default_document_indexer.cache_clear()
assert not default_document_indexer()
# Empty
indexer_settings.SEARCH_INDEXER_CLASS = ""
default_document_indexer.cache_clear()
assert not default_document_indexer()
# Valid class
indexer_settings.SEARCH_INDEXER_CLASS = (
"core.services.search_indexers.FindDocumentIndexer"
)
default_document_indexer.cache_clear()
assert default_document_indexer() is not None
indexer_settings.SEARCH_INDEXER_URL = ""
# Invalid url
default_document_indexer.cache_clear()
assert not default_document_indexer()
def test_services_search_indexer_url_is_none(indexer_settings):
""" """
Indexer should raise RuntimeError if SEARCH_INDEXER_URL is None or empty. Indexer should raise RuntimeError if SEARCH_INDEXER_URL is None or empty.
""" """
settings.SEARCH_INDEXER_URL = None indexer_settings.SEARCH_INDEXER_URL = None
with pytest.raises(ImproperlyConfigured) as exc_info: with pytest.raises(ImproperlyConfigured) as exc_info:
FindDocumentIndexer() FindDocumentIndexer()
@@ -115,11 +131,11 @@ def test_services_search_indexer_url_is_none(settings):
assert "SEARCH_INDEXER_URL must be set in Django settings." in str(exc_info.value) assert "SEARCH_INDEXER_URL must be set in Django settings." in str(exc_info.value)
def test_services_search_indexer_url_is_empty(settings): def test_services_search_indexer_url_is_empty(indexer_settings):
""" """
Indexer should raise RuntimeError if SEARCH_INDEXER_URL is empty string. Indexer should raise RuntimeError if SEARCH_INDEXER_URL is empty string.
""" """
settings.SEARCH_INDEXER_URL = "" indexer_settings.SEARCH_INDEXER_URL = ""
with pytest.raises(ImproperlyConfigured) as exc_info: with pytest.raises(ImproperlyConfigured) as exc_info:
FindDocumentIndexer() FindDocumentIndexer()
@@ -127,11 +143,11 @@ def test_services_search_indexer_url_is_empty(settings):
assert "SEARCH_INDEXER_URL must be set in Django settings." in str(exc_info.value) assert "SEARCH_INDEXER_URL must be set in Django settings." in str(exc_info.value)
def test_services_search_indexer_secret_is_none(settings): def test_services_search_indexer_secret_is_none(indexer_settings):
""" """
Indexer should raise RuntimeError if SEARCH_INDEXER_SECRET is None or empty. Indexer should raise RuntimeError if SEARCH_INDEXER_SECRET is None.
""" """
settings.SEARCH_INDEXER_SECRET = None indexer_settings.SEARCH_INDEXER_SECRET = None
with pytest.raises(ImproperlyConfigured) as exc_info: with pytest.raises(ImproperlyConfigured) as exc_info:
FindDocumentIndexer() FindDocumentIndexer()
@@ -141,11 +157,11 @@ def test_services_search_indexer_secret_is_none(settings):
) )
def test_services_search_indexer_secret_is_empty(settings): def test_services_search_indexer_secret_is_empty(indexer_settings):
""" """
Indexer should raise RuntimeError if SEARCH_INDEXER_SECRET is empty string. Indexer should raise RuntimeError if SEARCH_INDEXER_SECRET is empty string.
""" """
settings.SEARCH_INDEXER_SECRET = "" indexer_settings.SEARCH_INDEXER_SECRET = ""
with pytest.raises(ImproperlyConfigured) as exc_info: with pytest.raises(ImproperlyConfigured) as exc_info:
FindDocumentIndexer() FindDocumentIndexer()
@@ -155,6 +171,35 @@ def test_services_search_indexer_secret_is_empty(settings):
) )
def test_services_search_endpoint_is_none(indexer_settings):
"""
Indexer should raise RuntimeError if SEARCH_INDEXER_QUERY_URL is None.
"""
indexer_settings.SEARCH_INDEXER_QUERY_URL = None
with pytest.raises(ImproperlyConfigured) as exc_info:
FindDocumentIndexer()
assert "SEARCH_INDEXER_QUERY_URL must be set in Django settings." in str(
exc_info.value
)
def test_services_search_endpoint_is_empty(indexer_settings):
"""
Indexer should raise RuntimeError if SEARCH_INDEXER_QUERY_URL is empty.
"""
indexer_settings.SEARCH_INDEXER_QUERY_URL = ""
with pytest.raises(ImproperlyConfigured) as exc_info:
FindDocumentIndexer()
assert "SEARCH_INDEXER_QUERY_URL must be set in Django settings." in str(
exc_info.value
)
@pytest.mark.usefixtures("indexer_settings")
def test_services_search_indexers_serialize_document_returns_expected_json(): def test_services_search_indexers_serialize_document_returns_expected_json():
""" """
It should serialize documents with correct metadata and access control. It should serialize documents with correct metadata and access control.
@@ -195,6 +240,7 @@ def test_services_search_indexers_serialize_document_returns_expected_json():
} }
@pytest.mark.usefixtures("indexer_settings")
def test_services_search_indexers_serialize_document_deleted(): def test_services_search_indexers_serialize_document_deleted():
"""Deleted documents are marked as just in the serialized json.""" """Deleted documents are marked as just in the serialized json."""
parent = factories.DocumentFactory() parent = factories.DocumentFactory()
@@ -209,6 +255,7 @@ def test_services_search_indexers_serialize_document_deleted():
assert result["is_active"] is False assert result["is_active"] is False
@pytest.mark.usefixtures("indexer_settings")
def test_services_search_indexers_serialize_document_empty(): def test_services_search_indexers_serialize_document_empty():
"""Empty documents returns empty content in the serialized json.""" """Empty documents returns empty content in the serialized json."""
document = factories.DocumentFactory(content="", title=None) document = factories.DocumentFactory(content="", title=None)
@@ -220,13 +267,35 @@ def test_services_search_indexers_serialize_document_empty():
assert result["title"] == "" assert result["title"] == ""
@responses.activate
def test_services_search_indexers_index_errors(indexer_settings):
"""
Documents indexing response handling on Find API HTTP errors.
"""
factories.DocumentFactory()
indexer_settings.SEARCH_INDEXER_URL = "http://app-find/api/v1.0/documents/index/"
responses.add(
responses.POST,
"http://app-find/api/v1.0/documents/index/",
status=401,
body=json_dumps({"message": "Authentication failed."}),
)
with pytest.raises(HTTPError):
FindDocumentIndexer().index()
@patch.object(FindDocumentIndexer, "push") @patch.object(FindDocumentIndexer, "push")
def test_services_search_indexers_batches_pass_only_batch_accesses(mock_push, settings): def test_services_search_indexers_batches_pass_only_batch_accesses(
mock_push, indexer_settings
):
""" """
Documents indexing should be processed in batches, Documents indexing should be processed in batches,
and only the access data relevant to each batch should be used. and only the access data relevant to each batch should be used.
""" """
settings.SEARCH_INDEXER_BATCH_SIZE = 2 indexer_settings.SEARCH_INDEXER_BATCH_SIZE = 2
documents = factories.DocumentFactory.create_batch(5) documents = factories.DocumentFactory.create_batch(5)
# Attach a single user access to each document # Attach a single user access to each document
@@ -259,6 +328,7 @@ def test_services_search_indexers_batches_pass_only_batch_accesses(mock_push, se
@patch.object(FindDocumentIndexer, "push") @patch.object(FindDocumentIndexer, "push")
@pytest.mark.usefixtures("indexer_settings")
def test_services_search_indexers_ancestors_link_reach(mock_push): def test_services_search_indexers_ancestors_link_reach(mock_push):
"""Document accesses and reach should take into account ancestors link reaches.""" """Document accesses and reach should take into account ancestors link reaches."""
great_grand_parent = factories.DocumentFactory(link_reach="restricted") great_grand_parent = factories.DocumentFactory(link_reach="restricted")
@@ -279,6 +349,7 @@ def test_services_search_indexers_ancestors_link_reach(mock_push):
@patch.object(FindDocumentIndexer, "push") @patch.object(FindDocumentIndexer, "push")
@pytest.mark.usefixtures("indexer_settings")
def test_services_search_indexers_ancestors_users(mock_push): def test_services_search_indexers_ancestors_users(mock_push):
"""Document accesses and reach should include users from ancestors.""" """Document accesses and reach should include users from ancestors."""
user_gp, user_p, user_d = factories.UserFactory.create_batch(3) user_gp, user_p, user_d = factories.UserFactory.create_batch(3)
@@ -301,6 +372,7 @@ def test_services_search_indexers_ancestors_users(mock_push):
@patch.object(FindDocumentIndexer, "push") @patch.object(FindDocumentIndexer, "push")
@pytest.mark.usefixtures("indexer_settings")
def test_services_search_indexers_ancestors_teams(mock_push): def test_services_search_indexers_ancestors_teams(mock_push):
"""Document accesses and reach should include teams from ancestors.""" """Document accesses and reach should include teams from ancestors."""
grand_parent = factories.DocumentFactory(teams=["team_gp"]) grand_parent = factories.DocumentFactory(teams=["team_gp"])
@@ -317,12 +389,12 @@ def test_services_search_indexers_ancestors_teams(mock_push):
@patch("requests.post") @patch("requests.post")
def test_push_uses_correct_url_and_data(mock_post, settings): def test_push_uses_correct_url_and_data(mock_post, indexer_settings):
""" """
push() should call requests.post with the correct URL from settings push() should call requests.post with the correct URL from settings
the timeout set to 10 seconds and the data as JSON. the timeout set to 10 seconds and the data as JSON.
""" """
settings.SEARCH_INDEXER_URL = "http://example.com/index" indexer_settings.SEARCH_INDEXER_URL = "http://example.com/index"
indexer = FindDocumentIndexer() indexer = FindDocumentIndexer()
sample_data = [{"id": "123", "title": "Test"}] sample_data = [{"id": "123", "title": "Test"}]
@@ -335,7 +407,7 @@ def test_push_uses_correct_url_and_data(mock_post, settings):
mock_post.assert_called_once() mock_post.assert_called_once()
args, kwargs = mock_post.call_args args, kwargs = mock_post.call_args
assert args[0] == settings.SEARCH_INDEXER_URL assert args[0] == indexer_settings.SEARCH_INDEXER_URL
assert kwargs.get("json") == sample_data assert kwargs.get("json") == sample_data
assert kwargs.get("timeout") == 10 assert kwargs.get("timeout") == 10
@@ -348,9 +420,10 @@ def test_get_visited_document_ids_of():
user = factories.UserFactory() user = factories.UserFactory()
other = factories.UserFactory() other = factories.UserFactory()
anonymous = AnonymousUser() anonymous = AnonymousUser()
queryset = models.Document.objects.all()
assert not get_visited_document_ids_of(anonymous) assert not get_visited_document_ids_of(queryset, anonymous)
assert not get_visited_document_ids_of(user) assert not get_visited_document_ids_of(queryset, user)
doc1, doc2, _ = factories.DocumentFactory.create_batch(3) doc1, doc2, _ = factories.DocumentFactory.create_batch(3)
@@ -360,7 +433,7 @@ def test_get_visited_document_ids_of():
create_link(document=doc2) create_link(document=doc2)
# The third document is not visited # The third document is not visited
assert sorted(get_visited_document_ids_of(user)) == sorted( assert sorted(get_visited_document_ids_of(queryset, user)) == sorted(
[str(doc1.pk), str(doc2.pk)] [str(doc1.pk), str(doc2.pk)]
) )
@@ -368,11 +441,67 @@ def test_get_visited_document_ids_of():
factories.UserDocumentAccessFactory(user=user, document=doc2) factories.UserDocumentAccessFactory(user=user, document=doc2)
# The second document have an access for the user # The second document have an access for the user
assert get_visited_document_ids_of(user) == [str(doc1.pk)] assert get_visited_document_ids_of(queryset, user) == [str(doc1.pk)]
@pytest.mark.usefixtures("indexer_settings")
def test_get_visited_document_ids_of_deleted():
"""
get_visited_document_ids_of() returns the ids of the documents viewed
by the user if they are not deleted.
"""
user = factories.UserFactory()
anonymous = AnonymousUser()
queryset = models.Document.objects.all()
assert not get_visited_document_ids_of(queryset, anonymous)
assert not get_visited_document_ids_of(queryset, user)
doc = factories.DocumentFactory()
doc_deleted = factories.DocumentFactory()
doc_ancestor_deleted = factories.DocumentFactory(parent=doc_deleted)
create_link = partial(models.LinkTrace.objects.create, user=user, is_masked=False)
create_link(document=doc)
create_link(document=doc_deleted)
create_link(document=doc_ancestor_deleted)
# The all documents are visited
assert sorted(get_visited_document_ids_of(queryset, user)) == sorted(
[str(doc.pk), str(doc_deleted.pk), str(doc_ancestor_deleted.pk)]
)
doc_deleted.soft_delete()
# Only the first document is not deleted
assert get_visited_document_ids_of(queryset, user) == [str(doc.pk)]
@responses.activate
def test_services_search_indexers_search_errors(indexer_settings):
"""
Documents indexing response handling on Find API HTTP errors.
"""
factories.DocumentFactory()
indexer_settings.SEARCH_INDEXER_QUERY_URL = (
"http://app-find/api/v1.0/documents/search/"
)
responses.add(
responses.POST,
"http://app-find/api/v1.0/documents/search/",
status=401,
body=json_dumps({"message": "Authentication failed."}),
)
with pytest.raises(HTTPError):
FindDocumentIndexer().search("alpha", token="mytoken")
@patch("requests.post") @patch("requests.post")
def test_services_search_indexers_search(mock_post, settings): def test_services_search_indexers_search(mock_post, indexer_settings):
""" """
search() should call requests.post to SEARCH_INDEXER_QUERY_URL with the search() should call requests.post to SEARCH_INDEXER_QUERY_URL with the
document ids from linktraces. document ids from linktraces.
@@ -390,30 +519,22 @@ def test_services_search_indexers_search(mock_post, settings):
create_link(document=doc1) create_link(document=doc1)
create_link(document=doc2) create_link(document=doc2)
indexer.search("alpha", user=user, token="mytoken") visited = get_visited_document_ids_of(models.Document.objects.all(), user)
indexer.search("alpha", visited=visited, token="mytoken")
args, kwargs = mock_post.call_args args, kwargs = mock_post.call_args
assert args[0] == settings.SEARCH_INDEXER_QUERY_URL assert args[0] == indexer_settings.SEARCH_INDEXER_QUERY_URL
query_data = kwargs.get("json") query_data = kwargs.get("json")
assert query_data["q"] == "alpha" assert query_data["q"] == "alpha"
assert sorted(query_data["visited"]) == sorted([str(doc1.pk), str(doc2.pk)]) assert sorted(query_data["visited"]) == sorted([str(doc1.pk), str(doc2.pk)])
assert query_data["services"] == ["docs"] assert query_data["services"] == ["docs"]
assert query_data["page_number"] == 1
assert query_data["page_size"] == 50
assert query_data["order_by"] == "updated_at"
assert query_data["order_direction"] == "desc"
assert kwargs.get("headers") == {"Authorization": "Bearer mytoken"} assert kwargs.get("headers") == {"Authorization": "Bearer mytoken"}
assert kwargs.get("timeout") == 10 assert kwargs.get("timeout") == 10
def test_search_query_raises_error_if_search_endpoint_is_none(settings):
"""
Indexer should raise RuntimeError if SEARCH_INDEXER_QUERY_URL is None or empty.
"""
settings.SEARCH_INDEXER_QUERY_URL = None
with pytest.raises(ImproperlyConfigured) as exc_info:
FindDocumentIndexer()
assert "SEARCH_INDEXER_QUERY_URL must be set in Django settings." in str(
exc_info.value
)

View File

@@ -101,7 +101,7 @@ class Base(Configuration):
# Search # Search
SEARCH_INDEXER_CLASS = values.Value( SEARCH_INDEXER_CLASS = values.Value(
default="core.services.search_indexers.FindDocumentIndexer", default=None,
environ_name="SEARCH_INDEXER_CLASS", environ_name="SEARCH_INDEXER_CLASS",
environ_prefix=None, environ_prefix=None,
) )
@@ -982,11 +982,6 @@ class Test(Base):
# Tests are raising warnings because the /data/static directory does not exist # Tests are raising warnings because the /data/static directory does not exist
STATIC_ROOT = None STATIC_ROOT = None
# Setup indexer configuration to make test working on the CI.
SEARCH_INDEXER_SECRET = "ThisIsAKeyForTest" # noqa
SEARCH_INDEXER_URL = "http://localhost:8081/api/v1.0/documents/index/"
SEARCH_INDEXER_QUERY_URL = "http://localhost:8081/api/v1.0/documents/search/"
CELERY_TASK_ALWAYS_EAGER = values.BooleanValue(True) CELERY_TASK_ALWAYS_EAGER = values.BooleanValue(True)
def __init__(self): def __init__(self):