(backend) add document search view

New API view that calls the indexed documents search view
(resource server) of app "Find".

Signed-off-by: Fabre Florian <ffabre@hybird.org>
This commit is contained in:
Fabre Florian
2025-08-13 06:50:58 +02:00
committed by Quentin BEY
parent 3228f65092
commit d721b97f68
10 changed files with 312 additions and 34 deletions

View File

@@ -49,6 +49,11 @@ LOGOUT_REDIRECT_URL=http://localhost:3000
OIDC_REDIRECT_ALLOWED_HOSTS=["http://localhost:8083", "http://localhost:3000"] OIDC_REDIRECT_ALLOWED_HOSTS=["http://localhost:8083", "http://localhost:3000"]
OIDC_AUTH_REQUEST_EXTRA_PARAMS={"acr_values": "eidas1"} OIDC_AUTH_REQUEST_EXTRA_PARAMS={"acr_values": "eidas1"}
# Store OIDC tokens in the session
OIDC_STORE_ACCESS_TOKEN = True # Store the access token in the session
OIDC_STORE_REFRESH_TOKEN = True # Store the encrypted refresh token in the session
OIDC_STORE_REFRESH_TOKEN_KEY = AnExampleKeyForDevPurposeOnly
# AI # AI
AI_FEATURE_ENABLED=true AI_FEATURE_ENABLED=true
AI_BASE_URL=https://openaiendpoint.com AI_BASE_URL=https://openaiendpoint.com

View File

@@ -1013,3 +1013,17 @@ class ThreadSerializer(serializers.ModelSerializer):
if request: if request:
return thread.get_abilities(request.user) return thread.get_abilities(request.user)
return {} return {}
class FindDocumentSerializer(serializers.Serializer):
"""Serializer for Find search requests"""
q = serializers.CharField(required=True)
def validate_q(self, value):
"""Ensure the text field is not empty."""
if len(value.strip()) == 0:
raise serializers.ValidationError("Text field cannot be empty.")
return value

View File

@@ -23,6 +23,7 @@ from django.db.models.functions import Greatest, Left, Length
from django.http import Http404, StreamingHttpResponse from django.http import Http404, StreamingHttpResponse
from django.urls import reverse from django.urls import reverse
from django.utils import timezone from django.utils import timezone
from django.utils.decorators import method_decorator
from django.utils.functional import cached_property from django.utils.functional import cached_property
from django.utils.text import capfirst, slugify from django.utils.text import capfirst, slugify
from django.utils.translation import gettext_lazy as _ from django.utils.translation import gettext_lazy as _
@@ -33,6 +34,7 @@ from botocore.exceptions import ClientError
from csp.constants import NONE from csp.constants import NONE
from csp.decorators import csp_update from csp.decorators import csp_update
from lasuite.malware_detection import malware_detection from lasuite.malware_detection import malware_detection
from lasuite.oidc_login.decorators import refresh_oidc_access_token
from rest_framework import filters, status, viewsets from rest_framework import filters, status, viewsets
from rest_framework import response as drf_response from rest_framework import response as drf_response
from rest_framework.permissions import AllowAny from rest_framework.permissions import AllowAny
@@ -50,6 +52,7 @@ from core.services.converter_services import (
from core.services.converter_services import ( from core.services.converter_services import (
YdocConverter, YdocConverter,
) )
from core.services.search_indexers import FindDocumentIndexer
from core.tasks.mail import send_ask_for_access_mail from core.tasks.mail import send_ask_for_access_mail
from core.utils import extract_attachments, filter_descendants from core.utils import extract_attachments, filter_descendants
@@ -387,6 +390,7 @@ class DocumentViewSet(
list_serializer_class = serializers.ListDocumentSerializer list_serializer_class = serializers.ListDocumentSerializer
trashbin_serializer_class = serializers.ListDocumentSerializer trashbin_serializer_class = serializers.ListDocumentSerializer
tree_serializer_class = serializers.ListDocumentSerializer tree_serializer_class = serializers.ListDocumentSerializer
search_serializer_class = serializers.ListDocumentSerializer
def get_queryset(self): def get_queryset(self):
"""Get queryset performing all annotation and filtering on the document tree structure.""" """Get queryset performing all annotation and filtering on the document tree structure."""
@@ -1078,10 +1082,37 @@ class DocumentViewSet(
{"id": str(duplicated_document.id)}, status=status.HTTP_201_CREATED {"id": str(duplicated_document.id)}, status=status.HTTP_201_CREATED
) )
# TODO @drf.decorators.action(detail=False, methods=["get"], url_path="search")
# @drf.decorators.action(detail=False, methods=["get"]) @method_decorator(refresh_oidc_access_token)
# def search(self, request, *args, **kwargs): def search(self, request, *args, **kwargs):
# index.search() """
Returns a DRF response containing the filtered, annotated and ordered document list.
The filtering allows full text search through the opensearch indexation app "find".
"""
access_token = request.session.get("oidc_access_token")
serializer = serializers.FindDocumentSerializer(data=request.query_params)
serializer.is_valid(raise_exception=True)
try:
indexer = FindDocumentIndexer()
queryset = indexer.search(
text=serializer.validated_data.get("q", ""),
user=request.user,
token=access_token,
)
except RuntimeError:
return drf.response.Response(
{"detail": "The service is not configured properly."},
status=status.HTTP_401_UNAUTHORIZED,
)
return self.get_response_for_queryset(
queryset,
context={
"request": request,
},
)
@drf.decorators.action(detail=True, methods=["get"], url_path="versions") @drf.decorators.action(detail=True, methods=["get"], url_path="versions")
def versions_list(self, request, *args, **kwargs): def versions_list(self, request, *args, **kwargs):

View File

@@ -41,8 +41,8 @@ from .choices import (
RoleChoices, RoleChoices,
get_equivalent_link_definition, get_equivalent_link_definition,
) )
from .validators import sub_validator
from .tasks.find import trigger_document_indexer from .tasks.find import trigger_document_indexer
from .validators import sub_validator
logger = getLogger(__name__) logger = getLogger(__name__)

View File

@@ -48,19 +48,19 @@ def get_batch_accesses_by_users_and_teams(paths):
def get_visited_document_ids_of(user): def get_visited_document_ids_of(user):
"""
Returns the ids of the documents that have a linktrace to the user and NOT owned.
It will be use to limit the opensearch responses to the public documents already
"visited" by the user.
"""
if isinstance(user, AnonymousUser): if isinstance(user, AnonymousUser):
return [] return []
# TODO : exclude links when user already have a specific access to the doc qs = models.LinkTrace.objects.filter(user=user).exclude(
qs = models.LinkTrace.objects.filter(
user=user
).exclude(
document__accesses__user=user, document__accesses__user=user,
) )
return list({ return list({str(id) for id in qs.values_list("document_id", flat=True)})
str(id) for id in qs.values_list("document_id", flat=True)
})
class BaseDocumentIndexer(ABC): class BaseDocumentIndexer(ABC):
@@ -129,13 +129,14 @@ class BaseDocumentIndexer(ABC):
""" """
visited_ids = get_visited_document_ids_of(user) visited_ids = get_visited_document_ids_of(user)
response = self.search_query(data={ response = self.search_query(
"q": text, data={
"visited": visited_ids, "q": text,
"services": ["docs"], "visited": visited_ids,
}, token=token) "services": ["docs"],
},
print(response) token=token,
)
return self.format_response(response) return self.format_response(response)
@@ -207,7 +208,7 @@ class FindDocumentIndexer(BaseDocumentIndexer):
if not url: if not url:
raise RuntimeError( raise RuntimeError(
"SEARCH_INDEXER_QUERY_URL must be set in Django settings before indexing." "SEARCH_INDEXER_QUERY_URL must be set in Django settings before search."
) )
try: try:
@@ -228,9 +229,7 @@ class FindDocumentIndexer(BaseDocumentIndexer):
""" """
Retrieve documents ids from Find app response and return a queryset. Retrieve documents ids from Find app response and return a queryset.
""" """
return models.Document.objects.filter(pk__in=[ return models.Document.objects.filter(pk__in=[d["_id"] for d in data])
d['_id'] for d in data
])
def push(self, data): def push(self, data):
""" """

View File

@@ -86,7 +86,8 @@ def trigger_document_indexer(document, on_commit=False):
logger.info( logger.info(
"Add task for document %s indexation in %.2f seconds", "Add task for document %s indexation in %.2f seconds",
document.pk, countdown document.pk,
countdown,
) )
# Each time this method is called during the countdown, we increment the # Each time this method is called during the countdown, we increment the

View File

@@ -21,7 +21,7 @@ def test_index():
with transaction.atomic(): with transaction.atomic():
doc = factories.DocumentFactory() doc = factories.DocumentFactory()
empty_doc = factories.DocumentFactory(title=None, content='') empty_doc = factories.DocumentFactory(title=None, content="")
no_title_doc = factories.DocumentFactory(title=None) no_title_doc = factories.DocumentFactory(title=None)
factories.UserDocumentAccessFactory(document=doc, user=user) factories.UserDocumentAccessFactory(document=doc, user=user)
@@ -43,7 +43,10 @@ def test_index():
push_call_args = [call.args[0] for call in mock_push.call_args_list] push_call_args = [call.args[0] for call in mock_push.call_args_list]
assert len(push_call_args) == 1 # called once but with a batch of docs assert len(push_call_args) == 1 # called once but with a batch of docs
assert sorted(push_call_args[0], key=sortkey) == sorted([ assert sorted(push_call_args[0], key=sortkey) == sorted(
indexer.serialize_document(doc, accesses), [
indexer.serialize_document(no_title_doc, accesses), indexer.serialize_document(doc, accesses),
], key=sortkey) indexer.serialize_document(no_title_doc, accesses),
],
key=sortkey,
)

View File

@@ -0,0 +1,137 @@
"""
Tests for Documents API endpoint in impress's core app: list
"""
import pytest
import responses
from faker import Faker
from rest_framework.test import APIClient
from core import factories, models
fake = Faker()
pytestmark = pytest.mark.django_db
@pytest.mark.parametrize("role", models.LinkRoleChoices.values)
@pytest.mark.parametrize("reach", models.LinkReachChoices.values)
@responses.activate
def test_api_documents_search_anonymous(reach, role, settings):
"""
Anonymous users should not be allowed to search documents whatever the
link reach and link role
"""
factories.DocumentFactory(link_reach=reach, link_role=role)
settings.SEARCH_INDEXER_QUERY_URL = "http://find/api/v1.0/search"
factories.DocumentFactory(link_reach=reach, link_role=role)
# Find response
responses.add(
responses.POST,
"http://find/api/v1.0/search",
json=[],
status=200,
)
response = APIClient().get("/api/v1.0/documents/search/", data={"q": "alpha"})
assert response.status_code == 200
assert response.json() == {
"count": 0,
"next": None,
"previous": None,
"results": [],
}
def test_api_documents_search_endpoint_is_none(settings):
"""Missing SEARCH_INDEXER_QUERY_URL should throw an error"""
settings.SEARCH_INDEXER_QUERY_URL = None
user = factories.UserFactory()
client = APIClient()
client.force_login(user)
response = APIClient().get("/api/v1.0/documents/search/", data={"q": "alpha"})
assert response.status_code == 401
assert response.json() == {"detail": "The service is not configured properly."}
@responses.activate
def test_api_documents_search_invalid_params(settings):
"""Validate the format of documents as returned by the search view."""
settings.SEARCH_INDEXER_QUERY_URL = "http://find/api/v1.0/search"
user = factories.UserFactory()
client = APIClient()
client.force_login(user)
response = APIClient().get("/api/v1.0/documents/search/")
assert response.status_code == 400
assert response.json() == {"q": ["This field is required."]}
@responses.activate
def test_api_documents_search_format(settings):
"""Validate the format of documents as returned by the search view."""
settings.SEARCH_INDEXER_QUERY_URL = "http://find/api/v1.0/search"
user = factories.UserFactory()
client = APIClient()
client.force_login(user)
user_a, user_b, user_c = factories.UserFactory.create_batch(3)
document = factories.DocumentFactory(
title="alpha",
users=(user_a, user_c),
link_traces=(user, user_b),
)
access = factories.UserDocumentAccessFactory(document=document, user=user)
# Find response
responses.add(
responses.POST,
"http://find/api/v1.0/search",
json=[
{"_id": str(document.pk)},
],
status=200,
)
response = client.get("/api/v1.0/documents/search/", data={"q": "alpha"})
assert response.status_code == 200
content = response.json()
results = content.pop("results")
assert content == {
"count": 1,
"next": None,
"previous": None,
}
assert len(results) == 1
assert results[0] == {
"id": str(document.id),
"abilities": document.get_abilities(user),
"ancestors_link_reach": None,
"ancestors_link_role": None,
"computed_link_reach": document.computed_link_reach,
"computed_link_role": document.computed_link_role,
"created_at": document.created_at.isoformat().replace("+00:00", "Z"),
"creator": str(document.creator.id),
"depth": 1,
"excerpt": document.excerpt,
"link_reach": document.link_reach,
"link_role": document.link_role,
"nb_accesses_ancestors": 3,
"nb_accesses_direct": 3,
"numchild": 0,
"path": document.path,
"title": document.title,
"updated_at": document.updated_at.isoformat().replace("+00:00", "Z"),
"user_role": access.role,
}

View File

@@ -1,11 +1,17 @@
"""Tests for Documents search indexers""" """Tests for Documents search indexers"""
from functools import partial
from unittest.mock import patch from unittest.mock import patch
from django.contrib.auth.models import AnonymousUser
import pytest import pytest
from core import factories, utils from core import factories, models, utils
from core.services.search_indexers import FindDocumentIndexer from core.services.search_indexers import (
FindDocumentIndexer,
get_visited_document_ids_of,
)
pytestmark = pytest.mark.django_db pytestmark = pytest.mark.django_db
@@ -187,7 +193,6 @@ def test_services_search_indexers_ancestors_link_reach(mock_push):
FindDocumentIndexer().index() FindDocumentIndexer().index()
seen_doc_ids = set()
results = {doc["id"]: doc for doc in mock_push.call_args[0][0]} results = {doc["id"]: doc for doc in mock_push.call_args[0][0]}
assert len(results) == 4 assert len(results) == 4
assert results[str(great_grand_parent.id)]["reach"] == "restricted" assert results[str(great_grand_parent.id)]["reach"] == "restricted"
@@ -207,7 +212,6 @@ def test_services_search_indexers_ancestors_users(mock_push):
FindDocumentIndexer().index() FindDocumentIndexer().index()
seen_doc_ids = set()
results = {doc["id"]: doc for doc in mock_push.call_args[0][0]} results = {doc["id"]: doc for doc in mock_push.call_args[0][0]}
assert len(results) == 3 assert len(results) == 3
assert results[str(grand_parent.id)]["users"] == [str(user_gp.sub)] assert results[str(grand_parent.id)]["users"] == [str(user_gp.sub)]
@@ -228,7 +232,6 @@ def test_services_search_indexers_ancestors_teams(mock_push):
FindDocumentIndexer().index() FindDocumentIndexer().index()
seen_doc_ids = set()
results = {doc["id"]: doc for doc in mock_push.call_args[0][0]} results = {doc["id"]: doc for doc in mock_push.call_args[0][0]}
assert len(results) == 3 assert len(results) == 3
assert results[str(grand_parent.id)]["groups"] == ["team_gp"] assert results[str(grand_parent.id)]["groups"] == ["team_gp"]
@@ -258,3 +261,85 @@ def test_push_uses_correct_url_and_data(mock_post, settings):
assert args[0] == settings.SEARCH_INDEXER_URL assert args[0] == settings.SEARCH_INDEXER_URL
assert kwargs.get("json") == sample_data assert kwargs.get("json") == sample_data
assert kwargs.get("timeout") == 10 assert kwargs.get("timeout") == 10
def test_get_visited_document_ids_of():
"""
get_visited_document_ids_of() returns the ids of the documents viewed
by the user BUT without specific access configuration (like public ones)
"""
user = factories.UserFactory()
other = factories.UserFactory()
anonymous = AnonymousUser()
assert not get_visited_document_ids_of(anonymous)
assert not get_visited_document_ids_of(user)
doc1, doc2, _ = factories.DocumentFactory.create_batch(3)
create_link = partial(models.LinkTrace.objects.create, user=user, is_masked=False)
create_link(document=doc1)
create_link(document=doc2)
# The third document is not visited
assert sorted(get_visited_document_ids_of(user)) == sorted(
[str(doc1.pk), str(doc2.pk)]
)
factories.UserDocumentAccessFactory(user=other, document=doc1)
factories.UserDocumentAccessFactory(user=user, document=doc2)
# The second document have an access for the user
assert get_visited_document_ids_of(user) == [str(doc1.pk)]
@patch("requests.post")
def test_services_search_indexers_search(mock_post, settings):
"""
search() should call requests.post to SEARCH_INDEXER_QUERY_URL with the
document ids from linktraces.
"""
user = factories.UserFactory()
indexer = FindDocumentIndexer()
mock_response = mock_post.return_value
mock_response.raise_for_status.return_value = None # No error
doc1, doc2, _ = factories.DocumentFactory.create_batch(3)
create_link = partial(models.LinkTrace.objects.create, user=user, is_masked=False)
create_link(document=doc1)
create_link(document=doc2)
indexer.search("alpha", user=user, token="mytoken")
args, kwargs = mock_post.call_args
assert args[0] == settings.SEARCH_INDEXER_QUERY_URL
query_data = kwargs.get("json")
assert query_data["q"] == "alpha"
assert sorted(query_data["visited"]) == sorted([str(doc1.pk), str(doc2.pk)])
assert query_data["services"] == ["docs"]
assert kwargs.get("headers") == {"Authorization": "Bearer mytoken"}
assert kwargs.get("timeout") == 10
def test_search_query_raises_error_if_search_endpoint_is_none(settings):
"""
Indexer should raise RuntimeError if SEARCH_INDEXER_QUERY_URL is None or empty.
"""
settings.SEARCH_INDEXER_QUERY_URL = None
indexer = FindDocumentIndexer()
user = factories.UserFactory()
with pytest.raises(RuntimeError) as exc_info:
indexer.search("alpha", user=user, token="mytoken")
assert (
"SEARCH_INDEXER_QUERY_URL must be set in Django settings before search."
in str(exc_info.value)
)

View File

@@ -109,6 +109,9 @@ class Base(Configuration):
SEARCH_INDEXER_SECRET = values.Value( SEARCH_INDEXER_SECRET = values.Value(
default=None, environ_name="SEARCH_INDEXER_SECRET", environ_prefix=None default=None, environ_name="SEARCH_INDEXER_SECRET", environ_prefix=None
) )
SEARCH_INDEXER_QUERY_URL = values.Value(
default=None, environ_name="SEARCH_INDEXER_QUERY_URL", environ_prefix=None
)
# Static files (CSS, JavaScript, Images) # Static files (CSS, JavaScript, Images)
STATIC_URL = "/static/" STATIC_URL = "/static/"