(backend) add document search view

New API view that calls the indexed documents search view
(resource server) of app "Find".

Signed-off-by: Fabre Florian <ffabre@hybird.org>
This commit is contained in:
Fabre Florian
2025-08-13 06:50:58 +02:00
committed by Quentin BEY
parent 3228f65092
commit d721b97f68
10 changed files with 312 additions and 34 deletions

View File

@@ -49,6 +49,11 @@ LOGOUT_REDIRECT_URL=http://localhost:3000
OIDC_REDIRECT_ALLOWED_HOSTS=["http://localhost:8083", "http://localhost:3000"]
OIDC_AUTH_REQUEST_EXTRA_PARAMS={"acr_values": "eidas1"}
# Store OIDC tokens in the session
OIDC_STORE_ACCESS_TOKEN = True # Store the access token in the session
OIDC_STORE_REFRESH_TOKEN = True # Store the encrypted refresh token in the session
OIDC_STORE_REFRESH_TOKEN_KEY = AnExampleKeyForDevPurposeOnly
# AI
AI_FEATURE_ENABLED=true
AI_BASE_URL=https://openaiendpoint.com

View File

@@ -1013,3 +1013,17 @@ class ThreadSerializer(serializers.ModelSerializer):
if request:
return thread.get_abilities(request.user)
return {}
class FindDocumentSerializer(serializers.Serializer):
"""Serializer for Find search requests"""
q = serializers.CharField(required=True)
def validate_q(self, value):
"""Ensure the text field is not empty."""
if len(value.strip()) == 0:
raise serializers.ValidationError("Text field cannot be empty.")
return value

View File

@@ -23,6 +23,7 @@ from django.db.models.functions import Greatest, Left, Length
from django.http import Http404, StreamingHttpResponse
from django.urls import reverse
from django.utils import timezone
from django.utils.decorators import method_decorator
from django.utils.functional import cached_property
from django.utils.text import capfirst, slugify
from django.utils.translation import gettext_lazy as _
@@ -33,6 +34,7 @@ from botocore.exceptions import ClientError
from csp.constants import NONE
from csp.decorators import csp_update
from lasuite.malware_detection import malware_detection
from lasuite.oidc_login.decorators import refresh_oidc_access_token
from rest_framework import filters, status, viewsets
from rest_framework import response as drf_response
from rest_framework.permissions import AllowAny
@@ -50,6 +52,7 @@ from core.services.converter_services import (
from core.services.converter_services import (
YdocConverter,
)
from core.services.search_indexers import FindDocumentIndexer
from core.tasks.mail import send_ask_for_access_mail
from core.utils import extract_attachments, filter_descendants
@@ -387,6 +390,7 @@ class DocumentViewSet(
list_serializer_class = serializers.ListDocumentSerializer
trashbin_serializer_class = serializers.ListDocumentSerializer
tree_serializer_class = serializers.ListDocumentSerializer
search_serializer_class = serializers.ListDocumentSerializer
def get_queryset(self):
"""Get queryset performing all annotation and filtering on the document tree structure."""
@@ -1078,10 +1082,37 @@ class DocumentViewSet(
{"id": str(duplicated_document.id)}, status=status.HTTP_201_CREATED
)
# TODO
# @drf.decorators.action(detail=False, methods=["get"])
# def search(self, request, *args, **kwargs):
# index.search()
@drf.decorators.action(detail=False, methods=["get"], url_path="search")
@method_decorator(refresh_oidc_access_token)
def search(self, request, *args, **kwargs):
"""
Returns a DRF response containing the filtered, annotated and ordered document list.
The filtering allows full text search through the opensearch indexation app "find".
"""
access_token = request.session.get("oidc_access_token")
serializer = serializers.FindDocumentSerializer(data=request.query_params)
serializer.is_valid(raise_exception=True)
try:
indexer = FindDocumentIndexer()
queryset = indexer.search(
text=serializer.validated_data.get("q", ""),
user=request.user,
token=access_token,
)
except RuntimeError:
return drf.response.Response(
{"detail": "The service is not configured properly."},
status=status.HTTP_401_UNAUTHORIZED,
)
return self.get_response_for_queryset(
queryset,
context={
"request": request,
},
)
@drf.decorators.action(detail=True, methods=["get"], url_path="versions")
def versions_list(self, request, *args, **kwargs):

View File

@@ -41,8 +41,8 @@ from .choices import (
RoleChoices,
get_equivalent_link_definition,
)
from .validators import sub_validator
from .tasks.find import trigger_document_indexer
from .validators import sub_validator
logger = getLogger(__name__)

View File

@@ -48,19 +48,19 @@ def get_batch_accesses_by_users_and_teams(paths):
def get_visited_document_ids_of(user):
"""
Returns the ids of the documents that have a linktrace to the user and NOT owned.
It will be use to limit the opensearch responses to the public documents already
"visited" by the user.
"""
if isinstance(user, AnonymousUser):
return []
# TODO : exclude links when user already have a specific access to the doc
qs = models.LinkTrace.objects.filter(
user=user
).exclude(
qs = models.LinkTrace.objects.filter(user=user).exclude(
document__accesses__user=user,
)
return list({
str(id) for id in qs.values_list("document_id", flat=True)
})
return list({str(id) for id in qs.values_list("document_id", flat=True)})
class BaseDocumentIndexer(ABC):
@@ -129,13 +129,14 @@ class BaseDocumentIndexer(ABC):
"""
visited_ids = get_visited_document_ids_of(user)
response = self.search_query(data={
"q": text,
"visited": visited_ids,
"services": ["docs"],
}, token=token)
print(response)
response = self.search_query(
data={
"q": text,
"visited": visited_ids,
"services": ["docs"],
},
token=token,
)
return self.format_response(response)
@@ -207,7 +208,7 @@ class FindDocumentIndexer(BaseDocumentIndexer):
if not url:
raise RuntimeError(
"SEARCH_INDEXER_QUERY_URL must be set in Django settings before indexing."
"SEARCH_INDEXER_QUERY_URL must be set in Django settings before search."
)
try:
@@ -228,9 +229,7 @@ class FindDocumentIndexer(BaseDocumentIndexer):
"""
Retrieve documents ids from Find app response and return a queryset.
"""
return models.Document.objects.filter(pk__in=[
d['_id'] for d in data
])
return models.Document.objects.filter(pk__in=[d["_id"] for d in data])
def push(self, data):
"""

View File

@@ -86,7 +86,8 @@ def trigger_document_indexer(document, on_commit=False):
logger.info(
"Add task for document %s indexation in %.2f seconds",
document.pk, countdown
document.pk,
countdown,
)
# Each time this method is called during the countdown, we increment the

View File

@@ -21,7 +21,7 @@ def test_index():
with transaction.atomic():
doc = factories.DocumentFactory()
empty_doc = factories.DocumentFactory(title=None, content='')
empty_doc = factories.DocumentFactory(title=None, content="")
no_title_doc = factories.DocumentFactory(title=None)
factories.UserDocumentAccessFactory(document=doc, user=user)
@@ -43,7 +43,10 @@ def test_index():
push_call_args = [call.args[0] for call in mock_push.call_args_list]
assert len(push_call_args) == 1 # called once but with a batch of docs
assert sorted(push_call_args[0], key=sortkey) == sorted([
indexer.serialize_document(doc, accesses),
indexer.serialize_document(no_title_doc, accesses),
], key=sortkey)
assert sorted(push_call_args[0], key=sortkey) == sorted(
[
indexer.serialize_document(doc, accesses),
indexer.serialize_document(no_title_doc, accesses),
],
key=sortkey,
)

View File

@@ -0,0 +1,137 @@
"""
Tests for Documents API endpoint in impress's core app: list
"""
import pytest
import responses
from faker import Faker
from rest_framework.test import APIClient
from core import factories, models
fake = Faker()
pytestmark = pytest.mark.django_db
@pytest.mark.parametrize("role", models.LinkRoleChoices.values)
@pytest.mark.parametrize("reach", models.LinkReachChoices.values)
@responses.activate
def test_api_documents_search_anonymous(reach, role, settings):
"""
Anonymous users should not be allowed to search documents whatever the
link reach and link role
"""
factories.DocumentFactory(link_reach=reach, link_role=role)
settings.SEARCH_INDEXER_QUERY_URL = "http://find/api/v1.0/search"
factories.DocumentFactory(link_reach=reach, link_role=role)
# Find response
responses.add(
responses.POST,
"http://find/api/v1.0/search",
json=[],
status=200,
)
response = APIClient().get("/api/v1.0/documents/search/", data={"q": "alpha"})
assert response.status_code == 200
assert response.json() == {
"count": 0,
"next": None,
"previous": None,
"results": [],
}
def test_api_documents_search_endpoint_is_none(settings):
"""Missing SEARCH_INDEXER_QUERY_URL should throw an error"""
settings.SEARCH_INDEXER_QUERY_URL = None
user = factories.UserFactory()
client = APIClient()
client.force_login(user)
response = APIClient().get("/api/v1.0/documents/search/", data={"q": "alpha"})
assert response.status_code == 401
assert response.json() == {"detail": "The service is not configured properly."}
@responses.activate
def test_api_documents_search_invalid_params(settings):
"""Validate the format of documents as returned by the search view."""
settings.SEARCH_INDEXER_QUERY_URL = "http://find/api/v1.0/search"
user = factories.UserFactory()
client = APIClient()
client.force_login(user)
response = APIClient().get("/api/v1.0/documents/search/")
assert response.status_code == 400
assert response.json() == {"q": ["This field is required."]}
@responses.activate
def test_api_documents_search_format(settings):
"""Validate the format of documents as returned by the search view."""
settings.SEARCH_INDEXER_QUERY_URL = "http://find/api/v1.0/search"
user = factories.UserFactory()
client = APIClient()
client.force_login(user)
user_a, user_b, user_c = factories.UserFactory.create_batch(3)
document = factories.DocumentFactory(
title="alpha",
users=(user_a, user_c),
link_traces=(user, user_b),
)
access = factories.UserDocumentAccessFactory(document=document, user=user)
# Find response
responses.add(
responses.POST,
"http://find/api/v1.0/search",
json=[
{"_id": str(document.pk)},
],
status=200,
)
response = client.get("/api/v1.0/documents/search/", data={"q": "alpha"})
assert response.status_code == 200
content = response.json()
results = content.pop("results")
assert content == {
"count": 1,
"next": None,
"previous": None,
}
assert len(results) == 1
assert results[0] == {
"id": str(document.id),
"abilities": document.get_abilities(user),
"ancestors_link_reach": None,
"ancestors_link_role": None,
"computed_link_reach": document.computed_link_reach,
"computed_link_role": document.computed_link_role,
"created_at": document.created_at.isoformat().replace("+00:00", "Z"),
"creator": str(document.creator.id),
"depth": 1,
"excerpt": document.excerpt,
"link_reach": document.link_reach,
"link_role": document.link_role,
"nb_accesses_ancestors": 3,
"nb_accesses_direct": 3,
"numchild": 0,
"path": document.path,
"title": document.title,
"updated_at": document.updated_at.isoformat().replace("+00:00", "Z"),
"user_role": access.role,
}

View File

@@ -1,11 +1,17 @@
"""Tests for Documents search indexers"""
from functools import partial
from unittest.mock import patch
from django.contrib.auth.models import AnonymousUser
import pytest
from core import factories, utils
from core.services.search_indexers import FindDocumentIndexer
from core import factories, models, utils
from core.services.search_indexers import (
FindDocumentIndexer,
get_visited_document_ids_of,
)
pytestmark = pytest.mark.django_db
@@ -187,7 +193,6 @@ def test_services_search_indexers_ancestors_link_reach(mock_push):
FindDocumentIndexer().index()
seen_doc_ids = set()
results = {doc["id"]: doc for doc in mock_push.call_args[0][0]}
assert len(results) == 4
assert results[str(great_grand_parent.id)]["reach"] == "restricted"
@@ -207,7 +212,6 @@ def test_services_search_indexers_ancestors_users(mock_push):
FindDocumentIndexer().index()
seen_doc_ids = set()
results = {doc["id"]: doc for doc in mock_push.call_args[0][0]}
assert len(results) == 3
assert results[str(grand_parent.id)]["users"] == [str(user_gp.sub)]
@@ -228,7 +232,6 @@ def test_services_search_indexers_ancestors_teams(mock_push):
FindDocumentIndexer().index()
seen_doc_ids = set()
results = {doc["id"]: doc for doc in mock_push.call_args[0][0]}
assert len(results) == 3
assert results[str(grand_parent.id)]["groups"] == ["team_gp"]
@@ -258,3 +261,85 @@ def test_push_uses_correct_url_and_data(mock_post, settings):
assert args[0] == settings.SEARCH_INDEXER_URL
assert kwargs.get("json") == sample_data
assert kwargs.get("timeout") == 10
def test_get_visited_document_ids_of():
"""
get_visited_document_ids_of() returns the ids of the documents viewed
by the user BUT without specific access configuration (like public ones)
"""
user = factories.UserFactory()
other = factories.UserFactory()
anonymous = AnonymousUser()
assert not get_visited_document_ids_of(anonymous)
assert not get_visited_document_ids_of(user)
doc1, doc2, _ = factories.DocumentFactory.create_batch(3)
create_link = partial(models.LinkTrace.objects.create, user=user, is_masked=False)
create_link(document=doc1)
create_link(document=doc2)
# The third document is not visited
assert sorted(get_visited_document_ids_of(user)) == sorted(
[str(doc1.pk), str(doc2.pk)]
)
factories.UserDocumentAccessFactory(user=other, document=doc1)
factories.UserDocumentAccessFactory(user=user, document=doc2)
# The second document have an access for the user
assert get_visited_document_ids_of(user) == [str(doc1.pk)]
@patch("requests.post")
def test_services_search_indexers_search(mock_post, settings):
"""
search() should call requests.post to SEARCH_INDEXER_QUERY_URL with the
document ids from linktraces.
"""
user = factories.UserFactory()
indexer = FindDocumentIndexer()
mock_response = mock_post.return_value
mock_response.raise_for_status.return_value = None # No error
doc1, doc2, _ = factories.DocumentFactory.create_batch(3)
create_link = partial(models.LinkTrace.objects.create, user=user, is_masked=False)
create_link(document=doc1)
create_link(document=doc2)
indexer.search("alpha", user=user, token="mytoken")
args, kwargs = mock_post.call_args
assert args[0] == settings.SEARCH_INDEXER_QUERY_URL
query_data = kwargs.get("json")
assert query_data["q"] == "alpha"
assert sorted(query_data["visited"]) == sorted([str(doc1.pk), str(doc2.pk)])
assert query_data["services"] == ["docs"]
assert kwargs.get("headers") == {"Authorization": "Bearer mytoken"}
assert kwargs.get("timeout") == 10
def test_search_query_raises_error_if_search_endpoint_is_none(settings):
"""
Indexer should raise RuntimeError if SEARCH_INDEXER_QUERY_URL is None or empty.
"""
settings.SEARCH_INDEXER_QUERY_URL = None
indexer = FindDocumentIndexer()
user = factories.UserFactory()
with pytest.raises(RuntimeError) as exc_info:
indexer.search("alpha", user=user, token="mytoken")
assert (
"SEARCH_INDEXER_QUERY_URL must be set in Django settings before search."
in str(exc_info.value)
)

View File

@@ -109,6 +109,9 @@ class Base(Configuration):
SEARCH_INDEXER_SECRET = values.Value(
default=None, environ_name="SEARCH_INDEXER_SECRET", environ_prefix=None
)
SEARCH_INDEXER_QUERY_URL = values.Value(
default=None, environ_name="SEARCH_INDEXER_QUERY_URL", environ_prefix=None
)
# Static files (CSS, JavaScript, Images)
STATIC_URL = "/static/"