🚸(backend) sort user search results by proximity with the active user (#1802)

## Purpose
Allows a user to find more easily the other users they search, with the
following order of priority:
- users they already share documents with (more recent first)
- users that share the same full email domain
- ~~users that share the same partial email domain (last two parts)~~
- ~~other users~~

Edit: We need to ilter out other users in order to not reveal email
addresses from members of other organisations. It's still possible to
invite them by email.

Solves #1521

## Proposal
- [x] Add a new function in `core/utils.py`:
`users_sharing_documents_with()`
- [x] Use it as a key to sort the results of a basic user search
- [x] Filter user results to avoid reveal of users (and email addresses)
of other orgs or that have not been interacted with.
- [x] User research through "full" email address (contains the '@') is
left unaffected.

---------

Co-authored-by: Anthony LC <anthony.le-courric@mail.numerique.gouv.fr>
This commit is contained in:
Sylvain Boissel
2026-02-11 18:51:45 +01:00
committed by GitHub
parent 9af540de35
commit 685464f2d7
14 changed files with 416 additions and 35 deletions

View File

@@ -13,6 +13,7 @@ and this project adheres to
### Changed
- ♿️(frontend) Focus main container after navigation #1854
- 🚸(backend) sort user search results by proximity with the active user #1802
### Fixed

View File

@@ -17,6 +17,7 @@ These are the environment variables you can set for the `impress-backend` contai
| API_USERS_LIST_LIMIT | Limit on API users | 5 |
| API_USERS_LIST_THROTTLE_RATE_BURST | Throttle rate for api on burst | 30/minute |
| API_USERS_LIST_THROTTLE_RATE_SUSTAINED | Throttle rate for api | 180/hour |
| API_USERS_SEARCH_QUERY_MIN_LENGTH | Minimum characters to insert to search a user | 3 |
| AWS_S3_ACCESS_KEY_ID | Access id for s3 endpoint | |
| AWS_S3_ENDPOINT_URL | S3 endpoint | |
| AWS_S3_REGION_NAME | Region name for s3 endpoint | |

View File

@@ -2,6 +2,7 @@
import unicodedata
from django.conf import settings
from django.utils.translation import gettext_lazy as _
import django_filters
@@ -135,4 +136,6 @@ class UserSearchFilter(django_filters.FilterSet):
Custom filter for searching users.
"""
q = django_filters.CharFilter(min_length=5, max_length=254)
q = django_filters.CharFilter(
min_length=settings.API_USERS_SEARCH_QUERY_MIN_LENGTH, max_length=254
)

View File

@@ -37,6 +37,7 @@ from csp.constants import NONE
from csp.decorators import csp_update
from lasuite.malware_detection import malware_detection
from lasuite.oidc_login.decorators import refresh_oidc_access_token
from lasuite.tools.email import get_domain_from_email
from rest_framework import filters, status, viewsets
from rest_framework import response as drf_response
from rest_framework.permissions import AllowAny
@@ -61,7 +62,11 @@ from core.services.search_indexers import (
get_visited_document_ids_of,
)
from core.tasks.mail import send_ask_for_access_mail
from core.utils import extract_attachments, filter_descendants
from core.utils import (
extract_attachments,
filter_descendants,
users_sharing_documents_with,
)
from . import permissions, serializers, utils
from .filters import DocumentFilter, ListDocumentFilter, UserSearchFilter
@@ -220,18 +225,80 @@ class UserViewSet(
# Use trigram similarity for non-email-like queries
# For performance reasons we filter first by similarity, which relies on an
# index, then only calculate precise similarity scores for sorting purposes
# index, then only calculate precise similarity scores for sorting purposes.
#
# Additionally results are reordered to prefer users "closer" to the current
# user: users they recently shared documents with, then same email domain.
# To achieve that without complex SQL, we build a proximity score in Python
# and return the top N results.
# For security results, users that match neither of these proximity criteria
# are not returned at all, to prevent email enumeration.
current_user = self.request.user
shared_map = users_sharing_documents_with(current_user)
return (
user_email_domain = get_domain_from_email(current_user.email) or ""
candidates = list(
queryset.annotate(
sim_email=TrigramSimilarity("email", query),
sim_name=TrigramSimilarity("full_name", query),
)
.annotate(similarity=Greatest("sim_email", "sim_name"))
.filter(similarity__gt=0.2)
.order_by("-similarity")[: settings.API_USERS_LIST_LIMIT]
.order_by("-similarity")
)
# Keep only users that either share documents with the current user
# or have an email with the same domain as the current user.
filtered_candidates = []
for u in candidates:
candidate_domain = get_domain_from_email(u.email) or ""
if shared_map.get(u.id) or (
user_email_domain and candidate_domain == user_email_domain
):
filtered_candidates.append(u)
candidates = filtered_candidates
# Build ordering key for each candidate
def _sort_key(u):
# shared priority: most recent first
# Use shared_last_at timestamp numeric for secondary ordering when shared.
shared_last_at = shared_map.get(u.id)
if shared_last_at:
is_shared = 1
shared_score = int(shared_last_at.timestamp())
else:
is_shared = 0
shared_score = 0
# domain proximity
candidate_email_domain = get_domain_from_email(u.email) or ""
same_full_domain = (
1
if candidate_email_domain
and candidate_email_domain == user_email_domain
else 0
)
# similarity fallback
sim = getattr(u, "similarity", 0) or 0
return (
is_shared,
shared_score,
same_full_domain,
sim,
)
# Sort candidates by the key descending and return top N as a queryset-like
# list. Keep return type consistent with previous behavior (QuerySet slice
# was returned) by returning a list of model instances.
candidates.sort(key=_sort_key, reverse=True)
return candidates[: settings.API_USERS_LIST_LIMIT]
@drf.decorators.action(
detail=False,
methods=["get"],
@@ -2338,6 +2405,7 @@ class ConfigView(drf.views.APIView):
"""
array_settings = [
"AI_FEATURE_ENABLED",
"API_USERS_SEARCH_QUERY_MIN_LENGTH",
"COLLABORATION_WS_URL",
"COLLABORATION_WS_NOT_CONNECTED_READY_ONLY",
"CONVERSION_FILE_EXTENSIONS_ALLOWED",

View File

@@ -4,12 +4,14 @@ Declare and configure the signals for the impress core application
from functools import partial
from django.core.cache import cache
from django.db import transaction
from django.db.models import signals
from django.dispatch import receiver
from . import models
from .tasks.search import trigger_batch_document_indexer
from core import models
from core.tasks.search import trigger_batch_document_indexer
from core.utils import get_users_sharing_documents_with_cache_key
@receiver(signals.post_save, sender=models.Document)
@@ -26,8 +28,24 @@ def document_post_save(sender, instance, **kwargs): # pylint: disable=unused-ar
def document_access_post_save(sender, instance, created, **kwargs): # pylint: disable=unused-argument
"""
Asynchronous call to the document indexer at the end of the transaction.
Clear cache for the affected user.
"""
if not created:
transaction.on_commit(
partial(trigger_batch_document_indexer, instance.document)
)
# Invalidate cache for the user
if instance.user:
cache_key = get_users_sharing_documents_with_cache_key(instance.user)
cache.delete(cache_key)
@receiver(signals.post_delete, sender=models.DocumentAccess)
def document_access_post_delete(sender, instance, **kwargs): # pylint: disable=unused-argument
"""
Clear cache for the affected user when document access is deleted.
"""
if instance.user:
cache_key = get_users_sharing_documents_with_cache_key(instance.user)
cache.delete(cache_key)

View File

@@ -20,6 +20,7 @@ pytestmark = pytest.mark.django_db
@override_settings(
AI_FEATURE_ENABLED=False,
API_USERS_SEARCH_QUERY_MIN_LENGTH=6,
COLLABORATION_WS_URL="http://testcollab/",
COLLABORATION_WS_NOT_CONNECTED_READY_ONLY=True,
CRISP_WEBSITE_ID="123",
@@ -44,6 +45,7 @@ def test_api_config(is_authenticated):
assert response.status_code == HTTP_200_OK
assert response.json() == {
"AI_FEATURE_ENABLED": False,
"API_USERS_SEARCH_QUERY_MIN_LENGTH": 6,
"COLLABORATION_WS_URL": "http://testcollab/",
"COLLABORATION_WS_NOT_CONNECTED_READY_ONLY": True,
"CONVERSION_FILE_EXTENSIONS_ALLOWED": [".docx", ".md"],

View File

@@ -2,6 +2,8 @@
Test users API endpoints in the impress core app.
"""
from django.utils import timezone
import pytest
from rest_framework.test import APIClient
@@ -121,12 +123,12 @@ def test_api_users_list_query_full_name():
Authenticated users should be able to list users and filter by full name.
Only results with a Trigram similarity greater than 0.2 with the query should be returned.
"""
user = factories.UserFactory()
user = factories.UserFactory(email="user@example.com")
client = APIClient()
client.force_login(user)
dave = factories.UserFactory(email="contact@work.com", full_name="David Bowman")
dave = factories.UserFactory(email="contact@example.com", full_name="David Bowman")
response = client.get(
"/api/v1.0/users/?q=David",
@@ -166,13 +168,13 @@ def test_api_users_list_query_accented_full_name():
Authenticated users should be able to list users and filter by full name with accents.
Only results with a Trigram similarity greater than 0.2 with the query should be returned.
"""
user = factories.UserFactory()
user = factories.UserFactory(email="user@example.com")
client = APIClient()
client.force_login(user)
fred = factories.UserFactory(
email="contact@work.com", full_name="Frédérique Lefèvre"
email="contact@example.com", full_name="Frédérique Lefèvre"
)
response = client.get("/api/v1.0/users/?q=Frédérique")
@@ -201,12 +203,82 @@ def test_api_users_list_query_accented_full_name():
assert users == []
def test_api_users_list_sorted_by_closest_match():
"""
Authenticated users should be able to list users and the results should be
sorted by closest match to the query.
Sorting criteria are :
- Shared documents with the user (most recent first)
- Same full email domain (example.gouv.fr)
Addresses that match neither criteria should be excluded from the results.
Case in point: the logged-in user has recently shared documents
with pierre.dupont@beta.gouv.fr and less recently with pierre.durand@impots.gouv.fr.
Other users named Pierre also exist:
- pierre.thomas@example.com
- pierre.petit@anct.gouv.fr
- pierre.robert@culture.gouv.fr
The search results should be ordered as follows:
# Shared with first
- pierre.dupond@beta.gouv.fr # Most recent first
- pierre.durand@impots.gouv.fr
# Same full domain second
- pierre.petit@anct.gouv.fr
"""
user = factories.UserFactory(
email="martin.bernard@anct.gouv.fr", full_name="Martin Bernard"
)
client = APIClient()
client.force_login(user)
pierre_1 = factories.UserFactory(email="pierre.dupont@beta.gouv.fr")
pierre_2 = factories.UserFactory(email="pierre.durand@impots.gouv.fr")
_pierre_3 = factories.UserFactory(email="pierre.thomas@example.com")
pierre_4 = factories.UserFactory(email="pierre.petit@anct.gouv.fr")
_pierre_5 = factories.UserFactory(email="pierre.robert@culture.gouv.fr")
document_1 = factories.DocumentFactory(creator=user)
document_2 = factories.DocumentFactory(creator=user)
factories.UserDocumentAccessFactory(user=user, document=document_1)
factories.UserDocumentAccessFactory(user=user, document=document_2)
now = timezone.now()
last_week = now - timezone.timedelta(days=7)
last_month = now - timezone.timedelta(days=30)
# The factory cannot set the created_at directly, so we force it after creation
p1_d1 = factories.UserDocumentAccessFactory(user=pierre_1, document=document_1)
p1_d1.created_at = last_week
p1_d1.save()
p2_d2 = factories.UserDocumentAccessFactory(user=pierre_2, document=document_2)
p2_d2.created_at = last_month
p2_d2.save()
response = client.get("/api/v1.0/users/?q=Pierre")
assert response.status_code == 200
user_ids = [user["email"] for user in response.json()]
assert user_ids == [
str(pierre_1.email),
str(pierre_2.email),
str(pierre_4.email),
]
def test_api_users_list_limit(settings):
"""
Authenticated users should be able to list users and the number of results
should be limited to 10.
should be limited to API_USERS_LIST_LIMIT (by default 5).
"""
user = factories.UserFactory()
user = factories.UserFactory(email="user@example.com")
client = APIClient()
client.force_login(user)
@@ -309,28 +381,16 @@ def test_api_users_list_query_email_exclude_doc_user():
def test_api_users_list_query_short_queries():
"""
Queries shorter than 5 characters should return an empty result set.
If API_USERS_SEARCH_QUERY_MIN_LENGTH is not set, the default minimum length should be 3.
"""
user = factories.UserFactory(email="paul@example.com", full_name="Paul")
client = APIClient()
client.force_login(user)
factories.UserFactory(email="john.doe@example.com")
factories.UserFactory(email="john.lennon@example.com")
factories.UserFactory(email="john.doe@example.com", full_name="John Doe")
factories.UserFactory(email="john.lennon@example.com", full_name="John Lennon")
response = client.get("/api/v1.0/users/?q=jo")
assert response.status_code == 400
assert response.json() == {
"q": ["Ensure this value has at least 5 characters (it has 2)."]
}
response = client.get("/api/v1.0/users/?q=john")
assert response.status_code == 400
assert response.json() == {
"q": ["Ensure this value has at least 5 characters (it has 4)."]
}
response = client.get("/api/v1.0/users/?q=john.")
response = client.get("/api/v1.0/users/?q=joh")
assert response.status_code == 200
assert len(response.json()) == 2
@@ -356,7 +416,7 @@ def test_api_users_list_query_long_queries():
def test_api_users_list_query_inactive():
"""Inactive users should not be listed."""
user = factories.UserFactory()
user = factories.UserFactory(email="user@example.com")
client = APIClient()
client.force_login(user)

View File

@@ -3,9 +3,14 @@
import base64
import uuid
import pycrdt
from django.core.cache import cache
from core import utils
import pycrdt
import pytest
from core import factories, utils
pytestmark = pytest.mark.django_db
# This base64 string is an example of what is saved in the database.
# This base64 is generated from the blocknote editor, it contains
@@ -100,3 +105,103 @@ def test_utils_get_ancestor_to_descendants_map_multiple_paths():
"000100020005": {"000100020005"},
"00010003": {"00010003"},
}
def test_utils_users_sharing_documents_with_cache_miss():
"""Test cache miss: should query database and cache result."""
user1 = factories.UserFactory()
user2 = factories.UserFactory()
user3 = factories.UserFactory()
doc1 = factories.DocumentFactory()
doc2 = factories.DocumentFactory()
factories.UserDocumentAccessFactory(user=user1, document=doc1)
factories.UserDocumentAccessFactory(user=user2, document=doc1)
factories.UserDocumentAccessFactory(user=user3, document=doc2)
cache_key = utils.get_users_sharing_documents_with_cache_key(user1)
cache.delete(cache_key)
result = utils.users_sharing_documents_with(user1)
assert user2.id in result
cached_data = cache.get(cache_key)
assert cached_data == result
def test_utils_users_sharing_documents_with_cache_hit():
"""Test cache hit: should return cached data without querying database."""
user1 = factories.UserFactory()
user2 = factories.UserFactory()
doc1 = factories.DocumentFactory()
factories.UserDocumentAccessFactory(user=user1, document=doc1)
factories.UserDocumentAccessFactory(user=user2, document=doc1)
cache_key = utils.get_users_sharing_documents_with_cache_key(user1)
test_cached_data = {user2.id: "2025-02-10"}
cache.set(cache_key, test_cached_data, 86400)
result = utils.users_sharing_documents_with(user1)
assert result == test_cached_data
def test_utils_users_sharing_documents_with_cache_invalidation_on_create():
"""Test that cache is invalidated when a DocumentAccess is created."""
# Create test data
user1 = factories.UserFactory()
user2 = factories.UserFactory()
doc1 = factories.DocumentFactory()
# Pre-populate cache
cache_key = utils.get_users_sharing_documents_with_cache_key(user1)
cache.set(cache_key, {}, 86400)
# Verify cache exists
assert cache.get(cache_key) is not None
# Create new DocumentAccess
factories.UserDocumentAccessFactory(user=user2, document=doc1)
# Cache should still exist (only created for user2 who was added)
# But if we create access for user1 being shared with, cache should be cleared
cache.set(cache_key, {"test": "data"}, 86400)
factories.UserDocumentAccessFactory(user=user1, document=doc1)
# Cache for user1 should be invalidated (cleared)
assert cache.get(cache_key) is None
def test_utils_users_sharing_documents_with_cache_invalidation_on_delete():
"""Test that cache is invalidated when a DocumentAccess is deleted."""
user1 = factories.UserFactory()
user2 = factories.UserFactory()
doc1 = factories.DocumentFactory()
doc_access = factories.UserDocumentAccessFactory(user=user1, document=doc1)
cache_key = utils.get_users_sharing_documents_with_cache_key(user1)
cache.set(cache_key, {user2.id: "2025-02-10"}, 86400)
assert cache.get(cache_key) is not None
doc_access.delete()
assert cache.get(cache_key) is None
def test_utils_users_sharing_documents_with_empty_result():
"""Test when user is not sharing any documents."""
user1 = factories.UserFactory()
cache_key = utils.get_users_sharing_documents_with_cache_key(user1)
cache.delete(cache_key)
result = utils.users_sharing_documents_with(user1)
assert result == {}
cached_data = cache.get(cache_key)
assert cached_data == {}

View File

@@ -0,0 +1,62 @@
"""Tests for utils.users_sharing_documents_with function."""
from django.utils import timezone
import pytest
from core import factories, utils
pytestmark = pytest.mark.django_db
def test_utils_users_sharing_documents_with():
"""Test users_sharing_documents_with function."""
user = factories.UserFactory(
email="martin.bernard@anct.gouv.fr", full_name="Martin Bernard"
)
pierre_1 = factories.UserFactory(
email="pierre.dupont@beta.gouv.fr", full_name="Pierre Dupont"
)
pierre_2 = factories.UserFactory(
email="pierre.durand@impots.gouv.fr", full_name="Pierre Durand"
)
now = timezone.now()
yesterday = now - timezone.timedelta(days=1)
last_week = now - timezone.timedelta(days=7)
last_month = now - timezone.timedelta(days=30)
document_1 = factories.DocumentFactory(creator=user)
document_2 = factories.DocumentFactory(creator=user)
document_3 = factories.DocumentFactory(creator=user)
factories.UserDocumentAccessFactory(user=user, document=document_1)
factories.UserDocumentAccessFactory(user=user, document=document_2)
factories.UserDocumentAccessFactory(user=user, document=document_3)
# The factory cannot set the created_at directly, so we force it after creation
doc_1_pierre_1 = factories.UserDocumentAccessFactory(
user=pierre_1, document=document_1, created_at=last_week
)
doc_1_pierre_1.created_at = last_week
doc_1_pierre_1.save()
doc_2_pierre_2 = factories.UserDocumentAccessFactory(
user=pierre_2, document=document_2
)
doc_2_pierre_2.created_at = last_month
doc_2_pierre_2.save()
doc_3_pierre_2 = factories.UserDocumentAccessFactory(
user=pierre_2, document=document_3
)
doc_3_pierre_2.created_at = yesterday
doc_3_pierre_2.save()
shared_map = utils.users_sharing_documents_with(user)
assert shared_map == {
pierre_1.id: last_week,
pierre_2.id: yesterday,
}

View File

@@ -1,13 +1,21 @@
"""Utils for the core app."""
import base64
import logging
import re
import time
from collections import defaultdict
from django.core.cache import cache
from django.db import models as db
from django.db.models import Subquery
import pycrdt
from bs4 import BeautifulSoup
from core import enums
from core import enums, models
logger = logging.getLogger(__name__)
def get_ancestor_to_descendants_map(paths, steplen):
@@ -96,3 +104,46 @@ def extract_attachments(content):
xml_content = base64_yjs_to_xml(content)
return re.findall(enums.MEDIA_STORAGE_URL_EXTRACT, xml_content)
def get_users_sharing_documents_with_cache_key(user):
"""Generate a unique cache key for each user."""
return f"users_sharing_documents_with_{user.id}"
def users_sharing_documents_with(user):
"""
Returns a map of users sharing documents with the given user,
sorted by last shared date.
"""
start_time = time.time()
cache_key = get_users_sharing_documents_with_cache_key(user)
cached_result = cache.get(cache_key)
if cached_result is not None:
elapsed = time.time() - start_time
logger.info(
"users_sharing_documents_with cache hit for user %s (took %.3fs)",
user.id,
elapsed,
)
return cached_result
user_docs_qs = models.DocumentAccess.objects.filter(user=user).values_list(
"document_id", flat=True
)
shared_qs = (
models.DocumentAccess.objects.filter(document_id__in=Subquery(user_docs_qs))
.exclude(user=user)
.values("user")
.annotate(last_shared=db.Max("created_at"))
)
result = {item["user"]: item["last_shared"] for item in shared_qs}
cache.set(cache_key, result, 86400) # Cache for 1 day
elapsed = time.time() - start_time
logger.info(
"users_sharing_documents_with cache miss for user %s (took %.3fs)",
user.id,
elapsed,
)
return result

View File

@@ -842,6 +842,11 @@ class Base(Configuration):
environ_name="API_USERS_LIST_LIMIT",
environ_prefix=None,
)
API_USERS_SEARCH_QUERY_MIN_LENGTH = values.PositiveIntegerValue(
default=3,
environ_name="API_USERS_SEARCH_QUERY_MIN_LENGTH",
environ_prefix=None,
)
# Content Security Policy
# See https://content-security-policy.com/ for more information.

View File

@@ -8,6 +8,7 @@ export const BROWSERS: BrowserName[] = ['chromium', 'webkit', 'firefox'];
export const CONFIG = {
AI_FEATURE_ENABLED: true,
API_USERS_SEARCH_QUERY_MIN_LENGTH: 3,
CRISP_WEBSITE_ID: null,
COLLABORATION_WS_URL: 'ws://localhost:4444/collaboration/ws/',
COLLABORATION_WS_NOT_CONNECTED_READY_ONLY: true,

View File

@@ -16,6 +16,7 @@ interface ThemeCustomization {
export interface ConfigResponse {
AI_FEATURE_ENABLED?: boolean;
API_USERS_SEARCH_QUERY_MIN_LENGTH?: number;
COLLABORATION_WS_URL?: string;
COLLABORATION_WS_NOT_CONNECTED_READY_ONLY?: boolean;
CONVERSION_FILE_EXTENSIONS_ALLOWED: string[];

View File

@@ -11,6 +11,7 @@ import {
QuickSearchData,
QuickSearchGroup,
} from '@/components/quick-search/';
import { useConfig } from '@/core';
import { Doc } from '@/docs/doc-management';
import { User } from '@/features/auth';
import { useResponsiveStore } from '@/stores';
@@ -57,6 +58,9 @@ export const DocShareModal = ({ doc, onClose, isRootDoc = true }: Props) => {
const { t } = useTranslation();
const selectedUsersRef = useRef<HTMLDivElement>(null);
const queryClient = useQueryClient();
const { data: config } = useConfig();
const API_USERS_SEARCH_QUERY_MIN_LENGTH =
config?.API_USERS_SEARCH_QUERY_MIN_LENGTH || 5;
const { isDesktop } = useResponsiveStore();
@@ -83,7 +87,6 @@ export const DocShareModal = ({ doc, onClose, isRootDoc = true }: Props) => {
const canViewAccesses = doc.abilities.accesses_view;
const showMemberSection = inputValue === '' && selectedUsers.length === 0;
const showFooter = selectedUsers.length === 0 && !inputValue;
const MIN_CHARACTERS_FOR_SEARCH = 4;
const onSelect = (user: User) => {
setSelectedUsers((prev) => [...prev, user]);
@@ -111,7 +114,7 @@ export const DocShareModal = ({ doc, onClose, isRootDoc = true }: Props) => {
const searchUsersQuery = useUsers(
{ query: userQuery, docId: doc.id },
{
enabled: userQuery?.length > MIN_CHARACTERS_FOR_SEARCH,
enabled: userQuery?.length >= API_USERS_SEARCH_QUERY_MIN_LENGTH,
queryKey: [KEY_LIST_USER, { query: userQuery }],
},
);