🚸(backend) use unaccented full name for user search

We have the user full name through OIDC in the database, but the search only
used the email field.
This change allows to search for a user by their first and/or
last name (fix #929).
Given that user names are more likely than emails to include diacritics, it
unaccents both the query and the database entry for search (fix #1091).
It also unaccents for email so that internationalized domain names are
managed whether or not the accent is included in the search.
An unaccented gin index is added on users full_name an email fields.
Using a manual migration because a wrapper around unaccent is necessary
to make it IMMUTABLE (cf.
https://stackoverflow.com/questions/9063402/ )
This commit is contained in:
Sylvain Boissel
2025-11-19 14:49:24 +01:00
parent 52bd31c0d5
commit 96299f4b7f
4 changed files with 177 additions and 6 deletions

View File

@@ -64,6 +64,7 @@ and this project adheres to
- ♻️(frontend) preserve @ character when esc is pressed after typing it #1512
- ♻️(frontend) make summary button fixed to remain visible during scroll #1581
- ♻️(frontend) pdf embed use full width #1526
- 🚸(backend) use unaccented full name for user search #1637
### Fixed

View File

@@ -1,4 +1,5 @@
"""API endpoints"""
# pylint: disable=too-many-lines
import base64
@@ -18,7 +19,7 @@ from django.core.validators import URLValidator
from django.db import connection, transaction
from django.db import models as db
from django.db.models.expressions import RawSQL
from django.db.models.functions import Left, Length
from django.db.models.functions import Greatest, Left, Length
from django.http import Http404, StreamingHttpResponse
from django.urls import reverse
from django.utils import timezone
@@ -37,6 +38,7 @@ from rest_framework import response as drf_response
from rest_framework.permissions import AllowAny
from core import authentication, choices, enums, models
from core.api.filters import remove_accents
from core.services.ai_services import AIService
from core.services.collaboration_services import CollaborationService
from core.services.converter_services import (
@@ -188,13 +190,15 @@ class UserViewSet(
queryset = queryset.exclude(documentaccess__document_id=document_id)
filter_data = filterset.form.cleaned_data
query = filter_data["q"]
query = remove_accents(filter_data["q"])
# For emails, match emails by Levenstein distance to prevent typing errors
if "@" in query:
return (
queryset.annotate(
distance=RawSQL("levenshtein(email::text, %s::text)", (query,))
distance=RawSQL(
"levenshtein(unaccent(email::text), %s::text)", (query,)
)
)
.filter(distance__lte=3)
.order_by("distance", "email")[: settings.API_USERS_LIST_LIMIT]
@@ -203,11 +207,15 @@ class UserViewSet(
# Use trigram similarity for non-email-like queries
# For performance reasons we filter first by similarity, which relies on an
# index, then only calculate precise similarity scores for sorting purposes
return (
queryset.filter(email__trigram_word_similar=query)
.annotate(similarity=TrigramSimilarity("email", query))
queryset.annotate(
sim_email=TrigramSimilarity("email", query),
sim_name=TrigramSimilarity("full_name", query),
)
.annotate(similarity=Greatest("sim_email", "sim_name"))
.filter(similarity__gt=0.2)
.order_by("-similarity", "email")[: settings.API_USERS_LIST_LIMIT]
.order_by("-similarity")[: settings.API_USERS_LIST_LIMIT]
)
@drf.decorators.action(

View File

@@ -0,0 +1,37 @@
# Generated by Django 5.2.8 on 2025-11-20 09:56
from django.db import migrations
class Migration(migrations.Migration):
dependencies = [
("core", "0026_comments"),
]
operations = [
migrations.RunSQL(
sql="""
CREATE OR REPLACE FUNCTION public.immutable_unaccent(regdictionary, text)
RETURNS text
LANGUAGE c IMMUTABLE PARALLEL SAFE STRICT AS
'$libdir/unaccent', 'unaccent_dict';
CREATE OR REPLACE FUNCTION public.f_unaccent(text)
RETURNS text
LANGUAGE sql IMMUTABLE PARALLEL SAFE STRICT
RETURN public.immutable_unaccent(regdictionary 'public.unaccent', $1);
CREATE INDEX IF NOT EXISTS user_email_unaccent_trgm_idx
ON impress_user
USING gin (f_unaccent(email) gin_trgm_ops);
CREATE INDEX IF NOT EXISTS user_full_name_unaccent_trgm_idx
ON impress_user
USING gin (f_unaccent(full_name) gin_trgm_ops);
""",
reverse_sql="""
DROP INDEX IF EXISTS user_email_unaccent_trgm_idx;
DROP INDEX IF EXISTS user_full_name_unaccent_trgm_idx;
""",
),
]

View File

@@ -76,6 +76,131 @@ def test_api_users_list_query_email():
assert user_ids == []
def test_api_users_list_query_email_with_internationalized_domain_names():
"""
Authenticated users should be able to list users and filter by email.
It should work even if the email address contains an internationalized domain name.
"""
user = factories.UserFactory()
client = APIClient()
client.force_login(user)
jean = factories.UserFactory(email="jean.martin@éducation.fr")
marie = factories.UserFactory(email="marie.durand@education.fr")
kurokawa = factories.UserFactory(email="contact@黒川.日本")
response = client.get("/api/v1.0/users/?q=jean.martin@education.fr")
assert response.status_code == 200
user_ids = [user["id"] for user in response.json()]
assert user_ids == [str(jean.id)]
response = client.get("/api/v1.0/users/?q=jean.martin@éducation.fr")
assert response.status_code == 200
user_ids = [user["id"] for user in response.json()]
assert user_ids == [str(jean.id)]
response = client.get("/api/v1.0/users/?q=marie.durand@education.fr")
assert response.status_code == 200
user_ids = [user["id"] for user in response.json()]
assert user_ids == [str(marie.id)]
response = client.get("/api/v1.0/users/?q=marie.durand@éducation.fr")
assert response.status_code == 200
user_ids = [user["id"] for user in response.json()]
assert user_ids == [str(marie.id)]
response = client.get("/api/v1.0/users/?q=contact@黒川.日本")
assert response.status_code == 200
user_ids = [user["id"] for user in response.json()]
assert user_ids == [str(kurokawa.id)]
def test_api_users_list_query_full_name():
"""
Authenticated users should be able to list users and filter by full name.
Only results with a Trigram similarity greater than 0.2 with the query should be returned.
"""
user = factories.UserFactory()
client = APIClient()
client.force_login(user)
dave = factories.UserFactory(email="contact@work.com", full_name="David Bowman")
response = client.get(
"/api/v1.0/users/?q=David",
)
assert response.status_code == 200
user_ids = [user["id"] for user in response.json()]
assert user_ids == [str(dave.id)]
response = client.get("/api/v1.0/users/?q=Bowman")
assert response.status_code == 200
user_ids = [user["id"] for user in response.json()]
assert user_ids == [str(dave.id)]
response = client.get("/api/v1.0/users/?q=bowman")
assert response.status_code == 200
user_ids = [user["id"] for user in response.json()]
assert user_ids == [str(dave.id)]
response = client.get("/api/v1.0/users/?q=BOWMAN")
assert response.status_code == 200
user_ids = [user["id"] for user in response.json()]
assert user_ids == [str(dave.id)]
response = client.get("/api/v1.0/users/?q=BoWmAn")
assert response.status_code == 200
user_ids = [user["id"] for user in response.json()]
assert user_ids == [str(dave.id)]
response = client.get("/api/v1.0/users/?q=Bovin")
assert response.status_code == 200
user_ids = [user["id"] for user in response.json()]
assert user_ids == []
def test_api_users_list_query_accented_full_name():
"""
Authenticated users should be able to list users and filter by full name with accents.
Only results with a Trigram similarity greater than 0.2 with the query should be returned.
"""
user = factories.UserFactory()
client = APIClient()
client.force_login(user)
fred = factories.UserFactory(
email="contact@work.com", full_name="Frédérique Lefèvre"
)
response = client.get("/api/v1.0/users/?q=Frédérique")
assert response.status_code == 200
user_ids = [user["id"] for user in response.json()]
assert user_ids == [str(fred.id)]
response = client.get("/api/v1.0/users/?q=Frederique")
assert response.status_code == 200
user_ids = [user["id"] for user in response.json()]
assert user_ids == [str(fred.id)]
response = client.get("/api/v1.0/users/?q=Lefèvre")
assert response.status_code == 200
user_ids = [user["id"] for user in response.json()]
assert user_ids == [str(fred.id)]
response = client.get("/api/v1.0/users/?q=Lefevre")
assert response.status_code == 200
user_ids = [user["id"] for user in response.json()]
assert user_ids == [str(fred.id)]
response = client.get("/api/v1.0/users/?q=François Lorfebvre")
assert response.status_code == 200
users = [user["full_name"] for user in response.json()]
assert users == []
def test_api_users_list_limit(settings):
"""
Authenticated users should be able to list users and the number of results