From 7d65de1938777252a39a9a172046d3c5640a1a89 Mon Sep 17 00:00:00 2001 From: Lebaud Antoine Date: Wed, 6 Mar 2024 23:32:25 +0100 Subject: [PATCH] =?UTF-8?q?=E2=9C=A8(backend)=20search=20user=20on=20her?= =?UTF-8?q?=20email=20and=20name?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Compute Trigram similarity on user's name, and sum it up with existing one based on user's email. This approach is inspired by Contact search feature, which computes a Trigram similarity score on first name and last name, to sum up their scores. With a similarity score influenced by both email and name, API results would reflect both email and name user's attributes. As we sum up similarities, I increased the similarity threshold. Its value is empirical, and was finetuned to avoid breaking existing tests. Please note, the updated value is closer to the threshold used to search contacts. Email or Name can be None. Summing two similarity scores with one of them None, results in a None total score. To mitigate this issue, I added a default empty string value, to replace None values. Thus, the similarity score on this default empty string value is equal to 0 and not to None anymore. --- src/backend/core/api/viewsets.py | 17 +++- src/backend/core/tests/test_api_users.py | 112 ++++++++++++++++++++++- 2 files changed, 120 insertions(+), 9 deletions(-) diff --git a/src/backend/core/api/viewsets.py b/src/backend/core/api/viewsets.py index 82f151c..1dcfa84 100644 --- a/src/backend/core/api/viewsets.py +++ b/src/backend/core/api/viewsets.py @@ -2,6 +2,7 @@ from django.contrib.postgres.search import TrigramSimilarity from django.db.models import Func, Max, OuterRef, Prefetch, Q, Subquery, Value +from django.db.models.functions import Coalesce from rest_framework import ( decorators, @@ -18,9 +19,7 @@ from core import models from . import permissions, serializers -EMAIL_SIMILARITY_THRESHOLD = 0.01 -# TrigramSimilarity threshold is lower for searching email than for names, -# to improve matching results +SIMILARITY_THRESHOLD = 0.04 class NestedGenericViewSet(viewsets.GenericViewSet): @@ -212,13 +211,21 @@ class UserViewSet( if query := self.request.GET.get("q", ""): similarity = Max( TrigramSimilarity( - Func("identities__email", function="unaccent"), + Coalesce( + Func("identities__email", function="unaccent"), Value("") + ), + Func(Value(query), function="unaccent"), + ) + + TrigramSimilarity( + Coalesce( + Func("identities__name", function="unaccent"), Value("") + ), Func(Value(query), function="unaccent"), ) ) queryset = ( queryset.annotate(similarity=similarity) - .filter(similarity__gte=EMAIL_SIMILARITY_THRESHOLD) + .filter(similarity__gte=SIMILARITY_THRESHOLD) .order_by("-similarity") ) diff --git a/src/backend/core/tests/test_api_users.py b/src/backend/core/tests/test_api_users.py index fe48572..f457020 100644 --- a/src/backend/core/tests/test_api_users.py +++ b/src/backend/core/tests/test_api_users.py @@ -59,10 +59,16 @@ def test_api_users_authenticated_list_by_email(): client = APIClient() client.force_login(user) - dave = factories.IdentityFactory(email="david.bowman@work.com", name="david bowman", is_main=True) - nicole = factories.IdentityFactory(email="nicole_foole@work.com", name="nicole foole", is_main=True) - frank = factories.IdentityFactory(email="frank_poole@work.com", name="frank poole", is_main=True) - factories.IdentityFactory(email="heywood_floyd@work.com", name="heywood floyd", is_main=True) + dave = factories.IdentityFactory( + email="david.bowman@work.com", name=None, is_main=True + ) + nicole = factories.IdentityFactory( + email="nicole_foole@work.com", name=None, is_main=True + ) + frank = factories.IdentityFactory( + email="frank_poole@work.com", name=None, is_main=True + ) + factories.IdentityFactory(email="heywood_floyd@work.com", name=None, is_main=True) # Full query should work response = client.get( @@ -114,6 +120,104 @@ def test_api_users_authenticated_list_by_email(): ] +def test_api_users_authenticated_list_by_name(): + """ + Authenticated users should be able to search users with a case-insensitive and + partial query on the name. + """ + user = factories.UserFactory(email="tester@ministry.fr") + factories.IdentityFactory(user=user, email=user.email, name="john doe") + + client = APIClient() + client.force_login(user) + + dave = factories.IdentityFactory(name="dave bowman", email=None, is_main=True) + nicole = factories.IdentityFactory(name="nicole foole", email=None, is_main=True) + frank = factories.IdentityFactory(name="frank poole", email=None, is_main=True) + factories.IdentityFactory(name="heywood floyd", email=None, is_main=True) + + # Full query should work + response = client.get( + "/api/v1.0/users/?q=david.bowman@work.com", + ) + + assert response.status_code == HTTP_200_OK + user_ids = [user["id"] for user in response.json()["results"]] + assert user_ids[0] == str(dave.user.id) + + # Partial query should work + response = client.get("/api/v1.0/users/?q=fran") + + assert response.status_code == HTTP_200_OK + user_ids = [user["id"] for user in response.json()["results"]] + assert user_ids[0] == str(frank.user.id) + + # Result that matches a trigram twice ranks better than result that matches once + response = client.get("/api/v1.0/users/?q=ole") + + assert response.status_code == HTTP_200_OK + user_ids = [user["id"] for user in response.json()["results"]] + # "Nicole Foole" matches twice on "ole" + assert user_ids == [str(nicole.user.id), str(frank.user.id)] + + # Even with a low similarity threshold, query should match expected user + response = client.get("/api/v1.0/users/?q=ool") + + assert response.status_code == HTTP_200_OK + assert response.json()["results"] == [ + { + "id": str(nicole.user.id), + "email": nicole.email, + "name": nicole.name, + "is_device": nicole.user.is_device, + "is_staff": nicole.user.is_staff, + "language": nicole.user.language, + "timezone": str(nicole.user.timezone), + }, + { + "id": str(frank.user.id), + "email": frank.email, + "name": frank.name, + "is_device": frank.user.is_device, + "is_staff": frank.user.is_staff, + "language": frank.user.language, + "timezone": str(frank.user.timezone), + }, + ] + + +def test_api_users_authenticated_list_by_name_and_email(): + """ + Authenticated users should be able to search users with a case-insensitive and + partial query on the name and email. + """ + + user = factories.UserFactory(email="tester@ministry.fr") + factories.IdentityFactory(user=user, email=user.email, name="john doe") + + client = APIClient() + client.force_login(user) + + nicole = factories.IdentityFactory( + email="nicole_foole@work.com", name="nicole foole", is_main=True + ) + frank = factories.IdentityFactory( + email="oleg_poole@work.com", name=None, is_main=True + ) + david = factories.IdentityFactory(email=None, name="david role", is_main=True) + + # Result that matches a trigram in name and email ranks better than result that matches once + response = client.get("/api/v1.0/users/?q=ole") + + assert response.status_code == HTTP_200_OK + user_ids = [user["id"] for user in response.json()["results"]] + + # "Nicole Foole" matches twice on "ole" in her name and twice on "ole" in her email + # "Oleg poole" matches twice on "ole" in her email + # "David role" matches once on "ole" in his name + assert user_ids == [str(nicole.user.id), str(frank.user.id), str(david.user.id)] + + def test_api_users_authenticated_list_multiple_identities_single_user(): """ User with multiple identities should appear only once in results.