(backend) search user on her email and name

Compute Trigram similarity on user's name, and sum it up
with existing one based on user's email.

This approach is inspired by Contact search feature, which
computes a Trigram similarity score on first name and last
name, to sum up their scores.

With a similarity score influenced by both email and name,
API results would reflect both email and name user's attributes.

As we sum up similarities, I increased the similarity threshold.
Its value is empirical, and was finetuned to avoid breaking
existing tests. Please note, the updated value is closer to the
threshold used to search contacts.

Email or Name can be None. Summing two similarity scores with
one of them None, results in a None total score. To mitigate
this issue, I added a default empty string value, to replace
None values. Thus, the similarity score on this default empty
string value is equal to 0 and not to None anymore.
This commit is contained in:
Lebaud Antoine
2024-03-06 23:32:25 +01:00
committed by aleb_the_flash
parent b2d68df646
commit 7d65de1938
2 changed files with 120 additions and 9 deletions

View File

@@ -2,6 +2,7 @@
from django.contrib.postgres.search import TrigramSimilarity
from django.db.models import Func, Max, OuterRef, Prefetch, Q, Subquery, Value
from django.db.models.functions import Coalesce
from rest_framework import (
decorators,
@@ -18,9 +19,7 @@ from core import models
from . import permissions, serializers
EMAIL_SIMILARITY_THRESHOLD = 0.01
# TrigramSimilarity threshold is lower for searching email than for names,
# to improve matching results
SIMILARITY_THRESHOLD = 0.04
class NestedGenericViewSet(viewsets.GenericViewSet):
@@ -212,13 +211,21 @@ class UserViewSet(
if query := self.request.GET.get("q", ""):
similarity = Max(
TrigramSimilarity(
Func("identities__email", function="unaccent"),
Coalesce(
Func("identities__email", function="unaccent"), Value("")
),
Func(Value(query), function="unaccent"),
)
+ TrigramSimilarity(
Coalesce(
Func("identities__name", function="unaccent"), Value("")
),
Func(Value(query), function="unaccent"),
)
)
queryset = (
queryset.annotate(similarity=similarity)
.filter(similarity__gte=EMAIL_SIMILARITY_THRESHOLD)
.filter(similarity__gte=SIMILARITY_THRESHOLD)
.order_by("-similarity")
)

View File

@@ -59,10 +59,16 @@ def test_api_users_authenticated_list_by_email():
client = APIClient()
client.force_login(user)
dave = factories.IdentityFactory(email="david.bowman@work.com", name="david bowman", is_main=True)
nicole = factories.IdentityFactory(email="nicole_foole@work.com", name="nicole foole", is_main=True)
frank = factories.IdentityFactory(email="frank_poole@work.com", name="frank poole", is_main=True)
factories.IdentityFactory(email="heywood_floyd@work.com", name="heywood floyd", is_main=True)
dave = factories.IdentityFactory(
email="david.bowman@work.com", name=None, is_main=True
)
nicole = factories.IdentityFactory(
email="nicole_foole@work.com", name=None, is_main=True
)
frank = factories.IdentityFactory(
email="frank_poole@work.com", name=None, is_main=True
)
factories.IdentityFactory(email="heywood_floyd@work.com", name=None, is_main=True)
# Full query should work
response = client.get(
@@ -114,6 +120,104 @@ def test_api_users_authenticated_list_by_email():
]
def test_api_users_authenticated_list_by_name():
"""
Authenticated users should be able to search users with a case-insensitive and
partial query on the name.
"""
user = factories.UserFactory(email="tester@ministry.fr")
factories.IdentityFactory(user=user, email=user.email, name="john doe")
client = APIClient()
client.force_login(user)
dave = factories.IdentityFactory(name="dave bowman", email=None, is_main=True)
nicole = factories.IdentityFactory(name="nicole foole", email=None, is_main=True)
frank = factories.IdentityFactory(name="frank poole", email=None, is_main=True)
factories.IdentityFactory(name="heywood floyd", email=None, is_main=True)
# Full query should work
response = client.get(
"/api/v1.0/users/?q=david.bowman@work.com",
)
assert response.status_code == HTTP_200_OK
user_ids = [user["id"] for user in response.json()["results"]]
assert user_ids[0] == str(dave.user.id)
# Partial query should work
response = client.get("/api/v1.0/users/?q=fran")
assert response.status_code == HTTP_200_OK
user_ids = [user["id"] for user in response.json()["results"]]
assert user_ids[0] == str(frank.user.id)
# Result that matches a trigram twice ranks better than result that matches once
response = client.get("/api/v1.0/users/?q=ole")
assert response.status_code == HTTP_200_OK
user_ids = [user["id"] for user in response.json()["results"]]
# "Nicole Foole" matches twice on "ole"
assert user_ids == [str(nicole.user.id), str(frank.user.id)]
# Even with a low similarity threshold, query should match expected user
response = client.get("/api/v1.0/users/?q=ool")
assert response.status_code == HTTP_200_OK
assert response.json()["results"] == [
{
"id": str(nicole.user.id),
"email": nicole.email,
"name": nicole.name,
"is_device": nicole.user.is_device,
"is_staff": nicole.user.is_staff,
"language": nicole.user.language,
"timezone": str(nicole.user.timezone),
},
{
"id": str(frank.user.id),
"email": frank.email,
"name": frank.name,
"is_device": frank.user.is_device,
"is_staff": frank.user.is_staff,
"language": frank.user.language,
"timezone": str(frank.user.timezone),
},
]
def test_api_users_authenticated_list_by_name_and_email():
"""
Authenticated users should be able to search users with a case-insensitive and
partial query on the name and email.
"""
user = factories.UserFactory(email="tester@ministry.fr")
factories.IdentityFactory(user=user, email=user.email, name="john doe")
client = APIClient()
client.force_login(user)
nicole = factories.IdentityFactory(
email="nicole_foole@work.com", name="nicole foole", is_main=True
)
frank = factories.IdentityFactory(
email="oleg_poole@work.com", name=None, is_main=True
)
david = factories.IdentityFactory(email=None, name="david role", is_main=True)
# Result that matches a trigram in name and email ranks better than result that matches once
response = client.get("/api/v1.0/users/?q=ole")
assert response.status_code == HTTP_200_OK
user_ids = [user["id"] for user in response.json()["results"]]
# "Nicole Foole" matches twice on "ole" in her name and twice on "ole" in her email
# "Oleg poole" matches twice on "ole" in her email
# "David role" matches once on "ole" in his name
assert user_ids == [str(nicole.user.id), str(frank.user.id), str(david.user.id)]
def test_api_users_authenticated_list_multiple_identities_single_user():
"""
User with multiple identities should appear only once in results.