🚸(backend) on user search match emails by Levenstein distance
When the query looks like an email (includes @) we search by Levenstein distance because we are just trying to prevent typing errors, not searching anymore. It is important to still propose results with a short Levenstein distance because it is frequent to forget a double letter in someone's name for example "Pacoud" or even "pacou" instead of "Paccoud" and we want to prevent duplicates or failing on invitation. We consider the query string to be an email as soon as it contains a "@" character. Trying harder to identify a string that is really an email would lead to weird behaviors like toto@example.gouv looking like and email but if we continue typing toto@example.gouv.f not looking like an email... before toto@example.gouv.fr finally looking like an email. The result would be jumping from one type of search to the other. As soon as there is a "@" in the query, we can be sure that the user is not looking for a name anymore and we can switch to matching by Levenstein distance.
This commit is contained in:
committed by
Samuel Paccoud
parent
265a24fe7e
commit
609ff91894
@@ -11,7 +11,7 @@ and this project adheres to
|
||||
|
||||
## Added
|
||||
|
||||
- github actions to managed Crowdin workflow
|
||||
- github actions to manage Crowdin workflow
|
||||
- 📈Integrate Posthog #540
|
||||
- 🏷️(backend) add content-type to uploaded files #552
|
||||
- ✨(frontend) export pdf docx front side #537
|
||||
@@ -21,7 +21,6 @@ and this project adheres to
|
||||
- 💄(frontend) add abilities on doc row #581
|
||||
- 💄(frontend) improve DocsGridItem responsive padding #582
|
||||
|
||||
|
||||
## [2.0.1] - 2025-01-17
|
||||
|
||||
## Fixed
|
||||
|
||||
@@ -20,6 +20,7 @@ from django.db.models import (
|
||||
Subquery,
|
||||
Value,
|
||||
)
|
||||
from django.db.models.expressions import RawSQL
|
||||
from django.http import Http404
|
||||
|
||||
import rest_framework as drf
|
||||
@@ -150,29 +151,35 @@ class UserViewSet(
|
||||
"""
|
||||
queryset = self.queryset
|
||||
|
||||
if self.action == "list":
|
||||
# Exclude all users already in the given document
|
||||
if document_id := self.request.GET.get("document_id", ""):
|
||||
queryset = queryset.exclude(documentaccess__document_id=document_id)
|
||||
if self.action != "list":
|
||||
return queryset
|
||||
|
||||
# Filter users by email similarity
|
||||
if query := self.request.GET.get("q", ""):
|
||||
# For performance reasons we filter first by similarity, which relies on an index,
|
||||
# then only calculate precise similarity scores for sorting purposes
|
||||
queryset = queryset.filter(email__trigram_word_similar=query)
|
||||
# Exclude all users already in the given document
|
||||
if document_id := self.request.GET.get("document_id", ""):
|
||||
queryset = queryset.exclude(documentaccess__document_id=document_id)
|
||||
|
||||
queryset = queryset.annotate(
|
||||
similarity=TrigramSimilarity("email", query)
|
||||
if not (query := self.request.GET.get("q", "")):
|
||||
return queryset
|
||||
|
||||
# For emails, match emails by Levenstein distance to prevent typing errors
|
||||
if "@" in query:
|
||||
return (
|
||||
queryset.annotate(
|
||||
distance=RawSQL("levenshtein(email::text, %s::text)", (query,))
|
||||
)
|
||||
# When the query only is on the name part, we should try to make many proposals
|
||||
# But when the query looks like an email we should only propose serious matches
|
||||
threshold = 0.6 if "@" in query else 0.1
|
||||
.filter(distance__lte=3)
|
||||
.order_by("distance", "email")
|
||||
)
|
||||
|
||||
queryset = queryset.filter(similarity__gt=threshold).order_by(
|
||||
"-similarity", "email"
|
||||
)
|
||||
|
||||
return queryset
|
||||
# Use trigram similarity for non-email-like queries
|
||||
# For performance reasons we filter first by similarity, which relies on an
|
||||
# index, then only calculate precise similarity scores for sorting purposes
|
||||
return (
|
||||
queryset.filter(email__trigram_word_similar=query)
|
||||
.annotate(similarity=TrigramSimilarity("email", query))
|
||||
.filter(similarity__gt=0.2)
|
||||
.order_by("-similarity", "email")
|
||||
)
|
||||
|
||||
@drf.decorators.action(
|
||||
detail=False,
|
||||
|
||||
@@ -0,0 +1,16 @@
|
||||
# Generated by Django 5.1.4 on 2025-01-25 08:38
|
||||
|
||||
from django.db import migrations
|
||||
|
||||
class Migration(migrations.Migration):
|
||||
|
||||
dependencies = [
|
||||
('core', '0012_make_document_creator_and_invitation_issuer_optional'),
|
||||
]
|
||||
|
||||
operations = [
|
||||
migrations.RunSQL(
|
||||
"CREATE EXTENSION IF NOT EXISTS fuzzystrmatch;",
|
||||
reverse_sql="DROP EXTENSION IF EXISTS fuzzystrmatch;",
|
||||
),
|
||||
]
|
||||
@@ -42,8 +42,9 @@ def test_api_users_list_authenticated():
|
||||
|
||||
def test_api_users_list_query_email():
|
||||
"""
|
||||
Authenticated users should be able to list users
|
||||
and filter by email.
|
||||
Authenticated users should be able to list users and filter by email.
|
||||
Only results with a Levenstein distance less than 3 with the query should be returned.
|
||||
We want to match by Levenstein distance because we want to prevent typing errors.
|
||||
"""
|
||||
user = factories.UserFactory()
|
||||
|
||||
@@ -51,9 +52,7 @@ def test_api_users_list_query_email():
|
||||
client.force_login(user)
|
||||
|
||||
dave = factories.UserFactory(email="david.bowman@work.com")
|
||||
nicole = factories.UserFactory(email="nicole_foole@work.com")
|
||||
frank = factories.UserFactory(email="frank_poole@work.com")
|
||||
factories.UserFactory(email="heywood_floyd@work.com")
|
||||
factories.UserFactory(email="nicole.bowman@work.com")
|
||||
|
||||
response = client.get(
|
||||
"/api/v1.0/users/?q=david.bowman@work.com",
|
||||
@@ -62,59 +61,53 @@ def test_api_users_list_query_email():
|
||||
user_ids = [user["id"] for user in response.json()["results"]]
|
||||
assert user_ids == [str(dave.id)]
|
||||
|
||||
response = client.get("/api/v1.0/users/?q=oole")
|
||||
|
||||
response = client.get(
|
||||
"/api/v1.0/users/?q=davig.bovman@worm.com",
|
||||
)
|
||||
assert response.status_code == 200
|
||||
user_ids = [user["id"] for user in response.json()["results"]]
|
||||
assert user_ids == [str(nicole.id), str(frank.id)]
|
||||
assert user_ids == [str(dave.id)]
|
||||
|
||||
response = client.get(
|
||||
"/api/v1.0/users/?q=davig.bovman@worm.cop",
|
||||
)
|
||||
assert response.status_code == 200
|
||||
user_ids = [user["id"] for user in response.json()["results"]]
|
||||
assert user_ids == []
|
||||
|
||||
|
||||
def test_api_users_list_query_email_matching():
|
||||
"""While filtering by email, results should be filtered and sorted by similarity"""
|
||||
"""While filtering by email, results should be filtered and sorted by Levenstein distance."""
|
||||
user = factories.UserFactory()
|
||||
|
||||
client = APIClient()
|
||||
client.force_login(user)
|
||||
|
||||
alice = factories.UserFactory(email="alice.johnson@example.gouv.fr")
|
||||
factories.UserFactory(email="jane.smith@example.gouv.fr")
|
||||
michael_wilson = factories.UserFactory(email="michael.wilson@example.gouv.fr")
|
||||
factories.UserFactory(email="david.jones@example.gouv.fr")
|
||||
michael_brown = factories.UserFactory(email="michael.brown@example.gouv.fr")
|
||||
factories.UserFactory(email="sophia.taylor@example.gouv.fr")
|
||||
user1 = factories.UserFactory(email="alice.johnson@example.gouv.fr")
|
||||
user2 = factories.UserFactory(email="alice.johnnson@example.gouv.fr")
|
||||
user3 = factories.UserFactory(email="alice.kohlson@example.gouv.fr")
|
||||
user4 = factories.UserFactory(email="alicia.johnnson@example.gouv.fr")
|
||||
user5 = factories.UserFactory(email="alicia.johnnson@example.gov.uk")
|
||||
factories.UserFactory(email="alice.thomson@example.gouv.fr")
|
||||
|
||||
response = client.get(
|
||||
"/api/v1.0/users/?q=michael.johnson@example.gouv.f",
|
||||
"/api/v1.0/users/?q=alice.johnson@example.gouv.fr",
|
||||
)
|
||||
assert response.status_code == 200
|
||||
user_ids = [user["id"] for user in response.json()["results"]]
|
||||
assert user_ids == [str(michael_wilson.id)]
|
||||
assert user_ids == [str(user1.id), str(user2.id), str(user3.id), str(user4.id)]
|
||||
|
||||
response = client.get("/api/v1.0/users/?q=michael.johnson@example.gouv.fr")
|
||||
response = client.get("/api/v1.0/users/?q=alicia.johnnson@example.gouv.fr")
|
||||
|
||||
assert response.status_code == 200
|
||||
user_ids = [user["id"] for user in response.json()["results"]]
|
||||
assert user_ids == [str(michael_wilson.id), str(alice.id), str(michael_brown.id)]
|
||||
|
||||
response = client.get(
|
||||
"/api/v1.0/users/?q=ajohnson@example.gouv.f",
|
||||
)
|
||||
assert response.status_code == 200
|
||||
user_ids = [user["id"] for user in response.json()["results"]]
|
||||
assert user_ids == [str(alice.id)]
|
||||
|
||||
response = client.get(
|
||||
"/api/v1.0/users/?q=michael.wilson@example.gouv.f",
|
||||
)
|
||||
assert response.status_code == 200
|
||||
user_ids = [user["id"] for user in response.json()["results"]]
|
||||
assert user_ids == [str(michael_wilson.id)]
|
||||
assert user_ids == [str(user4.id), str(user2.id), str(user1.id), str(user5.id)]
|
||||
|
||||
|
||||
def test_api_users_list_query_email_exclude_doc_user():
|
||||
"""
|
||||
Authenticated users should be able to list users
|
||||
and filter by email and exclude users who have access to a document.
|
||||
Authenticated users should be able to list users while filtering by email
|
||||
and excluding users who have access to a document.
|
||||
"""
|
||||
user = factories.UserFactory()
|
||||
document = factories.DocumentFactory()
|
||||
@@ -122,17 +115,19 @@ def test_api_users_list_query_email_exclude_doc_user():
|
||||
client = APIClient()
|
||||
client.force_login(user)
|
||||
|
||||
nicole = factories.UserFactory(email="nicole_foole@work.com")
|
||||
frank = factories.UserFactory(email="frank_poole@work.com")
|
||||
nicole_fool = factories.UserFactory(email="nicole_fool@work.com")
|
||||
nicole_pool = factories.UserFactory(email="nicole_pool@work.com")
|
||||
factories.UserFactory(email="heywood_floyd@work.com")
|
||||
|
||||
factories.UserDocumentAccessFactory(document=document, user=frank)
|
||||
factories.UserDocumentAccessFactory(document=document, user=nicole_pool)
|
||||
|
||||
response = client.get("/api/v1.0/users/?q=oole&document_id=" + str(document.id))
|
||||
response = client.get(
|
||||
"/api/v1.0/users/?q=nicole_fool@work.com&document_id=" + str(document.id)
|
||||
)
|
||||
|
||||
assert response.status_code == 200
|
||||
user_ids = [user["id"] for user in response.json()["results"]]
|
||||
assert user_ids == [str(nicole.id)]
|
||||
assert user_ids == [str(nicole_fool.id)]
|
||||
|
||||
|
||||
def test_api_users_retrieve_me_anonymous():
|
||||
|
||||
Reference in New Issue
Block a user