(backend) Index partially empty documents

Only documents without title and content are ignored by indexer.
This commit is contained in:
Fabre Florian
2025-09-22 16:05:39 +02:00
committed by Quentin BEY
parent 01c31ddd74
commit 331a94ad2f
3 changed files with 45 additions and 8 deletions

View File

@@ -21,8 +21,11 @@ class Command(BaseCommand):
"""Launch and log search index generation."""
logger.info("Starting to regenerate Find index...")
start = time.perf_counter()
FindDocumentIndexer().index()
count = FindDocumentIndexer().index()
duration = time.perf_counter() - start
logger.info("Search index regenerated in %.2f seconds.", duration)
logger.info(
"Search index regenerated from %d document(s) in %.2f seconds.",
count,
duration,
)

View File

@@ -146,6 +146,8 @@ class BaseDocumentIndexer(ABC):
Fetch documents in batches, serialize them, and push to the search backend.
"""
last_id = 0
count = 0
while True:
documents_batch = list(
models.Document.objects.filter(
@@ -163,9 +165,13 @@ class BaseDocumentIndexer(ABC):
serialized_batch = [
self.serialize_document(document, accesses_by_document_path)
for document in documents_batch
if document.content
if document.content or document.title
]
self.push(serialized_batch)
count += len(serialized_batch)
return count
@abstractmethod
def serialize_document(self, document, accesses):

View File

@@ -304,7 +304,7 @@ def test_services_search_indexers_batches_pass_only_batch_accesses(
access = factories.UserDocumentAccessFactory(document=document)
expected_user_subs[str(document.id)] = str(access.user.sub)
FindDocumentIndexer().index()
assert FindDocumentIndexer().index() == 5
# Should be 3 batches: 2 + 2 + 1
assert mock_push.call_count == 3
@@ -327,6 +327,34 @@ def test_services_search_indexers_batches_pass_only_batch_accesses(
assert seen_doc_ids == {str(d.id) for d in documents}
@patch.object(FindDocumentIndexer, "push")
@pytest.mark.usefixtures("indexer_settings")
def test_services_search_indexers_ignore_empty_documents(mock_push):
"""
Documents indexing should be processed in batches,
and only the access data relevant to each batch should be used.
"""
document = factories.DocumentFactory()
factories.DocumentFactory(content="", title="")
empty_title = factories.DocumentFactory(title="")
empty_content = factories.DocumentFactory(content="")
assert FindDocumentIndexer().index() == 3
assert mock_push.call_count == 1
# Make sure only not eempty documents are indexed
results = {doc["id"] for doc in mock_push.call_args[0][0]}
assert results == {
str(d.id)
for d in (
document,
empty_content,
empty_title,
)
}
@patch.object(FindDocumentIndexer, "push")
@pytest.mark.usefixtures("indexer_settings")
def test_services_search_indexers_ancestors_link_reach(mock_push):
@@ -338,7 +366,7 @@ def test_services_search_indexers_ancestors_link_reach(mock_push):
parent = factories.DocumentFactory(parent=grand_parent, link_reach="public")
document = factories.DocumentFactory(parent=parent, link_reach="restricted")
FindDocumentIndexer().index()
assert FindDocumentIndexer().index() == 4
results = {doc["id"]: doc for doc in mock_push.call_args[0][0]}
assert len(results) == 4
@@ -358,7 +386,7 @@ def test_services_search_indexers_ancestors_users(mock_push):
parent = factories.DocumentFactory(parent=grand_parent, users=[user_p])
document = factories.DocumentFactory(parent=parent, users=[user_d])
FindDocumentIndexer().index()
assert FindDocumentIndexer().index() == 3
results = {doc["id"]: doc for doc in mock_push.call_args[0][0]}
assert len(results) == 3
@@ -379,7 +407,7 @@ def test_services_search_indexers_ancestors_teams(mock_push):
parent = factories.DocumentFactory(parent=grand_parent, teams=["team_p"])
document = factories.DocumentFactory(parent=parent, teams=["team_d"])
FindDocumentIndexer().index()
assert FindDocumentIndexer().index() == 3
results = {doc["id"]: doc for doc in mock_push.call_args[0][0]}
assert len(results) == 3