diff --git a/src/backend/core/management/commands/index.py b/src/backend/core/management/commands/index.py index b7eab950..6a15faac 100644 --- a/src/backend/core/management/commands/index.py +++ b/src/backend/core/management/commands/index.py @@ -21,8 +21,11 @@ class Command(BaseCommand): """Launch and log search index generation.""" logger.info("Starting to regenerate Find index...") start = time.perf_counter() - - FindDocumentIndexer().index() + count = FindDocumentIndexer().index() duration = time.perf_counter() - start - logger.info("Search index regenerated in %.2f seconds.", duration) + logger.info( + "Search index regenerated from %d document(s) in %.2f seconds.", + count, + duration, + ) diff --git a/src/backend/core/services/search_indexers.py b/src/backend/core/services/search_indexers.py index 255aa3bb..69eb1f67 100644 --- a/src/backend/core/services/search_indexers.py +++ b/src/backend/core/services/search_indexers.py @@ -146,6 +146,8 @@ class BaseDocumentIndexer(ABC): Fetch documents in batches, serialize them, and push to the search backend. """ last_id = 0 + count = 0 + while True: documents_batch = list( models.Document.objects.filter( @@ -163,9 +165,13 @@ class BaseDocumentIndexer(ABC): serialized_batch = [ self.serialize_document(document, accesses_by_document_path) for document in documents_batch - if document.content + if document.content or document.title ] + self.push(serialized_batch) + count += len(serialized_batch) + + return count @abstractmethod def serialize_document(self, document, accesses): diff --git a/src/backend/core/tests/test_services_search_indexers.py b/src/backend/core/tests/test_services_search_indexers.py index 63c2d305..14d47d9d 100644 --- a/src/backend/core/tests/test_services_search_indexers.py +++ b/src/backend/core/tests/test_services_search_indexers.py @@ -304,7 +304,7 @@ def test_services_search_indexers_batches_pass_only_batch_accesses( access = factories.UserDocumentAccessFactory(document=document) expected_user_subs[str(document.id)] = str(access.user.sub) - FindDocumentIndexer().index() + assert FindDocumentIndexer().index() == 5 # Should be 3 batches: 2 + 2 + 1 assert mock_push.call_count == 3 @@ -327,6 +327,34 @@ def test_services_search_indexers_batches_pass_only_batch_accesses( assert seen_doc_ids == {str(d.id) for d in documents} +@patch.object(FindDocumentIndexer, "push") +@pytest.mark.usefixtures("indexer_settings") +def test_services_search_indexers_ignore_empty_documents(mock_push): + """ + Documents indexing should be processed in batches, + and only the access data relevant to each batch should be used. + """ + document = factories.DocumentFactory() + factories.DocumentFactory(content="", title="") + empty_title = factories.DocumentFactory(title="") + empty_content = factories.DocumentFactory(content="") + + assert FindDocumentIndexer().index() == 3 + + assert mock_push.call_count == 1 + + # Make sure only not eempty documents are indexed + results = {doc["id"] for doc in mock_push.call_args[0][0]} + assert results == { + str(d.id) + for d in ( + document, + empty_content, + empty_title, + ) + } + + @patch.object(FindDocumentIndexer, "push") @pytest.mark.usefixtures("indexer_settings") def test_services_search_indexers_ancestors_link_reach(mock_push): @@ -338,7 +366,7 @@ def test_services_search_indexers_ancestors_link_reach(mock_push): parent = factories.DocumentFactory(parent=grand_parent, link_reach="public") document = factories.DocumentFactory(parent=parent, link_reach="restricted") - FindDocumentIndexer().index() + assert FindDocumentIndexer().index() == 4 results = {doc["id"]: doc for doc in mock_push.call_args[0][0]} assert len(results) == 4 @@ -358,7 +386,7 @@ def test_services_search_indexers_ancestors_users(mock_push): parent = factories.DocumentFactory(parent=grand_parent, users=[user_p]) document = factories.DocumentFactory(parent=parent, users=[user_d]) - FindDocumentIndexer().index() + assert FindDocumentIndexer().index() == 3 results = {doc["id"]: doc for doc in mock_push.call_args[0][0]} assert len(results) == 3 @@ -379,7 +407,7 @@ def test_services_search_indexers_ancestors_teams(mock_push): parent = factories.DocumentFactory(parent=grand_parent, teams=["team_p"]) document = factories.DocumentFactory(parent=parent, teams=["team_d"]) - FindDocumentIndexer().index() + assert FindDocumentIndexer().index() == 3 results = {doc["id"]: doc for doc in mock_push.call_args[0][0]} assert len(results) == 3