✨(backend) Index partially empty documents
Only documents without title and content are ignored by indexer.
This commit is contained in:
committed by
Quentin BEY
parent
01c31ddd74
commit
331a94ad2f
@@ -21,8 +21,11 @@ class Command(BaseCommand):
|
|||||||
"""Launch and log search index generation."""
|
"""Launch and log search index generation."""
|
||||||
logger.info("Starting to regenerate Find index...")
|
logger.info("Starting to regenerate Find index...")
|
||||||
start = time.perf_counter()
|
start = time.perf_counter()
|
||||||
|
count = FindDocumentIndexer().index()
|
||||||
FindDocumentIndexer().index()
|
|
||||||
|
|
||||||
duration = time.perf_counter() - start
|
duration = time.perf_counter() - start
|
||||||
logger.info("Search index regenerated in %.2f seconds.", duration)
|
logger.info(
|
||||||
|
"Search index regenerated from %d document(s) in %.2f seconds.",
|
||||||
|
count,
|
||||||
|
duration,
|
||||||
|
)
|
||||||
|
|||||||
@@ -146,6 +146,8 @@ class BaseDocumentIndexer(ABC):
|
|||||||
Fetch documents in batches, serialize them, and push to the search backend.
|
Fetch documents in batches, serialize them, and push to the search backend.
|
||||||
"""
|
"""
|
||||||
last_id = 0
|
last_id = 0
|
||||||
|
count = 0
|
||||||
|
|
||||||
while True:
|
while True:
|
||||||
documents_batch = list(
|
documents_batch = list(
|
||||||
models.Document.objects.filter(
|
models.Document.objects.filter(
|
||||||
@@ -163,9 +165,13 @@ class BaseDocumentIndexer(ABC):
|
|||||||
serialized_batch = [
|
serialized_batch = [
|
||||||
self.serialize_document(document, accesses_by_document_path)
|
self.serialize_document(document, accesses_by_document_path)
|
||||||
for document in documents_batch
|
for document in documents_batch
|
||||||
if document.content
|
if document.content or document.title
|
||||||
]
|
]
|
||||||
|
|
||||||
self.push(serialized_batch)
|
self.push(serialized_batch)
|
||||||
|
count += len(serialized_batch)
|
||||||
|
|
||||||
|
return count
|
||||||
|
|
||||||
@abstractmethod
|
@abstractmethod
|
||||||
def serialize_document(self, document, accesses):
|
def serialize_document(self, document, accesses):
|
||||||
|
|||||||
@@ -304,7 +304,7 @@ def test_services_search_indexers_batches_pass_only_batch_accesses(
|
|||||||
access = factories.UserDocumentAccessFactory(document=document)
|
access = factories.UserDocumentAccessFactory(document=document)
|
||||||
expected_user_subs[str(document.id)] = str(access.user.sub)
|
expected_user_subs[str(document.id)] = str(access.user.sub)
|
||||||
|
|
||||||
FindDocumentIndexer().index()
|
assert FindDocumentIndexer().index() == 5
|
||||||
|
|
||||||
# Should be 3 batches: 2 + 2 + 1
|
# Should be 3 batches: 2 + 2 + 1
|
||||||
assert mock_push.call_count == 3
|
assert mock_push.call_count == 3
|
||||||
@@ -327,6 +327,34 @@ def test_services_search_indexers_batches_pass_only_batch_accesses(
|
|||||||
assert seen_doc_ids == {str(d.id) for d in documents}
|
assert seen_doc_ids == {str(d.id) for d in documents}
|
||||||
|
|
||||||
|
|
||||||
|
@patch.object(FindDocumentIndexer, "push")
|
||||||
|
@pytest.mark.usefixtures("indexer_settings")
|
||||||
|
def test_services_search_indexers_ignore_empty_documents(mock_push):
|
||||||
|
"""
|
||||||
|
Documents indexing should be processed in batches,
|
||||||
|
and only the access data relevant to each batch should be used.
|
||||||
|
"""
|
||||||
|
document = factories.DocumentFactory()
|
||||||
|
factories.DocumentFactory(content="", title="")
|
||||||
|
empty_title = factories.DocumentFactory(title="")
|
||||||
|
empty_content = factories.DocumentFactory(content="")
|
||||||
|
|
||||||
|
assert FindDocumentIndexer().index() == 3
|
||||||
|
|
||||||
|
assert mock_push.call_count == 1
|
||||||
|
|
||||||
|
# Make sure only not eempty documents are indexed
|
||||||
|
results = {doc["id"] for doc in mock_push.call_args[0][0]}
|
||||||
|
assert results == {
|
||||||
|
str(d.id)
|
||||||
|
for d in (
|
||||||
|
document,
|
||||||
|
empty_content,
|
||||||
|
empty_title,
|
||||||
|
)
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
@patch.object(FindDocumentIndexer, "push")
|
@patch.object(FindDocumentIndexer, "push")
|
||||||
@pytest.mark.usefixtures("indexer_settings")
|
@pytest.mark.usefixtures("indexer_settings")
|
||||||
def test_services_search_indexers_ancestors_link_reach(mock_push):
|
def test_services_search_indexers_ancestors_link_reach(mock_push):
|
||||||
@@ -338,7 +366,7 @@ def test_services_search_indexers_ancestors_link_reach(mock_push):
|
|||||||
parent = factories.DocumentFactory(parent=grand_parent, link_reach="public")
|
parent = factories.DocumentFactory(parent=grand_parent, link_reach="public")
|
||||||
document = factories.DocumentFactory(parent=parent, link_reach="restricted")
|
document = factories.DocumentFactory(parent=parent, link_reach="restricted")
|
||||||
|
|
||||||
FindDocumentIndexer().index()
|
assert FindDocumentIndexer().index() == 4
|
||||||
|
|
||||||
results = {doc["id"]: doc for doc in mock_push.call_args[0][0]}
|
results = {doc["id"]: doc for doc in mock_push.call_args[0][0]}
|
||||||
assert len(results) == 4
|
assert len(results) == 4
|
||||||
@@ -358,7 +386,7 @@ def test_services_search_indexers_ancestors_users(mock_push):
|
|||||||
parent = factories.DocumentFactory(parent=grand_parent, users=[user_p])
|
parent = factories.DocumentFactory(parent=grand_parent, users=[user_p])
|
||||||
document = factories.DocumentFactory(parent=parent, users=[user_d])
|
document = factories.DocumentFactory(parent=parent, users=[user_d])
|
||||||
|
|
||||||
FindDocumentIndexer().index()
|
assert FindDocumentIndexer().index() == 3
|
||||||
|
|
||||||
results = {doc["id"]: doc for doc in mock_push.call_args[0][0]}
|
results = {doc["id"]: doc for doc in mock_push.call_args[0][0]}
|
||||||
assert len(results) == 3
|
assert len(results) == 3
|
||||||
@@ -379,7 +407,7 @@ def test_services_search_indexers_ancestors_teams(mock_push):
|
|||||||
parent = factories.DocumentFactory(parent=grand_parent, teams=["team_p"])
|
parent = factories.DocumentFactory(parent=grand_parent, teams=["team_p"])
|
||||||
document = factories.DocumentFactory(parent=parent, teams=["team_d"])
|
document = factories.DocumentFactory(parent=parent, teams=["team_d"])
|
||||||
|
|
||||||
FindDocumentIndexer().index()
|
assert FindDocumentIndexer().index() == 3
|
||||||
|
|
||||||
results = {doc["id"]: doc for doc in mock_push.call_args[0][0]}
|
results = {doc["id"]: doc for doc in mock_push.call_args[0][0]}
|
||||||
assert len(results) == 3
|
assert len(results) == 3
|
||||||
|
|||||||
Reference in New Issue
Block a user