Skip to content

Commit

Permalink
fix(mongodb): reorganize document size filter logic in aggregation
Browse files Browse the repository at this point in the history
Reorganize the logic for adding a document size filter in the MongoDB aggregation process. The changed aggregation order improves MongoDB scanning performance.
  • Loading branch information
KadeRyu committed Jan 22, 2025
1 parent af16d7e commit 6d70e90
Showing 1 changed file with 9 additions and 8 deletions.
17 changes: 9 additions & 8 deletions metadata-ingestion/src/datahub/ingestion/source/mongodb.py
Original file line number Diff line number Diff line change
Expand Up @@ -218,21 +218,22 @@ def construct_schema_pymongo(
"""

aggregations: List[Dict] = []
if should_add_document_size_filter:
doc_size_field = "temporary_doc_size_field"
# create a temporary field to store the size of the document. filter on it and then remove it.
aggregations = [
{"$addFields": {doc_size_field: {"$bsonSize": "$$ROOT"}}},
{"$match": {doc_size_field: {"$lt": max_document_size}}},
{"$project": {doc_size_field: 0}},
]

if sample_size:
if use_random_sampling:
aggregations.append({"$sample": {"size": sample_size}})
else:
aggregations.append({"$limit": sample_size})

Check warning on line 226 in metadata-ingestion/src/datahub/ingestion/source/mongodb.py

View check run for this annotation

Codecov / codecov/patch

metadata-ingestion/src/datahub/ingestion/source/mongodb.py#L226

Added line #L226 was not covered by tests

if should_add_document_size_filter:
doc_size_field = "temporary_doc_size_field"
# create a temporary field to store the size of the document. filter on it and then remove it.
aggregations.extend([
{"$addFields": {doc_size_field: {"$bsonSize": "$$ROOT"}}},
{"$match": {doc_size_field: {"$lt": max_document_size}}},
{"$project": {doc_size_field: 0}},
])

documents = collection.aggregate(aggregations, allowDiskUse=True)

return construct_schema(list(documents), delimiter)
Expand Down

0 comments on commit 6d70e90

Please sign in to comment.