feat(ingest/snowflake): improve accuracy of computed sample-based pro…

…file
datahub-project · Jan 10, 2024 · 258edeb · 258edeb
1 parent 21075e6
commit 258edeb
Showing 1 changed file with 6 additions and 8 deletions.
diff --git a/metadata-ingestion/src/datahub/ingestion/source/ge_data_profiler.py b/metadata-ingestion/src/datahub/ingestion/source/ge_data_profiler.py
@@ -680,14 +680,12 @@ def generate_dataset_profile(  # noqa: C901 (complexity)
         assert profile.rowCount is not None
         row_count: int  # used for null counts calculation
         if profile.partitionSpec and "SAMPLE" in profile.partitionSpec.partition:
-            # We can alternatively use `self._get_dataset_rows(profile)` to get
-            # exact count of rows in sample, as actual rows involved in sample
-            # may be slightly different (more or less) than configured `sample_size`.
-            # However not doing so to start with, as that adds another query overhead
-            # plus approximate metrics should work for sampling based profiling.
-            row_count = self.config.sample_size
-        else:
-            row_count = profile.rowCount
+            # Querying exact row count of sample using `_get_dataset_rows`.
+            # We are not using `self.config.sample_size` directly as actual row count
+            # in sample may be slightly different (more or less) than configured `sample_size`.
+            self._get_dataset_rows(profile)
+
+        row_count = profile.rowCount
 
         for column_spec in columns_profiling_queue:
             column = column_spec.column