Comparator generate delta even with column differences (DOI-DO#18)

agilesix · Dec 2, 2024 · 2b397ae · 2b397ae
1 parent 273a863
commit 2b397ae
Showing 1 changed file with 120 additions and 99 deletions.
diff --git a/data/data-pipeline/data_pipeline/comparator.py b/data/data-pipeline/data_pipeline/comparator.py
@@ -1,6 +1,5 @@
 import sys
 import click
-import difflib
 import pandas as pd
 from pathlib import Path
 
@@ -16,6 +15,26 @@
 pd.set_option("display.width", 10000)
 pd.set_option("display.colheader_justify", "left")
 
+result_text = []
+
+
+def _add_text(text: str):
+    """Add a line to the output result.
+
+    Args:
+        line (str): a line to add
+    """
+    result_text.append(text)
+
+
+def _get_result_doc() -> str:
+    """Gets the document with results.
+
+    Returns:
+        str: the results document as text
+    """
+    return "".join(result_text)
+
 
 def _read_from_file(file_path: Path):
     """Read a CSV file into a Dataframe."""
@@ -65,7 +84,9 @@ def cli():
     default=constants.DATA_SCORE_CSV_FULL_FILE_PATH,
     help="Compare to the specified score CSV file instead of downloading from production",
 )
-def compare_score(compare_to_version: str, compare_to_file: str, local_score_file: str):
+def compare_score(
+    compare_to_version: str, compare_to_file: str, local_score_file: str
+):
     """Compares the score in the production environment to the locally generated score. The
     algorithm is pretty simple:
 
@@ -81,10 +102,6 @@ def compare_score(compare_to_version: str, compare_to_file: str, local_score_fil
     FLOAT_ROUNDING_PLACES = 2
     WORKING_PATH = constants.TMP_PATH / "Comparator" / "Score"
 
-    summary = "# Score Comparison Summary\n"
-    summary += f"Hi! I'm the Score Comparator. I compared the score in production (version {compare_to_version}) to the"
-    summary += " locally calculated score. Here are the results.\n"
-
     log_title("Compare Score", "Compare production score to local score")
 
     if compare_to_file:
@@ -118,59 +135,73 @@ def compare_score(compare_to_version: str, compare_to_file: str, local_score_fil
     production_score_df_columns = sorted(
         production_score_df.columns.array.tolist()
     )
+    extra_cols_in_local = set(local_score_df_columns) - set(
+        production_score_df_columns
+    )
+    extra_cols_in_prod = set(production_score_df_columns) - set(
+        local_score_df_columns
+    )
 
-    log_info("Comparing columns (production vs local). Differences are: ")
-    summary += "\n## Columns\n"
-    summary += "I compared the columns. Here's what I found.\n"
-
-    col_diff = difflib.unified_diff(
-        production_score_df_columns, local_score_df_columns
+    _add_text("# Score Comparison Summary\n")
+    _add_text(
+        f"Hi! I'm the Score Comparator. I compared the score in production (version {compare_to_version}) to the"
+        " locally calculated score. Here are the results:\n\n"
     )
-    col_diff_res = ""
-    for d in col_diff:
-        col_diff_res += str(d) + "\n"
 
-    if len(col_diff_res) == 0:
-        log_info("None. Columns are the same")
-        summary += "* There are no differences in the column names.\n"
+    #####################
+    # Compare the columns
+    #####################
+    log_info("Comparing columns (production vs local)")
+    _add_text("## Columns\n")
+    if len(extra_cols_in_local) == 0 and len(extra_cols_in_prod) == 0:
+        _add_text("* There are no differences in the column names.\n")
     else:
-        log_info("There are differences. The diff is:")
-        log_info(col_diff_res)
-        summary += f"* There are differences in the column names. Here's a diff:\n{col_diff_res}\n"
+        _add_text(
+            f"* There are {len(extra_cols_in_local)} columns that were added as compared to the production score."
+        )
+        if len(extra_cols_in_local) > 0:
+            _add_text(f" Those colums are:\n{extra_cols_in_local}")
+        _add_text(
+            f"\n* There are {len(extra_cols_in_prod)} columns that were removed as compared to the production score."
+        )
+        if len(extra_cols_in_prod) > 0:
+            _add_text(f" Those colums are:\n{extra_cols_in_prod}")
 
+    ####################
+    # Compare the scores
+    ####################
     log_info("Comparing dataframe contents (production vs local)")
-    summary += "\n## Scores\n"
-    summary += "I compared the scores, too. Here's what I found.\n"
+    _add_text("\n\n## Scores\n")
 
     production_row_count = len(production_score_df.index)
     local_row_count = len(local_score_df.index)
 
-    summary += f"* The production score has {production_row_count:,} census tracts, and the freshly calculated score has {local_row_count:,}."
-    summary += (
-        " They match!\n"
-        if production_row_count == local_row_count
-        else " They don't match.\n"
+    _add_text(
+        f"* The production score has {production_row_count:,} census tracts, and the freshly calculated score has {local_row_count:,}."
     )
+    if production_row_count == local_row_count:
+        _add_text(" They match!\n")
+    else:
+        _add_text(" They don't match. The differences are:\n")
+        _add_text(
+            "  * New tracts added to the local score are:\n"
+            f"{local_score_df.index.difference(production_score_df.index).to_list()}"
+            "\n  * Tracts removed from the local score are:\n"
+            f"{production_score_df.index.difference(local_score_df.index).to_list()}"
+            "\n"
+        )
 
     production_total_population = production_score_df["Total population"].sum()
     local_total_population = local_score_df["Total population"].sum()
 
-    log_info(
-        f"The total population in all census tracts in production is {production_total_population:,}."
+    _add_text(
+        f"* The total population in all census tracts in the production score is {production_total_population:,}. "
+        f"The total population in all census tracts locally is {local_total_population:,}. "
     )
-    log_info(
-        f"The total population in all census tracts locally is {local_total_population:,}."
-    )
-    log_info(
-        f"The difference in population is {abs(production_total_population - local_total_population):,}."
-    )
-
-    summary += f"* The total population in all census tracts in the production score is {production_total_population:,}."
-    summary += f" The total population in all census tracts locally is {local_total_population:,}."
-    summary += (
-        " They match!\n"
+    _add_text(
+        "They match!\n"
         if production_total_population == local_total_population
-        else f"  The difference is {abs(production_total_population - local_total_population):,}.\n"
+        else f"The difference is {abs(production_total_population - local_total_population):,}.\n"
     )
 
     production_disadvantaged_tracts_df = production_score_df.query(
@@ -196,30 +227,18 @@ def compare_score(compare_to_version: str, compare_to_file: str, local_score_fil
         / local_total_population
     )
 
-    log_info(
-        f"There are {len(production_disadvantaged_tracts_set):,} disadvantaged tracts in the production score."
-    )
-    log_info(
-        f"This represents {production_pct_of_population_represented:.1%} of the total population."
-    )
-    log_info(
-        f"There are {len(local_disadvantaged_tracts_set):,} in the locally generated score."
+    _add_text(
+        f"* There are {len(production_disadvantaged_tracts_set):,} disadvantaged tracts in the production score representing"
+        f" {production_pct_of_population_represented:.1%} of the total population, and {len(local_disadvantaged_tracts_set):,}"
     )
-    log_info(
-        f"This represents {local_pct_of_population_represented:.1%} of the total population."
+    _add_text(
+        f" in the locally generated score representing {local_pct_of_population_represented:.1%} of the total population."
     )
-    log_info(
-        f"The difference is {abs(len(production_disadvantaged_tracts_set) - len(local_disadvantaged_tracts_set)):,} tract(s)."
-    )
-
-    summary += f"* There are {len(production_disadvantaged_tracts_set):,} disadvantaged tracts in the production score representing"
-    summary += f" {production_pct_of_population_represented:.1%} of the total population, and {len(local_disadvantaged_tracts_set):,}"
-    summary += f" in the locally generated score representing {local_pct_of_population_represented:.1%} of the total population."
-    summary += (
-        " The number of tracts match!\n"
+    _add_text(
+        " The number of tracts match!\n "
         if len(production_disadvantaged_tracts_set)
         == len(local_disadvantaged_tracts_set)
-        else f" The difference is {abs(len(production_disadvantaged_tracts_set) - len(local_disadvantaged_tracts_set))} tract(s).\n"
+        else f" The difference is {abs(len(production_disadvantaged_tracts_set) - len(local_disadvantaged_tracts_set))} tract(s).\n "
     )
 
     removed_tracts = production_disadvantaged_tracts_set.difference(
@@ -228,58 +247,60 @@ def compare_score(compare_to_version: str, compare_to_file: str, local_score_fil
     added_tracts = local_disadvantaged_tracts_set.difference(
         production_disadvantaged_tracts_set
     )
-    removed_tracts_str = ", ".join(list(removed_tracts))
-    added_tracts_str = ", ".join(list(added_tracts))
-
-    log_info(
-        f"There are {len(removed_tracts):,} tract(s) marked as disadvantaged in the prod "
-        "score that are not disadvantaged in the local score. Those tracts are:"
-    )
-    log_info(removed_tracts_str)
-    log_info(
-        f"There are {len(added_tracts):,} tract(s) marked as disadvantaged in the local "
-        "score that are not disadvantaged in the prod score. Those tracts are:"
-    )
-    log_info(added_tracts_str)
-
-    summary += (
+    _add_text(
         f"* There are {len(removed_tracts):,} tract(s) marked as disadvantaged in the production score that are not disadvantaged in the locally"
-        f" generated score (i.e. disadvantaged tracts that were removed by the new score). Those tracts are:\n{removed_tracts_str}\n"
-        f" There are {len(added_tracts):,} tract(s) marked as disadvantaged in the locally generated score that are not disadvantaged in the"
-        f" production score (i.e. disadvantaged tracts that were added by the new score). Those tracts are:\n{added_tracts_str}\n\n"
+        f" generated score (i.e. disadvantaged tracts that were removed by the new score). "
     )
+    if len(removed_tracts) > 0:
+        _add_text(f"Those tracts are:\n{removed_tracts}\n")
 
+    _add_text(
+        f"* There are {len(added_tracts):,} tract(s) marked as disadvantaged in the locally generated score that are not disadvantaged in the"
+        f" production score (i.e. disadvantaged tracts that were added by the new score). "
+    )
+    if len(added_tracts) > 0:
+        _add_text(f"Those tracts are:\n{added_tracts}\n")
+
+    ################
+    # Create a delta
+    ################
+    _add_text("\n## Delta\n")
+    # First we make the columns on two dataframes to be the same to be able to compare
+    trimmed_prod_df = production_score_df.drop(extra_cols_in_prod, axis=1)
+    trimmed_local_df = local_score_df.drop(extra_cols_in_local, axis=1)
     try:
 
-        comparison_results_df = production_score_df.compare(
-            local_score_df, align_axis=1, keep_shape=False, keep_equal=False
+        comparison_results_df = trimmed_prod_df.compare(
+            trimmed_local_df, align_axis=1, keep_shape=False, keep_equal=False
         ).rename({"self": "Production", "other": "Local"}, axis=1, level=1)
 
-        summary += "* I compared all values across all census tracts."
-        summary += f" There are {len(comparison_results_df.index):,} tracts with at least one difference."
-        summary += " Please examine the logs or run the score comparison locally to view them all.\n"
-
-        log_info(
-            f"There are {len(comparison_results_df.index)} rows with any differences."
+        _add_text(
+            "* I compared all values across all census tracts. Note this ignores any columns that have been added or removed."
+            f" There are {len(comparison_results_df.index):,} tracts with at least one difference.\n"
         )
-        if len(comparison_results_df.index) > 0:
-            log_info("Those differences are:")
-            log_info("\n" + str(comparison_results_df))
-            comparison_path = WORKING_PATH / "deltas.csv"
-            comparison_results_df.to_csv(path_or_buf=comparison_path)
-            log_info(f"Wrote comparison results to {comparison_path}")
+
+        comparison_path = WORKING_PATH / "deltas.csv"
+        comparison_results_df.to_csv(path_or_buf=comparison_path)
+
+        _add_text(f"* Wrote comparison results to {comparison_path}")
 
     except ValueError as e:
-        summary += "* I could not run a full comparison. This is likely because there are column or index (census tract) differences."
-        summary += " Please examine the logs or run the score comparison locally to find out more.\n"
-        log_info(
-            f"Encountered an exception while performing the comparison: {repr(e)}"
+        _add_text(
+            "* I could not run a full comparison. This is likely because there are column or index (census tract) differences."
+            " Please examine the logs or run the score comparison locally to find out more.\n"
         )
+        _add_text(
+            f"Encountered an exception while performing the comparison: {repr(e)}\n"
+        )
+
+    result_doc = _get_result_doc()
+    print(result_doc)
 
+    # Write the report
     summary_path = WORKING_PATH / "comparison-summary.md"
 
     with open(summary_path, "w", encoding="utf-8") as f:
-        f.write(summary)
+        f.write(result_doc)
         log_info(f"Wrote comparison summary to {summary_path}")
 
     log_goodbye()