From 3294becf38d37a0e24155eeb0f182c43e675bb86 Mon Sep 17 00:00:00 2001 From: QuanMPhm Date: Thu, 9 May 2024 16:21:49 -0400 Subject: [PATCH] Validate combined dataframe with alias file Added an argument `alias-file` which gives the user the choice to provide a PI alias file, else defaults to fetching the file from a hardcoded location in S3 storage ("PIs/alias.csv") The PI file must be a csv, with the first value of each row being the canonical name for each PI every other value in each row will be the PI's known aliases. An example: ``` PI1,PI1_1,PI1_2 PI2,PI2_1 ``` Given the alias file, `validate_pi_aliases` will iterate through all projects in the combined invoice, and replace all encountered aliases with its PI's canonical name --- process_report/process_report.py | 48 ++++++++++++++++++++++++++++-- process_report/tests/unit_tests.py | 21 +++++++++++++ 2 files changed, 67 insertions(+), 2 deletions(-) diff --git a/process_report/process_report.py b/process_report/process_report.py index 2c6b8d7..2e5585d 100644 --- a/process_report/process_report.py +++ b/process_report/process_report.py @@ -31,6 +31,9 @@ PI_S3_FILEPATH = "PIs/PI.csv" +ALIAS_S3_FILEPATH = "PIs/alias.csv" + + def get_institution_from_pi(institute_map, pi_uname): institution_key = pi_uname.split("@")[-1] institution_name = institute_map.get(institution_key, "") @@ -57,8 +60,7 @@ def load_old_pis(old_pi_file): pi, first_month = pi_info.strip().split(",") old_pi_dict[pi] = first_month except FileNotFoundError: - print("Applying credit 0002 failed. Old PI file does not exist") - sys.exit(1) + sys.exit("Applying credit 0002 failed. Old PI file does not exist") return old_pi_dict @@ -69,6 +71,21 @@ def dump_old_pis(old_pi_file, old_pi_dict: dict): f.write(f"{pi},{first_month}\n") +def load_alias(alias_file): + alias_dict = dict() + + try: + with open(alias_file) as f: + for line in f: + pi_alias_info = line.strip().split(",") + alias_dict[pi_alias_info[0]] = pi_alias_info[1:] + except FileNotFoundError: + print("Validating PI aliases failed. Alias file does not exist") + sys.exit(1) + + return alias_dict + + def is_old_pi(old_pi_dict, pi, invoice_month): first_invoice_month = old_pi_dict.get(pi, invoice_month) if compare_invoice_month(first_invoice_month, invoice_month): @@ -188,6 +205,11 @@ def main(): required=False, help="Name of csv file listing previously billed PIs. If not provided, defaults to fetching from S3", ) + parser.add_argument( + "--alias-file", + required=False, + help="Name of alias file listing PIs with aliases (and their aliases). If not provided, defaults to fetching from S3", + ) parser.add_argument( "--BU-subsidy-amount", required=True, @@ -202,11 +224,18 @@ def main(): csv_files = fetch_s3_invoices(invoice_month) else: csv_files = args.csv_files + if args.old_pi_file: old_pi_file = args.old_pi_file else: old_pi_file = fetch_s3_old_pi_file() + if args.alias_file: + alias_file = args.alias_file + else: + alias_file = fetch_s3_alias_file() + alias_dict = load_alias(alias_file) + merged_dataframe = merge_csv(csv_files) pi = [] @@ -224,6 +253,7 @@ def main(): projects = list(set(projects + timed_projects_list)) + merged_dataframe = validate_pi_aliases(merged_dataframe, alias_dict) merged_dataframe = add_institution(merged_dataframe) export_lenovo(merged_dataframe, args.Lenovo_file) remove_billables(merged_dataframe, pi, projects, args.nonbillable_file) @@ -338,6 +368,20 @@ def validate_pi_names(dataframe): return dataframe +def validate_pi_aliases(dataframe: pandas.DataFrame, alias_dict: dict): + for pi, pi_aliases in alias_dict.items(): + dataframe.loc[dataframe[PI_FIELD].isin(pi_aliases), PI_FIELD] = pi + + return dataframe + + +def fetch_s3_alias_file(): + local_name = "alias.csv" + invoice_bucket = get_invoice_bucket() + invoice_bucket.download_file(ALIAS_S3_FILEPATH, local_name) + return local_name + + def apply_credits_new_pi(dataframe, old_pi_file): new_pi_credit_code = "0002" new_pi_credit_amount = 1000 diff --git a/process_report/tests/unit_tests.py b/process_report/tests/unit_tests.py index 1ca797c..7eb7a65 100644 --- a/process_report/tests/unit_tests.py +++ b/process_report/tests/unit_tests.py @@ -258,6 +258,27 @@ def test_get_pi_institution(self): ) +class TestAlias(TestCase): + def setUp(self): + self.alias_dict = {"PI1": ["PI1_1", "PI1_2"], "PI2": ["PI2_1"]} + + self.data = pandas.DataFrame( + { + "Manager (PI)": ["PI1", "PI1_1", "PI1_2", "PI2_1", "PI2_1"], + } + ) + + self.answer = pandas.DataFrame( + { + "Manager (PI)": ["PI1", "PI1", "PI1", "PI2", "PI2"], + } + ) + + def test_validate_alias(self): + output = process_report.validate_pi_aliases(self.data, self.alias_dict) + self.assertTrue(self.answer.equals(output)) + + class TestCredit0002(TestCase): def setUp(self): data = {