From d9b838c1ad81ccac6b297509eaafb263abef0c54 Mon Sep 17 00:00:00 2001 From: Quan Pham Date: Wed, 18 Sep 2024 11:10:41 -0400 Subject: [PATCH] Implemented `Processor` class and refactored some preliminary processing A `Processor` class has been added, subclassing from the `Invoice` class. This is the first step to refactor invoicing in order to seperate the processing and filtering/exporting functionalities of our current Invoice subclasses. Subclasses of `Processor` should only process invoices and manipulate its internal data, while subclasses of `Invoice` should only perform filtering and exporting, never changing any data itself. In addition to implementing `Processor`, two of its subclasses, `ValidatePIAliasProcessor` and `AddInstitutionProcessor` has been added to perform some preliminary processing steps. --- process_report/process_report.py | 81 +++++-------------- .../processors/add_institution_processor.py | 58 +++++++++++++ process_report/processors/processor.py | 8 ++ .../processors/validate_pi_alias_processor.py | 23 ++++++ process_report/tests/unit_tests.py | 27 ++++--- process_report/tests/util.py | 21 +++++ process_report/util.py | 18 ----- 7 files changed, 146 insertions(+), 90 deletions(-) create mode 100644 process_report/processors/add_institution_processor.py create mode 100644 process_report/processors/processor.py create mode 100644 process_report/processors/validate_pi_alias_processor.py diff --git a/process_report/process_report.py b/process_report/process_report.py index 4207152..33a80c3 100644 --- a/process_report/process_report.py +++ b/process_report/process_report.py @@ -2,7 +2,6 @@ import sys import datetime -import json import pandas import pyarrow @@ -15,7 +14,10 @@ bu_internal_invoice, pi_specific_invoice, ) - +from process_report.processors import ( + validate_pi_alias_processor, + add_institution_processor, +) ### PI file field names PI_PI_FIELD = "PI" @@ -51,26 +53,6 @@ ALIAS_S3_FILEPATH = "PIs/alias.csv" -def get_institution_from_pi(institute_map, pi_uname): - institution_domain = pi_uname.split("@")[-1] - for i in range(institution_domain.count(".") + 1): - if institution_name := institute_map.get(institution_domain, ""): - break - institution_domain = institution_domain[institution_domain.find(".") + 1 :] - - if institution_name == "": - print(f"Warning: PI name {pi_uname} does not match any institution!") - - return institution_name - - -def load_institute_map() -> dict: - with open("process_report/institute_map.json", "r") as f: - institute_map = json.load(f) - - return institute_map - - def load_alias(alias_file): alias_dict = dict() @@ -220,15 +202,27 @@ def main(): projects = list(set(projects + timed_projects_list)) - merged_dataframe = validate_pi_aliases(merged_dataframe, alias_dict) - merged_dataframe = add_institution(merged_dataframe) + ### Preliminary processing + + validate_pi_alias_proc = validate_pi_alias_processor.ValidatePIAliasProcessor( + "", invoice_month, merged_dataframe, alias_dict + ) + validate_pi_alias_proc.process() + + add_institute_proc = add_institution_processor.AddInstitutionProcessor( + "", invoice_month, validate_pi_alias_proc.data + ) + add_institute_proc.process() + + ### Finish preliminary processing + lenovo_inv = lenovo_invoice.LenovoInvoice( - name=args.Lenovo_file, invoice_month=invoice_month, data=merged_dataframe.copy() + name=args.Lenovo_file, invoice_month=invoice_month, data=add_institute_proc.data ) nonbillable_inv = nonbillable_invoice.NonbillableInvoice( name=args.nonbillable_file, invoice_month=invoice_month, - data=merged_dataframe.copy(), + data=add_institute_proc.data, nonbillable_pis=pi, nonbillable_projects=projects, ) @@ -239,7 +233,7 @@ def main(): billable_inv = billable_invoice.BillableInvoice( name=args.output_file, invoice_month=invoice_month, - data=merged_dataframe.copy(), + data=add_institute_proc.data.copy(), nonbillable_pis=pi, nonbillable_projects=projects, old_pi_filepath=old_pi_file, @@ -330,13 +324,6 @@ def timed_projects(timed_projects_file, invoice_date): return dataframe[mask]["Project"].to_list() -def validate_pi_aliases(dataframe: pandas.DataFrame, alias_dict: dict): - for pi, pi_aliases in alias_dict.items(): - dataframe.loc[dataframe[PI_FIELD].isin(pi_aliases), PI_FIELD] = pi - - return dataframe - - def fetch_s3_alias_file(): local_name = "alias.csv" invoice_bucket = get_invoice_bucket() @@ -356,32 +343,6 @@ def backup_to_s3_old_pi_file(old_pi_file): invoice_bucket.upload_file(old_pi_file, f"PIs/Archive/PI {get_iso8601_time()}.csv") -def add_institution(dataframe: pandas.DataFrame): - """Determine every PI's institution name, logging any PI whose institution cannot be determined - This is performed by `get_institution_from_pi()`, which tries to match the PI's username to - a list of known institution email domains (i.e bu.edu), or to several edge cases (i.e rudolph) if - the username is not an email address. - - Exact matches are then mapped to the corresponding institution name. - - I.e "foo@bu.edu" would match with "bu.edu", which maps to the instition name "Boston University" - - The list of mappings are defined in `institute_map.json`. - """ - institute_map = load_institute_map() - dataframe = dataframe.astype({INSTITUTION_FIELD: "str"}) - for i, row in dataframe.iterrows(): - pi_name = row[PI_FIELD] - if pandas.isna(pi_name): - print(f"Project {row[PROJECT_FIELD]} has no PI") - else: - dataframe.at[i, INSTITUTION_FIELD] = get_institution_from_pi( - institute_map, pi_name - ) - - return dataframe - - def export_billables(dataframe, output_file): dataframe.to_csv(output_file, index=False) diff --git a/process_report/processors/add_institution_processor.py b/process_report/processors/add_institution_processor.py new file mode 100644 index 0000000..0d7e94b --- /dev/null +++ b/process_report/processors/add_institution_processor.py @@ -0,0 +1,58 @@ +from dataclasses import dataclass +import json + +import pandas + +from process_report.invoices import invoice +from process_report.processors import processor + + +@dataclass +class AddInstitutionProcessor(processor.Processor): + @staticmethod + def _load_institute_map() -> dict: + with open("process_report/institute_map.json", "r") as f: + institute_map = json.load(f) + + return institute_map + + @staticmethod + def _get_institution_from_pi(institute_map, pi_uname): + institution_domain = pi_uname.split("@")[-1] + for i in range(institution_domain.count(".") + 1): + if institution_name := institute_map.get(institution_domain, ""): + break + institution_domain = institution_domain[institution_domain.find(".") + 1 :] + + if institution_name == "": + print(f"Warning: PI name {pi_uname} does not match any institution!") + + return institution_name + + def _add_institution(self, dataframe: pandas.DataFrame): + """Determine every PI's institution name, logging any PI whose institution cannot be determined + This is performed by `get_institution_from_pi()`, which tries to match the PI's username to + a list of known institution email domains (i.e bu.edu), or to several edge cases (i.e rudolph) if + the username is not an email address. + + Exact matches are then mapped to the corresponding institution name. + + I.e "foo@bu.edu" would match with "bu.edu", which maps to the instition name "Boston University" + + The list of mappings are defined in `institute_map.json`. + """ + institute_map = self._load_institute_map() + dataframe = dataframe.astype({invoice.INSTITUTION_FIELD: "str"}) + for i, row in dataframe.iterrows(): + pi_name = row[invoice.PI_FIELD] + if pandas.isna(pi_name): + print(f"Project {row[invoice.PROJECT_FIELD]} has no PI") + else: + dataframe.at[ + i, invoice.INSTITUTION_FIELD + ] = self._get_institution_from_pi(institute_map, pi_name) + + return dataframe + + def _process(self): + self.data = self._add_institution(self.data) diff --git a/process_report/processors/processor.py b/process_report/processors/processor.py new file mode 100644 index 0000000..5a88fd6 --- /dev/null +++ b/process_report/processors/processor.py @@ -0,0 +1,8 @@ +from dataclasses import dataclass + +from process_report.invoices import invoice + + +@dataclass +class Processor(invoice.Invoice): + pass diff --git a/process_report/processors/validate_pi_alias_processor.py b/process_report/processors/validate_pi_alias_processor.py new file mode 100644 index 0000000..a702858 --- /dev/null +++ b/process_report/processors/validate_pi_alias_processor.py @@ -0,0 +1,23 @@ +from dataclasses import dataclass + +import pandas + +from process_report.invoices import invoice +from process_report.processors import processor + + +@dataclass +class ValidatePIAliasProcessor(processor.Processor): + alias_map: dict + + @staticmethod + def _validate_pi_aliases(dataframe: pandas.DataFrame, alias_dict: dict): + for pi, pi_aliases in alias_dict.items(): + dataframe.loc[ + dataframe[invoice.PI_FIELD].isin(pi_aliases), invoice.PI_FIELD + ] = pi + + return dataframe + + def _process(self): + self.data = self._validate_pi_aliases(self.data, self.alias_map) diff --git a/process_report/tests/unit_tests.py b/process_report/tests/unit_tests.py index 5136ac9..68ad2ca 100644 --- a/process_report/tests/unit_tests.py +++ b/process_report/tests/unit_tests.py @@ -219,7 +219,7 @@ def test_export_pi(self): self.assertNotIn("ProjectC", pi_df["Project - Allocation"].tolist()) -class TestGetInstitute(TestCase): +class TestAddInstituteProcessor(TestCase): def test_get_pi_institution(self): institute_map = { "harvard.edu": "Harvard University", @@ -248,31 +248,34 @@ def test_get_pi_institution(self): "g@bidmc.harvard.edu": "Beth Israel Deaconess Medical Center", } + add_institute_proc = test_utils.new_add_institution_processor() + for pi_email, answer in answers.items(): self.assertEqual( - process_report.get_institution_from_pi(institute_map, pi_email), answer + add_institute_proc._get_institution_from_pi(institute_map, pi_email), + answer, ) -class TestAlias(TestCase): - def setUp(self): - self.alias_dict = {"PI1": ["PI1_1", "PI1_2"], "PI2": ["PI2_1"]} - - self.data = pandas.DataFrame( +class TestValidateAliasProcessor(TestCase): + def test_validate_alias(self): + alias_map = {"PI1": ["PI1_1", "PI1_2"], "PI2": ["PI2_1"]} + test_data = pandas.DataFrame( { "Manager (PI)": ["PI1", "PI1_1", "PI1_2", "PI2_1", "PI2_1"], } ) - - self.answer = pandas.DataFrame( + answer_data = pandas.DataFrame( { "Manager (PI)": ["PI1", "PI1", "PI1", "PI2", "PI2"], } ) - def test_validate_alias(self): - output = process_report.validate_pi_aliases(self.data, self.alias_dict) - self.assertTrue(self.answer.equals(output)) + validate_pi_alias_proc = test_utils.new_validate_pi_alias_processor( + data=test_data, alias_map=alias_map + ) + validate_pi_alias_proc.process() + self.assertTrue(answer_data.equals(validate_pi_alias_proc.data)) class TestMonthUtils(TestCase): diff --git a/process_report/tests/util.py b/process_report/tests/util.py index 95148d1..d7ddef4 100644 --- a/process_report/tests/util.py +++ b/process_report/tests/util.py @@ -7,6 +7,11 @@ pi_specific_invoice, ) +from process_report.processors import ( + add_institution_processor, + validate_pi_alias_processor, +) + def new_base_invoice( name="", @@ -52,3 +57,19 @@ def new_pi_specific_invoice( invoice_month, data, ) + + +def new_add_institution_processor( + name="", + invoice_month="0000-00", + data=pandas.DataFrame(), +): + return add_institution_processor.AddInstitutionProcessor(name, invoice_month, data) + + +def new_validate_pi_alias_processor( + name="", invoice_month="0000-00", data=pandas.DataFrame(), alias_map={} +): + return validate_pi_alias_processor.ValidatePIAliasProcessor( + name, invoice_month, data, alias_map + ) diff --git a/process_report/util.py b/process_report/util.py index 45b9ed4..5c4f190 100644 --- a/process_report/util.py +++ b/process_report/util.py @@ -1,6 +1,5 @@ import os import datetime -import json import logging import functools @@ -27,23 +26,6 @@ def get_invoice_bucket(): return s3_resource.Bucket(os.environ.get("S3_BUCKET_NAME", "nerc-invoicing")) -def get_institution_from_pi(institute_map, pi_uname): - institution_key = pi_uname.split("@")[-1] - institution_name = institute_map.get(institution_key, "") - - if institution_name == "": - logger.warn(f"PI name {pi_uname} does not match any institution!") - - return institution_name - - -def load_institute_map() -> dict: - with open("process_report/institute_map.json", "r") as f: - institute_map = json.load(f) - - return institute_map - - def get_iso8601_time(): return datetime.datetime.now().strftime("%Y%m%dT%H%M%SZ")