From c5ff060b3d0c567d0c22b4ed5b36bc1091c960cc Mon Sep 17 00:00:00 2001 From: Quan Pham Date: Wed, 3 Apr 2024 11:52:47 -0400 Subject: [PATCH] Added processing to apply project credits, determine institution name for each PI, and exporting HU and BU invoices --- process_report/institute_map.json | 36 ++++++ process_report/process_report.py | 189 +++++++++++++++++++++++++++-- process_report/tests/unit_tests.py | 101 ++++++++++++++- 3 files changed, 314 insertions(+), 12 deletions(-) create mode 100644 process_report/institute_map.json diff --git a/process_report/institute_map.json b/process_report/institute_map.json new file mode 100644 index 0000000..cd2d60d --- /dev/null +++ b/process_report/institute_map.json @@ -0,0 +1,36 @@ +{ + "northeastern.edu" : "Northeastern University", + "bu.edu" : "Boston University", + "bentley.edu" : "Bentley", + "uri.edu" : "University of Rhode Island", + "redhat.com" : "Red Hat", + "childrens.harvard.edu" : "Boston Childrens Hospital", + "mclean.harvard.edu" : "McLean Hospital", + "meei.harvard.edu" : "Massachusetts Eye & Ear", + "dfci.harvard.edu" : "Dana-Farber Cancer Institute", + "bwh.harvard.edu" : "Brigham and Women's Hospital", + "bidmc.harvard.edu" : "Beth Israel Deaconess Medical Center", + "fas.harvard.edu" : "Harvard University", + "cga.harvard.edu" : "Harvard University", + "iq.harvard.edu" : "Harvard University", + "hks.harvard.edu" : "Harvard University", + "hsph.harvard.edu" : "Harvard University", + "seas.harvard.edu" : "Harvard University", + "gse.harvard.edu" : "Harvard University", + "gov.harvard.edu" : "Harvard University", + "oeb.harvard.edu" : "Harvard University", + "harvard.edu" : "Harvard University", + "wpi.edu" : "Worcester Polytechnic Institute", + "mit.edu" : "Massachusetts Institute of Technology", + "umass.edu" : "University of Massachusetts Amherst", + "uml.edu" : "University of Massachusetts Lowell", + "codeforboston.org" : "Code For Boston", + "yale.edu" : "Yale University", + "mmsh" : "Harvard University", + "gstuart" : "University of Massachusetts Amherst", + "rudolph" : "Boston Childrens Hospital", + "robbaron" : "Boston University", + "kmdalton" : "Harvard University", + "mzink" : "University of Massachusetts Amherst", + "francesco.pontiggia" : "Harvard University" +} diff --git a/process_report/process_report.py b/process_report/process_report.py index cfcdc70..846cccf 100644 --- a/process_report/process_report.py +++ b/process_report/process_report.py @@ -1,9 +1,67 @@ import argparse import os +import sys +import json import pandas +### Invoice field names +INVOICE_DATE_FIELD = 'Invoice Month' +PROJECT_FIELD = 'Project - Allocation' +PROJECT_ID_FIELD = 'Project - Allocation ID' +PI_FIELD = 'Manager (PI)' +INVOICE_EMAIL_FIELD = 'Invoice Email' +INVOICE_ADDRESS_FIELD = 'Invoice Address' +INSTITUTION_FIELD = 'Institution' +INSTITUTION_ID_FIELD = 'Institution - Specific Code' +SU_HOURS_FIELD = 'SU Hours (GBhr or SUhr)' +SU_TYPE_FIELD = 'SU Type' +COST_FIELD = 'Cost' +CREDIT_FIELD = 'Credit' +CREDIT_CODE_FIELD = 'Credit Code' +BALANCE_FIELD = 'Balance' +### + + +def get_institution_from_pi(institute_map, pi_uname): + institution_key = pi_uname.split('@')[-1] + institution_name = institute_map.get(institution_key, '') + + if institution_name == '': + print(f"Warning: PI name {pi_uname} does not match any institution!") + + return institution_name + + +def load_institute_map() -> dict: + with open('institute_map.json', 'r') as f: + institute_map = json.load(f) + + return institute_map + + +def load_old_pis(old_pi_file): + old_pi_dict = dict() + + try: + with open(old_pi_file) as f: + for pi_info in f: + pi, first_month = pi_info.strip().split(',') + old_pi_dict[pi] = first_month + except FileNotFoundError: + print('Applying credit 0002 failed. Old PI file does not exist') + sys.exit(1) + + return old_pi_dict + + +def is_old_pi(old_pi_dict, pi, invoice_month): + if pi in old_pi_dict and old_pi_dict[pi] != invoice_month: + return True + return False + + def main(): """Remove non-billable PIs and projects""" @@ -41,6 +99,23 @@ def main(): default="pi_invoices", help="Name of output folder containing pi-specific invoice csvs" ) + parser.add_argument( + "--HU-invoice-file", + required=False, + default="HU_only.csv", + help="Name of output csv for HU invoices" + ) + parser.add_argument( + "--HU-BU-invoice-file", + required=False, + default="HU_BU.csv", + help="Name of output csv for HU and BU invoices" + ) + parser.add_argument( + "--old-pi-file", + required=False, + help="Name of csv file listing previously billed PIs" + ) args = parser.parse_args() merged_dataframe = merge_csv(args.csv_files) @@ -60,9 +135,16 @@ def main(): projects = list(set(projects + timed_projects_list)) - billable_projects = remove_non_billables(merged_dataframe, pi, projects, args.output_file) + merged_dataframe = add_institution(merged_dataframe) remove_billables(merged_dataframe, pi, projects, "non_billable.csv") + + billable_projects = remove_non_billables(merged_dataframe, pi, projects) + billable_projects = validate_pi_names(billable_projects) + credited_projects = apply_credits_new_pi(billable_projects, args.old_pi_file) + export_billables(credited_projects, args.output_file) export_pi_billables(billable_projects, args.output_folder) + export_HU_only(billable_projects, args.HU_invoice_file) + export_HU_BU(billable_projects, args.HU_BU_invoice_file) def merge_csv(files): @@ -83,7 +165,7 @@ def get_invoice_date(dataframe): Note that it only checks the first entry because it should be the same for every row. """ - invoice_date_str = dataframe['Invoice Month'][0] + invoice_date_str = dataframe[INVOICE_DATE_FIELD][0] invoice_date = pandas.to_datetime(invoice_date_str, format='%Y-%m') return invoice_date @@ -100,10 +182,9 @@ def timed_projects(timed_projects_file, invoice_date): return dataframe[mask]['Project'].to_list() -def remove_non_billables(dataframe, pi, projects, output_file): +def remove_non_billables(dataframe, pi, projects): """Removes projects and PIs that should not be billed from the dataframe""" - filtered_dataframe = dataframe[~dataframe['Manager (PI)'].isin(pi) & ~dataframe['Project - Allocation'].isin(projects)] - filtered_dataframe.to_csv(output_file, index=False) + filtered_dataframe = dataframe[~dataframe[PI_FIELD].isin(pi) & ~dataframe[PROJECT_FIELD].isin(projects)] return filtered_dataframe @@ -112,21 +193,107 @@ def remove_billables(dataframe, pi, projects, output_file): So this *keeps* the projects/pis that should not be billed. """ - filtered_dataframe = dataframe[dataframe['Manager (PI)'].isin(pi) | dataframe['Project - Allocation'].isin(projects)] + filtered_dataframe = dataframe[dataframe[PI_FIELD].isin(pi) | dataframe[PROJECT_FIELD].isin(projects)] filtered_dataframe.to_csv(output_file, index=False) + +def validate_pi_names(dataframe): + invalid_pi_projects = dataframe[pandas.isna(dataframe[PI_FIELD])] + for i, row in invalid_pi_projects.iterrows(): + print(f'Warning: Project {row[PROJECT_FIELD]} has empty PI field') + dataframe = dataframe[~pandas.isna(dataframe[PI_FIELD])] + + return dataframe + + +def export_billables(dataframe, output_file): + dataframe.to_csv(output_file, index=False) + + def export_pi_billables(dataframe: pandas.DataFrame, output_folder): if not os.path.exists(output_folder): os.mkdir(output_folder) - invoice_month = dataframe['Invoice Month'].iat[0] - pi_list = dataframe['Manager (PI)'].unique() + invoice_month = dataframe[INVOICE_DATE_FIELD].iat[0] + pi_list = dataframe[PI_FIELD].unique() for pi in pi_list: - pi_projects = dataframe[dataframe['Manager (PI)'] == pi] - pi_instituition = pi_projects['Institution'].iat[0] + if pandas.isna(pi): + continue + pi_projects = dataframe[dataframe[PI_FIELD] == pi] + pi_instituition = pi_projects[INSTITUTION_FIELD].iat[0] pi_projects.to_csv(output_folder + f"/{pi_instituition}_{pi}_{invoice_month}.csv") - + + +def apply_credits_new_pi(dataframe, old_pi_file): + new_pi_credit_code = "0002" + new_pi_credit_amount = 1000 + + dataframe[CREDIT_FIELD] = None + dataframe[CREDIT_CODE_FIELD] = None + dataframe[BALANCE_FIELD] = 0 + + old_pi_dict = load_old_pis(old_pi_file) + + current_pi_list = dataframe[PI_FIELD].unique() + invoice_month = dataframe[INVOICE_DATE_FIELD].iat[0] + + for pi in current_pi_list: + pi_projects = dataframe[dataframe[PI_FIELD] == pi] + + if is_old_pi(old_pi_dict, pi, invoice_month): + for i, row in pi_projects.iterrows(): + dataframe.at[i, BALANCE_FIELD] = row[COST_FIELD] + else: + remaining_credit = new_pi_credit_amount + for i, row in pi_projects.iterrows(): + project_cost = row[COST_FIELD] + applied_credit = min(project_cost, remaining_credit) + + dataframe.at[i, CREDIT_FIELD] = applied_credit + dataframe.at[i, CREDIT_CODE_FIELD] = new_pi_credit_code + dataframe.at[i, BALANCE_FIELD] = row[COST_FIELD] - applied_credit + remaining_credit -= applied_credit + + if remaining_credit == 0: + break + + return dataframe + + +def add_institution(dataframe: pandas.DataFrame): + """Determine every PI's institution name, logging any PI whose institution cannot be determined + This is performed by `get_institution_from_pi()`, which tries to match the PI's username to + a list of known institution email domains (i.e bu.edu), or to several edge cases (i.e rudolph) if + the username is not an email address. + + Exact matches are then mapped to the corresponding institution name. + + I.e "foo@bu.edu" would match with "bu.edu", which maps to the instition name "Boston University" + + The list of mappings are defined in `institute_map.json`. + """ + institute_map = load_institute_map() + for i, row in dataframe.iterrows(): + pi_name = row[PI_FIELD] + if pandas.isna(pi_name): + print(f"Project {row[PROJECT_FIELD]} has no PI") + else: + dataframe.at[i, INSTITUTION_FIELD] = get_institution_from_pi(institute_map, pi_name) + + return dataframe + + +def export_HU_only(dataframe, output_file): + HU_projects = dataframe[dataframe[INSTITUTION_FIELD] == 'Harvard University'] + HU_projects.to_csv(output_file) + + +def export_HU_BU(dataframe, output_file): + HU_BU_projects = dataframe[(dataframe[INSTITUTION_FIELD] == 'Harvard University') | + (dataframe[INSTITUTION_FIELD] == 'Boston University')] + HU_BU_projects.to_csv(output_file) + if __name__ == "__main__": main() diff --git a/process_report/tests/unit_tests.py b/process_report/tests/unit_tests.py index d63c7d4..7db5e89 100644 --- a/process_report/tests/unit_tests.py +++ b/process_report/tests/unit_tests.py @@ -1,8 +1,11 @@ from unittest import TestCase +from unittest import skipIf import tempfile import pandas import os +import math from textwrap import dedent + from process_report import process_report class TestGetInvoiceDate(TestCase): @@ -70,7 +73,8 @@ def tearDown(self): os.remove(self.output_file2.name) def test_remove_non_billables(self): - process_report.remove_non_billables(self.dataframe, self.pi_to_exclude, self.projects_to_exclude, self.output_file.name) + billables_df = process_report.remove_non_billables(self.dataframe, self.pi_to_exclude, self.projects_to_exclude) + process_report.export_billables(billables_df, self.output_file.name) result_df = pandas.read_csv(self.output_file.name) @@ -175,3 +179,98 @@ def test_export_pi(self): self.assertNotIn('ProjectA', pi_df['Project - Allocation'].tolist()) self.assertNotIn('ProjectB', pi_df['Project - Allocation'].tolist()) self.assertNotIn('ProjectC', pi_df['Project - Allocation'].tolist()) + + +class TestGetInstitute(TestCase): + def test_get_pi_institution(self): + + institute_map = { + "harvard.edu" : "Harvard University", + "bu.edu" : "Boston University", + "bentley.edu" : "Bentley", + "mclean.harvard.edu" : "McLean Hospital", + "meei.harvard.edu" : "Massachusetts Eye & Ear", + "dfci.harvard.edu" : "Dana-Farber Cancer Institute", + "northeastern.edu" : "Northeastern University", + } + + self.assertEqual( + process_report.get_institution_from_pi(institute_map, "quanmp@bu.edu"), "Boston University" + ) + self.assertEqual( + process_report.get_institution_from_pi(institute_map, "c@mclean.harvard.edu"), "McLean Hospital" + ) + self.assertEqual( + process_report.get_institution_from_pi(institute_map, "b@harvard.edu"), "Harvard University" + ) + self.assertEqual( + process_report.get_institution_from_pi(institute_map, "fake"), "" + ) + self.assertEqual( + process_report.get_institution_from_pi(institute_map, "pi@northeastern.edu"), "Northeastern University" + ) + + +class TestCredit0002(TestCase): + def setUp(self): + + data = { + 'Invoice Month': ['2024-03','2024-03','2024-03','2024-03','2024-03','2024-03'], + 'Manager (PI)': ['PI1', 'PI1', 'PI2', 'PI3', 'PI4', 'PI4'], + 'Project - Allocation': ['ProjectA', 'ProjectB', 'ProjectC', 'ProjectD', 'ProjectE', 'ProjectF'], + 'Cost': [10, 100, 10000, 5000, 800, 1000] + } + self.dataframe = pandas.DataFrame(data) + old_pi = ['PI2,2023-09', 'PI3,2024-02', 'PI4,2024-03'] # Case with old and new pi in pi file + old_pi_file = tempfile.NamedTemporaryFile(delete=False, mode='w', suffix='.csv') + for pi in old_pi: + old_pi_file.write(pi + "\n") + self.old_pi_file = old_pi_file.name + + def tearDown(self): + os.remove(self.old_pi_file) + + def test_apply_credit_0002(self): + dataframe = process_report.apply_credits_new_pi(self.dataframe, self.old_pi_file) + + self.assertTrue('Credit' in dataframe) + self.assertTrue('Credit Code' in dataframe) + self.assertTrue('Balance' in dataframe) + + non_credited_project = dataframe[pandas.isna(dataframe['Credit Code'])] + credited_projects = dataframe[dataframe['Credit Code'] == '0002'] + + self.assertEqual(2, len(non_credited_project)) + self.assertEqual(non_credited_project.loc[2, 'Cost'], non_credited_project.loc[2, 'Balance']) + self.assertEqual(non_credited_project.loc[3, 'Cost'], non_credited_project.loc[3, 'Balance']) + + + self.assertEqual(4, len(credited_projects.index)) + self.assertTrue('PI2' not in credited_projects['Manager (PI)'].unique()) + self.assertTrue('PI3' not in credited_projects['Manager (PI)'].unique()) + + self.assertEqual(10, credited_projects.loc[0, 'Credit']) + self.assertEqual(100, credited_projects.loc[1, 'Credit']) + self.assertEqual(800, credited_projects.loc[4, 'Credit']) + self.assertEqual(200, credited_projects.loc[5, 'Credit']) + + self.assertEqual(0, credited_projects.loc[0, 'Balance']) + self.assertEqual(0, credited_projects.loc[1, 'Balance']) + self.assertEqual(0, credited_projects.loc[4, 'Balance']) + self.assertEqual(800, credited_projects.loc[5, 'Balance']) + + +class TestValidateBillables(TestCase): + + def setUp(self): + + data = { + 'Manager (PI)': ['PI1', math.nan, 'PI1', 'PI2', 'PI2'], + 'Project - Allocation': ['ProjectA', 'ProjectB', 'ProjectC', 'ProjectD', 'ProjectE'], + } + self.dataframe = pandas.DataFrame(data) + + def test_validate_billables(self): + self.assertEqual(1, len(self.dataframe[pandas.isna(self.dataframe['Manager (PI)'])])) + validated_df = process_report.validate_pi_names(self.dataframe) + self.assertEqual(0, len(validated_df[pandas.isna(validated_df['Manager (PI)'])]))