Skip to content

Commit

Permalink
Implemented Processor class and refactored some preliminary processing
Browse files Browse the repository at this point in the history
A `Processor` class has been added, subclassing from the `Invoice` class.
This is the first step to refactor invoicing in order to seperate the
processing and filtering/exporting functionalities of our current
Invoice subclasses. Subclasses of `Processor` should only process invoices
and manipulate its internal data, while subclasses of `Invoice` should
only perform filtering and exporting, never changing any data itself.

In addition to implementing `Processor`, two of its subclasses,
`ValidatePIAliasProcessor` and `AddInstitutionProcessor` has been
added to perform some preliminary processing steps.
  • Loading branch information
QuanMPhm committed Sep 25, 2024
1 parent b5ca684 commit d9b838c
Show file tree
Hide file tree
Showing 7 changed files with 146 additions and 90 deletions.
81 changes: 21 additions & 60 deletions process_report/process_report.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,6 @@
import sys
import datetime

import json
import pandas
import pyarrow

Expand All @@ -15,7 +14,10 @@
bu_internal_invoice,
pi_specific_invoice,
)

from process_report.processors import (
validate_pi_alias_processor,
add_institution_processor,
)

### PI file field names
PI_PI_FIELD = "PI"
Expand Down Expand Up @@ -51,26 +53,6 @@
ALIAS_S3_FILEPATH = "PIs/alias.csv"


def get_institution_from_pi(institute_map, pi_uname):
institution_domain = pi_uname.split("@")[-1]
for i in range(institution_domain.count(".") + 1):
if institution_name := institute_map.get(institution_domain, ""):
break
institution_domain = institution_domain[institution_domain.find(".") + 1 :]

if institution_name == "":
print(f"Warning: PI name {pi_uname} does not match any institution!")

return institution_name


def load_institute_map() -> dict:
with open("process_report/institute_map.json", "r") as f:
institute_map = json.load(f)

return institute_map


def load_alias(alias_file):
alias_dict = dict()

Expand Down Expand Up @@ -220,15 +202,27 @@ def main():

projects = list(set(projects + timed_projects_list))

merged_dataframe = validate_pi_aliases(merged_dataframe, alias_dict)
merged_dataframe = add_institution(merged_dataframe)
### Preliminary processing

validate_pi_alias_proc = validate_pi_alias_processor.ValidatePIAliasProcessor(
"", invoice_month, merged_dataframe, alias_dict
)
validate_pi_alias_proc.process()

add_institute_proc = add_institution_processor.AddInstitutionProcessor(
"", invoice_month, validate_pi_alias_proc.data
)
add_institute_proc.process()

### Finish preliminary processing

lenovo_inv = lenovo_invoice.LenovoInvoice(
name=args.Lenovo_file, invoice_month=invoice_month, data=merged_dataframe.copy()
name=args.Lenovo_file, invoice_month=invoice_month, data=add_institute_proc.data
)
nonbillable_inv = nonbillable_invoice.NonbillableInvoice(
name=args.nonbillable_file,
invoice_month=invoice_month,
data=merged_dataframe.copy(),
data=add_institute_proc.data,
nonbillable_pis=pi,
nonbillable_projects=projects,
)
Expand All @@ -239,7 +233,7 @@ def main():
billable_inv = billable_invoice.BillableInvoice(
name=args.output_file,
invoice_month=invoice_month,
data=merged_dataframe.copy(),
data=add_institute_proc.data.copy(),
nonbillable_pis=pi,
nonbillable_projects=projects,
old_pi_filepath=old_pi_file,
Expand Down Expand Up @@ -330,13 +324,6 @@ def timed_projects(timed_projects_file, invoice_date):
return dataframe[mask]["Project"].to_list()


def validate_pi_aliases(dataframe: pandas.DataFrame, alias_dict: dict):
for pi, pi_aliases in alias_dict.items():
dataframe.loc[dataframe[PI_FIELD].isin(pi_aliases), PI_FIELD] = pi

return dataframe


def fetch_s3_alias_file():
local_name = "alias.csv"
invoice_bucket = get_invoice_bucket()
Expand All @@ -356,32 +343,6 @@ def backup_to_s3_old_pi_file(old_pi_file):
invoice_bucket.upload_file(old_pi_file, f"PIs/Archive/PI {get_iso8601_time()}.csv")


def add_institution(dataframe: pandas.DataFrame):
"""Determine every PI's institution name, logging any PI whose institution cannot be determined
This is performed by `get_institution_from_pi()`, which tries to match the PI's username to
a list of known institution email domains (i.e bu.edu), or to several edge cases (i.e rudolph) if
the username is not an email address.
Exact matches are then mapped to the corresponding institution name.
I.e "foo@bu.edu" would match with "bu.edu", which maps to the instition name "Boston University"
The list of mappings are defined in `institute_map.json`.
"""
institute_map = load_institute_map()
dataframe = dataframe.astype({INSTITUTION_FIELD: "str"})
for i, row in dataframe.iterrows():
pi_name = row[PI_FIELD]
if pandas.isna(pi_name):
print(f"Project {row[PROJECT_FIELD]} has no PI")
else:
dataframe.at[i, INSTITUTION_FIELD] = get_institution_from_pi(
institute_map, pi_name
)

return dataframe


def export_billables(dataframe, output_file):
dataframe.to_csv(output_file, index=False)

Expand Down
58 changes: 58 additions & 0 deletions process_report/processors/add_institution_processor.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,58 @@
from dataclasses import dataclass
import json

import pandas

from process_report.invoices import invoice
from process_report.processors import processor


@dataclass
class AddInstitutionProcessor(processor.Processor):
@staticmethod
def _load_institute_map() -> dict:
with open("process_report/institute_map.json", "r") as f:
institute_map = json.load(f)

return institute_map

@staticmethod
def _get_institution_from_pi(institute_map, pi_uname):
institution_domain = pi_uname.split("@")[-1]
for i in range(institution_domain.count(".") + 1):
if institution_name := institute_map.get(institution_domain, ""):
break
institution_domain = institution_domain[institution_domain.find(".") + 1 :]

if institution_name == "":
print(f"Warning: PI name {pi_uname} does not match any institution!")

return institution_name

def _add_institution(self, dataframe: pandas.DataFrame):
"""Determine every PI's institution name, logging any PI whose institution cannot be determined
This is performed by `get_institution_from_pi()`, which tries to match the PI's username to
a list of known institution email domains (i.e bu.edu), or to several edge cases (i.e rudolph) if
the username is not an email address.
Exact matches are then mapped to the corresponding institution name.
I.e "foo@bu.edu" would match with "bu.edu", which maps to the instition name "Boston University"
The list of mappings are defined in `institute_map.json`.
"""
institute_map = self._load_institute_map()
dataframe = dataframe.astype({invoice.INSTITUTION_FIELD: "str"})
for i, row in dataframe.iterrows():
pi_name = row[invoice.PI_FIELD]
if pandas.isna(pi_name):
print(f"Project {row[invoice.PROJECT_FIELD]} has no PI")
else:
dataframe.at[
i, invoice.INSTITUTION_FIELD
] = self._get_institution_from_pi(institute_map, pi_name)

return dataframe

def _process(self):
self.data = self._add_institution(self.data)
8 changes: 8 additions & 0 deletions process_report/processors/processor.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
from dataclasses import dataclass

from process_report.invoices import invoice


@dataclass
class Processor(invoice.Invoice):
pass
23 changes: 23 additions & 0 deletions process_report/processors/validate_pi_alias_processor.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,23 @@
from dataclasses import dataclass

import pandas

from process_report.invoices import invoice
from process_report.processors import processor


@dataclass
class ValidatePIAliasProcessor(processor.Processor):
alias_map: dict

@staticmethod
def _validate_pi_aliases(dataframe: pandas.DataFrame, alias_dict: dict):
for pi, pi_aliases in alias_dict.items():
dataframe.loc[
dataframe[invoice.PI_FIELD].isin(pi_aliases), invoice.PI_FIELD
] = pi

return dataframe

def _process(self):
self.data = self._validate_pi_aliases(self.data, self.alias_map)
27 changes: 15 additions & 12 deletions process_report/tests/unit_tests.py
Original file line number Diff line number Diff line change
Expand Up @@ -219,7 +219,7 @@ def test_export_pi(self):
self.assertNotIn("ProjectC", pi_df["Project - Allocation"].tolist())


class TestGetInstitute(TestCase):
class TestAddInstituteProcessor(TestCase):
def test_get_pi_institution(self):
institute_map = {
"harvard.edu": "Harvard University",
Expand Down Expand Up @@ -248,31 +248,34 @@ def test_get_pi_institution(self):
"g@bidmc.harvard.edu": "Beth Israel Deaconess Medical Center",
}

add_institute_proc = test_utils.new_add_institution_processor()

for pi_email, answer in answers.items():
self.assertEqual(
process_report.get_institution_from_pi(institute_map, pi_email), answer
add_institute_proc._get_institution_from_pi(institute_map, pi_email),
answer,
)


class TestAlias(TestCase):
def setUp(self):
self.alias_dict = {"PI1": ["PI1_1", "PI1_2"], "PI2": ["PI2_1"]}

self.data = pandas.DataFrame(
class TestValidateAliasProcessor(TestCase):
def test_validate_alias(self):
alias_map = {"PI1": ["PI1_1", "PI1_2"], "PI2": ["PI2_1"]}
test_data = pandas.DataFrame(
{
"Manager (PI)": ["PI1", "PI1_1", "PI1_2", "PI2_1", "PI2_1"],
}
)

self.answer = pandas.DataFrame(
answer_data = pandas.DataFrame(
{
"Manager (PI)": ["PI1", "PI1", "PI1", "PI2", "PI2"],
}
)

def test_validate_alias(self):
output = process_report.validate_pi_aliases(self.data, self.alias_dict)
self.assertTrue(self.answer.equals(output))
validate_pi_alias_proc = test_utils.new_validate_pi_alias_processor(
data=test_data, alias_map=alias_map
)
validate_pi_alias_proc.process()
self.assertTrue(answer_data.equals(validate_pi_alias_proc.data))


class TestMonthUtils(TestCase):
Expand Down
21 changes: 21 additions & 0 deletions process_report/tests/util.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,11 @@
pi_specific_invoice,
)

from process_report.processors import (
add_institution_processor,
validate_pi_alias_processor,
)


def new_base_invoice(
name="",
Expand Down Expand Up @@ -52,3 +57,19 @@ def new_pi_specific_invoice(
invoice_month,
data,
)


def new_add_institution_processor(
name="",
invoice_month="0000-00",
data=pandas.DataFrame(),
):
return add_institution_processor.AddInstitutionProcessor(name, invoice_month, data)


def new_validate_pi_alias_processor(
name="", invoice_month="0000-00", data=pandas.DataFrame(), alias_map={}
):
return validate_pi_alias_processor.ValidatePIAliasProcessor(
name, invoice_month, data, alias_map
)
18 changes: 0 additions & 18 deletions process_report/util.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,5 @@
import os
import datetime
import json
import logging
import functools

Expand All @@ -27,23 +26,6 @@ def get_invoice_bucket():
return s3_resource.Bucket(os.environ.get("S3_BUCKET_NAME", "nerc-invoicing"))


def get_institution_from_pi(institute_map, pi_uname):
institution_key = pi_uname.split("@")[-1]
institution_name = institute_map.get(institution_key, "")

if institution_name == "":
logger.warn(f"PI name {pi_uname} does not match any institution!")

return institution_name


def load_institute_map() -> dict:
with open("process_report/institute_map.json", "r") as f:
institute_map = json.load(f)

return institute_map


def get_iso8601_time():
return datetime.datetime.now().strftime("%Y%m%dT%H%M%SZ")

Expand Down

0 comments on commit d9b838c

Please sign in to comment.