Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Normalize column names in all dataframes #93

Draft
wants to merge 3 commits into
base: main
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
43 changes: 39 additions & 4 deletions times_reader/datatypes.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,41 @@
# ============================================================================


class Col(str):
"""A subclass of string to be used as column names for our DataFrames.
This class normalizes case and whitespace of the supplied column names.
"""

def __new__(cls, content):
if isinstance(content, cls):
return super().__new__(cls, str(content))
if isinstance(content, str):
return super().__new__(cls, str(content).strip().upper())

def __eq__(self, other):
if other is None:
return False
if isinstance(other, Col):
return super().__eq__(other)
if isinstance(other, str):
# raise ValueError("Cannot compare Col and str -- use only Col!")
return self == Col(other)
return super().__eq__(other)

def __hash__(self):
return super().__hash__()

def __repr__(self) -> str:
return f"Col({super().__str__()})"

def __str__(self) -> str:
return super().__str__()


def Cols(col_names: List[str]) -> List[Col]:
return [Col(x) for x in col_names]


@dataclass
class EmbeddedXlTable:
"""This class defines a table object as a pandas dataframe wrapped with some metadata.
Expand Down Expand Up @@ -67,11 +102,11 @@ class TimesXlMap:
"""

times_name: str
times_cols: List[str]
times_cols: List[Col]
xl_name: str
xl_cols: List[str]
col_map: Dict[str, str]
filter_rows: Dict[str, str]
xl_cols: List[Col]
col_map: Dict[Col, Col]
filter_rows: Dict[Col, str]


class Tag(str, Enum):
Expand Down
7 changes: 5 additions & 2 deletions times_reader/excel.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@
import re
from . import datatypes
from . import utils
from .datatypes import Col


def extract_tables(filename: str) -> List[datatypes.EmbeddedXlTable]:
Expand Down Expand Up @@ -96,7 +97,7 @@ def extract_table(
max_row=tag_row + 1,
)
)
table_df = DataFrame(columns=["VALUE"])
table_df = DataFrame(columns=[Col("VALUE")])
table_df.loc[0] = [df.iloc[tag_row, tag_col + 1]]
uc_sets = {}
else:
Expand Down Expand Up @@ -129,12 +130,14 @@ def extract_table(
if end_row - header_row == 1 and end_col - start_col == 1:
# Interpret single cell tables as a single data item with a column name VALUE
table_df = DataFrame(df.iloc[header_row, start_col:end_col])
table_df.columns = ["VALUE"]
table_df.columns = [Col("VALUE")]
else:
table_df = df.iloc[header_row + 1 : end_row, start_col:end_col]
# Make all columns names strings as some are integers e.g. years
table_df.columns = [str(x) for x in df.iloc[header_row, start_col:end_col]]

# Normalize column names
table_df.rename(lambda s: datatypes.Col(s), axis="columns", inplace=True)
table_df.reset_index(drop=True, inplace=True)

# Don't use applymap because it can convert ints to floats
Expand Down
37 changes: 25 additions & 12 deletions times_reader/main.py
Original file line number Diff line number Diff line change
@@ -1,17 +1,15 @@
from pandas.core.frame import DataFrame
import pandas as pd
from dataclasses import replace
from typing import Dict, List
from itertools import groupby
import re
import os
from concurrent.futures import ProcessPoolExecutor
import time
from functools import reduce
import pickle
from . import datatypes
from . import excel
from . import transforms
from .datatypes import Col


def read_mappings(filename: str) -> List[datatypes.TimesXlMap]:
Expand Down Expand Up @@ -44,14 +42,14 @@ def read_mappings(filename: str) -> List[datatypes.TimesXlMap]:
(times, xl) = line.split(" = ")
(times_name, times_cols_str) = list(filter(None, re.split("\[|\]", times)))
(xl_name, xl_cols_str) = list(filter(None, re.split("\(|\)", xl)))
times_cols = times_cols_str.split(",")
times_cols = list(map(Col, times_cols_str.split(",")))
xl_cols = xl_cols_str.split(",")
filter_rows = {}
for i, s in enumerate(xl_cols):
for s in xl_cols:
if ":" in s:
[col_name, col_val] = s.split(":")
filter_rows[col_name.strip()] = col_val.strip()
xl_cols = [s for s in xl_cols if ":" not in s]
filter_rows[Col(col_name)] = col_val.strip()
xl_cols = [Col(s) for s in xl_cols if ":" not in s]

# TODO remove: Filter out mappings that are not yet finished
if xl_name != "~TODO" and not any(c.startswith("TODO") for c in xl_cols):
Expand Down Expand Up @@ -92,7 +90,7 @@ def convert_xl_to_times(
else:
raw_tables = []

use_pool = True
use_pool = False
if use_pool:
with ProcessPoolExecutor() as executor:
for result in executor.map(excel.extract_tables, input_files):
Expand Down Expand Up @@ -160,6 +158,20 @@ def convert_xl_to_times(
f"transform {transform.__code__.co_name} took {end_time-start_time:.2f} seconds"
)
input = output
# TODO flag
# Assert that all dataframes use only the Col type for column names:
if isinstance(output, list):
for e in output:
for c in e.dataframe.columns:
if not isinstance(c, datatypes.Col):
raise ValueError(f"Error: {e} uses non-Col column name {c}")
elif isinstance(output, dict):
for tag, df in output.items():
for c in df.columns:
if not isinstance(c, datatypes.Col):
raise ValueError(f"Error: {tag} uses non-Col column name {c}")
else:
raise ValueError("Unexpected output type after transform")

print(
f"Conversion complete, {len(output)} tables produced,"
Expand Down Expand Up @@ -287,12 +299,13 @@ def produce_times_tables(
f" table {mapping.xl_name} does not contain column {filter_col}"
)
# TODO break this loop and continue outer loop?
filter = set(x.lower() for x in {filter_val})
i = df[filter_col].str.lower().isin(filter)
# filter = set(x.upper() for x in {filter_val})
# i = df[filter_col].str.upper().isin(filter)
i = df[filter_col].str.upper() == filter_val.upper()
df = df.loc[i, :]
# TODO find the correct tech group
if "TechGroup" in mapping.xl_cols:
df["TechGroup"] = df["TechName"]
if Col("TechGroup") in mapping.xl_cols:
df[Col("TechGroup")] = df[Col("TechName")]
if not all(c in df.columns for c in mapping.xl_cols):
missing = set(mapping.xl_cols) - set(df.columns)
print(
Expand Down
Loading