-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathhelperfunctions.py
100 lines (83 loc) · 3.1 KB
/
helperfunctions.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
from pandas import *
import scipy.signal as signal
# Helper methods for the data collection
def fileReader(pathtofile, dateheading, dtformat='%m/%d/%Y %H:%M', offset=0):
"""
reads files in Bdx format and returns a list of data frames with parsed time
:param pathtofile: type str; the folder or path from which we read individual .csv or .excel files
:param dateheading: type str; the column name for date varies, so it is required
:param dtformat: format string for datetime parsing
:param offset: {str} -- hours to offset incase of zone adjustment
:return: list of dataframes
"""
# Read the files
dlist = []
if pathtofile.endswith('.csv'):
dfr = read_csv(pathtofile)
if pathtofile.endswith('.xlsx'):
dfr = read_excel(pathtofile)
else:
dfr = read_pickle(pathtofile)
# Parsing the Date column
dfr.insert(loc=0, column='Dates',
value=to_datetime(dfr[dateheading],
format=dtformat) + DateOffset(hours=offset))
dfr.drop(dateheading, axis=1, inplace=True) # Drop original Time column
# Add df to the dlist
dlist.append(dfr)
return dlist
def merge_df_rows(dlist):
"""
Merge rows of dataframes sharing same columns but different time points
Always Call merge_df_rows before calling merge_df_columns as time has not been set as
index yet
:param dlist: list of dataframes to be added along rows
:return: dataframe
"""
# Create Dataframe from the dlist files
dframe = concat(dlist, axis=0, join='outer', sort=False)
# Sort the df based on the datetime index
dframe.sort_values(by='Dates', inplace=True)
# Setting Dates as the dataframe index
dframe.set_index(['Dates'], drop=True, inplace=True)
# Dropiing duplicated time points that may exist in the data
dframe = dframe[~dframe.index.duplicated()]
return dframe
def merge_df_columns(dlist):
"""
Merge dataframes sharing same rows but different columns
:param dlist: list of dataframes to be along column axis
:return: concatenated dataframe
"""
df = concat(dlist, axis=1, join='outer', sort=False)
df = droprows(df)
return df
def droprows(df):
return df.dropna(axis=0, how='any')
def df_sample(df, period=12):
"""
resamples dataframe at "period" 5 minute time points
:param df:
:param period: number of 5 min time points
:return: sampled dataframe
"""
timegap = period * 5
return df[df.index.minute % timegap == 0]
def butterworthsmoothing(df, column_names: list = None, Wn = 0.015):
"""
Smoothes the dataframe columns
:param df: the input datafrme
:param column_names: list of column names to be smoothed
:return: smoothed data frame
"""
if column_names is None:
return df
else:
# First, design the Buterworth filter
N = 2 # Filter order
Wn = Wn # Cutoff frequency
B, A = signal.butter(N, Wn, output='ba')
for i in column_names:
df[i] = signal.filtfilt(B, A, df[i])
df.dropna(axis=0, how='any', inplace=True)
return df