-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathmain.py
271 lines (208 loc) · 9.2 KB
/
main.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
#Audrey Houghton
#Created on: 1/16/2025
#Last Updated on: 1/17/2025
#load in utilized python packages
import pandas as pd
import json
import matplotlib.pyplot as plt
from datetime import datetime
from lxml import etree
import argparse
"""
This script processes and visualizes people data from an XML file.
It reads the file, formats data, generates a report, and optionally creates a bar graph of average age by city.
"""
def read_xml(filename):
'''
Read and process an XML file, returning a cleaned DataFrame.
This function checks if the input file has an XML extension, parses the XML data, converts the date of birth
column to a datetime format, infers the country based on ZIP code when missing, and removes rows with invalid
or missing data.
Args:
filename: A string representing the path to the input XML file.
Returns:
A pandas DataFrame containing the cleaned and processed data from the XML file.
'''
if is_xml_file(filename):
df = generate_df(filename)
dt_formatted_df = dt_format(df, 'dob')
inferred_country_df = infer_country(dt_formatted_df)
valid_df = drop_invalid_data(inferred_country_df)
return valid_df
def is_xml_file(filename):
'''
Check if the file has an XML extension (case-insensitive).
Args:
filename (str): The file path of the XML file provided by args.input_file.
Returns:
bool: True if the file has an XML extension, otherwise False.
'''
return filename.lower().endswith('.xml')
def generate_df(filename):
'''
Parse the provided XML file and generate a DataFrame containing people data.
Args:
filename (str): The path to the XML file to be parsed.
Returns:
pandas.DataFrame: A DataFrame containing information about each person
(name, id, dob, address).
'''
tree = etree.parse(filename)
people = tree.xpath("/people/person")
data = []
for person in people:
name = person.findtext("name", default = "")
id = person.findtext("id", default = "")
dob = person.findtext("dob", default = "")
street = person.findtext("address/street", default = "")
city = person.findtext("address/city", default = "")
state = person.findtext("address/state", default = "")
zipcode = person.findtext("address/zipcode", default = "")
country = person.findtext("address/country", default = "")
data.append({
"name": name,
"id": id,
"dob": dob,
"street": street,
"city": city,
"state": state,
"zipcode": zipcode,
"country": country
})
df = pd.DataFrame(data)
return df
def dt_format(df, key):
'''
Convert the specified column in the DataFrame to datetime format.
Args:
df (pandas.DataFrame): The DataFrame containing the data.
key (str): The name of the column to be converted to datetime.
Returns:
pandas.DataFrame: The DataFrame with the specified column formatted as datetime.
'''
df[key] = pd.to_datetime(df[key], errors='coerce')
return df
def infer_country(df):
'''
Infer the country based on the presence of a valid 5-digit zipcode.
If the country field is empty and the zipcode is a 5-digit number, the country is set to "USA".
Args:
df (pandas.DataFrame): The DataFrame containing the data, including 'country' and 'zipcode' columns.
Returns:
pandas.DataFrame: The DataFrame with the 'country' column updated, if applicable.
'''
df['country'] = df.apply(
lambda row: "USA" if row['country'] == "" and row['zipcode'].isdigit() and len(row['zipcode']) == 5 else row['country'],
axis=1
)
return df
def drop_invalid_data(df):
'''
Remove rows with missing or empty data from the DataFrame.
Replaces empty strings with NaN and drops rows containing NaN values.
Args:
df (pandas.DataFrame): The DataFrame to clean, where rows with missing or empty data will be removed.
Returns:
pandas.DataFrame: The cleaned DataFrame with invalid data dropped.
'''
filled_df = df.replace("", pd.NA)
valid_df = filled_df.dropna()
return valid_df
def get_todays_date():
'''
Get today's date in the format YYYYMMDD.
Returns:
str: The current date formatted as a string in the format YYYYMMDD.
'''
return datetime.today().strftime('%Y%m%d')
def child_or_adult(df):
'''
Categorize individuals as "Adult" or "Child" based on their age (18 years or older is considered an adult).
This function calculates the age of each person based on their date of birth and assigns them
to a category ("Adult" or "Child") depending on whether their age is 18 years or older.
Args:
df: A pandas DataFrame containing a 'dob' (date of birth) column.
Returns:
A pandas DataFrame with two new columns:
- 'age': The age of each individual in years.
- 'category': The category ('Adult' or 'Child') based on the age.
'''
today = datetime.today()
df['age'] = (today - df['dob']).dt.days // 365.
df['category'] = df['age'].apply(lambda x: "Adult" if x >= 18. else "Child")
return df
def generate_report(df, path):
'''
Generate a report summarizing the number of adults and children by city, and save the results in a JSON file.
This function processes the input DataFrame to categorize individuals as "Adult" or "Child", groups
them by city, and counts the number of adults and children in each city. The summary is printed to
the console and saved as a JSON file with today's date appended to the file name.
Args:
df: A pandas DataFrame containing individual data with a 'dob' (date of birth) and 'city' column.
path: A string representing the directory path where the JSON file will be saved.
Returns:
A pandas DataFrame with an additional 'category' column categorizing individuals as 'Adult' or 'Child'.
'''
categorized_df = child_or_adult(df)
counts_by_city = categorized_df.groupby('city')['category'].value_counts().unstack(fill_value=0)
counts_summary = counts_by_city.to_dict(orient='index')
for city, counts in counts_summary.items():
print(f"City: {city}, Adults: {counts.get('Adult', 0)}, Children: {counts.get('Child', 0)}")
with open(path + 'age_categorized_by_city_' + get_todays_date() + '.json', 'w+') as json_file:
json.dump(counts_summary, json_file, indent=4)
return categorized_df
def average_age_by_city(df):
'''
Calculate the average age of individuals in each city.
This function groups the input DataFrame by city and calculates the mean age for each group.
The age values are rounded to the nearest whole number.
Args:
df: A pandas DataFrame containing individual data, including an 'age' and 'city' column.
Returns:
A pandas Series with the average age by city, where the index is the city name.
'''
avg_age_by_city = df.groupby('city')['age'].mean().round()
return avg_age_by_city
def create_bar_graph(series, path):
'''
Create and save a bar graph showing the average age by city.
This function generates a bar graph with cities on the x-axis and the average age on the y-axis.
The graph is styled with a green color for the bars and black edges, and is saved as a PNG file in
the specified directory.
Args:
series: A pandas Series containing the average age by city.
path: A string representing the directory path where the graph will be saved.
Returns:
None
'''
series.plot(kind='bar', color='green', edgecolor='black')
plt.title('Average Age by City')
plt.xlabel('City')
plt.ylabel('Average Age (Years)')
plt.xticks(rotation=45, ha="right")
plt.tight_layout()
plt.savefig(path + "average_age_by_city_" + get_todays_date() + ".png")
def main():
'''
Main function to parse arguments, read input data, generate a report, and optionally create a bar graph.
This function sets up argument parsing, reads the input XML file, processes the data to generate a report
(with counts of adults and children by city), and optionally creates and saves a bar graph of the average age
by city.
Args:
None
Returns:
None
'''
parser = argparse.ArgumentParser(description='Process and visualize people data from an XML file.')
parser.add_argument('input_file', type=str, help='Input XML file')
parser.add_argument('output_path', type=str, help='Output JSON file for report')
parser.add_argument('--output_graph', action='store_true', help='Flag to output an average age by city graph image file')
args = parser.parse_args()
valid_df = read_xml(args.input_file)
categorized_df = generate_report(valid_df, args.output_path)
if args.output_graph:
avg_age_by_city = average_age_by_city(valid_df)
create_bar_graph(avg_age_by_city, args.output_path)
return print("reports generated")
if __name__ == "__main__":
main()