-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathrefine_tags.py
105 lines (80 loc) · 4.22 KB
/
refine_tags.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
import matplotlib.pyplot as plt
import pandas as pd
import json
import os
# ----------------------------------------------------------------------------------- #
# Define constants #
# ----------------------------------------------------------------------------------- #
TAGGED_ACC_FILEPATH = os.path.join('data', 'output', 'tagged_accidents.csv')
TAGGED_ACC_FIN_FILEPATH = os.path.join('data', 'output', 'tagged_accidents_final.csv')
ACCIDENT_FILEPATH = os.path.join('data', 'output', 'accident_reports.csv')
TAGS_FILEPATH = os.path.join('data', 'input', 'tags.csv')
TAG_MAP_FILEPATH = os.path.join('data', 'input', 'tag_map.json')
TAG_COUNT_FILEPATH = os.path.join('data', 'output', 'tag_count.csv')
# ----------------------------------------------------------------------------------- #
# Configurations #
# ----------------------------------------------------------------------------------- #
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)
# ----------------------------------------------------------------------------------- #
# Prepare Tagged Accident Information #
# ----------------------------------------------------------------------------------- #
# Load accident tags
tagged_df = pd.read_csv(TAGGED_ACC_FILEPATH)
# Explode tags
tagged_df['tags'] = tagged_df['tags'].apply(eval)
tagged_df = tagged_df.explode('tags', ignore_index=True)
# Splitting the 'tags' column into two separate columns
tagged_df[['tag', 'prob']] = tagged_df['tags'].apply(pd.Series)
tagged_df.drop(columns=['tags'], inplace=True)
# Drop NAs
tagged_df.dropna(subset='tag', inplace=True, ignore_index=True)
# Add tag_id column
tag_df = pd.read_csv(TAGS_FILEPATH)
tagged_df = tagged_df.merge(tag_df, how='left', on='tag')
# Add accident information by acc_id
acc_df = pd.read_csv(ACCIDENT_FILEPATH)
tagged_df = tagged_df.merge(acc_df, how='left', on='acc_id')
tagged_df['tag_id'] = tagged_df['tag_id'].astype(int)
# ----------------------------------------------------------------------------------- #
# Custom Map & Remove Tags #
# ----------------------------------------------------------------------------------- #
# Remove "extreme cold" tag, and cases where accident description was "Nothing"
tagged_df = tagged_df.query('tag != "extreme cold"')
tagged_df = tagged_df.query('accidents != "Nothing"')
tagged_df.reset_index(inplace=True, drop=True)
# Look at value count distributions for high-probability tags
plt.figure(figsize=(10, 6))
tagged_df['tag'].value_counts().plot(kind='bar', color='#ca0203')
plt.tight_layout()
plt.savefig(os.path.join('data', 'output', 'tag_value_counts.svg'), format='svg')
plt.show()
# Apply custom maps
with open(TAG_MAP_FILEPATH, 'r') as json_file:
tag_map = json.load(json_file)
tagged_df['new_tag'] = tagged_df['tag'].apply(lambda x: tag_map[x] if x in tag_map else x)
filtered_df = tagged_df[['new_tag', 'acc_id']]
filtered_df.drop_duplicates(inplace=True)
# Look at value count distributions
plt.figure(figsize=(10, 6))
filtered_df['new_tag'].value_counts().plot(kind='bar', color='#f2b202')
plt.tight_layout()
plt.savefig(os.path.join('data', 'output', 'updated_tag_value_counts.svg'), format='svg')
plt.show()
filtered_df = filtered_df.query('new_tag != "inadequate preparation"')
filtered_df = filtered_df.query('new_tag != "steep rock"')
tagged_df = tagged_df.query('new_tag != "inadequate preparation"')
tagged_df = tagged_df.query('new_tag != "steep rock"')
# Look at value count distributions
plt.figure(figsize=(10, 6))
filtered_df['new_tag'].value_counts().plot(kind='bar', color='#01a0e6')
plt.tight_layout()
plt.savefig(os.path.join('data', 'output', 'final_tag_value_counts.svg'), format='svg')
plt.show()
# Save exploded dataframe for top 10 tags
top10_df = filtered_df['new_tag'].value_counts().iloc[:10].reset_index()
top10_df.drop(columns=['count'], inplace=True)
save_df = top10_df.merge(tagged_df, how='left', on='new_tag')
save_df.drop(columns=['tag', 'prob', 'tag_id'], inplace=True)
save_df.drop_duplicates(inplace=True)
save_df.to_csv(TAGGED_ACC_FIN_FILEPATH, index=False)