-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathmain.py
107 lines (84 loc) · 3.26 KB
/
main.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
# Importing the Libraries
from sklearn.cluster import KMeans
import pandas as pd
from sklearn.preprocessing import MinMaxScaler
from matplotlib import pyplot as plt
import plotly.express as px
# Reading the dataset
df = pd.read_csv("SFO/train.csv")
# Printing the first five rows to check
# print(df.head())
# Truncating the irrelevant columns
df = df.drop(['Category','Descript','DayOfWeek','PdDistrict','Resolution','Address'], axis=1)
# Printing the first five rows to check
print("First five rows after dropping irrelevant columns: \n")
print(df.head())
print()
# Check for null data
print("Checking for null data in the dataset: \n")
print(df.isnull().sum())
print()
# Year filtering operation
# Start by removing the time from the datetime column
# NOTE: Using `lambda` the anonymous function.
extract_date = lambda x: (x["Dates"].split())[0]
df["Dates"] = df.apply(extract_date, axis=1) # axis 1 refers to the horizontal direction meaning it applies to all the columns of each row.
# print(df.head())
# Split the Date and only keep the year
extract_year = lambda y: (y["Dates"].split('-'))[0]
df["Dates"] = df.apply(extract_year, axis=1)
# print(df.head())
# NOTE: This dataset contains data from 2003 to 2014.
# Input from user asking for the year
user_year_input = input("Select a year (2003-2014): ")
print()
# Categorize dataset by year
df_filtered_by_year = df[(df.Dates == user_year_input)].copy()
# print(df_filtered_by_year.head())
print(f"First five rows of the dataset for the year {user_year_input}: \n")
print(df_filtered_by_year.head())
print()
# Scale the data for accurate results
scaler = MinMaxScaler()
# Y is latitude and X is longitude...
scaler.fit(df_filtered_by_year[['X']])
df_filtered_by_year.loc[:, 'X_scaled'] = scaler.transform(df_filtered_by_year[['X']])
scaler.fit(df_filtered_by_year[['Y']])
df_filtered_by_year.loc[:, 'Y_scaled'] = scaler.transform(df_filtered_by_year[['Y']])
# print(df_filtered_by_year.head())
print("First five rows of the scaled dataset with clusters: \n")
print(df_filtered_by_year.head())
print()
# Check for number of clusters using Elbow Method
k_range = range(1, 15)
# Initialize empty list to hold inertia values of each K.
inertia_values = []
for k in k_range:
kmeans_model = KMeans(n_clusters=k)
kmeans_model.fit(df_filtered_by_year[['X_scaled', 'Y_scaled']])
inertia_values.append(kmeans_model.inertia_)
# print(inertia_values)
# Plotting the graph
def plot_k():
plt.xlabel('K')
plt.ylabel('Inertia')
plt.plot(k_range, inertia_values)
plt.show()
# Ask the user if they want to see the plot
user_plot_response = input("Do you want to see the plot? (yes/no): ")
if user_plot_response.lower() == 'yes':
plot_k() # Call the plot function if the user wants to see the plot
else:
print("Plot not displayed.")
df_inertia = pd.DataFrame({'K': k_range, 'Inertia': inertia_values})
print("Inertia values for different K values: \n")
print(df_inertia)
print()
# NOTE: Using Elbow Method, K = 5 (2014)
# K-Means model for K = 5
kmeans_final_model = KMeans(n_clusters=5)
predicted_clusters = kmeans_final_model.fit_predict(df_filtered_by_year[['X_scaled', 'Y_scaled']])
# print(predicted_clusters)
# Adding this new data as a column
df_filtered_by_year['cluster'] = predicted_clusters
print(df_filtered_by_year.head())