-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathBike_Sharing_Algorithm.R
422 lines (286 loc) · 17.5 KB
/
Bike_Sharing_Algorithm.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
rm(list=ls())
# Step 1. Hypothesis Generation
# Here are some of the hypothesis which I thought could influence the demand of bikes:
# 1.Hourly trend: There must be high demand during office timings. Early morning and late evening can have different trend (cyclist) and low demand during 10:00 pm to 4:00 am.
# 2.Daily Trend: Registered users demand more bike on weekdays as compared to weekend or holiday.
# 3.Rain: The demand of bikes will be lower on a rainy day as compared to a sunny day. Similarly, higher humidity will cause to lower the demand and vice versa.
# 4.Temperature: In India, temperature has negative correlation with bike demand. But, after looking at Washington's temperature graph, I presume it may have positive correlation.
# 5.Pollution: If the pollution level in a city starts soaring, people may start using Bike (it may be influenced by government / company policies or increased awareness).
# 6.Time: Total demand should have higher contribution of registered user as compared to casual because registered user base would increase over time.
# 7.Traffic: It can be positively correlated with Bike demand. Higher traffic may force people to use bike as compared to other road transport medium like car, taxi etc
# Step 2. Understanding the Data Set
# Step 3. Importing Data set , Libraries and Basic Data Exploration
#loading the required libraries
library(rpart)
library(rattle)
library(rpart.plot)
library(RColorBrewer)
library(randomForest)
library(caret)
setwd("C:/Bike Sharing Algorithm/")
#1. Import Train and Test Data Set
train=read.csv("train.csv")
test=read.csv("test.csv")
str(train)
str(test)
# 2.Combine both Train and Test Data set (to understand the distribution of independent variable together).
# introducing variables in test to combine train and test
# can also be done by removing the same variables from training data
test$registered=0
test$casual=0
test$count=0
data=rbind(train,test)
str(data)
summary(data)
unique(data$atemp)
unique(data$season)
unique(data$weather)
unique(data$holiday)
unique(data$workingday)
# 3. Find missing values in data set if any.
colSums(is.na(data))
# 4.Understand the distribution of numerical variables and generate a frequency table for numeric variables.
# Now, I'll test and plot a histogram for each numerical variables and analyze the distribution.
#
# par(mfrow=c(4,2))
# par(mar = rep(2, 4))
hist(data$season)
hist(data$weather)
hist(data$humidity)
hist(data$holiday)
hist(data$workingday)
hist(data$temp)
hist(data$atemp)
hist(data$windspeed)
# Few inferences can be drawn by looking at the these histograms:
#
# Season has four categories of almost equal distribution
# Weather 1 has higher contribution i.e. mostly clear weather.
prop.table(table(data$weather))
# As expected, mostly working days and variable holiday is also showing a similar inference. You can use the code above to look at the distribution in detail. Here you can generate a variable for weekday using holiday and working day. Incase, if both have zero values, then it must be a working day.
# Variables temp, atemp, humidity and windspeed looks naturally distributed.
#5. Convert discrete variables into factor (season, weather, holiday, workingday)
# factoring some variables from numeric
data$season=as.factor(data$season)
data$weather=as.factor(data$weather)
data$holiday=as.factor(data$holiday)
data$workingday=as.factor(data$workingday)
str(data)
# Step 4.Hypothesis Testing (using multivariate analysis)
# Till now, we have got a fair understanding of the data set. Now, let's test the hypothesis which we had generated earlier.
# Here I have added some additional hypothesis from the dataset. Let's test them one by one:
# 4.1.Hourly trend: We don't have the variable 'hour' with us right now. But we can extract it using the datetime column.
unique(data$datetime)
data$hour=substr(data$datetime,12,16) # extracting hour from the datetime variable(hour is starting from 12th digit in datetime and ending on 13th digit )
unique(data$hour)
data$hour=as.factor(data$hour)
# 4.2.Let's plot the hourly trend of count over hours and check if our hypothesis is correct or not. We will separate train and test data set from combined one.
train=data[as.integer(substr(data$datetime,9,10))<20,]
test=data[as.integer(substr(data$datetime,9,10))>19,]
# creating some boxplots on the count of rentals
boxplot(train$count~train$hour,xlab="hour", ylab="count of users")
# Above, you can see the trend of bike demand over hours. Quickly, I'll segregate the bike demand in three categories:
# High : 7-9 and 17-19 hours
# Average : 10-16 hours
# Low : 0-6 and 20-24 hours
# Here I have analyzed the distribution of total bike demand. Let's look at the distribution of registered and casual users separately.
boxplot(train$casual~train$hour,xlab="hour", ylab="casual users")
boxplot(train$registered~train$hour,xlab="hour", ylab="registered users")
# Above you can see that registered users have similar trend as count. Whereas, casual users have different trend. Thus, we can say that 'hour' is significant variable and our hypothesis is 'true'.
# You might have noticed that there are a lot of outliers while plotting the count of registered and casual users. These values are not generated due to error, so we consider them as natural outliers.
# They might be a result of groups of people taking up cycling (who are not registered). To treat such outliers, we will use logarithm transformation. Let's look at the similar plot after log transformation.
boxplot(log(train$count)~train$hour,xlab="hour",ylab="log(count)")
# 4.3.Daily Trend: Like Hour, we will generate a variable for day from datetime variable and after that we'll plot it.
date=substr(data$datetime,1,10) # extracting days of week from datetime
days<-weekdays(as.Date(date))
data$day=days
train=data[as.integer(substr(data$datetime,9,10))<20,]
test=data[as.integer(substr(data$datetime,9,10))>19,]
# Below Plot shows registered and casual users' demand over days.
boxplot(data$casual ~ data$day,xlab="casual", ylab="count of casual users")
boxplot(data$registered ~ data$day,xlab="registered", ylab="count of registered users")
# While looking at the plot, I can say that the demand of causal users increases over weekend.
# 4.4. Rain: We don't have the 'rain' variable with us but have 'weather' which is sufficient to test our hypothesis. As per variable description, weather 3 represents light rain and weather 4 represents heavy rain. Take a look at the plot:
boxplot(data$casual ~ data$weather,xlab="casual", ylab="count of casual users")
boxplot(data$registered ~ data$weather,xlab="registered", ylab="count of registered users")
# 4.5. Temperature
boxplot(train$registered~train$temp,xlab="temp", ylab="registered users")
boxplot(train$casual~train$temp,xlab="temp", ylab="casual users")
# It is clearly satisfying our hypothesis.
# 4.6.Temperature, Windspeed and Humidity: These are continuous variables so we can look at the correlation factor to validate hypothesis.
sub=data.frame(train$registered,train$casual,train$count,train$temp,train$humidity,train$atemp,train$windspeed)
cor(sub) #Plot Correlation Matrix
# Here are a few inferences you can draw by looking at the above histograms:
#
# Variable temp is positively correlated with dependent variables (casual is more compare to registered)
# Variable atemp is highly correlated with temp.
# Windspeed has lower correlation as compared to temp and humidity
# 4.7.Time: Let's extract year of each observation from the datetime column and see the trend of bike demand over year.
data$year=substr(data$datetime,1,4) # extracting year from data
data$year=as.factor(data$year)
train=data[as.integer(substr(data$datetime,9,10))<20,]
test=data[as.integer(substr(data$datetime,9,10))>19,]
# again some boxplots with different variables
# these boxplots give important information about the dependent variable with respect to the independent variables
boxplot(train$registered~train$year,xlab="year", ylab="registered users")
boxplot(train$casual~train$year,xlab="year", ylab="casual users")
boxplot(train$registered~train$windspeed,xlab="year", ylab="registered users")
boxplot(train$casual~train$windspeed,xlab="year", ylab="casual users")
boxplot(train$registered~train$humidity,xlab="humidity", ylab="registered users")
boxplot(train$casual~train$humidity,xlab="humidity", ylab="casual users")
boxplot(train$count~train$year,xlab="year", ylab="count")
# You can see that 2012 has higher bike demand as compared to 2011.
data$hour=as.integer(data$hour)
#4.7. monthly trend
data$month=substr(data$datetime,6,7)
data$month=as.integer(data$month)
unique(data$month)
train=data[as.integer(substr(data$datetime,9,10))<20,]
test=data[as.integer(substr(data$datetime,9,10))>19,]
boxplot(train$registered~train$month,xlab="humidity", ylab="registered users")
boxplot(train$casual~train$month,xlab="humidity", ylab="casual users")
boxplot(train$count~train$month,xlab="year", ylab="count")
# 4.8. Pollution & Traffic: We don't have the variable related with these metrics in our data set so we cannot test this hypothesis.
# created this variable to divide a day into parts, but did not finally use it
data$day_part=0
train=data[as.integer(substr(data$datetime,9,10))<20,]
test=data[as.integer(substr(data$datetime,9,10))>19,]
data=rbind(train,test)
# Step 5. Feature Engineering
# you must have noticed that we generated new variables like hour, month, day and year.
#
# Here we will create more variables, let's look at the some of these:
# Hour Bins: Initially, we have broadly categorize the hour into three categories. Let's create bins for the hour variable separately for casual and registered users. Here we will use decision tree to find the accurate bins.
# We use the library rpart for decision tree algorithm.
#using decision trees for binning some variables, this was a really important step in feature engineering
d=rpart(registered~hour,data=train)
fancyRpartPlot(d)
d1=rpart(casual~hour,data=train)
fancyRpartPlot(d1)
data=rbind(train,test)
str(data)
# Now, looking at the nodes we can create different hour bucket for registered users.
data$dp_reg=0
data$dp_reg[data$hour<8]=1
data$dp_reg[data$hour>=22]=2
data$dp_reg[data$hour>9 & data$hour<18]=3
data$dp_reg[data$hour==8]=4
data$dp_reg[data$hour==9]=5
data$dp_reg[data$hour==20 | data$hour==21]=6
data$dp_reg[data$hour==19 | data$hour==18]=7
unique(data$dp_reg)
data$dp_cas=0
data$dp_cas[data$hour<=8]=1
data$dp_cas[data$hour==9]=2
data$dp_cas[data$hour>=10 & data$hour<=19]=3
data$dp_cas[data$hour>19]=4
f=rpart(registered~temp,data=train)
fancyRpartPlot(f)
f1=rpart(casual~temp,data=train)
fancyRpartPlot(f1)
# Temp Bins: Using similar methods, we have created bins for temperature for both registered and casuals users. Variables created are (temp_reg and temp_cas).
data$temp_reg=0
data$temp_reg[data$temp<13]=1
data$temp_reg[data$temp>=13 & data$temp<23]=2
data$temp_reg[data$temp>=23 & data$temp<30]=3
data$temp_reg[data$temp>=30]=4
data$temp_cas=0
data$temp_cas[data$temp<15]=1
data$temp_cas[data$temp>=15 & data$temp<23]=2
data$temp_cas[data$temp>=23 & data$temp<30]=3
data$temp_cas[data$temp>=30]=4
# Year Bins: We had a hypothesis that bike demand will increase over time and we have proved it also. Here I have created 8 bins (quarterly) for two years. Jan-Mar 2011 as 1 ...Oct-Dec2012 as 8.
unique(data$year_part)
data$year_part[data$year=='2011']=1
data$year_part[data$year=='2011' & data$month>3]=2
data$year_part[data$year=='2011' & data$month>6]=3
data$year_part[data$year=='2011' & data$month>9]=4
data$year_part[data$year=='2012']=5
data$year_part[data$year=='2012' & data$month>3]=6
data$year_part[data$year=='2012' & data$month>6]=7
data$year_part[data$year=='2012' & data$month>9]=8
table(data$year_part)
# creating another variable day_type which may affect our accuracy as weekends and weekdays are important in deciding rentals
# Day Type: Created a variable having categories like "weekday", "weekend" and "holiday".
data$day_type=""
data$day_type[data$holiday==0 & data$workingday==0]="weekend"
data$day_type[data$holiday==1]="holiday"
data$day_type[data$holiday==0 & data$workingday==1]="working day"
# Weekend: Created a separate variable for weekend (0/1)
data$weekend=0
data$weekend[data$day=="Sunday" | data$day=="Saturday" ]=1
train=data[as.integer(substr(data$datetime,9,10))<20,]
test=data[as.integer(substr(data$datetime,9,10))>19,]
plot(train$temp,train$count)
data=rbind(train,test)
str(data)
unique(data$windspeed)
sum(is.na(data$windspeed)) #Check ifany missing values
table(data$windspeed==0)
# dividing total data depending on windspeed to impute/predict the missing values
k=data$windspeed==0
wind_0=subset(data,k)
wind_1=subset(data,!k)
str(data)
# predicting missing values in windspeed using a random forest model
# this is a different approach to impute missing values rather than just using the mean or median or some other statistic for imputation
set.seed(415)
# fit <- randomForest(windspeed ~ season+weather +humidity +month+temp+ year+atemp, data=wind_1,importance=TRUE, ntree=250)
# rm(fit)
# saveRDS(fit, "RandomForest_Wind.rds")##Save Model
mod <- readRDS("C:/R/Analytics Vidhya/Bike Sharing Competition/RandomForest_Wind.rds") ##Load Model
pred=predict(mod,wind_0)
wind_0$windspeed=pred
data=rbind(wind_0,wind_1)
data$weekend=0
data$weekend[data$day=="Sunday" | data$day=="Saturday"]=1
str(data)
# converting all relevant categorical variables into factors to feed to our random forest model
data$season=as.factor(data$season)
data$holiday=as.factor(data$holiday)
data$workingday=as.factor(data$workingday)
data$weather=as.factor(data$weather)
data$hour=as.factor(data$hour)
data$month=as.factor(data$month)
data$day_part=as.factor(data$dp_cas)
data$day_type=as.factor(data$dp_reg)
data$day=as.factor(data$day)
data$temp_cas=as.factor(data$temp_cas)
data$temp_reg=as.factor(data$temp_reg)
# log transformation for some skewed variables, which can be seen from their distribution
# As we know that dependent variables have natural outliers so we will predict log of dependent variables.
# y1=log(casual+1) and y2=log(registered+1), Here we have added 1 to deal with zero values in the casual and registered columns.
hist(train$registered)
hist(train$casual)
hist(train$count)
train$reg1=train$registered+1
train$cas1=train$casual+1
train$logcas=log(train$cas1)
train$logreg=log(train$reg1)
test$logreg=0
test$logcas=0
boxplot(train$logreg~train$weather,xlab="weather", ylab="registered users")
boxplot(train$logreg~train$season,xlab="season", ylab="registered users")
# Step 6. Model Building
# As this was our first attempt, we applied decision tree, conditional inference tree and random forest algorithms and found that random forest is performing the best. You can also go with regression, boosted regression, neural network and find which one is working well for you.
# final model building using random forest
# note that we build different models for predicting for registered and casual users
# this was seen as giving best result after a lot of experimentation
set.seed(415)
fit1 <- randomForest(logreg ~ hour +workingday+day+holiday+ day_type +temp_reg+humidity+atemp+windspeed+season+weather+dp_reg+weekend+year+year_part, data=train,importance=TRUE, ntree=250)
saveRDS(fit1, "RandomForest_1.rds")##Save Model
mod1 <- readRDS("C:/Bike Sharing Algorithm//RandomForest_1.rds") ##Load Model
pred1=predict(mod1,test)
test$logreg=pred1
set.seed(415)
fit2 <- randomForest(logcas ~hour + day_type+day+humidity+atemp+temp_cas+windspeed+season+weather+holiday+workingday+dp_cas+weekend+year+year_part, data=train,importance=TRUE, ntree=250)
saveRDS(fit2, "RandomForest_2.rds")##Save Model
mod2 <- readRDS("C:/Bike Sharing Algorithm//RandomForest_2.rds") ##Load Model
pred2=predict(mod2,test)
test$logcas=pred2
#creating the final submission file
test$registered=exp(test$logreg)-1
test$casual=exp(test$logcas)-1
test$count=test$casual+test$registered
s<-data.frame(datetime=test$datetime,count=test$count)
write.csv(s,file="submit.csv",row.names=FALSE)