-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathver2.py
148 lines (132 loc) · 7.08 KB
/
ver2.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
#!/usr/bin/env python3
#coding=utf-8
import time
import os
import json
import numpy as np
import pandas as pd
from pandas.io.json import json_normalize
from sklearn import preprocessing
from sklearn import metrics
import lightgbm as lgb
import datetime
def load_df(csv_path='train_v2.csv', nrows=None):
JSON_COLUMNS = ['device', 'geoNetwork', 'totals', 'trafficSource']
df = pd.read_csv(csv_path,
converters={column: json.loads for column in JSON_COLUMNS},
dtype={'fullVisitorId': 'str'}, # Important!!
nrows=nrows)
for column in JSON_COLUMNS:
column_as_df = json_normalize(df[column])
column_as_df.columns = [f"{column}.{subcolumn}" for subcolumn in column_as_df.columns]
df = df.drop(column, axis=1).merge(column_as_df, right_index=True, left_index=True)
print(f"Loaded {os.path.basename(csv_path)}. Shape: {df.shape}")
return df
# custom function to run light gbm model
def run_lgb(train_X, train_y, val_X, val_y, test_X):
params = {
"objective" : "regression",
"metric" : "rmse",
"num_leaves" : 30,
"min_child_samples" : 100,
"learning_rate" : 0.1,
"bagging_fraction" : 0.7,
"feature_fraction" : 0.7,
"bagging_freq" : 5,
"bagging_seed" : 2018,
"verbosity" : -1
}
lgtrain = lgb.Dataset(train_X, label=train_y)
lgval = lgb.Dataset(val_X, label=val_y)
model = lgb.train(params, lgtrain, 1000, valid_sets=[lgval], early_stopping_rounds=100, verbose_eval=100)
pred_test_y = model.predict(test_X, num_iteration=model.best_iteration)
pred_val_y = model.predict(val_X, num_iteration=model.best_iteration)
return pred_test_y, model, pred_val_y
if __name__ == "__main__":
train_df = load_df()
test_df = load_df("test_v2.csv")
print(train_df.head())
#The totals.transactionRevenue is as float type
#The gdf is totals of transactionRevenue per user
#The nzi is number of instances in train set with non-zero revenue
#The nzr is number of unique customers with non-zero revenue
train_df["totals.transactionRevenue"] = train_df["totals.transactionRevenue"].astype('float')
test_df["totals.transactionRevenue"] = test_df["totals.transactionRevenue"].astype('float')
gdf = train_df.groupby("fullVisitorId")["totals.transactionRevenue"].sum().reset_index()
nzi = pd.notnull(train_df["totals.transactionRevenue"]).sum()
nzr = (gdf["totals.transactionRevenue"]>0).sum()
#Find constant column and drop
#Find all column in train data but not in test data and drop
const_cols = [c for c in train_df.columns if train_df[c].nunique(dropna=False)==1 ]
print(const_cols)
print("Variables not in test but in train : ", set(train_df.columns).difference(set(test_df.columns)))
train_df = train_df.drop(const_cols + ["trafficSource.campaignCode"], axis=1)
test_df = test_df.drop(const_cols, axis=1)
# Impute 0 for missing target values
train_df["totals.transactionRevenue"].fillna(0, inplace=True)
test_df["totals.transactionRevenue"].fillna(0, inplace=True)
train_y = train_df["totals.transactionRevenue"].values
train_id = train_df["fullVisitorId"].values
test_id = test_df["fullVisitorId"].values
# label encode the categorical variables and convert the numerical variables to float
cat_cols = ["channelGrouping", "device.browser",
"device.deviceCategory", "device.operatingSystem",
"geoNetwork.city", "geoNetwork.continent",
"geoNetwork.country", "geoNetwork.metro",
"geoNetwork.networkDomain", "geoNetwork.region",
"geoNetwork.subContinent", "trafficSource.adContent",
"trafficSource.adwordsClickInfo.adNetworkType",
"trafficSource.adwordsClickInfo.gclId",
"trafficSource.adwordsClickInfo.page",
"trafficSource.adwordsClickInfo.slot", "trafficSource.campaign",
"trafficSource.keyword", "trafficSource.medium",
"trafficSource.referralPath", "trafficSource.source",
'trafficSource.adwordsClickInfo.isVideoAd', 'trafficSource.isTrueDirect']
for col in cat_cols:
print(col)
lbl = preprocessing.LabelEncoder()
lbl.fit(list(train_df[col].values.astype('str')) + list(test_df[col].values.astype('str')))
train_df[col] = lbl.transform(list(train_df[col].values.astype('str')))
test_df[col] = lbl.transform(list(test_df[col].values.astype('str')))
num_cols = ["totals.hits", "totals.pageviews", "visitNumber", "visitStartTime", 'totals.bounces', 'totals.newVisits']
for col in num_cols:
train_df[col] = train_df[col].astype(float)
test_df[col] = test_df[col].astype(float)
# Split the train dataset into development and valid based on time
dev_df = train_df[train_df['date'] <= int(datetime.date(2017,5,31).strftime("%Y%m%d"))]
val_df = train_df[train_df['date'] > int(datetime.date(2017,5,31).strftime("%Y%m%d"))]
dev_y = np.log1p(dev_df["totals.transactionRevenue"].values)
val_y = np.log1p(val_df["totals.transactionRevenue"].values)
dev_X = dev_df[cat_cols + num_cols]
val_X = val_df[cat_cols + num_cols]
test_X = test_df[cat_cols + num_cols]
# Training the model
pred_test, model, pred_val = run_lgb(dev_X, dev_y, val_X, val_y, test_X)
# Model Predicted Revenue and Count Validation error rate
pred_val[pred_val<0] = 0
val_pred_df = pd.DataFrame({"fullVisitorId":val_df["fullVisitorId"].values})
print(val_pred_df.head())
val_pred_df["transactionRevenue"] = val_df["totals.transactionRevenue"].values
print(val_pred_df.head())
val_pred_df["PredictedRevenue"] = np.expm1(pred_val)
print(val_pred_df.head())
val_pred_df = val_pred_df.groupby("fullVisitorId")["transactionRevenue", "PredictedRevenue"].sum().reset_index()
print(val_pred_df.head())
print(np.sqrt(metrics.mean_squared_error(np.log1p(val_pred_df["transactionRevenue"].values), np.log1p(val_pred_df["PredictedRevenue"].values))))
# Output PredictedLogRevenue and Count Test error rate
sub_df = pd.DataFrame({"fullVisitorId":test_id})
print(sub_df.head())
pred_test[pred_test<0] = 0
sub_df["transactionRevenue"] = test_df["totals.transactionRevenue"].values
print(sub_df.head())
sub_df["PredictedLogRevenue"] = np.expm1(pred_test)
print(sub_df.head())
sub_df = sub_df.groupby("fullVisitorId")["transactionRevenue", "PredictedLogRevenue"].sum().reset_index()
print(sub_df.head())
sub_df.columns = ["fullVisitorId", "transactionRevenue", "PredictedLogRevenue"]
sub_df["PredictedLogRevenue"] = np.log1p(sub_df["PredictedLogRevenue"])
sub_df.to_csv("baseline_lgb.csv", index=False)
result = np.sqrt(metrics.mean_squared_error(np.log1p(sub_df["transactionRevenue"].values), sub_df["PredictedLogRevenue"].values))
with open(output.txt,"w") as text_file:
print(result , file = text_file)
print(np.sqrt(metrics.mean_squared_error(np.log1p(sub_df["transactionRevenue"].values), sub_df["PredictedLogRevenue"].values)))