-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathmultivariate.py
604 lines (470 loc) · 30.9 KB
/
multivariate.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
import csv
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import preprocessing
from statsmodels.tsa.arima.model import ARIMA
from statsmodels.tsa.statespace.sarimax import SARIMAX
from statsmodels.tsa.arima.model import ARIMAResults
from sklearn.metrics import mean_squared_error, mean_absolute_error
from sklearn.model_selection import TimeSeriesSplit, cross_val_score
import statsmodels.api as sm
from scipy.stats import ttest_ind
##### CORRELATION #####
def compute_covariance(cov_df, index_df, variables):
covariances = []
start_date = index_df.index[0].split("-")
year_df_first = start_date[0]
month_df_first = start_date[1]
end_date = index_df.index[-1].split("-")
year_df_end = end_date[0]
month_df_end = end_date[1]
for variable in variables:
last_date = variable.index[-1].split("-")
year_var_end = last_date[0]
month_var_end = last_date[1]
first_date = variable.index[0].split("-")
year_var_first = first_date[0]
month_var_first = first_date[1]
if (len(variable) > len(index_df)): # If the variable has more data than the index and ends later
if int(year_var_end) < int(year_df_end): # If the variable ends before the index
last_row = variable.loc[year_var_end+"-"+month_var_end:].index[0]
variable = variable.loc[year_df_first+"-"+month_df_first:last_row]
last_row = index_df.loc[year_var_end+"-"+month_var_end:].index[0]
index_df_cov = index_df.loc[year_df_first+"-"+month_df_first:last_row]
cov = np.cov(index_df_cov, variable)[0][1]
else:
last_row = index_df.loc[year_df_end+"-"+month_df_end:].index[0]
variable = variable.loc[year_df_first+"-"+month_df_first:last_row]
cov = np.cov(index_df, variable)[0][1]
if len(variable) < len(index_df): # If the variable has less data than the index
if int(year_df_end) <= int(year_var_end) and int(month_df_end) < int(month_var_end): # If the variable ends after the index
last_row = index_df.loc[year_df_end+"-"+month_df_end:].index[0]
variable = variable.loc[year_var_first+"-"+month_var_first:last_row]
last_row = index_df.loc[year_df_end+"-"+month_df_end:].index[0]
index_df_cov = index_df.loc[year_var_first+"-"+month_var_first:last_row]
if index_df_cov.index[-1].split("-")[1] != variable.index[-1].split("-")[1]: # If slice ends in different month
index_df_cov = index_df_cov[:-1]
cov = np.cov(index_df_cov, variable)[0][1]
elif int(year_df_end) < int(year_var_end): # If the variable after the index
last_row = index_df.loc[year_df_end+"-"+month_df_end:].index[0]
variable = variable.loc[year_var_first+"-"+month_var_first:last_row]
last_row = index_df.loc[year_df_end+"-"+month_df_end:].index[0]
index_df_cov = index_df.loc[year_var_first+"-"+month_var_first:last_row]
cov = np.cov(index_df_cov, variable)[0][1]
else: # If the variable ends before the index
last_row = index_df.loc[year_var_end+"-"+month_var_end:].index[0]
index_df_cov = index_df.loc[year_var_first+"-"+month_var_first:last_row]
cov = np.cov(index_df_cov, variable)[0][1]
covariances.append(cov)
cov_df.loc[len(cov_df)] = [covariances[0], covariances[1], covariances[2], covariances[3], covariances[4], covariances[5], covariances[6]]
return cov_df
def compute_pearson_coeff(pearson_df, index_df, variables):
pearson_coeffs = []
start_date = index_df.index[0].split("-")
year_df_first = start_date[0]
month_df_first = start_date[1]
end_date = index_df.index[-1].split("-")
year_df_end = end_date[0]
month_df_end = end_date[1]
for variable in variables:
last_date = variable.index[-1].split("-")
year_var_end = last_date[0]
month_var_end = last_date[1]
first_date = variable.index[0].split("-")
year_var_first = first_date[0]
month_var_first = first_date[1]
if (len(variable) > len(index_df)): # If the variable has more data than the index and ends later
if int(year_var_end) < int(year_df_end): # If the variable ends before the index
last_row = variable.loc[year_var_end+"-"+month_var_end:].index[0]
variable = variable.loc[year_df_first+"-"+month_df_first:last_row]
last_row = index_df.loc[year_var_end+"-"+month_var_end:].index[0]
index_df_coef = index_df.loc[year_df_first+"-"+month_df_first:last_row]
# Substract the sample mean
index_df_coef = index_df_coef - index_df_coef.mean()
variable = variable - variable.mean()
coef = np.corrcoef(index_df_coef, variable)[0][1]
else:
last_row = index_df.loc[year_df_end+"-"+month_df_end:].index[0]
variable = variable.loc[year_df_first+"-"+month_df_first:last_row]
# Substract the sample mean
index_df_coef = index_df - index_df.mean()
variable = variable - variable.mean()
coef = np.corrcoef(index_df_coef, variable)[0][1]
if len(variable) < len(index_df): # If the variable has less data than the index
if int(year_df_end) <= int(year_var_end) and int(month_df_end) < int(month_var_end): # If the variable ends after the index
last_row = index_df.loc[year_df_end+"-"+month_df_end:].index[0]
variable = variable.loc[year_var_first+"-"+month_var_first:last_row]
last_row = index_df.loc[year_df_end+"-"+month_df_end:].index[0]
index_df_cov = index_df.loc[year_var_first+"-"+month_var_first:last_row]
if index_df_cov.index[-1].split("-")[1] != variable.index[-1].split("-")[1]: # If slice ends in different month
index_df_cov = index_df_cov[:-1]
# Substract the sample mean
index_df_cov = index_df_cov - index_df_cov.mean()
variable = variable - variable.mean()
coef = np.corrcoef(index_df_cov, variable)[0][1]
elif int(year_df_end) < int(year_var_end): # If the variable ends after the index
last_row = index_df.loc[year_df_end+"-"+month_df_end:].index[0]
variable = variable.loc[year_var_first+"-"+month_var_first:last_row]
last_row = index_df.loc[year_df_end+"-"+month_df_end:].index[0]
index_df_cov = index_df.loc[year_var_first+"-"+month_var_first:last_row]
# Substract the sample mean
index_df_cov = index_df_cov - index_df_cov.mean()
variable = variable - variable.mean()
coef = np.corrcoef(index_df_cov, variable)[0][1]
else: # If the variable ends before the index
last_row = index_df.loc[year_var_end+"-"+month_var_end:].index[0]
index_df_cov = index_df.loc[year_var_first+"-"+month_var_first:last_row]
# Substract the sample mean
index_df_cov = index_df_cov - index_df_cov.mean()
variable = variable - variable.mean()
coef = np.corrcoef(index_df_cov, variable)[0][1]
pearson_coeffs.append(coef)
pearson_df.loc[len(pearson_df)] = [pearson_coeffs[0], pearson_coeffs[1], pearson_coeffs[2], pearson_coeffs[3], pearson_coeffs[4], pearson_coeffs[5], pearson_coeffs[6]]
return pearson_df
def compute_t_test(index_df, variable, significance_level=0.05):
t_stat, p_val = ttest_ind(index_df, variable)
print(f"t-statistic: {t_stat}")
print(f"P-value: {p_val}")
if p_val < significance_level:
# Reject the null hypothesis
print("The means differ significantly, correlation is significant and tells us the direction")
else:
# Accept the null hypothesis
print("There is no significant difference between the means, correlation is not significant")
return t_stat, p_val
##### MODELS #####
def create_model(train, order, exogenous_var, seasonal_order=None, index='wine'):
if seasonal_order == None: # ARIMAX Model
model = ARIMA(train, trend='n', order=order,
exog=exogenous_var,
enforce_stationarity=True,
enforce_invertibility=True)
fit_results = model.fit()
fit_results.save(f'models\{index}_arimax.pkl') # Comment this when evaluating multiple models
else: # SARIMAX Model
model = SARIMAX(train, trend='n', order=order,
exog=exogenous_var,
seasonal_order=seasonal_order,
enforce_stationarity=True,
enforce_invertibility=True)
model.initialize_approximate_diffuse() # Avoid LU Decomposition error when searching for optimal parameters
fit_results = model.fit()
fit_results.save(f'models\{index}_sarimax.pkl') # Comment this when evaluating multiple models
training_residuals = fit_results.resid
return fit_results, training_residuals
def test_model(test, test_exog, model=None, seasonal=False, index='wine'): # Testing data
if model == None and seasonal == False: # ARIMA Model
model = ARIMAResults.load(f'models\{index}_arimax.pkl')
elif model == None and seasonal == True: # SARIMA Model
model = ARIMAResults.load(f'models\{index}_sarimax.pkl')
# Testing Forecast make sure test_exog has same length as test.shape[0]
forecast_steps = test.shape[0]
forecast = model.get_forecast(steps=forecast_steps, exog=test_exog)
forecast_ci = forecast.conf_int()
yhat_test = forecast.predicted_mean.values # Apply the exp transformation if you used log transform before to invert scales back
y_test = test
baseline = np.full(len(y_test), y_test[0])
baseline_mean = np.full(len(y_test), y_test.mean())
# Evaluate the model
mae = mean_absolute_error(y_test, yhat_test)
mse = mean_squared_error(y_test, yhat_test)
mae_baseline = mean_absolute_error(y_test, baseline)
mse_baseline = mean_squared_error(y_test, baseline)
mae_baseline_mean = mean_absolute_error(y_test, baseline_mean)
mse_baseline_mean = mean_squared_error(y_test, baseline_mean)
rmse = np.sqrt(mse)
rmse_baseline = np.sqrt(mse_baseline)
rmse_baseline_mean = np.sqrt(mse_baseline_mean)
mape = np.mean(np.abs((y_test - yhat_test) / y_test)) * 100
mape_baseline = np.mean(np.abs((y_test - baseline) / y_test)) * 100
mape_baseline_mean = np.mean(np.abs((y_test - baseline_mean) / y_test)) * 100
# Plot the results
plt.plot(yhat_test, color="green", label="predicted") # Comment this when evaluating multiple models
plt.plot(y_test, color="blue", label="observed") # Comment this when evaluating multiple models
plt.plot(baseline, color="red", label="baseline") # Comment this when evaluating multiple models
plt.plot(baseline_mean, color="purple", label="mean") # Comment this when evaluating multiple models
plt.legend(loc='best') # Comment this when evaluating multiple models
plt.title(f'Compare forecasted and observed {index} index values for test set') # Comment this when evaluating multiple models
plt.xticks([0, len(y_test)/2, len(y_test)-1]) # Comment this when evaluating multiple models
plt.xlabel('Time') # Comment this when evaluating multiple models
plt.ylabel('Index value') # Comment this when evaluating multiple models
plt.show() # Comment this when evaluating multiple models
return yhat_test, mae, mse, mae_baseline, mse_baseline, mae_baseline_mean, mse_baseline_mean, rmse, rmse_baseline, rmse_baseline_mean, mape, mape_baseline, mape_baseline_mean
def split_cross_validation(data, order, exog, index='wine', seasonal_order=None, seasonal=False):
# Not using blocked cross-validation because there is not enough data for sufficient blocks
# Using split cross validation instead with an 80/20 ratio at each split
mae_l = []
mse_l = []
rmse_l = []
mape_l = []
aic_l = []
bic_l = []
mae_l_bas = []
mse_l_bas = []
rmse_l_bas = []
mape_l_bas = []
mae_l_mean = []
mse_l_mean = []
rmse_l_mean = []
mape_l_mean = []
splits = [0.5, 0.65, 0.85, 1.0]
for split in splits:
split_data = data[:int(split*len(data))]
train = split_data[:int(0.8*len(split_data))]
test = split_data[int(0.8*len(split_data)):]
split_exog = exog[:int(split*len(exog))]
exog_train = split_exog[:int(0.8*len(split_exog))]
exog_test = split_exog[int(0.8*len(split_exog)):]
fit_results, _ = create_model(train, order, exog_train, seasonal_order, index)
_, mae, mse, mae_baseline, mse_baseline, mae_baseline_mean, mse_baseline_mean, rmse, rmse_baseline, rmse_baseline_mean, mape, mape_baseline, mape_baseline_mean = test_model(test, exog_test, fit_results, seasonal, index)
# Model Evaluation Metrics
mae_l.append(mae)
mse_l.append(mse)
rmse_l.append(rmse)
mape_l.append(mape)
aic_l.append(fit_results.aic)
bic_l.append(fit_results.bic)
# Baseline Evaluation Metrics
mae_l_bas.append(mae_baseline)
mse_l_bas.append(mse_baseline)
rmse_l_bas.append(rmse_baseline)
mape_l_bas.append(mape_baseline)
# Mean Evaluation Metrics
mae_l_mean.append(mae_baseline_mean)
mse_l_mean.append(mse_baseline_mean)
rmse_l_mean.append(rmse_baseline_mean)
mape_l_mean.append(mape_baseline_mean)
# Return all eval metrics
return np.mean(aic_l), np.mean(bic_l), np.mean(mae_l), np.mean(mse_l), np.mean(rmse_l), np.mean(mape_l), np.mean(mae_l_bas), np.mean(mse_l_bas), np.mean(rmse_l_bas), np.mean(mape_l_bas), np.mean(mae_l_mean), np.mean(mse_l_mean), np.mean(rmse_l_mean), np.mean(mape_l_mean)
def evaluate_model_with_Plots(data, candidates, eval_df, exog, seasonal=False, index='wine', arima_order=None):
# Take the model with the lowest eval metrics and errors
for candidate in candidates:
if seasonal == False:
# Split cross validation
aic, bic, mae, mse, rmse, mape, mae_bas, mse_bas, rmse_bas, mape_bas, mae_mean, mse_mean, rmse_mean, mape_mean = split_cross_validation(data, candidate, exog, index, None, seasonal)
# Store evaluation information (those are already avg calculated in the split cross validation function)
eval_df.loc[len(eval_df)] = [candidate, None, aic, bic, mae, mse, rmse, mape]
else:
# Split cross validation
aic, bic, mae, mse, rmse, mape, mae_bas, mse_bas, rmse_bas, mape_bas, mae_mean, mse_mean, rmse_mean, mape_mean = split_cross_validation(data, order=arima_order, exog=exog, index=index, seasonal_order=candidate, seasonal=seasonal)
# Store evaluation information (those are already avg calculated in the split cross validation function)
eval_df.loc[len(eval_df)] = [arima_order, candidate, aic, bic, mae, mse, rmse, mape]
print("MAE Baseline:", mae_bas)
print("MSE Baseline:", mse_bas)
print("RMSE Baseline:", rmse_bas)
print("MAPE % Baseline:", mape_bas)
print("MAE Mean:", mae_mean)
print("MSE Mean:", mse_mean)
print("RMSE Mean:", rmse_mean)
print("MAPE % Mean:", mape_mean)
return eval_df
def align_data(index_df, exog):
start_date = index_df.index[0].split("-")
year_df_first = start_date[0]
month_df_first = start_date[1]
end_date = index_df.index[-1].split("-")
year_df_end = end_date[0]
month_df_end = end_date[1]
last_date = exog.index[-1].split("-")
year_var_end = last_date[0]
month_var_end = last_date[1]
first_date = exog.index[0].split("-")
year_var_first = first_date[0]
month_var_first = first_date[1]
if (len(exog) > len(index_df)): # If the variable has more data than the index and ends later
if int(year_var_end) < int(year_df_end): # If the variable ends before the index
last_row = exog.loc[year_var_end+"-"+month_var_end:].index[0]
exog = exog.loc[year_df_first+"-"+month_df_first:last_row]
last_row = index_df.loc[year_var_end+"-"+month_var_end:].index[0]
index_df = index_df.loc[year_df_first+"-"+month_df_first:last_row]
else:
last_row = index_df.loc[year_df_end+"-"+month_df_end:].index[0]
exog = exog.loc[year_df_first+"-"+month_df_first:last_row]
if len(exog) < len(index_df): # If the variable has less data than the index
if int(year_df_end) <= int(year_var_end) and int(month_df_end) < int(month_var_end): # If the variable ends after the index
last_row = index_df.loc[year_df_end+"-"+month_df_end:].index[0]
exog = exog.loc[year_var_first+"-"+month_var_first:last_row]
last_row = index_df.loc[year_df_end+"-"+month_df_end:].index[0]
index_df = index_df.loc[year_var_first+"-"+month_var_first:last_row]
if index_df.index[-1].split("-")[1] != exog.index[-1].split("-")[1]: # If slice ends in different month
index_df = index_df[:-1]
elif int(year_df_end) < int(year_var_end): # If the variable ends after the index
last_row = index_df.loc[year_df_end+"-"+month_df_end:].index[0]
exog = exog.loc[year_var_first+"-"+month_var_first:last_row]
last_row = index_df.loc[year_df_end+"-"+month_df_end:].index[0]
index_df = index_df.loc[year_var_first+"-"+month_var_first:last_row]
else: # If the variable ends before the index
last_row = index_df.loc[year_var_end+"-"+month_var_end:].index[0]
index_df = index_df.loc[year_var_first+"-"+month_var_first:last_row]
# They now have the same length, but not exactly the same dates day for day
exog.index = index_df.index
return index_df, exog
def forecast_model(data, test, exog_data, forecast_steps, length, end_date, model=None, seasonal=False, index='wine'):
if model == None and seasonal == False: # ARIMAX Model
model = ARIMAResults.load(f'models\{index}_arimax.pkl')
elif model == None and seasonal == True: # SARIMAX Model
model = ARIMAResults.load(f'models\{index}_sarimax.pkl')
forecast = model.get_forecast(steps=forecast_steps, exog=exog_data)
forecast_ci = forecast.conf_int()
yhat = forecast.predicted_mean.values # Apply the exp transformation if you used log transform during fit before to invert scales back
if index=='wine' or index=='wine_residuals':
x_axis = pd.date_range(start=data.index[0], end=data.index[-1], freq = 'M')
x_axis_forecast = pd.date_range(start=test.index[0], end = end_date, freq = 'M')
elif index=='watch' or index=='watch_residuals':
x_axis = pd.date_range(start=data.index[0], end=data.index[-1], freq = 'MS')
x_axis_forecast = pd.date_range(start=test.index[0], end = end_date, freq = 'MS')
else: # index=='art' or index=='art_residuals'
x_axis = pd.date_range(start=data.index[0], end=data.index[-1], freq = 'MS')
x_axis_forecast = pd.date_range(start=test.index[0], end = end_date, freq = 'MS')
plt.plot(x_axis, data.values, color="blue", label="observed data")
plt.plot(x_axis_forecast, yhat, color="red", label="forecast", linestyle="--")
plt.legend(loc='best')
plt.title(f'{length} term forecast of {index} index values using exogenous variable')
plt.xlabel('Time')
plt.ylabel('Index value')
plt.show()
return yhat
def forecast_exog(train_data, test_data, forecast_length, method="rolling_window", window_size=1):
# Window size must be > 1.
forecast = [x for x in test_data]
history = [x for x in train_data]
history = np.append(history,forecast)
if method == "rolling_window": # Rolling_window mean method
for i in range(forecast_length):
rolling_window = history[len(history) - window_size:len(history)]
mu = np.mean(rolling_window)
# Update the training data
forecast = np.append(forecast, mu)
history = np.append(history, forecast)
elif method == "mean":
window_mean = history[len(history) - window_size:len(history)]
forecast_mean = np.full((forecast_length-len(test_data)), np.mean(window_mean))
forecast = np.append(forecast, forecast_mean)
if len(forecast) > forecast_length:
forecast = forecast[:forecast_length]
return forecast
##### MAIN #####
## Load the data from pre-processing ##
wine_df, watch_df, art_df, gold_df, sp500_df, cpi_df, bond_yield_df, crude_oil_df, spRealEstate_df, usdx_df = preprocessing.main(univariate=False)
# Compute covariance matrix between each pair of correlated variables and index
# Log transform the data to stabilize variance and get more accurate covariance
wine_cov = pd.DataFrame(columns = ["Gold", "SP500", "CPI", "Bond Yield", "Crude Oil", "S&P US Real Estate", "USDX"])
wine_cov = compute_covariance(wine_cov, np.log(wine_df.observed), [np.log(gold_df.observed), np.log(sp500_df.observed), np.log(cpi_df.observed), bond_yield_df.observed, np.log(crude_oil_df.observed), np.log(spRealEstate_df.observed), np.log(usdx_df.observed)])
watch_cov = pd.DataFrame(columns = ["Gold", "SP500", "CPI", "Bond Yield", "Crude Oil", "S&P US Real Estate", "USDX"])
watch_cov = compute_covariance(watch_cov, np.log(watch_df.observed), [np.log(gold_df.observed), np.log(sp500_df.observed), np.log(cpi_df.observed), bond_yield_df.observed, np.log(crude_oil_df.observed), np.log(spRealEstate_df.observed), np.log(usdx_df.observed)])
art_cov = pd.DataFrame(columns = ["Gold", "SP500", "CPI", "Bond Yield", "Crude Oil", "S&P US Real Estate", "USDX"])
art_cov = compute_covariance(art_cov, np.log(art_df.observed), [ np.log(gold_df.observed), np.log(sp500_df.observed), np.log(cpi_df.observed), bond_yield_df.observed, np.log(crude_oil_df.observed), np.log(spRealEstate_df.observed), np.log(usdx_df.observed)])
# print(wine_cov)
# print(watch_cov)
# print(art_cov)
# Big covariance between Gold and Wine, SP500, SP US Real Estate and Watch, SP500 and Art
# Bond Yield has a negative covariance with all indexes, and is a bit biased, because I cannot log transform it since it has negative values
# Compute Pearson correlation coefficient between each pair of correlated variables and index
# Log transform the data to stabilize variance and get more accurate coefficient
wine_pearson_coeff = pd.DataFrame(columns = ["Gold", "SP500", "CPI", "Bond Yield", "Crude Oil", "S&P US Real Estate", "USDX"])
wine_pearson_coeff = compute_pearson_coeff(wine_pearson_coeff, np.log(wine_df.observed), [np.log(gold_df.observed), np.log(sp500_df.observed), np.log(cpi_df.observed), bond_yield_df.observed, np.log(crude_oil_df.observed), np.log(spRealEstate_df.observed), np.log(usdx_df.observed)])
watch_pearson_coeff = pd.DataFrame(columns = ["Gold", "SP500", "CPI", "Bond Yield", "Crude Oil", "S&P US Real Estate", "USDX"])
watch_pearson_coeff = compute_pearson_coeff(watch_pearson_coeff, np.log(watch_df.observed), [np.log(gold_df.observed), np.log(sp500_df.observed), np.log(cpi_df.observed), bond_yield_df.observed, np.log(crude_oil_df.observed), np.log(spRealEstate_df.observed), np.log(usdx_df.observed)])
art_pearson_coeff = pd.DataFrame(columns = ["Gold", "SP500", "CPI", "Bond Yield", "Crude Oil", "S&P US Real Estate", "USDX"])
art_pearson_coeff = compute_pearson_coeff(art_pearson_coeff, np.log(art_df.observed), [np.log(gold_df.observed), np.log(sp500_df.observed), np.log(cpi_df.observed), bond_yield_df.observed, np.log(crude_oil_df.observed), np.log(spRealEstate_df.observed), np.log(usdx_df.observed)])
# print(wine_pearson_coeff)
# print(watch_pearson_coeff)
# print(art_pearson_coeff)
# Big correlation between Gold,USDX and Wine, watch and S&P US Real Estate, SP500+CPI+RealEstate+USDX and Art
# Bond Yield has a negative correlation with all indexes, and is a bit biased, because I cannot log transform it since it has negative values
# Test the significance of the correlation coefficient with a t-test (two sample), alternative: Mann-Whitney U test (which is non-parametric)
# compute_t_test(np.log(wine_df.observed), np.log(gold_df.observed)) # Significant correlation
# compute_t_test(np.log(wine_df.observed), np.log(usdx_df.observed)) # Significant correlation
# compute_t_test(np.log(watch_df.observed), np.log(cpi_df.observed)) # significant correlation
# compute_t_test(np.log(watch_df.observed), np.log(spRealEstate_df.observed)) # significant correlation
# compute_t_test(np.log(art_df.observed), np.log(spRealEstate_df.observed)) # significant correlation
# compute_t_test(np.log(art_df.observed), np.log(usdx_df.observed)) # significant correlation
# compute_t_test(np.log(art_df.observed), np.log(sp500_df.observed)) # Significant correlation
# compute_t_test(np.log(art_df.observed), np.log(cpi_df.observed)) # Significant correlation
# NB forecasting using exogenous variables requires future exog data to be known in advance which is not the case.
# Thus I need to forecast the exogenous variables as well using ARIMA, and use that forecast as input for the main forecast of our index.
# WINE
arima_wine = (3,1,3)
wine_adjusted, exog_wine = align_data(wine_df.observed, usdx_df.observed)
# Evaluate the model
# eval_df = pd.DataFrame(columns=["ARIMA", "SEASONAL", "AIC", "BIC", "MAE", "MSE", "RMSE", "MAPE %"]) # To store the most important evaluation metrics
# eval_df = evaluate_model_with_Plots(wine_adjusted, [arima_wine], eval_df, exog_wine, seasonal=False, index='wine')
# print(eval_df)
sarima_wine = [(3,1,3),(3,0,6,12)]
# wine_adjusted, exog_wine = align_data(wine_df.observed, usdx_df.observed)
# eval_df = pd.DataFrame(columns=["ARIMA", "SEASONAL", "AIC", "BIC", "MAE", "MSE", "RMSE", "MAPE %"]) # To store the most important evaluation metrics
# eval_df = evaluate_model_with_Plots(wine_adjusted, [sarima_wine[1]], eval_df, exog_wine, seasonal=True, index='wine', arima_order=sarima_wine[0])
# print(eval_df)
wine_train = wine_adjusted[:int(0.8*len(wine_adjusted))]
wine_test = wine_adjusted[int(0.8*len(wine_adjusted)):]
wine_train_exog = exog_wine[:int(0.8*len(exog_wine))]
wine_test_exog = exog_wine[int(0.8*len(exog_wine)):]
# Save optimal model
# wine_model_exog = create_model(wine_adjusted, arima_wine, exog_wine, index='wine')
# seasonal_wine_model_exog = create_model(wine_adjusted, sarima_wine[0], exog_wine, seasonal_order=sarima_wine[1], index='wine')
# Forecast
# Now that the optimal has been found, use it to forecast
long_term = wine_train.shape[0] # Full training set can go beyond that but it would be extrapolation, so less reliable
# Long term forecasts
ref_start = wine_adjusted.index[-1] # "2022-07-31"
end_long = "2035-02-28"
# Exog needs to be forecasted to obtain the forecasted values for index, you cannot take past exog values for forecast you need current values
# future_exog = forecast_exog(wine_train_exog, wine_test_exog, long_term, method="rolling_window", window_size=5)
# forecast_model(wine_adjusted, wine_test, future_exog, long_term, "Long", end_date=end_long, model=None, seasonal=False, index='wine')
# WATCH
arima_watch = (2,1,3)
watch_adjusted, exog_watch = align_data(watch_df.observed, spRealEstate_df.observed)
# Evaluate the model
# eval_df = pd.DataFrame(columns=["ARIMA", "SEASONAL", "AIC", "BIC", "MAE", "MSE", "RMSE", "MAPE %"]) # To store the most important evaluation metrics
# eval_df = evaluate_model_with_Plots(watch_adjusted, [arima_watch], eval_df, exog_watch, seasonal=False, index='watch')
# print(eval_df)
# sarima_watch = [(2,1,3),(1,0,3,12)]
# watch_adjusted, exog_watch = align_data(watch_df.observed, spRealEstate_df.observed)
# eval_df = pd.DataFrame(columns=["ARIMA", "SEASONAL", "AIC", "BIC", "MAE", "MSE", "RMSE", "MAPE %"]) # To store the most important evaluation metrics
# eval_df = evaluate_model_with_Plots(watch_adjusted, [sarima_watch[1]], eval_df, exog_watch, seasonal=True, index='watch', arima_order=sarima_watch[0])
# print(eval_df)
watch_train = watch_adjusted[:int(0.8*len(watch_adjusted))]
watch_test = watch_adjusted[int(0.8*len(watch_adjusted)):]
watch_train_exog = exog_watch[:int(0.8*len(exog_watch))]
watch_test_exog = exog_watch[int(0.8*len(exog_watch)):]
# Save optimal model
# watch_model_exog = create_model(watch_adjusted, arima_watch, exog_watch, index='watch')
# seasonal_watch_model_exog = create_model(watch_adjusted, sarima_watch[0], exog_watch, seasonal_order=sarima_watch[1], index='watch')
# Forecast
# Now that the optimal has been found, use it to forecast
long_term = watch_train.shape[0] # Full training set can go beyond that but it would be extrapolation, so less reliable
# Long term forecasts
ref_start = watch_adjusted.index[-1] # "2023-12-01"
end_long = "2034-02-01"
# Exog needs to be forecasted to obtain the forecasted values for index, you cannot take past exog values for forecast you need current values
# future_exog = forecast_exog(watch_train_exog, watch_test_exog, long_term, method="rolling_window", window_size=5)
# forecast_model(watch_adjusted, watch_test, future_exog, long_term, "Long", end_date=end_long, model=None, seasonal=False, index='watch')
# ART
arima_art = (13,1,6)
art_adjusted, exog_art = align_data(art_df.observed, cpi_df.observed)
# Evaluate the model
# eval_df = pd.DataFrame(columns=["ARIMA", "SEASONAL", "AIC", "BIC", "MAE", "MSE", "RMSE", "MAPE %"]) # To store the most important evaluation metrics
# eval_df = evaluate_model_with_Plots(art_adjusted, [arima_art], eval_df, exog_art, seasonal=False, index='art')
# print(eval_df)
sarima_art = [(4,1,2),(5,0,6,6)]
# art_adjusted, exog_art = align_data(art_df.observed, cpi_df.observed)
# eval_df = pd.DataFrame(columns=["ARIMA", "SEASONAL", "AIC", "BIC", "MAE", "MSE", "RMSE", "MAPE %"]) # To store the most important evaluation metrics
# eval_df = evaluate_model_with_Plots(art_adjusted, [sarima_art[1]], eval_df, exog_art, seasonal=True, index='art', arima_order=sarima_art[0])
# print(eval_df)
art_train = art_adjusted[:int(0.8*len(art_adjusted))]
art_test = art_adjusted[int(0.8*len(art_adjusted)):]
art_train_exog = exog_art[:int(0.8*len(exog_art))]
art_test_exog = exog_art[int(0.8*len(exog_art)):]
# Save optimal model
# art_model_exog = create_model(art_adjusted, arima_art, exog_art, index='art')
# seasonal_art_model_exog = create_model(art_adjusted, sarima_art[0], exog_art, seasonal_order=sarima_art[1], index='art')
# Forecast
# Now that the optimal has been found, use it to forecast
long_term = art_train.shape[0] # Full training set can go beyond that but it would be extrapolation, so less reliable
# Long term forecasts
ref_start = art_adjusted.index[-1] # "2023-09-01"
end_long = "2051-02-01"
# Exog needs to be forecasted to obtain the forecasted values for index, you cannot take past exog values for forecast you need current values
# future_exog = forecast_exog(art_train_exog, art_test_exog, long_term, method="rolling_window", window_size=5)
# forecast_model(art_adjusted, art_test, future_exog, long_term, "Long", end_date=end_long, model=None, seasonal=False, index='art')