diff --git a/predtekmovanje.py b/tekmovanje.py similarity index 52% rename from predtekmovanje.py rename to tekmovanje.py index c7f4668..98b0968 100644 --- a/predtekmovanje.py +++ b/tekmovanje.py @@ -2,9 +2,12 @@ from datetime import datetime, time import numpy as np import pandas as pd -import linear +import xgboost as xgb import lpputils +route_dict = {} +direction_dict = {} + def read_data(filename, sep='\t'): """ Reads the data from the given file. @@ -39,7 +42,7 @@ def add_duration(data): data['Duration'] = data.apply(lambda x: lpputils.tsdiff(x['Arrival time'], x['Departure time']), axis=1) return data -def add_departure_info(data): +def add_structured_time(data): """ Adds structured departure time to the data. """ @@ -49,21 +52,56 @@ def add_departure_info(data): data['DP month'] = pd.to_datetime(data['Departure time']).dt.month return data +def get_direction_from_row(row): + direction = row['Route description'] + route = row['Route'] + if route in direction_dict: + if direction == direction_dict[route]: + return 0 + else: + return 1 + + direction_dict[route] = direction + return 0 + +def add_direction(data): + """ + Adds information about in which direction the bus is driving to the data. + """ + data['Direction'] = data.apply(get_direction_from_row, axis=1) + return data + +def split_by_route(data): + """ + Creates seperate dataset for each route. + """ + datasets = {} + for route in get_routes(data): + if route not in datasets: + datasets[route] = [] + datasets[route].append(data.copy()[(data['Route'] == route) & (data['Direction'] == 0)]) + datasets[route].append(data.copy()[(data['Route'] == route) & (data['Direction'] == 1)]) + return datasets + +def get_routes(data): + return list(data['Route'].unique()) + def pre_process_data(data, train=True): """ Pre-processes the data. """ - data = add_departure_info(data) data = add_day_of_week(data) data = add_holiday_info(data) + # data = add_route(data) + data = add_direction(data) + data = add_structured_time(data) # not really needed since they are the same everywhere data = data.drop('Route description', axis=1) data = data.drop('Route Direction', axis=1) data = data.drop('First station' , axis=1) data = data.drop('Last station', axis=1) - data = data.drop('Route', axis=1) data = data.drop('Registration', axis=1) data = data.drop('Driver ID', axis=1) @@ -75,28 +113,44 @@ def pre_process_data(data, train=True): departures = data['Departure time'] data = data.drop('Departure time', axis=1) - # print(data) + datasets = None + if train: + datasets = split_by_route(data) + for set in datasets.values(): + set[0].drop(['Route', 'Direction'], axis=1, inplace=True) + set[1].drop(['Route', 'Direction'], axis=1, inplace=True) + data = data.drop('Route', axis=1) - return data, departures + return data, departures, datasets -def train_lr(data, lamb=1.0, label='Duration'): +def train_model(data, label='Duration'): """ - Trains the linear regression model. + Trains the model. """ - X = data.drop(label, axis=1).to_numpy() - y = data[label].to_numpy() + models = {} + for route, dataset in data.items(): + X0 = dataset[0].drop(label, axis=1).to_numpy() + y0 = dataset[0][label].to_numpy() + + X1 = dataset[1].drop(label, axis=1).to_numpy() + y1 = dataset[1][label].to_numpy() - lr = linear.LinearLearner(lambda_=lamb) - return lr(X,y) + model0 = xgb.XGBRegressor(eval_metric='mae', verbosity=0, n_threads=4) + model1 = xgb.XGBRegressor(eval_metric='mae', verbosity=0, n_threads=4) + model0.fit(X0, y0) + model1.fit(X1, y1) + models[route] = {0: model0, 1: model1} + return models -def predict_lr(model, data): +def predict(models, data: pd.DataFrame): """ Predicts the arrival time for the given data. Data should be pre-processed. """ - rows = data.to_numpy() results = [] - for row in rows: - results.append(model(row)) + for _, row in data.iterrows(): + model = models[row['Route']][row['Direction']] + pred_data = [row.drop(['Route', 'Direction']).to_numpy()] + results.append(model.predict(pred_data)) data['Duration'] = results return data @@ -106,7 +160,7 @@ def post_process(data, departures): Post-processes the data. """ data['Departure time'] = departures - data['Arrival time'] = data.apply(lambda x: lpputils.tsadd(x['Departure time'], x['Duration']), axis=1) + data['Arrival time'] = data.apply(lambda x: lpputils.tsadd(x['Departure time'], int(x['Duration'][0])), axis=1) return data def create_output(data, departures, filename='out.txt'): @@ -117,11 +171,11 @@ def create_output(data, departures, filename='out.txt'): data['Arrival time'].to_csv(filename, sep='\n', index=False, header=False) if __name__ == '__main__': - train_data = read_data('train_pred.csv') - test_data = read_data('test_pred.csv') - train_data, departures_train = pre_process_data(train_data) - test_data, departures_test = pre_process_data(test_data, train=False) - model = train_lr(train_data) - pred = predict_lr(model, test_data) + train_data = read_data('train.csv') + test_data = read_data('test.csv') + train_data, departures_train, train_datasets = pre_process_data(train_data) + test_data, departures_test, _ = pre_process_data(test_data, train=False) + models = train_model(train_datasets) + pred = predict(models, test_data) create_output(pred, departures_test) \ No newline at end of file