init commit

dPreininger · May 3, 2022 · 7768530 · 7768530
commit 7768530
Show file tree

Hide file tree

Showing 6 changed files with 488,783 additions and 0 deletions.
diff --git a/.gitignore b/.gitignore
@@ -0,0 +1,3 @@
+.vscode/
+out.txt
+__pycache__/
diff --git a/lpputils.py b/lpputils.py
@@ -0,0 +1,26 @@
+import datetime
+
+FORMAT = "%Y-%m-%d %H:%M:%S.%f"
+
+def parsedate(x):
+    if not isinstance(x, datetime.datetime):
+        x = datetime.datetime.strptime(x, FORMAT)
+    return x
+
+def tsdiff(x, y):
+    return (parsedate(x) - parsedate(y)).total_seconds()
+
+def tsadd(x, seconds):
+    d = datetime.timedelta(seconds=seconds)
+    nd = parsedate(x) + d
+    return nd.strftime(FORMAT)
+
+if __name__ == "__main__":
+    testd1 = "2012-01-01 23:32:38.000"
+    testd2 = "2012-12-01 03:33:38.000"
+
+    testd1 = datetime.datetime.strptime(testd1, FORMAT)
+
+    for i in range(23000):
+        a = tsdiff(testd1, testd2)
+        b = tsadd(testd1, -122)
diff --git a/prazniki.csv b/prazniki.csv
@@ -0,0 +1,20 @@
+DATUM;IME_PRAZNIKA;DAN_V_TEDNU;DELA_PROST_DAN;DAN;MESEC;LETO
+1.01.2012;novo leto;nedelja;da;1;1;2012
+2.01.2012;novo leto;ponedeljek;da;2;1;2012
+8.02.2012;Prešernov dan, slovenski kulturni praznik;sreda;da;8;2;2012
+8.04.2012;velika noč;nedelja;da;8;4;2012
+9.04.2012;velikonočni ponedeljek;ponedeljek;da;9;4;2012
+27.04.2012;dan boja proti okupatorju ;petek;da;27;4;2012
+1.05.2012;praznik dela;torek;da;1;5;2012
+2.05.2012;praznik dela;sreda;da;2;5;2012
+27.05.2012;binkoštna nedelja;nedelja;da;27;5;2012
+8.06.2012;dan Primoža Trubarja;petek;ne;8;6;2012
+25.06.2012;dan državnosti;ponedeljek;da;25;6;2012
+15.08.2012;Marijino vnebovzetje;sreda;da;15;8;2012
+17.08.2012;združitev prekmurskih Slovencev z matičnim narodom;petek;ne ;17;8;2012
+15.09.2012;vrnitev Primorske k matični domovini;sobota;ne;15;9;2012
+31.10.2012;dan reformacije;sreda;da;31;10;2012
+1.11.2012;dan spomina na mrtve;četrtek;da;1;11;2012
+23.11.2012;dan Rudolfa Maistra;petek;ne;23;11;2012
+25.12.2012;božič;torek;da;25;12;2012
+26.12.2012;dan samostojnosti in enotnosti;sreda;da;26;12;2012
diff --git a/predtekmovanje.py b/predtekmovanje.py
@@ -0,0 +1,127 @@
+from calendar import weekday
+from datetime import datetime, time
+import numpy as np
+import pandas as pd
+import linear
+import lpputils
+
+def read_data(filename, sep='\t'):
+    """
+    Reads the data from the given file.
+    """
+    return pd.read_csv(filename, sep=sep)
+
+def add_day_of_week(data):
+    """
+    Adds the day of the week to the data represented as int.
+    """
+    rows = pd.to_datetime(data['Departure time'])
+    days = np.zeros((len(rows), 7))
+    for index, row in enumerate(rows):
+        days[index, row.weekday()] = 1
+    days = pd.DataFrame(days, columns=['Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday', 'Sunday'])
+    data = pd.concat([data, days], axis=1)
+    return data
+
+def add_holiday_info(data):
+    """
+    Adds information if a day is holiday to the data.
+    """
+    holidays = read_data('prazniki.csv', ';')
+    dates = list(holidays['DATUM'])
+    data['Holiday'] = data['Departure time'].apply(lambda x: 1 if datetime.strptime(x, '%Y-%m-%d %H:%M:%S.%f').strftime('%-d.%m.%Y') in dates else 0)
+    return data
+
+def add_duration(data):
+    """
+    Adds the duration of the trip to the data.
+    """
+    data['Duration'] = data.apply(lambda x: lpputils.tsdiff(x['Arrival time'], x['Departure time']), axis=1)
+    return data
+
+def add_departure_info(data):
+    """
+    Adds structured departure time to the data.
+    """
+    data['DP hour'] = pd.to_datetime(data['Departure time']).dt.hour
+    data['DP min'] = pd.to_datetime(data['Departure time']).dt.minute
+    data['DP day'] = pd.to_datetime(data['Departure time']).dt.day
+    data['DP month'] = pd.to_datetime(data['Departure time']).dt.month
+    return data
+
+def pre_process_data(data, train=True):
+    """
+    Pre-processes the data.
+    """
+
+    data = add_departure_info(data)
+    data = add_day_of_week(data)
+    data = add_holiday_info(data)
+
+    # not really needed since they are the same everywhere
+    data = data.drop('Route description', axis=1)
+    data = data.drop('Route Direction', axis=1)
+    data = data.drop('First station' , axis=1)
+    data = data.drop('Last station', axis=1)
+    data = data.drop('Route', axis=1)
+
+    data = data.drop('Registration', axis=1)
+    data = data.drop('Driver ID', axis=1)
+
+    if train:
+        add_duration(data)
+    data = data.drop('Arrival time', axis=1)
+
+    departures = data['Departure time']
+    data = data.drop('Departure time', axis=1)
+
+    # print(data)
+
+    return data, departures
+
+def train_lr(data, lamb=1.0, label='Duration'):
+    """
+    Trains the linear regression model.
+    """
+    X = data.drop(label, axis=1).to_numpy()
+    y = data[label].to_numpy()
+
+    lr = linear.LinearLearner(lambda_=lamb)
+    return lr(X,y)
+
+def predict_lr(model, data):
+    """
+    Predicts the arrival time for the given data. Data should be pre-processed.
+    """
+    rows = data.to_numpy()
+    results = []
+    for row in rows:
+        results.append(model(row))
+
+    data['Duration'] = results
+    return data
+
+def post_process(data, departures):
+    """
+    Post-processes the data.
+    """
+    data['Departure time'] = departures
+    data['Arrival time'] = data.apply(lambda x: lpputils.tsadd(x['Departure time'], x['Duration']), axis=1)
+    return data
+
+def create_output(data, departures, filename='out.txt'):
+    """
+    Creates the output file.
+    """
+    data = post_process(data, departures)
+    data['Arrival time'].to_csv(filename, sep='\n', index=False, header=False)
+
+if __name__ == '__main__':
+    train_data = read_data('train_pred.csv')
+    test_data = read_data('test_pred.csv')
+    train_data, departures_train = pre_process_data(train_data)
+    test_data, departures_test = pre_process_data(test_data, train=False)
+    model = train_lr(train_data)
+    pred = predict_lr(model, test_data)
+    create_output(pred, departures_test)
+