diff --git a/day5.ipynb b/day5.ipynb new file mode 100644 index 0000000..e6b7c0a --- /dev/null +++ b/day5.ipynb @@ -0,0 +1,463 @@ +{ + "nbformat": 4, + "nbformat_minor": 0, + "metadata": { + "colab": { + "name": "day5.ipynb", + "provenance": [], + "collapsed_sections": [], + "mount_file_id": "1EnuY-zir8UL_kahtEvy7MDA-rce_5O5Y", + "authorship_tag": "ABX9TyMbvLQBXG5h9iD7qR5h2mRy", + "include_colab_link": true + }, + "kernelspec": { + "name": "python3", + "display_name": "Python 3" + } + }, + "cells": [ + { + "cell_type": "markdown", + "metadata": { + "id": "view-in-github", + "colab_type": "text" + }, + "source": [ + "\"Open" + ] + }, + { + "cell_type": "code", + "metadata": { + "id": "mMqiCuBpcidR", + "colab_type": "code", + "colab": { + "base_uri": "https://localhost:8080/", + "height": 666 + }, + "outputId": "8f439a48-47cd-47ff-b7ae-b7d3705125e3" + }, + "source": [ + "!pip install --upgrade tables\n", + "!pip install eli5\n", + "!pip install xgboost\n", + "!pip install hyperopt" + ], + "execution_count": 1, + "outputs": [ + { + "output_type": "stream", + "text": [ + "Collecting tables\n", + "\u001b[?25l Downloading https://files.pythonhosted.org/packages/ed/c3/8fd9e3bb21872f9d69eb93b3014c86479864cca94e625fd03713ccacec80/tables-3.6.1-cp36-cp36m-manylinux1_x86_64.whl (4.3MB)\n", + "\u001b[K |████████████████████████████████| 4.3MB 5.0MB/s \n", + "\u001b[?25hRequirement already satisfied, skipping upgrade: numexpr>=2.6.2 in /usr/local/lib/python3.6/dist-packages (from tables) (2.7.1)\n", + "Requirement already satisfied, skipping upgrade: numpy>=1.9.3 in /usr/local/lib/python3.6/dist-packages (from tables) (1.17.5)\n", + "Installing collected packages: tables\n", + " Found existing installation: tables 3.4.4\n", + " Uninstalling tables-3.4.4:\n", + " Successfully uninstalled tables-3.4.4\n", + "Successfully installed tables-3.6.1\n", + "Collecting eli5\n", + "\u001b[?25l Downloading https://files.pythonhosted.org/packages/97/2f/c85c7d8f8548e460829971785347e14e45fa5c6617da374711dec8cb38cc/eli5-0.10.1-py2.py3-none-any.whl (105kB)\n", + "\u001b[K |████████████████████████████████| 112kB 4.8MB/s \n", + "\u001b[?25hRequirement already satisfied: scipy in /usr/local/lib/python3.6/dist-packages (from eli5) (1.4.1)\n", + "Requirement already satisfied: numpy>=1.9.0 in /usr/local/lib/python3.6/dist-packages (from eli5) (1.17.5)\n", + "Requirement already satisfied: attrs>16.0.0 in /usr/local/lib/python3.6/dist-packages (from eli5) (19.3.0)\n", + "Requirement already satisfied: graphviz in /usr/local/lib/python3.6/dist-packages (from eli5) (0.10.1)\n", + "Requirement already satisfied: six in /usr/local/lib/python3.6/dist-packages (from eli5) (1.12.0)\n", + "Requirement already satisfied: jinja2 in /usr/local/lib/python3.6/dist-packages (from eli5) (2.11.1)\n", + "Requirement already satisfied: scikit-learn>=0.18 in /usr/local/lib/python3.6/dist-packages (from eli5) (0.22.1)\n", + "Requirement already satisfied: tabulate>=0.7.7 in /usr/local/lib/python3.6/dist-packages (from eli5) (0.8.6)\n", + "Requirement already satisfied: MarkupSafe>=0.23 in /usr/local/lib/python3.6/dist-packages (from jinja2->eli5) (1.1.1)\n", + "Requirement already satisfied: joblib>=0.11 in /usr/local/lib/python3.6/dist-packages (from scikit-learn>=0.18->eli5) (0.14.1)\n", + "Installing collected packages: eli5\n", + "Successfully installed eli5-0.10.1\n", + "Requirement already satisfied: xgboost in /usr/local/lib/python3.6/dist-packages (0.90)\n", + "Requirement already satisfied: numpy in /usr/local/lib/python3.6/dist-packages (from xgboost) (1.17.5)\n", + "Requirement already satisfied: scipy in /usr/local/lib/python3.6/dist-packages (from xgboost) (1.4.1)\n", + "Requirement already satisfied: hyperopt in /usr/local/lib/python3.6/dist-packages (0.1.2)\n", + "Requirement already satisfied: pymongo in /usr/local/lib/python3.6/dist-packages (from hyperopt) (3.10.1)\n", + "Requirement already satisfied: networkx in /usr/local/lib/python3.6/dist-packages (from hyperopt) (2.4)\n", + "Requirement already satisfied: scipy in /usr/local/lib/python3.6/dist-packages (from hyperopt) (1.4.1)\n", + "Requirement already satisfied: numpy in /usr/local/lib/python3.6/dist-packages (from hyperopt) (1.17.5)\n", + "Requirement already satisfied: six in /usr/local/lib/python3.6/dist-packages (from hyperopt) (1.12.0)\n", + "Requirement already satisfied: future in /usr/local/lib/python3.6/dist-packages (from hyperopt) (0.16.0)\n", + "Requirement already satisfied: tqdm in /usr/local/lib/python3.6/dist-packages (from hyperopt) (4.28.1)\n", + "Requirement already satisfied: decorator>=4.3.0 in /usr/local/lib/python3.6/dist-packages (from networkx->hyperopt) (4.4.1)\n" + ], + "name": "stdout" + } + ] + }, + { + "cell_type": "code", + "metadata": { + "id": "9y0_eFZOd-ow", + "colab_type": "code", + "colab": { + "base_uri": "https://localhost:8080/", + "height": 185 + }, + "outputId": "0aae5db8-3bd9-48d4-ea35-3a169981220b" + }, + "source": [ + "import pandas as pd\n", + "import numpy as np\n", + "\n", + "import xgboost as xgb\n", + "\n", + "from sklearn.metrics import mean_absolute_error as mae \n", + "from sklearn.model_selection import cross_val_score, KFold\n", + "\n", + "from hyperopt import hp, fmin, tpe, STATUS_OK\n", + "\n", + "import eli5\n", + "from eli5.sklearn import PermutationImportance" + ], + "execution_count": 2, + "outputs": [ + { + "output_type": "stream", + "text": [ + "/usr/local/lib/python3.6/dist-packages/sklearn/utils/deprecation.py:144: FutureWarning: The sklearn.metrics.scorer module is deprecated in version 0.22 and will be removed in version 0.24. The corresponding classes / functions should instead be imported from sklearn.metrics. Anything that cannot be imported from sklearn.metrics is now part of the private API.\n", + " warnings.warn(message, FutureWarning)\n", + "/usr/local/lib/python3.6/dist-packages/sklearn/utils/deprecation.py:144: FutureWarning: The sklearn.feature_selection.base module is deprecated in version 0.22 and will be removed in version 0.24. The corresponding classes / functions should instead be imported from sklearn.feature_selection. Anything that cannot be imported from sklearn.feature_selection is now part of the private API.\n", + " warnings.warn(message, FutureWarning)\n", + "Using TensorFlow backend.\n" + ], + "name": "stderr" + }, + { + "output_type": "display_data", + "data": { + "text/html": [ + "

\n", + "The default version of TensorFlow in Colab will soon switch to TensorFlow 2.x.
\n", + "We recommend you upgrade now \n", + "or ensure your notebook will continue to use TensorFlow 1.x via the %tensorflow_version 1.x magic:\n", + "more info.

\n" + ], + "text/plain": [ + "" + ] + }, + "metadata": { + "tags": [] + } + } + ] + }, + { + "cell_type": "code", + "metadata": { + "id": "K7cWHmW2egS2", + "colab_type": "code", + "colab": { + "base_uri": "https://localhost:8080/", + "height": 54 + }, + "outputId": "13192911-b7a3-465d-ee3f-d36f4132c4c1" + }, + "source": [ + "cd \"/content/drive/My Drive/Colab Notebooks/dataworkshop_matrix/matrix2/dataworkshop_matrix2\"" + ], + "execution_count": 3, + "outputs": [ + { + "output_type": "stream", + "text": [ + "/content/drive/My Drive/Colab Notebooks/dataworkshop_matrix/matrix2/dataworkshop_matrix2\n" + ], + "name": "stdout" + } + ] + }, + { + "cell_type": "code", + "metadata": { + "id": "gTqZh-K1eqbU", + "colab_type": "code", + "colab": { + "base_uri": "https://localhost:8080/", + "height": 34 + }, + "outputId": "f2d6995a-dfef-47c5-e3bd-05d679612d11" + }, + "source": [ + "df = pd.read_hdf('data/car.h5')\n", + "df.shape" + ], + "execution_count": 4, + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/plain": [ + "(106494, 155)" + ] + }, + "metadata": { + "tags": [] + }, + "execution_count": 4 + } + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "SOfx2xwjeyf7", + "colab_type": "text" + }, + "source": [ + "## Feature Engineering" + ] + }, + { + "cell_type": "code", + "metadata": { + "id": "bsA2-HyoezSv", + "colab_type": "code", + "colab": {} + }, + "source": [ + "SUFFIX_CAT = '__cat'\n", + "for feat in df.columns:\n", + " if isinstance(df[feat][0], list): continue\n", + "\n", + " factorized_values = df[feat].factorize()[0]\n", + " if SUFFIX_CAT in feat:\n", + " df[feat] = factorized_values\n", + " else:\n", + " df[feat + SUFFIX_CAT] = factorized_values" + ], + "execution_count": 0, + "outputs": [] + }, + { + "cell_type": "code", + "metadata": { + "id": "sMiJxC0VfNbv", + "colab_type": "code", + "colab": {} + }, + "source": [ + "df['param_pojemność-skokowa'] = df['param_pojemność-skokowa'].map(lambda x: -1 if str(x) == 'None' else int(str(x).split('cm')[0].replace(' ', '')))\n", + "\n", + "df['param_moc'] = df['param_moc'].map(lambda x: -1 if str(x) == 'None' else int(str(x).split(' ')[0]))\n", + "\n", + "df['param_rok-produkcji'] = df['param_rok-produkcji'].map(lambda x: -1 if str(x) == 'None' else int(x))" + ], + "execution_count": 0, + "outputs": [] + }, + { + "cell_type": "code", + "metadata": { + "id": "UnLn2X3Zfayk", + "colab_type": "code", + "colab": {} + }, + "source": [ + "def run_model(model, feats): \n", + " X = df[feats].values\n", + " y = df['price_value'].values\n", + " \n", + " scores = cross_val_score(model, X, y, cv=3, scoring = 'neg_mean_absolute_error')\n", + " return np.mean(scores), np.std(scores)" + ], + "execution_count": 0, + "outputs": [] + }, + { + "cell_type": "code", + "metadata": { + "id": "6jPIshMjfmek", + "colab_type": "code", + "colab": { + "base_uri": "https://localhost:8080/", + "height": 105 + }, + "outputId": "10acbf9a-ec0f-4b70-bdfd-0217b8d5a39c" + }, + "source": [ + "feats = ['param_napęd__cat',\n", + "'param_rok-produkcji',\n", + "'param_stan__cat',\n", + "'param_skrzynia-biegów__cat',\n", + "'param_faktura-vat__cat',\n", + "'param_moc',\n", + "'param_marka-pojazdu__cat',\n", + "'feature_kamera-cofania__cat',\n", + "'param_typ__cat',\n", + "'param_pojemność-skokowa',\n", + "'seller_name__cat',\n", + "'feature_wspomaganie-kierownicy__cat',\n", + "'param_model-pojazdu__cat',\n", + "'param_wersja__cat',\n", + "'param_kod-silnika__cat',\n", + "'feature_system-start-stop__cat',\n", + "'feature_asystent-pasa-ruchu__cat',\n", + "'feature_czujniki-parkowania-przednie__cat',\n", + "'feature_łopatki-zmiany-biegów__cat',\n", + "'feature_regulowane-zawieszenie__cat']\n", + "\n", + "xgb_params = {\n", + " 'max_depth':5,\n", + " 'n_estimators':50,\n", + " 'learning_rate':0.1,\n", + " 'seed':0\n", + "}\n", + "model = xgb.XGBRegressor(**xgb_params)\n", + "run_model(model, feats)" + ], + "execution_count": 8, + "outputs": [ + { + "output_type": "stream", + "text": [ + "[20:18:10] WARNING: /workspace/src/objective/regression_obj.cu:152: reg:linear is now deprecated in favor of reg:squarederror.\n", + "[20:18:14] WARNING: /workspace/src/objective/regression_obj.cu:152: reg:linear is now deprecated in favor of reg:squarederror.\n", + "[20:18:18] WARNING: /workspace/src/objective/regression_obj.cu:152: reg:linear is now deprecated in favor of reg:squarederror.\n" + ], + "name": "stdout" + }, + { + "output_type": "execute_result", + "data": { + "text/plain": [ + "(-9569.227198767323, 72.83561801421891)" + ] + }, + "metadata": { + "tags": [] + }, + "execution_count": 8 + } + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "UszcalxAhUGz", + "colab_type": "text" + }, + "source": [ + "##Hyperopt" + ] + }, + { + "cell_type": "code", + "metadata": { + "id": "3zgC3suCf8qy", + "colab_type": "code", + "colab": { + "base_uri": "https://localhost:8080/", + "height": 972 + }, + "outputId": "0535e498-1556-4bf0-ac57-f0b748887b62" + }, + "source": [ + "def obj_func(params):\n", + " print(\"Training with params: \")\n", + " print(params)\n", + "\n", + " mean_mae, score_std = run_model(xgb.XGBRegressor(**params), feats)\n", + "\n", + " return {'loss': np.abs(mean_mae), 'status': STATUS_OK}\n", + "\n", + "#space\n", + "xgb_reg_params = {\n", + " 'learning_rate': hp.choice('learning_rate', np.arange(0.05, 0.31, 0.05)),\n", + " 'max_depth': hp.choice('max_depth', np.arange(5, 16, 1, dtype=int)),\n", + " 'subsample': hp.quniform('subsample', 0.5, 1, 0.05),\n", + " 'colsample_bytree': hp.quniform('colsample_bytree', 0.5, 1, 0.05),\n", + " 'objective': 'reg:squarederror',\n", + " 'n_estimators': 100,\n", + " 'seed': 0,\n", + "}\n", + "\n", + "##run\n", + "best = fmin(obj_func, xgb_reg_params, algo=tpe.suggest, max_evals=25)\n", + "\n", + "best" + ], + "execution_count": 10, + "outputs": [ + { + "output_type": "stream", + "text": [ + "Training with params: \n", + "{'colsample_bytree': 0.75, 'learning_rate': 0.1, 'max_depth': 7, 'n_estimators': 100, 'objective': 'reg:squarederror', 'seed': 0, 'subsample': 0.65}\n", + "Training with params: \n", + "{'colsample_bytree': 0.55, 'learning_rate': 0.1, 'max_depth': 14, 'n_estimators': 100, 'objective': 'reg:squarederror', 'seed': 0, 'subsample': 0.9500000000000001}\n", + "Training with params: \n", + "{'colsample_bytree': 0.8500000000000001, 'learning_rate': 0.15000000000000002, 'max_depth': 9, 'n_estimators': 100, 'objective': 'reg:squarederror', 'seed': 0, 'subsample': 0.8500000000000001}\n", + "Training with params: \n", + "{'colsample_bytree': 0.7000000000000001, 'learning_rate': 0.2, 'max_depth': 15, 'n_estimators': 100, 'objective': 'reg:squarederror', 'seed': 0, 'subsample': 0.75}\n", + "Training with params: \n", + "{'colsample_bytree': 0.75, 'learning_rate': 0.25, 'max_depth': 6, 'n_estimators': 100, 'objective': 'reg:squarederror', 'seed': 0, 'subsample': 0.9500000000000001}\n", + "Training with params: \n", + "{'colsample_bytree': 0.9, 'learning_rate': 0.2, 'max_depth': 15, 'n_estimators': 100, 'objective': 'reg:squarederror', 'seed': 0, 'subsample': 0.8}\n", + "Training with params: \n", + "{'colsample_bytree': 0.9, 'learning_rate': 0.2, 'max_depth': 15, 'n_estimators': 100, 'objective': 'reg:squarederror', 'seed': 0, 'subsample': 0.7000000000000001}\n", + "Training with params: \n", + "{'colsample_bytree': 0.65, 'learning_rate': 0.05, 'max_depth': 11, 'n_estimators': 100, 'objective': 'reg:squarederror', 'seed': 0, 'subsample': 0.9}\n", + "Training with params: \n", + "{'colsample_bytree': 0.9500000000000001, 'learning_rate': 0.15000000000000002, 'max_depth': 6, 'n_estimators': 100, 'objective': 'reg:squarederror', 'seed': 0, 'subsample': 0.65}\n", + "Training with params: \n", + "{'colsample_bytree': 0.9500000000000001, 'learning_rate': 0.2, 'max_depth': 11, 'n_estimators': 100, 'objective': 'reg:squarederror', 'seed': 0, 'subsample': 0.8500000000000001}\n", + "Training with params: \n", + "{'colsample_bytree': 0.65, 'learning_rate': 0.05, 'max_depth': 14, 'n_estimators': 100, 'objective': 'reg:squarederror', 'seed': 0, 'subsample': 0.9500000000000001}\n", + "Training with params: \n", + "{'colsample_bytree': 0.6000000000000001, 'learning_rate': 0.25, 'max_depth': 6, 'n_estimators': 100, 'objective': 'reg:squarederror', 'seed': 0, 'subsample': 0.9500000000000001}\n", + "Training with params: \n", + "{'colsample_bytree': 0.65, 'learning_rate': 0.1, 'max_depth': 10, 'n_estimators': 100, 'objective': 'reg:squarederror', 'seed': 0, 'subsample': 0.6000000000000001}\n", + "Training with params: \n", + "{'colsample_bytree': 0.55, 'learning_rate': 0.2, 'max_depth': 6, 'n_estimators': 100, 'objective': 'reg:squarederror', 'seed': 0, 'subsample': 0.55}\n", + "Training with params: \n", + "{'colsample_bytree': 0.75, 'learning_rate': 0.25, 'max_depth': 8, 'n_estimators': 100, 'objective': 'reg:squarederror', 'seed': 0, 'subsample': 0.9500000000000001}\n", + "Training with params: \n", + "{'colsample_bytree': 0.55, 'learning_rate': 0.25, 'max_depth': 12, 'n_estimators': 100, 'objective': 'reg:squarederror', 'seed': 0, 'subsample': 0.55}\n", + "Training with params: \n", + "{'colsample_bytree': 0.5, 'learning_rate': 0.15000000000000002, 'max_depth': 15, 'n_estimators': 100, 'objective': 'reg:squarederror', 'seed': 0, 'subsample': 0.65}\n", + "Training with params: \n", + "{'colsample_bytree': 0.75, 'learning_rate': 0.05, 'max_depth': 15, 'n_estimators': 100, 'objective': 'reg:squarederror', 'seed': 0, 'subsample': 0.9500000000000001}\n", + "Training with params: \n", + "{'colsample_bytree': 0.7000000000000001, 'learning_rate': 0.15000000000000002, 'max_depth': 15, 'n_estimators': 100, 'objective': 'reg:squarederror', 'seed': 0, 'subsample': 0.75}\n", + "Training with params: \n", + "{'colsample_bytree': 0.8500000000000001, 'learning_rate': 0.1, 'max_depth': 7, 'n_estimators': 100, 'objective': 'reg:squarederror', 'seed': 0, 'subsample': 1.0}\n", + "Training with params: \n", + "{'colsample_bytree': 1.0, 'learning_rate': 0.3, 'max_depth': 11, 'n_estimators': 100, 'objective': 'reg:squarederror', 'seed': 0, 'subsample': 0.8500000000000001}\n", + "Training with params: \n", + "{'colsample_bytree': 1.0, 'learning_rate': 0.05, 'max_depth': 13, 'n_estimators': 100, 'objective': 'reg:squarederror', 'seed': 0, 'subsample': 0.8500000000000001}\n", + "Training with params: \n", + "{'colsample_bytree': 0.8, 'learning_rate': 0.3, 'max_depth': 5, 'n_estimators': 100, 'objective': 'reg:squarederror', 'seed': 0, 'subsample': 0.8}\n", + "Training with params: \n", + "{'colsample_bytree': 0.9500000000000001, 'learning_rate': 0.05, 'max_depth': 11, 'n_estimators': 100, 'objective': 'reg:squarederror', 'seed': 0, 'subsample': 0.9}\n", + "Training with params: \n", + "{'colsample_bytree': 0.8, 'learning_rate': 0.05, 'max_depth': 13, 'n_estimators': 100, 'objective': 'reg:squarederror', 'seed': 0, 'subsample': 1.0}\n", + "100%|██████████| 25/25 [25:19<00:00, 66.04s/it, best loss: 7522.162667868127]\n" + ], + "name": "stdout" + }, + { + "output_type": "execute_result", + "data": { + "text/plain": [ + "{'colsample_bytree': 0.75,\n", + " 'learning_rate': 0,\n", + " 'max_depth': 10,\n", + " 'subsample': 0.9500000000000001}" + ] + }, + "metadata": { + "tags": [] + }, + "execution_count": 10 + } + ] + } + ] +} \ No newline at end of file