From 22d88c345d008081aeccc33babfe0e3c5bed288b Mon Sep 17 00:00:00 2001
From: Paulina <55359571+paulinakaszuba94@users.noreply.github.com>
Date: Fri, 6 Mar 2020 22:11:21 +0100
Subject: [PATCH] add hyperopt
---
day5.ipynb | 463 +++++++++++++++++++++++++++++++++++++++++++++++++++++
1 file changed, 463 insertions(+)
create mode 100644 day5.ipynb
diff --git a/day5.ipynb b/day5.ipynb
new file mode 100644
index 0000000..e6b7c0a
--- /dev/null
+++ b/day5.ipynb
@@ -0,0 +1,463 @@
+{
+ "nbformat": 4,
+ "nbformat_minor": 0,
+ "metadata": {
+ "colab": {
+ "name": "day5.ipynb",
+ "provenance": [],
+ "collapsed_sections": [],
+ "mount_file_id": "1EnuY-zir8UL_kahtEvy7MDA-rce_5O5Y",
+ "authorship_tag": "ABX9TyMbvLQBXG5h9iD7qR5h2mRy",
+ "include_colab_link": true
+ },
+ "kernelspec": {
+ "name": "python3",
+ "display_name": "Python 3"
+ }
+ },
+ "cells": [
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "id": "view-in-github",
+ "colab_type": "text"
+ },
+ "source": [
+ "
"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "metadata": {
+ "id": "mMqiCuBpcidR",
+ "colab_type": "code",
+ "colab": {
+ "base_uri": "https://localhost:8080/",
+ "height": 666
+ },
+ "outputId": "8f439a48-47cd-47ff-b7ae-b7d3705125e3"
+ },
+ "source": [
+ "!pip install --upgrade tables\n",
+ "!pip install eli5\n",
+ "!pip install xgboost\n",
+ "!pip install hyperopt"
+ ],
+ "execution_count": 1,
+ "outputs": [
+ {
+ "output_type": "stream",
+ "text": [
+ "Collecting tables\n",
+ "\u001b[?25l Downloading https://files.pythonhosted.org/packages/ed/c3/8fd9e3bb21872f9d69eb93b3014c86479864cca94e625fd03713ccacec80/tables-3.6.1-cp36-cp36m-manylinux1_x86_64.whl (4.3MB)\n",
+ "\u001b[K |████████████████████████████████| 4.3MB 5.0MB/s \n",
+ "\u001b[?25hRequirement already satisfied, skipping upgrade: numexpr>=2.6.2 in /usr/local/lib/python3.6/dist-packages (from tables) (2.7.1)\n",
+ "Requirement already satisfied, skipping upgrade: numpy>=1.9.3 in /usr/local/lib/python3.6/dist-packages (from tables) (1.17.5)\n",
+ "Installing collected packages: tables\n",
+ " Found existing installation: tables 3.4.4\n",
+ " Uninstalling tables-3.4.4:\n",
+ " Successfully uninstalled tables-3.4.4\n",
+ "Successfully installed tables-3.6.1\n",
+ "Collecting eli5\n",
+ "\u001b[?25l Downloading https://files.pythonhosted.org/packages/97/2f/c85c7d8f8548e460829971785347e14e45fa5c6617da374711dec8cb38cc/eli5-0.10.1-py2.py3-none-any.whl (105kB)\n",
+ "\u001b[K |████████████████████████████████| 112kB 4.8MB/s \n",
+ "\u001b[?25hRequirement already satisfied: scipy in /usr/local/lib/python3.6/dist-packages (from eli5) (1.4.1)\n",
+ "Requirement already satisfied: numpy>=1.9.0 in /usr/local/lib/python3.6/dist-packages (from eli5) (1.17.5)\n",
+ "Requirement already satisfied: attrs>16.0.0 in /usr/local/lib/python3.6/dist-packages (from eli5) (19.3.0)\n",
+ "Requirement already satisfied: graphviz in /usr/local/lib/python3.6/dist-packages (from eli5) (0.10.1)\n",
+ "Requirement already satisfied: six in /usr/local/lib/python3.6/dist-packages (from eli5) (1.12.0)\n",
+ "Requirement already satisfied: jinja2 in /usr/local/lib/python3.6/dist-packages (from eli5) (2.11.1)\n",
+ "Requirement already satisfied: scikit-learn>=0.18 in /usr/local/lib/python3.6/dist-packages (from eli5) (0.22.1)\n",
+ "Requirement already satisfied: tabulate>=0.7.7 in /usr/local/lib/python3.6/dist-packages (from eli5) (0.8.6)\n",
+ "Requirement already satisfied: MarkupSafe>=0.23 in /usr/local/lib/python3.6/dist-packages (from jinja2->eli5) (1.1.1)\n",
+ "Requirement already satisfied: joblib>=0.11 in /usr/local/lib/python3.6/dist-packages (from scikit-learn>=0.18->eli5) (0.14.1)\n",
+ "Installing collected packages: eli5\n",
+ "Successfully installed eli5-0.10.1\n",
+ "Requirement already satisfied: xgboost in /usr/local/lib/python3.6/dist-packages (0.90)\n",
+ "Requirement already satisfied: numpy in /usr/local/lib/python3.6/dist-packages (from xgboost) (1.17.5)\n",
+ "Requirement already satisfied: scipy in /usr/local/lib/python3.6/dist-packages (from xgboost) (1.4.1)\n",
+ "Requirement already satisfied: hyperopt in /usr/local/lib/python3.6/dist-packages (0.1.2)\n",
+ "Requirement already satisfied: pymongo in /usr/local/lib/python3.6/dist-packages (from hyperopt) (3.10.1)\n",
+ "Requirement already satisfied: networkx in /usr/local/lib/python3.6/dist-packages (from hyperopt) (2.4)\n",
+ "Requirement already satisfied: scipy in /usr/local/lib/python3.6/dist-packages (from hyperopt) (1.4.1)\n",
+ "Requirement already satisfied: numpy in /usr/local/lib/python3.6/dist-packages (from hyperopt) (1.17.5)\n",
+ "Requirement already satisfied: six in /usr/local/lib/python3.6/dist-packages (from hyperopt) (1.12.0)\n",
+ "Requirement already satisfied: future in /usr/local/lib/python3.6/dist-packages (from hyperopt) (0.16.0)\n",
+ "Requirement already satisfied: tqdm in /usr/local/lib/python3.6/dist-packages (from hyperopt) (4.28.1)\n",
+ "Requirement already satisfied: decorator>=4.3.0 in /usr/local/lib/python3.6/dist-packages (from networkx->hyperopt) (4.4.1)\n"
+ ],
+ "name": "stdout"
+ }
+ ]
+ },
+ {
+ "cell_type": "code",
+ "metadata": {
+ "id": "9y0_eFZOd-ow",
+ "colab_type": "code",
+ "colab": {
+ "base_uri": "https://localhost:8080/",
+ "height": 185
+ },
+ "outputId": "0aae5db8-3bd9-48d4-ea35-3a169981220b"
+ },
+ "source": [
+ "import pandas as pd\n",
+ "import numpy as np\n",
+ "\n",
+ "import xgboost as xgb\n",
+ "\n",
+ "from sklearn.metrics import mean_absolute_error as mae \n",
+ "from sklearn.model_selection import cross_val_score, KFold\n",
+ "\n",
+ "from hyperopt import hp, fmin, tpe, STATUS_OK\n",
+ "\n",
+ "import eli5\n",
+ "from eli5.sklearn import PermutationImportance"
+ ],
+ "execution_count": 2,
+ "outputs": [
+ {
+ "output_type": "stream",
+ "text": [
+ "/usr/local/lib/python3.6/dist-packages/sklearn/utils/deprecation.py:144: FutureWarning: The sklearn.metrics.scorer module is deprecated in version 0.22 and will be removed in version 0.24. The corresponding classes / functions should instead be imported from sklearn.metrics. Anything that cannot be imported from sklearn.metrics is now part of the private API.\n",
+ " warnings.warn(message, FutureWarning)\n",
+ "/usr/local/lib/python3.6/dist-packages/sklearn/utils/deprecation.py:144: FutureWarning: The sklearn.feature_selection.base module is deprecated in version 0.22 and will be removed in version 0.24. The corresponding classes / functions should instead be imported from sklearn.feature_selection. Anything that cannot be imported from sklearn.feature_selection is now part of the private API.\n",
+ " warnings.warn(message, FutureWarning)\n",
+ "Using TensorFlow backend.\n"
+ ],
+ "name": "stderr"
+ },
+ {
+ "output_type": "display_data",
+ "data": {
+ "text/html": [
+ "
\n",
+ "The default version of TensorFlow in Colab will soon switch to TensorFlow 2.x.
\n",
+ "We recommend you upgrade now \n",
+ "or ensure your notebook will continue to use TensorFlow 1.x via the %tensorflow_version 1.x
magic:\n",
+ "more info.
\n"
+ ],
+ "text/plain": [
+ ""
+ ]
+ },
+ "metadata": {
+ "tags": []
+ }
+ }
+ ]
+ },
+ {
+ "cell_type": "code",
+ "metadata": {
+ "id": "K7cWHmW2egS2",
+ "colab_type": "code",
+ "colab": {
+ "base_uri": "https://localhost:8080/",
+ "height": 54
+ },
+ "outputId": "13192911-b7a3-465d-ee3f-d36f4132c4c1"
+ },
+ "source": [
+ "cd \"/content/drive/My Drive/Colab Notebooks/dataworkshop_matrix/matrix2/dataworkshop_matrix2\""
+ ],
+ "execution_count": 3,
+ "outputs": [
+ {
+ "output_type": "stream",
+ "text": [
+ "/content/drive/My Drive/Colab Notebooks/dataworkshop_matrix/matrix2/dataworkshop_matrix2\n"
+ ],
+ "name": "stdout"
+ }
+ ]
+ },
+ {
+ "cell_type": "code",
+ "metadata": {
+ "id": "gTqZh-K1eqbU",
+ "colab_type": "code",
+ "colab": {
+ "base_uri": "https://localhost:8080/",
+ "height": 34
+ },
+ "outputId": "f2d6995a-dfef-47c5-e3bd-05d679612d11"
+ },
+ "source": [
+ "df = pd.read_hdf('data/car.h5')\n",
+ "df.shape"
+ ],
+ "execution_count": 4,
+ "outputs": [
+ {
+ "output_type": "execute_result",
+ "data": {
+ "text/plain": [
+ "(106494, 155)"
+ ]
+ },
+ "metadata": {
+ "tags": []
+ },
+ "execution_count": 4
+ }
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "id": "SOfx2xwjeyf7",
+ "colab_type": "text"
+ },
+ "source": [
+ "## Feature Engineering"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "metadata": {
+ "id": "bsA2-HyoezSv",
+ "colab_type": "code",
+ "colab": {}
+ },
+ "source": [
+ "SUFFIX_CAT = '__cat'\n",
+ "for feat in df.columns:\n",
+ " if isinstance(df[feat][0], list): continue\n",
+ "\n",
+ " factorized_values = df[feat].factorize()[0]\n",
+ " if SUFFIX_CAT in feat:\n",
+ " df[feat] = factorized_values\n",
+ " else:\n",
+ " df[feat + SUFFIX_CAT] = factorized_values"
+ ],
+ "execution_count": 0,
+ "outputs": []
+ },
+ {
+ "cell_type": "code",
+ "metadata": {
+ "id": "sMiJxC0VfNbv",
+ "colab_type": "code",
+ "colab": {}
+ },
+ "source": [
+ "df['param_pojemność-skokowa'] = df['param_pojemność-skokowa'].map(lambda x: -1 if str(x) == 'None' else int(str(x).split('cm')[0].replace(' ', '')))\n",
+ "\n",
+ "df['param_moc'] = df['param_moc'].map(lambda x: -1 if str(x) == 'None' else int(str(x).split(' ')[0]))\n",
+ "\n",
+ "df['param_rok-produkcji'] = df['param_rok-produkcji'].map(lambda x: -1 if str(x) == 'None' else int(x))"
+ ],
+ "execution_count": 0,
+ "outputs": []
+ },
+ {
+ "cell_type": "code",
+ "metadata": {
+ "id": "UnLn2X3Zfayk",
+ "colab_type": "code",
+ "colab": {}
+ },
+ "source": [
+ "def run_model(model, feats): \n",
+ " X = df[feats].values\n",
+ " y = df['price_value'].values\n",
+ " \n",
+ " scores = cross_val_score(model, X, y, cv=3, scoring = 'neg_mean_absolute_error')\n",
+ " return np.mean(scores), np.std(scores)"
+ ],
+ "execution_count": 0,
+ "outputs": []
+ },
+ {
+ "cell_type": "code",
+ "metadata": {
+ "id": "6jPIshMjfmek",
+ "colab_type": "code",
+ "colab": {
+ "base_uri": "https://localhost:8080/",
+ "height": 105
+ },
+ "outputId": "10acbf9a-ec0f-4b70-bdfd-0217b8d5a39c"
+ },
+ "source": [
+ "feats = ['param_napęd__cat',\n",
+ "'param_rok-produkcji',\n",
+ "'param_stan__cat',\n",
+ "'param_skrzynia-biegów__cat',\n",
+ "'param_faktura-vat__cat',\n",
+ "'param_moc',\n",
+ "'param_marka-pojazdu__cat',\n",
+ "'feature_kamera-cofania__cat',\n",
+ "'param_typ__cat',\n",
+ "'param_pojemność-skokowa',\n",
+ "'seller_name__cat',\n",
+ "'feature_wspomaganie-kierownicy__cat',\n",
+ "'param_model-pojazdu__cat',\n",
+ "'param_wersja__cat',\n",
+ "'param_kod-silnika__cat',\n",
+ "'feature_system-start-stop__cat',\n",
+ "'feature_asystent-pasa-ruchu__cat',\n",
+ "'feature_czujniki-parkowania-przednie__cat',\n",
+ "'feature_łopatki-zmiany-biegów__cat',\n",
+ "'feature_regulowane-zawieszenie__cat']\n",
+ "\n",
+ "xgb_params = {\n",
+ " 'max_depth':5,\n",
+ " 'n_estimators':50,\n",
+ " 'learning_rate':0.1,\n",
+ " 'seed':0\n",
+ "}\n",
+ "model = xgb.XGBRegressor(**xgb_params)\n",
+ "run_model(model, feats)"
+ ],
+ "execution_count": 8,
+ "outputs": [
+ {
+ "output_type": "stream",
+ "text": [
+ "[20:18:10] WARNING: /workspace/src/objective/regression_obj.cu:152: reg:linear is now deprecated in favor of reg:squarederror.\n",
+ "[20:18:14] WARNING: /workspace/src/objective/regression_obj.cu:152: reg:linear is now deprecated in favor of reg:squarederror.\n",
+ "[20:18:18] WARNING: /workspace/src/objective/regression_obj.cu:152: reg:linear is now deprecated in favor of reg:squarederror.\n"
+ ],
+ "name": "stdout"
+ },
+ {
+ "output_type": "execute_result",
+ "data": {
+ "text/plain": [
+ "(-9569.227198767323, 72.83561801421891)"
+ ]
+ },
+ "metadata": {
+ "tags": []
+ },
+ "execution_count": 8
+ }
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "id": "UszcalxAhUGz",
+ "colab_type": "text"
+ },
+ "source": [
+ "##Hyperopt"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "metadata": {
+ "id": "3zgC3suCf8qy",
+ "colab_type": "code",
+ "colab": {
+ "base_uri": "https://localhost:8080/",
+ "height": 972
+ },
+ "outputId": "0535e498-1556-4bf0-ac57-f0b748887b62"
+ },
+ "source": [
+ "def obj_func(params):\n",
+ " print(\"Training with params: \")\n",
+ " print(params)\n",
+ "\n",
+ " mean_mae, score_std = run_model(xgb.XGBRegressor(**params), feats)\n",
+ "\n",
+ " return {'loss': np.abs(mean_mae), 'status': STATUS_OK}\n",
+ "\n",
+ "#space\n",
+ "xgb_reg_params = {\n",
+ " 'learning_rate': hp.choice('learning_rate', np.arange(0.05, 0.31, 0.05)),\n",
+ " 'max_depth': hp.choice('max_depth', np.arange(5, 16, 1, dtype=int)),\n",
+ " 'subsample': hp.quniform('subsample', 0.5, 1, 0.05),\n",
+ " 'colsample_bytree': hp.quniform('colsample_bytree', 0.5, 1, 0.05),\n",
+ " 'objective': 'reg:squarederror',\n",
+ " 'n_estimators': 100,\n",
+ " 'seed': 0,\n",
+ "}\n",
+ "\n",
+ "##run\n",
+ "best = fmin(obj_func, xgb_reg_params, algo=tpe.suggest, max_evals=25)\n",
+ "\n",
+ "best"
+ ],
+ "execution_count": 10,
+ "outputs": [
+ {
+ "output_type": "stream",
+ "text": [
+ "Training with params: \n",
+ "{'colsample_bytree': 0.75, 'learning_rate': 0.1, 'max_depth': 7, 'n_estimators': 100, 'objective': 'reg:squarederror', 'seed': 0, 'subsample': 0.65}\n",
+ "Training with params: \n",
+ "{'colsample_bytree': 0.55, 'learning_rate': 0.1, 'max_depth': 14, 'n_estimators': 100, 'objective': 'reg:squarederror', 'seed': 0, 'subsample': 0.9500000000000001}\n",
+ "Training with params: \n",
+ "{'colsample_bytree': 0.8500000000000001, 'learning_rate': 0.15000000000000002, 'max_depth': 9, 'n_estimators': 100, 'objective': 'reg:squarederror', 'seed': 0, 'subsample': 0.8500000000000001}\n",
+ "Training with params: \n",
+ "{'colsample_bytree': 0.7000000000000001, 'learning_rate': 0.2, 'max_depth': 15, 'n_estimators': 100, 'objective': 'reg:squarederror', 'seed': 0, 'subsample': 0.75}\n",
+ "Training with params: \n",
+ "{'colsample_bytree': 0.75, 'learning_rate': 0.25, 'max_depth': 6, 'n_estimators': 100, 'objective': 'reg:squarederror', 'seed': 0, 'subsample': 0.9500000000000001}\n",
+ "Training with params: \n",
+ "{'colsample_bytree': 0.9, 'learning_rate': 0.2, 'max_depth': 15, 'n_estimators': 100, 'objective': 'reg:squarederror', 'seed': 0, 'subsample': 0.8}\n",
+ "Training with params: \n",
+ "{'colsample_bytree': 0.9, 'learning_rate': 0.2, 'max_depth': 15, 'n_estimators': 100, 'objective': 'reg:squarederror', 'seed': 0, 'subsample': 0.7000000000000001}\n",
+ "Training with params: \n",
+ "{'colsample_bytree': 0.65, 'learning_rate': 0.05, 'max_depth': 11, 'n_estimators': 100, 'objective': 'reg:squarederror', 'seed': 0, 'subsample': 0.9}\n",
+ "Training with params: \n",
+ "{'colsample_bytree': 0.9500000000000001, 'learning_rate': 0.15000000000000002, 'max_depth': 6, 'n_estimators': 100, 'objective': 'reg:squarederror', 'seed': 0, 'subsample': 0.65}\n",
+ "Training with params: \n",
+ "{'colsample_bytree': 0.9500000000000001, 'learning_rate': 0.2, 'max_depth': 11, 'n_estimators': 100, 'objective': 'reg:squarederror', 'seed': 0, 'subsample': 0.8500000000000001}\n",
+ "Training with params: \n",
+ "{'colsample_bytree': 0.65, 'learning_rate': 0.05, 'max_depth': 14, 'n_estimators': 100, 'objective': 'reg:squarederror', 'seed': 0, 'subsample': 0.9500000000000001}\n",
+ "Training with params: \n",
+ "{'colsample_bytree': 0.6000000000000001, 'learning_rate': 0.25, 'max_depth': 6, 'n_estimators': 100, 'objective': 'reg:squarederror', 'seed': 0, 'subsample': 0.9500000000000001}\n",
+ "Training with params: \n",
+ "{'colsample_bytree': 0.65, 'learning_rate': 0.1, 'max_depth': 10, 'n_estimators': 100, 'objective': 'reg:squarederror', 'seed': 0, 'subsample': 0.6000000000000001}\n",
+ "Training with params: \n",
+ "{'colsample_bytree': 0.55, 'learning_rate': 0.2, 'max_depth': 6, 'n_estimators': 100, 'objective': 'reg:squarederror', 'seed': 0, 'subsample': 0.55}\n",
+ "Training with params: \n",
+ "{'colsample_bytree': 0.75, 'learning_rate': 0.25, 'max_depth': 8, 'n_estimators': 100, 'objective': 'reg:squarederror', 'seed': 0, 'subsample': 0.9500000000000001}\n",
+ "Training with params: \n",
+ "{'colsample_bytree': 0.55, 'learning_rate': 0.25, 'max_depth': 12, 'n_estimators': 100, 'objective': 'reg:squarederror', 'seed': 0, 'subsample': 0.55}\n",
+ "Training with params: \n",
+ "{'colsample_bytree': 0.5, 'learning_rate': 0.15000000000000002, 'max_depth': 15, 'n_estimators': 100, 'objective': 'reg:squarederror', 'seed': 0, 'subsample': 0.65}\n",
+ "Training with params: \n",
+ "{'colsample_bytree': 0.75, 'learning_rate': 0.05, 'max_depth': 15, 'n_estimators': 100, 'objective': 'reg:squarederror', 'seed': 0, 'subsample': 0.9500000000000001}\n",
+ "Training with params: \n",
+ "{'colsample_bytree': 0.7000000000000001, 'learning_rate': 0.15000000000000002, 'max_depth': 15, 'n_estimators': 100, 'objective': 'reg:squarederror', 'seed': 0, 'subsample': 0.75}\n",
+ "Training with params: \n",
+ "{'colsample_bytree': 0.8500000000000001, 'learning_rate': 0.1, 'max_depth': 7, 'n_estimators': 100, 'objective': 'reg:squarederror', 'seed': 0, 'subsample': 1.0}\n",
+ "Training with params: \n",
+ "{'colsample_bytree': 1.0, 'learning_rate': 0.3, 'max_depth': 11, 'n_estimators': 100, 'objective': 'reg:squarederror', 'seed': 0, 'subsample': 0.8500000000000001}\n",
+ "Training with params: \n",
+ "{'colsample_bytree': 1.0, 'learning_rate': 0.05, 'max_depth': 13, 'n_estimators': 100, 'objective': 'reg:squarederror', 'seed': 0, 'subsample': 0.8500000000000001}\n",
+ "Training with params: \n",
+ "{'colsample_bytree': 0.8, 'learning_rate': 0.3, 'max_depth': 5, 'n_estimators': 100, 'objective': 'reg:squarederror', 'seed': 0, 'subsample': 0.8}\n",
+ "Training with params: \n",
+ "{'colsample_bytree': 0.9500000000000001, 'learning_rate': 0.05, 'max_depth': 11, 'n_estimators': 100, 'objective': 'reg:squarederror', 'seed': 0, 'subsample': 0.9}\n",
+ "Training with params: \n",
+ "{'colsample_bytree': 0.8, 'learning_rate': 0.05, 'max_depth': 13, 'n_estimators': 100, 'objective': 'reg:squarederror', 'seed': 0, 'subsample': 1.0}\n",
+ "100%|██████████| 25/25 [25:19<00:00, 66.04s/it, best loss: 7522.162667868127]\n"
+ ],
+ "name": "stdout"
+ },
+ {
+ "output_type": "execute_result",
+ "data": {
+ "text/plain": [
+ "{'colsample_bytree': 0.75,\n",
+ " 'learning_rate': 0,\n",
+ " 'max_depth': 10,\n",
+ " 'subsample': 0.9500000000000001}"
+ ]
+ },
+ "metadata": {
+ "tags": []
+ },
+ "execution_count": 10
+ }
+ ]
+ }
+ ]
+}
\ No newline at end of file