From d35bd53faee29808a8816747f83463a81d942a17 Mon Sep 17 00:00:00 2001 From: Paulina Date: Wed, 4 Mar 2020 18:18:00 +0000 Subject: [PATCH] add simple model --- day3_simple_model.ipynb | 1 + 1 file changed, 1 insertion(+) create mode 100644 day3_simple_model.ipynb diff --git a/day3_simple_model.ipynb b/day3_simple_model.ipynb new file mode 100644 index 0000000..a7544cd --- /dev/null +++ b/day3_simple_model.ipynb @@ -0,0 +1 @@ +{"nbformat":4,"nbformat_minor":0,"metadata":{"colab":{"name":"day3_simple_model.ipynb","provenance":[],"collapsed_sections":[],"mount_file_id":"195g0PwEha3D-pwxmr-26aD_tU0hwePEc","authorship_tag":"ABX9TyPK+cDeiMb6XBU4YZMR+kkP"},"kernelspec":{"name":"python3","display_name":"Python 3"}},"cells":[{"cell_type":"code","metadata":{"id":"Ne-HA9pKjlip","colab_type":"code","outputId":"c6ad557d-60d6-4577-a50f-bfe5397dbb74","executionInfo":{"status":"ok","timestamp":1583344755794,"user_tz":-60,"elapsed":6623,"user":{"displayName":"Paulina Kaszuba","photoUrl":"https://lh3.googleusercontent.com/a-/AOh14GiwP0nBpw0nN5d3vjjNpq8-qyxWNVGuUgEtyZyqhiM=s64","userId":"08144210629576263422"}},"colab":{"base_uri":"https://localhost:8080/","height":255}},"source":["!pip install --upgrade tables\n","!pip install eli5"],"execution_count":35,"outputs":[{"output_type":"stream","text":["Requirement already up-to-date: tables in /usr/local/lib/python3.6/dist-packages (3.6.1)\n","Requirement already satisfied, skipping upgrade: numexpr>=2.6.2 in /usr/local/lib/python3.6/dist-packages (from tables) (2.7.1)\n","Requirement already satisfied, skipping upgrade: numpy>=1.9.3 in /usr/local/lib/python3.6/dist-packages (from tables) (1.17.5)\n","Requirement already satisfied: eli5 in /usr/local/lib/python3.6/dist-packages (0.10.1)\n","Requirement already satisfied: jinja2 in /usr/local/lib/python3.6/dist-packages (from eli5) (2.11.1)\n","Requirement already satisfied: numpy>=1.9.0 in /usr/local/lib/python3.6/dist-packages (from eli5) (1.17.5)\n","Requirement already satisfied: six in /usr/local/lib/python3.6/dist-packages (from eli5) (1.12.0)\n","Requirement already satisfied: scikit-learn>=0.18 in /usr/local/lib/python3.6/dist-packages (from eli5) (0.22.1)\n","Requirement already satisfied: tabulate>=0.7.7 in /usr/local/lib/python3.6/dist-packages (from eli5) (0.8.6)\n","Requirement already satisfied: attrs>16.0.0 in /usr/local/lib/python3.6/dist-packages (from eli5) (19.3.0)\n","Requirement already satisfied: scipy in /usr/local/lib/python3.6/dist-packages (from eli5) (1.4.1)\n","Requirement already satisfied: graphviz in /usr/local/lib/python3.6/dist-packages (from eli5) (0.10.1)\n","Requirement already satisfied: MarkupSafe>=0.23 in /usr/local/lib/python3.6/dist-packages (from jinja2->eli5) (1.1.1)\n","Requirement already satisfied: joblib>=0.11 in /usr/local/lib/python3.6/dist-packages (from scikit-learn>=0.18->eli5) (0.14.1)\n"],"name":"stdout"}]},{"cell_type":"code","metadata":{"id":"lJkjhrcDh67W","colab_type":"code","colab":{}},"source":["import pandas as pd\n","import numpy as np\n","\n","from sklearn.dummy import DummyRegressor\n","from sklearn.tree import DecisionTreeRegressor\n","\n","from sklearn.metrics import mean_absolute_error as mae\n","from sklearn.model_selection import cross_val_score\n","\n","import eli5\n","from eli5.sklearn import PermutationImportance"],"execution_count":0,"outputs":[]},{"cell_type":"markdown","metadata":{"id":"8xQosW6hkPWM","colab_type":"text"},"source":["## Loading data and quick overview"]},{"cell_type":"code","metadata":{"id":"hT-e53fnkd3v","colab_type":"code","outputId":"bc675a3c-9be3-4c9e-8ea0-4fdc7af7c042","executionInfo":{"status":"ok","timestamp":1583344764560,"user_tz":-60,"elapsed":590,"user":{"displayName":"Paulina Kaszuba","photoUrl":"https://lh3.googleusercontent.com/a-/AOh14GiwP0nBpw0nN5d3vjjNpq8-qyxWNVGuUgEtyZyqhiM=s64","userId":"08144210629576263422"}},"colab":{"base_uri":"https://localhost:8080/","height":34}},"source":["cd \"/content/drive/My Drive/Colab Notebooks/dataworkshop_matrix/matrix2/dataworkshop_matrix2\""],"execution_count":37,"outputs":[{"output_type":"stream","text":["/content/drive/My Drive/Colab Notebooks/dataworkshop_matrix/matrix2/dataworkshop_matrix2\n"],"name":"stdout"}]},{"cell_type":"code","metadata":{"id":"mI6M2c3sk8sj","colab_type":"code","outputId":"77b668e3-5a50-4c3f-bf49-237fc2b0535e","executionInfo":{"status":"ok","timestamp":1583344767910,"user_tz":-60,"elapsed":2051,"user":{"displayName":"Paulina Kaszuba","photoUrl":"https://lh3.googleusercontent.com/a-/AOh14GiwP0nBpw0nN5d3vjjNpq8-qyxWNVGuUgEtyZyqhiM=s64","userId":"08144210629576263422"}},"colab":{"base_uri":"https://localhost:8080/","height":34}},"source":["ls"],"execution_count":38,"outputs":[{"output_type":"stream","text":["\u001b[0m\u001b[01;34mdata\u001b[0m/ day1_meta.ipynb day2_visualisation.ipynb LICENSE README.md\n"],"name":"stdout"}]},{"cell_type":"code","metadata":{"id":"AuVr61ONkU4z","colab_type":"code","outputId":"3bbf7608-b217-4466-e69d-1972f72800a6","executionInfo":{"status":"ok","timestamp":1583344772695,"user_tz":-60,"elapsed":2597,"user":{"displayName":"Paulina Kaszuba","photoUrl":"https://lh3.googleusercontent.com/a-/AOh14GiwP0nBpw0nN5d3vjjNpq8-qyxWNVGuUgEtyZyqhiM=s64","userId":"08144210629576263422"}},"colab":{"base_uri":"https://localhost:8080/","height":34}},"source":["df = pd.read_hdf('data/car.h5')\n","df.shape"],"execution_count":39,"outputs":[{"output_type":"execute_result","data":{"text/plain":["(106494, 155)"]},"metadata":{"tags":[]},"execution_count":39}]},{"cell_type":"code","metadata":{"id":"0Nj_499HlJ0N","colab_type":"code","outputId":"7fc78b34-8198-4e5e-8a69-83153cc7d793","executionInfo":{"status":"ok","timestamp":1583344775351,"user_tz":-60,"elapsed":626,"user":{"displayName":"Paulina Kaszuba","photoUrl":"https://lh3.googleusercontent.com/a-/AOh14GiwP0nBpw0nN5d3vjjNpq8-qyxWNVGuUgEtyZyqhiM=s64","userId":"08144210629576263422"}},"colab":{"base_uri":"https://localhost:8080/","height":187}},"source":["df.columns"],"execution_count":40,"outputs":[{"output_type":"execute_result","data":{"text/plain":["Index(['breadcrumb', 'created_at', 'price_currency', 'price_details',\n"," 'price_value', 'seller_address', 'seller_name', 'seller_type',\n"," 'feature_czujniki-parkowania-przednie',\n"," 'feature_poduszka-powietrzna-chroniąca-kolana',\n"," ...\n"," 'param_pearl', 'param_stan', 'param_wersja', 'param_emisja-co2',\n"," 'param_body-type', 'param_matowy', 'param_bezwypadkowy',\n"," 'param_akryl-(niemetalizowany)', 'param_monthly-payment-value',\n"," 'car_id'],\n"," dtype='object', length=155)"]},"metadata":{"tags":[]},"execution_count":40}]},{"cell_type":"markdown","metadata":{"id":"rye54WjLlzS4","colab_type":"text"},"source":["## Dummy Model"]},{"cell_type":"code","metadata":{"id":"G-8jHrDfl14R","colab_type":"code","outputId":"e5cad015-ddcf-4834-c1f7-14b0bacf8b90","executionInfo":{"status":"ok","timestamp":1583344779364,"user_tz":-60,"elapsed":746,"user":{"displayName":"Paulina Kaszuba","photoUrl":"https://lh3.googleusercontent.com/a-/AOh14GiwP0nBpw0nN5d3vjjNpq8-qyxWNVGuUgEtyZyqhiM=s64","userId":"08144210629576263422"}},"colab":{"base_uri":"https://localhost:8080/","height":34}},"source":["df.select_dtypes(np.number).columns"],"execution_count":41,"outputs":[{"output_type":"execute_result","data":{"text/plain":["Index(['price_value', 'car_id'], dtype='object')"]},"metadata":{"tags":[]},"execution_count":41}]},{"cell_type":"code","metadata":{"id":"v7kyiXhwmI87","colab_type":"code","outputId":"c9da2602-be34-4ef4-b5d8-bf137d7dd2a1","executionInfo":{"status":"ok","timestamp":1583344782664,"user_tz":-60,"elapsed":633,"user":{"displayName":"Paulina Kaszuba","photoUrl":"https://lh3.googleusercontent.com/a-/AOh14GiwP0nBpw0nN5d3vjjNpq8-qyxWNVGuUgEtyZyqhiM=s64","userId":"08144210629576263422"}},"colab":{"base_uri":"https://localhost:8080/","height":34}},"source":["feats = ['car_id']\n","X = df[ feats].values\n","y = df[ 'price_value' ].values\n","\n","model = DummyRegressor()\n","model.fit(X, y)\n","y_pred = model.predict(X)\n","\n","mae(y, y_pred)"],"execution_count":42,"outputs":[{"output_type":"execute_result","data":{"text/plain":["39465.934630440985"]},"metadata":{"tags":[]},"execution_count":42}]},{"cell_type":"code","metadata":{"id":"zB6QsxAtnPrp","colab_type":"code","outputId":"0a38111e-c894-4ac5-900e-60262a6f20c0","executionInfo":{"status":"ok","timestamp":1583344785677,"user_tz":-60,"elapsed":727,"user":{"displayName":"Paulina Kaszuba","photoUrl":"https://lh3.googleusercontent.com/a-/AOh14GiwP0nBpw0nN5d3vjjNpq8-qyxWNVGuUgEtyZyqhiM=s64","userId":"08144210629576263422"}},"colab":{"base_uri":"https://localhost:8080/","height":34}},"source":["[x for x in df.columns if 'price' in x]"],"execution_count":43,"outputs":[{"output_type":"execute_result","data":{"text/plain":["['price_currency', 'price_details', 'price_value']"]},"metadata":{"tags":[]},"execution_count":43}]},{"cell_type":"code","metadata":{"id":"KlNxhPEYoG4p","colab_type":"code","outputId":"19b76446-e5ea-4e94-964d-7274af98ef31","executionInfo":{"status":"ok","timestamp":1583344787573,"user_tz":-60,"elapsed":700,"user":{"displayName":"Paulina Kaszuba","photoUrl":"https://lh3.googleusercontent.com/a-/AOh14GiwP0nBpw0nN5d3vjjNpq8-qyxWNVGuUgEtyZyqhiM=s64","userId":"08144210629576263422"}},"colab":{"base_uri":"https://localhost:8080/","height":68}},"source":["df['price_currency'].value_counts()"],"execution_count":44,"outputs":[{"output_type":"execute_result","data":{"text/plain":["PLN 106290\n","EUR 204\n","Name: price_currency, dtype: int64"]},"metadata":{"tags":[]},"execution_count":44}]},{"cell_type":"code","metadata":{"id":"RDbJnl1EnZxf","colab_type":"code","outputId":"3d4ced47-4e77-4608-e4d0-946c8ae2e9fc","executionInfo":{"status":"ok","timestamp":1583344790447,"user_tz":-60,"elapsed":831,"user":{"displayName":"Paulina Kaszuba","photoUrl":"https://lh3.googleusercontent.com/a-/AOh14GiwP0nBpw0nN5d3vjjNpq8-qyxWNVGuUgEtyZyqhiM=s64","userId":"08144210629576263422"}},"colab":{"base_uri":"https://localhost:8080/","height":68}},"source":["df['price_currency'].value_counts(normalize=True) * 100"],"execution_count":45,"outputs":[{"output_type":"execute_result","data":{"text/plain":["PLN 99.80844\n","EUR 0.19156\n","Name: price_currency, dtype: float64"]},"metadata":{"tags":[]},"execution_count":45}]},{"cell_type":"code","metadata":{"id":"bQmlEEBrnxhj","colab_type":"code","outputId":"cbfdc429-376f-4c32-a163-6bc40f832926","executionInfo":{"status":"ok","timestamp":1583344792502,"user_tz":-60,"elapsed":678,"user":{"displayName":"Paulina Kaszuba","photoUrl":"https://lh3.googleusercontent.com/a-/AOh14GiwP0nBpw0nN5d3vjjNpq8-qyxWNVGuUgEtyZyqhiM=s64","userId":"08144210629576263422"}},"colab":{"base_uri":"https://localhost:8080/","height":34}},"source":["df = df[ df['price_currency'] != 'EUR' ]\n","df.shape"],"execution_count":46,"outputs":[{"output_type":"execute_result","data":{"text/plain":["(106290, 155)"]},"metadata":{"tags":[]},"execution_count":46}]},{"cell_type":"markdown","metadata":{"id":"6V9Sb2rZoRiO","colab_type":"text"},"source":["## Features"]},{"cell_type":"code","metadata":{"id":"fvJrdfp4oq03","colab_type":"code","outputId":"2d9aa6be-7196-45e3-b8ff-62bca72aaee1","executionInfo":{"status":"ok","timestamp":1583344795827,"user_tz":-60,"elapsed":648,"user":{"displayName":"Paulina Kaszuba","photoUrl":"https://lh3.googleusercontent.com/a-/AOh14GiwP0nBpw0nN5d3vjjNpq8-qyxWNVGuUgEtyZyqhiM=s64","userId":"08144210629576263422"}},"colab":{"base_uri":"https://localhost:8080/","height":34}},"source":["df['param_color'].factorize()[0]"],"execution_count":47,"outputs":[{"output_type":"execute_result","data":{"text/plain":["array([-1, -1, -1, ..., -1, -1, -1])"]},"metadata":{"tags":[]},"execution_count":47}]},{"cell_type":"code","metadata":{"id":"GYW-45wqroaK","colab_type":"code","colab":{}},"source":["SUFFIX_CAT = '__cat'\n","for feat in df.columns:\n"," if isinstance(df[feat][0], list): continue\n","\n"," factorized_values = df[feat].factorize()[0]\n"," if SUFFIX_CAT in feat:\n"," df[feat] = factorized_values\n"," else:\n"," df[feat + SUFFIX_CAT] = factorized_values"],"execution_count":0,"outputs":[]},{"cell_type":"code","metadata":{"id":"ztWRdm6frsJZ","colab_type":"code","outputId":"774f3c39-01c3-4c5b-f2c8-8c40603b1a02","executionInfo":{"status":"ok","timestamp":1583344802351,"user_tz":-60,"elapsed":604,"user":{"displayName":"Paulina Kaszuba","photoUrl":"https://lh3.googleusercontent.com/a-/AOh14GiwP0nBpw0nN5d3vjjNpq8-qyxWNVGuUgEtyZyqhiM=s64","userId":"08144210629576263422"}},"colab":{"base_uri":"https://localhost:8080/","height":34}},"source":["cat_feats = [ x for x in df.columns if SUFFIX_CAT in x]\n","cat_feats = [ x for x in cat_feats if 'price' not in x]\n","len(cat_feats)"],"execution_count":49,"outputs":[{"output_type":"execute_result","data":{"text/plain":["151"]},"metadata":{"tags":[]},"execution_count":49}]},{"cell_type":"code","metadata":{"id":"0LJwNXaatRBe","colab_type":"code","colab":{"base_uri":"https://localhost:8080/","height":34},"outputId":"e8ce3bc2-c34b-4964-c01b-38776ef6d116","executionInfo":{"status":"ok","timestamp":1583345037834,"user_tz":-60,"elapsed":4692,"user":{"displayName":"Paulina Kaszuba","photoUrl":"https://lh3.googleusercontent.com/a-/AOh14GiwP0nBpw0nN5d3vjjNpq8-qyxWNVGuUgEtyZyqhiM=s64","userId":"08144210629576263422"}}},"source":["X = df[cat_feats].values\n","y = df['price_value'].values\n","\n","model = DecisionTreeRegressor(max_depth=5)\n","scores = cross_val_score(model, X, y, cv=3, scoring = 'neg_mean_absolute_error')\n","np.mean(scores)"],"execution_count":50,"outputs":[{"output_type":"execute_result","data":{"text/plain":["-19566.588937368328"]},"metadata":{"tags":[]},"execution_count":50}]},{"cell_type":"code","metadata":{"id":"tao0InfZuDe1","colab_type":"code","colab":{"base_uri":"https://localhost:8080/","height":391},"outputId":"882f5e0d-f696-4ee6-cef9-723b20069e76","executionInfo":{"status":"ok","timestamp":1583345365251,"user_tz":-60,"elapsed":52904,"user":{"displayName":"Paulina Kaszuba","photoUrl":"https://lh3.googleusercontent.com/a-/AOh14GiwP0nBpw0nN5d3vjjNpq8-qyxWNVGuUgEtyZyqhiM=s64","userId":"08144210629576263422"}}},"source":["m = DecisionTreeRegressor(max_depth=5)\n","m.fit(X, y)\n","\n","imp = PermutationImportance(m, random_state=0).fit(X, y)\n","eli5.show_weights(imp, feature_names=cat_feats)"],"execution_count":52,"outputs":[{"output_type":"execute_result","data":{"text/html":["\n"," \n","\n","\n","\n"," \n","\n"," \n","\n"," \n","\n"," \n","\n"," \n","\n"," \n","\n","\n"," \n","\n"," \n","\n"," \n","\n"," \n","\n"," \n","\n"," \n","\n","\n"," \n","\n"," \n","\n"," \n","\n"," \n","\n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n","
WeightFeature
\n"," 0.2533\n"," \n"," ± 0.0040\n"," \n"," \n"," param_napęd__cat\n","
\n"," 0.2008\n"," \n"," ± 0.0047\n"," \n"," \n"," param_faktura-vat__cat\n","
\n"," 0.1943\n"," \n"," ± 0.0088\n"," \n"," \n"," param_stan__cat\n","
\n"," 0.1423\n"," \n"," ± 0.0086\n"," \n"," \n"," param_rok-produkcji__cat\n","
\n"," 0.0629\n"," \n"," ± 0.0047\n"," \n"," \n"," param_moc__cat\n","
\n"," 0.0424\n"," \n"," ± 0.0014\n"," \n"," \n"," feature_kamera-cofania__cat\n","
\n"," 0.0412\n"," \n"," ± 0.0008\n"," \n"," \n"," param_skrzynia-biegów__cat\n","
\n"," 0.0286\n"," \n"," ± 0.0037\n"," \n"," \n"," param_marka-pojazdu__cat\n","
\n"," 0.0191\n"," \n"," ± 0.0022\n"," \n"," \n"," param_pojemność-skokowa__cat\n","
\n"," 0.0163\n"," \n"," ± 0.0005\n"," \n"," \n"," feature_bluetooth__cat\n","
\n"," 0.0117\n"," \n"," ± 0.0007\n"," \n"," \n"," feature_łopatki-zmiany-biegów__cat\n","
\n"," 0.0111\n"," \n"," ± 0.0004\n"," \n"," \n"," feature_światła-led__cat\n","
\n"," 0.0026\n"," \n"," ± 0.0002\n"," \n"," \n"," feature_klimatyzacja-manualna__cat\n","
\n"," 0.0022\n"," \n"," ± 0.0002\n"," \n"," \n"," param_kod-silnika__cat\n","
\n"," 0\n"," \n"," ± 0.0000\n"," \n"," \n"," feature_gniazdo-aux__cat\n","
\n"," 0\n"," \n"," ± 0.0000\n"," \n"," \n"," feature_klimatyzacja-automatyczna__cat\n","
\n"," 0\n"," \n"," ± 0.0000\n"," \n"," \n"," feature_radio-fabryczne__cat\n","
\n"," 0\n"," \n"," ± 0.0000\n"," \n"," \n"," feature_czujniki-parkowania-tylne__cat\n","
\n"," 0\n"," \n"," ± 0.0000\n"," \n"," \n"," feature_poduszki-boczne-tylne__cat\n","
\n"," 0\n"," \n"," ± 0.0000\n"," \n"," \n"," feature_odtwarzacz-dvd__cat\n","
\n"," … 131 more …\n","
\n"," \n","\n"," \n","\n","\n"," \n","\n"," \n","\n"," \n","\n"," \n","\n"," \n","\n"," \n","\n","\n","\n"],"text/plain":[""]},"metadata":{"tags":[]},"execution_count":52}]}]} \ No newline at end of file