diff --git a/CHANGELOG.rst b/CHANGELOG.rst index 5c088f45..aeb3a6c1 100644 --- a/CHANGELOG.rst +++ b/CHANGELOG.rst @@ -16,40 +16,6 @@ Subsections for each version can be one of the following; Each individual change should have a link to the pull request after the description of the change. -0.3.3 (2023-01-19) ------------------- - -Added -^^^^^ -- added support for prior mean encoding (regularised encodings) `#46 `_ - -- added support for weights to mean, median and mode imputers `#47 `_ - -- added classname() method to BaseTransformer and prefixed all errors with classname call for easier debugging `#48 `_ - -- added DatetimeInfoExtractor transformer in ``tubular/dates.py``associated tests with ``tests/dates/test_DatetimeInfoExtractor.py`` and examples with ``examples/dates/DatetimeInfoExtractor.ipynb`` `#49 `_ - -- added DatetimeSinusoidCalculator in ``tubular/dates.py``associated tests with ``tests/dates/test_DatetimeSinusoidCalculator.py`` and examples with ``examples/dates/DatetimeSinusoidCalculator.ipynb`` `#50 `_ - -- added TwoColumnOperatorTransformer in ``tubular/numeric.py``associated tests with ``tests/numeric/test_TwoColumnOperatorTransformer.py`` and examples with ``examples/dates/TwoColumnOperatorTransformer.ipynb`` `#51 `_ - -- added StringConcatenator in ``tubular/strings.py``associated tests with ``tests/strings/test_StringConcatenator.py`` and examples with ``examples/strings/StringConcatenator.ipynb`` `#52 `_ - -- added SetColumnDtype in ``tubular/misc.py``associated tests with ``tests/misc/test_StringConcatenator.py`` and examples with ``examples/strings/StringConcatenator.ipynb`` `#53 `_ - -- added waring to MappingTransformer in ``tubular/mapping.py`` for unexpected changes in dtype `#54 `_ - -- added new module ``tubular/comparison.py`` containing EqualityChecker. Also added associated tests with ``tests/comparison/test_EqualityChecker.py`` and examples with ``examples/comparison/EqualityChecker.ipynb`` `#55 `_ - -- added PCATransformer in ``tubular/numeric.py``associated tests with ``tests/misc/test_PCATransformer.py`` and examples with ``examples/numeric/PCATransformer.ipynb`` `#57 `_ - -Fixed -^^^^^ -- updated black version to 22.3.0 and flake8 version to 5.0.4 to fix compatibility issues `#45 `_ - -- removed **kwargs argument from BaseTransfomer in ``tubular/base.py``to avoid silent erroring if incorrect arguments passed to transformers. Fixed a few tests which were revealed to have incorrect arguments passed by change `#56 `_ - - 0.3.2 (2022-01-13) ------------------ diff --git a/CONTRIBUTING.rst b/CONTRIBUTING.rst index f5cf925e..c521026f 100644 --- a/CONTRIBUTING.rst +++ b/CONTRIBUTING.rst @@ -43,7 +43,7 @@ General ^^^^^^^ - Please try and keep each pull request to one change or feature only -- Make sure to update the `changelog `_ with details of your change +- Make sure to update the `changelog `_ with details of your change Code formatting ^^^^^^^^^^^^^^^ diff --git a/docs/source/api.rst b/docs/source/api.rst index 0720bbe7..2aabf975 100644 --- a/docs/source/api.rst +++ b/docs/source/api.rst @@ -20,14 +20,6 @@ capping module capping.CappingTransformer capping.OutOfRangeNullTransformer - -comparison module ------------------- - -.. autosummary:: - :toctree: api/ - - comparison.EqualityChecker dates module ------------------ @@ -40,8 +32,6 @@ dates module dates.DateDiffLeapYearTransformer dates.SeriesDtMethodTransformer dates.ToDatetimeTransformer - dates.DatetimeInfoExtractor - dates.DatetimeSinusoidCalculator imputers module ------------------ @@ -77,7 +67,6 @@ misc module :toctree: api/ misc.SetValueTransformer - misc.SetColumnDtype nominal module ------------------ @@ -99,11 +88,9 @@ numeric module :toctree: api/ numeric.LogTransformer - numeric.CutTransformer - numeric.TwoColumnOperatorTransformer + numeric.CutTransformer numeric.ScalingTransformer numeric.InteractionTransformer - numeric.PCATransformer strings module ------------------ @@ -112,4 +99,3 @@ strings module :toctree: api/ strings.SeriesStrMethodTransformer - strings.StringConcatenator diff --git a/docs/source/quick-start.rst b/docs/source/quick-start.rst index e621e4ee..ca71c065 100644 --- a/docs/source/quick-start.rst +++ b/docs/source/quick-start.rst @@ -1,11 +1,10 @@ Quick Start ==================== -|logo| -Welcome to the quick start guide for tubular! +Welcome to the quick start guide for |logo| ! .. |logo| image:: ../../logo.png - :height: 200px + :height: 50px Installation -------------------- @@ -16,6 +15,7 @@ The easiest way to get ``tubular`` is to install directly from ``pypi``; pip install tubular +.. important:: Thanks for installing tubular! We hope you find it useful! @@ -54,24 +54,20 @@ The standard `OutOfRangeNullTransformer `_ or not `DateDifferenceTransformer `_. +Date differencing is available - accounting for leap years (`DateDiffLeapYearTransformer `_) or not (`DateDifferenceTransformer `_). The `BetweenDatesTransformer `_ calculates if one date falls between two others. The `ToDatetimeTransformer `_ converts columns to datetime type. -The `SeriesDtMethodTransformer `_ allows the user to use `pandas.Series.dt `_ methods in a similar way to `base.DataFrameMethodTransformer `_. - -The `DatetimeInfoExtractor `_ allows the user to extract datetime info such as the time of day or month from a datetime field. - -The `DatetimeSinusoidCalculator `_ derives a feature in a dataframe by calculating the sine or cosine of a datetime column. +Finally the `SeriesDtMethodTransformer `_ allows the user to use `pandas.Series.dt `_ methods in a similar way to `base.DataFrameMethodTransformer `_. Imputers ^^^^^^^^ -This module contains standard imputation techniques - mean, median mode as well as `NearestMeanResponseImputer `_ which imputes with the value which is closest to the ``null`` values in terms of average response. All of these support weights. +This module contains standard imputation techniques - mean, median mode as well as `NearestMeanResponseImputer `_ which imputes with the value which is closest to the ``null`` values in terms of average response. The `NullIndicator `_ is used to create binary indicators of where ``null`` values are present in a column. @@ -87,36 +83,24 @@ The `CrossColumnMappingTransformer `_ creates a constant column with arbitrary value. - -`SetDtype `_ allows the user to set the dtype of a column. +The misc module currently contains only one transformer, `SetValueTransformer `_, which creates a constant column with arbitrary value. Nominal ^^^^^^^ This module contains categorical encoding techniques. -There are respone encoding techniques such as `MeanResponseTransformer `_, one hot encoding `OneHotEncodingTransformer `_ and grouping of infrequently occuring levels `GroupRareLevelsTransformer `_. - -`MeanResponseTransformer `_ also supports regularisation of encodings using a prior. +There are respone encoding techniques such as `MeanResponseTransformer `_, one hot encoding (`OneHotEncodingTransformer `_) and grouping of infrequently occuring levels (`GroupRareLevelsTransformer `_). Numeric ^^^^^^^ -This module contains numeric transformations - cut `CutTransformer `_, log `LogTransformer `_, and scaling `ScalingTransformer `_. - -`TwoColumnOperatorTransformer `_ allows a user to apply operations to two colmns using methods from `pandas.DataFrame method `_ which require a multiple columns (e.g. add, subtract, multiply etc - -It also contains `InteractionTransformer `_ and `PCATransformer `_ which create interaction terms and pca components. +This module contains numeric transformations - cut (`CutTransformer `_), log (`LogTransformer `_) and scaling (`ScalingTransformer `_). Strings ^^^^^^^ -The strings module contains useful transformers for working with strings. `SeriesStrMethodTransformer `_, allows the user to access `pandas.Series.str `_ methods within ``tubular``. `StringConcatenator `_ allows a user to concatenate multiple columns together of varied dtype into a string output. - - +The strings module contains a single transformer, `SeriesStrMethodTransformer `_, that allows the user to access `pandas.Series.str `_ methods within ``tubular``. Reporting an issue --------------------------------- diff --git a/examples/comparison/EqualityChecker.ipynb b/examples/comparison/EqualityChecker.ipynb deleted file mode 100644 index 531d9c96..00000000 --- a/examples/comparison/EqualityChecker.ipynb +++ /dev/null @@ -1,299 +0,0 @@ -{ - "cells": [ - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# EqualityChecker" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "This notebook shows the functionality of the EqualityChecker transformer class. This transformer compares two columns and creates a new feature containing a boolean representing whether the compared features are equal,\n" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Imports" - ] - }, - { - "cell_type": "code", - "execution_count": 1, - "metadata": {}, - "outputs": [], - "source": [ - "import pandas as pd\n", - "import numpy as np\n", - "\n", - "\n", - "from tubular.comparison import EqualityChecker\n", - "import tubular" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Tubular version" - ] - }, - { - "cell_type": "code", - "execution_count": 2, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "'0.3.3'" - ] - }, - "execution_count": 2, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "tubular.__version__" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Simple usage\n" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Example dataset:\n" - ] - }, - { - "cell_type": "code", - "execution_count": 3, - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
ab
011
123
232
341
\n", - "
" - ], - "text/plain": [ - " a b\n", - "0 1 1\n", - "1 2 3\n", - "2 3 2\n", - "3 4 1" - ] - }, - "execution_count": 3, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "base_df = pd.DataFrame({\"a\": [1, 2, 3, 4], \"b\": [1, 3, 2, 1]})\n", - "base_df" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Initialising EqualityCheckerTransformer" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "The user must specify the following;\n", - "- `columns: list` containing the names of the 2 `columns` to be compared.\n", - "- `new_col_name: str` representing the name of the new column.\n", - "- `drop_original: boolean` representing whether or not to drop the original `columns`" - ] - }, - { - "cell_type": "code", - "execution_count": 4, - "metadata": {}, - "outputs": [], - "source": [ - "compare_1 = EqualityChecker(['a','b'],'bool')" - ] - }, - { - "cell_type": "code", - "execution_count": 5, - "metadata": {}, - "outputs": [], - "source": [ - "transformed_df = compare_1.transform(base_df)" - ] - }, - { - "cell_type": "code", - "execution_count": 6, - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
abbool
011True
123False
232False
341False
\n", - "
" - ], - "text/plain": [ - " a b bool\n", - "0 1 1 True\n", - "1 2 3 False\n", - "2 3 2 False\n", - "3 4 1 False" - ] - }, - "execution_count": 6, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "transformed_df" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [] - } - ], - "metadata": { - "kernelspec": { - "display_name": "Python 3.8.0 ('tubular-dev')", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.8.0" - }, - "orig_nbformat": 4, - "vscode": { - "interpreter": { - "hash": "cc666868ff21538e6058ba6d4768423bd0d0d7d7fded3ffb1bc309a0bf9339c2" - } - } - }, - "nbformat": 4, - "nbformat_minor": 2 -} diff --git a/examples/dates/DateTimeInfoExtractor.ipynb b/examples/dates/DateTimeInfoExtractor.ipynb deleted file mode 100644 index 7862b063..00000000 --- a/examples/dates/DateTimeInfoExtractor.ipynb +++ /dev/null @@ -1,563 +0,0 @@ -{ - "cells": [ - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# DatetimeInfoExtractor\n", - "This notebook shows the functionality of the `DatetimeInfoExtractor` class. This transformer extracts information from a `datetime` type column - such as the hour of the day, the month, etc - and then maps it to a label. The transformer contains default mappings for each set of extracted information, which can be overridden with an optional parameter." - ] - }, - { - "cell_type": "code", - "execution_count": 1, - "metadata": {}, - "outputs": [], - "source": [ - "import pandas as pd\n", - "import numpy as np" - ] - }, - { - "cell_type": "code", - "execution_count": 2, - "metadata": {}, - "outputs": [], - "source": [ - "import tubular\n", - "from tubular.dates import DatetimeInfoExtractor\n", - "\n", - "import tests.test_data as td" - ] - }, - { - "cell_type": "code", - "execution_count": 3, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "'0.3.3'" - ] - }, - "execution_count": 3, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "tubular.__version__" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Load dataset\n", - "Here we load in a dataset with datetime dtypes" - ] - }, - { - "cell_type": "code", - "execution_count": 4, - "metadata": {}, - "outputs": [], - "source": [ - "\n", - "df = td.create_datediff_test_df()" - ] - }, - { - "cell_type": "code", - "execution_count": 5, - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
ab
01993-09-27 11:58:582020-05-01 12:59:59
12000-03-19 12:59:592019-12-25 11:58:58
22018-11-10 11:59:592018-11-10 11:59:59
32018-10-10 11:59:592018-11-10 11:59:59
42018-10-10 11:59:592018-09-10 09:59:59
52018-10-10 10:59:592015-11-10 11:59:59
62018-12-10 11:59:592015-11-10 12:59:59
71985-07-23 11:59:592015-07-23 11:59:59
\n", - "
" - ], - "text/plain": [ - " a b\n", - "0 1993-09-27 11:58:58 2020-05-01 12:59:59\n", - "1 2000-03-19 12:59:59 2019-12-25 11:58:58\n", - "2 2018-11-10 11:59:59 2018-11-10 11:59:59\n", - "3 2018-10-10 11:59:59 2018-11-10 11:59:59\n", - "4 2018-10-10 11:59:59 2018-09-10 09:59:59\n", - "5 2018-10-10 10:59:59 2015-11-10 11:59:59\n", - "6 2018-12-10 11:59:59 2015-11-10 12:59:59\n", - "7 1985-07-23 11:59:59 2015-07-23 11:59:59" - ] - }, - "execution_count": 5, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "df" - ] - }, - { - "cell_type": "code", - "execution_count": 6, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "a datetime64[ns]\n", - "b datetime64[ns]\n", - "dtype: object" - ] - }, - "execution_count": 6, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "df.dtypes" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Simple usage" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Initialising DatetimeInfoExtractor" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "The user must specify the following;\n", - "- `columns` the datetime columns in the `DataFrame` passed to the `transform` method to extract information from.\n", - "\n", - "The user can also choose to specify;\n", - "- `include` the information to extract. Must be a list containing some or all of the following - `[\"timeofday\", \"timeofmonth\", \"timeofyear\", \"dayofweek\"]`\n", - "\n", - "If one of these is in include but no mappings are provided default values will be used as follows:\n", - " timeofday_mapping = {\n", - " \"night\": range(0, 6), # Midnight - 6am\n", - " \"morning\": range(6, 12), # 6am - Noon\n", - " \"afternoon\": range(12, 18), # Noon - 6pm\n", - " \"evening\": range(18, 24), # 6pm - Midnight\n", - " }\n", - " timeofmonth_mapping = {\n", - " \"start\": range(0, 11),\n", - " \"middle\": range(11, 21),\n", - " \"end\": range(21, 32),\n", - " }\n", - " timeofyear_mapping = {\n", - " \"spring\": range(3, 6), # Mar, Apr, May\n", - " \"summer\": range(6, 9), # Jun, Jul, Aug\n", - " \"autumn\": range(9, 12), # Sep, Oct, Nov\n", - " \"winter\": [12, 1, 2], # Dec, Jan, Feb\n", - " }\n", - " dayofweek_mapping = {\n", - " \"monday\": [0],\n", - " \"tuesday\": [1],\n", - " \"wednesday\": [2],\n", - " \"thursday\": [3],\n", - " \"friday\": [4],\n", - " \"saturday\": [5],\n", - " \"sunday\": [6],\n", - " }" - ] - }, - { - "cell_type": "code", - "execution_count": 7, - "metadata": {}, - "outputs": [], - "source": [ - "simple_datetime_extractor = DatetimeInfoExtractor(columns=[\"a\"])" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "collapsed": true - }, - "source": [ - "### DatetimeInfoExtractor fit\n", - "There is no fit method for the `DatetimeInfoExtractor` as the methods that it can run do not 'learn' anything from the data." - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### DatetimeInfoExtractor transform\n", - "When running transform with this configuration a new column `a_timeofday_` is added to the input `X`." - ] - }, - { - "cell_type": "code", - "execution_count": 8, - "metadata": {}, - "outputs": [], - "source": [ - "df_2 = simple_datetime_extractor.transform(df)" - ] - }, - { - "cell_type": "code", - "execution_count": 9, - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
aa_timeofday
01993-09-27 11:58:58morning
12000-03-19 12:59:59afternoon
22018-11-10 11:59:59morning
32018-10-10 11:59:59morning
42018-10-10 11:59:59morning
\n", - "
" - ], - "text/plain": [ - " a a_timeofday\n", - "0 1993-09-27 11:58:58 morning\n", - "1 2000-03-19 12:59:59 afternoon\n", - "2 2018-11-10 11:59:59 morning\n", - "3 2018-10-10 11:59:59 morning\n", - "4 2018-10-10 11:59:59 morning" - ] - }, - "execution_count": 9, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "df_2[[\"a\", \"a_timeofday\"]].head()" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Use Custom Mappings\n", - "\n", - "The user can choose to specify individual mappings for any of the features extracted. The `datetime_mappings` must take the following form:\n", - "\n", - "`datetime_mappings = {\"feature_to_map\": {\"label\": [List_to_map]}}`\n", - "\n", - "All hours/days/months must be mapped for each feature\n", - "\n", - "ie, a mapping for `dayofweek` must include all values 0-6;\n", - "datetime_mappings = {\"dayofweek\": {\"week\": [0, 1, 2, 3, 4],\n", - " \"weekend\": [5, 6]}}\n", - "The values for the mapping array must be iterable;\n", - "datetime_mappings = {\"timeofday\": {\"am\": range(0, 12),\n", - " \"pm\": range(12, 24)}}\n", - "\n", - "Keys of the dictionary must be contained in `include`\n", - "\n", - "The required ranges for each mapping are:\n", - "- timeofday: 0-23\n", - "- timeofmonth: 1-31\n", - "- timeofyear: 1-12\n", - "- dayofweek: 0-6\n" - ] - }, - { - "cell_type": "code", - "execution_count": 10, - "metadata": {}, - "outputs": [], - "source": [ - "datetime_mappings = {\n", - " \"timeofday\": {\"am\": range(0, 12), \"pm\": range(12, 24)},\n", - " \"dayofweek\": {\"week\": range(0, 5), \"weekend\": [5, 6]},\n", - "}" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "This `datetime_mapping` can then be used when creating the transformer. It is important to note that the transformer will only extract features in `include`, so all features with defined mappings must be in `include`" - ] - }, - { - "cell_type": "code", - "execution_count": 11, - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
bb_timeofdayb_dayofweek
02020-05-01 12:59:59pmweek
12019-12-25 11:58:58amweek
22018-11-10 11:59:59amweekend
32018-11-10 11:59:59amweekend
42018-09-10 09:59:59amweek
52015-11-10 11:59:59amweek
62015-11-10 12:59:59pmweek
72015-07-23 11:59:59amweek
\n", - "
" - ], - "text/plain": [ - " b b_timeofday b_dayofweek\n", - "0 2020-05-01 12:59:59 pm week\n", - "1 2019-12-25 11:58:58 am week\n", - "2 2018-11-10 11:59:59 am weekend\n", - "3 2018-11-10 11:59:59 am weekend\n", - "4 2018-09-10 09:59:59 am week\n", - "5 2015-11-10 11:59:59 am week\n", - "6 2015-11-10 12:59:59 pm week\n", - "7 2015-07-23 11:59:59 am week" - ] - }, - "execution_count": 11, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "time_and_day_transformer = DatetimeInfoExtractor(\n", - " columns=[\"b\"],\n", - " include=[\"timeofday\", \"dayofweek\"],\n", - " datetime_mappings=datetime_mappings,\n", - ")\n", - "\n", - "df3 = time_and_day_transformer.transform(df)\n", - "\n", - "df3[[\"b\", \"b_timeofday\", \"b_dayofweek\"]]" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [] - } - ], - "metadata": { - "kernelspec": { - "display_name": "Python 3.8.0 ('tubular-dev')", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.8.0" - }, - "toc": { - "base_numbering": 1, - "nav_menu": {}, - "number_sections": true, - "sideBar": true, - "skip_h1_title": false, - "title_cell": "Table of Contents", - "title_sidebar": "Contents", - "toc_cell": false, - "toc_position": {}, - "toc_section_display": true, - "toc_window_display": true - }, - "vscode": { - "interpreter": { - "hash": "cc666868ff21538e6058ba6d4768423bd0d0d7d7fded3ffb1bc309a0bf9339c2" - } - } - }, - "nbformat": 4, - "nbformat_minor": 2 -} diff --git a/examples/dates/DatetimeSinusoidCalculator.ipynb b/examples/dates/DatetimeSinusoidCalculator.ipynb deleted file mode 100644 index d435b994..00000000 --- a/examples/dates/DatetimeSinusoidCalculator.ipynb +++ /dev/null @@ -1,436 +0,0 @@ -{ - "cells": [ - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# DatetimeSinusoidCalculator\n", - "This notebook shows the functionality of the `DatetimeSinusoidCalculator` class. This transformer derives a feature in a dataframe by calculating the sine or cosine of a datetime column in a given unit (e.g hour), with the option to scale period of the sine or cosine to match the natural period of the unit (e.g. 24)." - ] - }, - { - "cell_type": "code", - "execution_count": 1, - "metadata": {}, - "outputs": [], - "source": [ - "import tubular\n", - "import tests.test_data as d\n", - "from tubular.dates import DatetimeSinusoidCalculator" - ] - }, - { - "cell_type": "code", - "execution_count": 2, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "'0.3.3'" - ] - }, - "execution_count": 2, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "tubular.__version__" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Load dummy dataset" - ] - }, - { - "cell_type": "code", - "execution_count": 3, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "(8, 2)" - ] - }, - "execution_count": 3, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "df = d.create_datediff_test_df()\n", - "df.shape" - ] - }, - { - "cell_type": "code", - "execution_count": 4, - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
ab
01993-09-27 11:58:582020-05-01 12:59:59
12000-03-19 12:59:592019-12-25 11:58:58
22018-11-10 11:59:592018-11-10 11:59:59
32018-10-10 11:59:592018-11-10 11:59:59
42018-10-10 11:59:592018-09-10 09:59:59
52018-10-10 10:59:592015-11-10 11:59:59
62018-12-10 11:59:592015-11-10 12:59:59
71985-07-23 11:59:592015-07-23 11:59:59
\n", - "
" - ], - "text/plain": [ - " a b\n", - "0 1993-09-27 11:58:58 2020-05-01 12:59:59\n", - "1 2000-03-19 12:59:59 2019-12-25 11:58:58\n", - "2 2018-11-10 11:59:59 2018-11-10 11:59:59\n", - "3 2018-10-10 11:59:59 2018-11-10 11:59:59\n", - "4 2018-10-10 11:59:59 2018-09-10 09:59:59\n", - "5 2018-10-10 10:59:59 2015-11-10 11:59:59\n", - "6 2018-12-10 11:59:59 2015-11-10 12:59:59\n", - "7 1985-07-23 11:59:59 2015-07-23 11:59:59" - ] - }, - "execution_count": 4, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "df" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Simple usage" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Initialising DatetimeSinusoidCalculator\n", - "The user must specify the following;\n", - "- `columns` giving the column to operate on, this can be a single column or a list of column names.\n", - "- `method` argument to specify which function is to be calculated. Accepted values are 'sin', 'cos' or a list containing both.\n", - "- `units` which time unit the calculation is to be carried out on. Accepted values are 'year', 'month', 'day', 'hour', 'minute', 'second', 'microsecond'. \n", - "- `period` the period of the output in the units specified above. To leave the period of the sinusoid output as 2 pi, leave the value as default.\n", - "\n", - "\n" - ] - }, - { - "cell_type": "code", - "execution_count": 5, - "metadata": {}, - "outputs": [], - "source": [ - "cosine_month_calculator = DatetimeSinusoidCalculator(\n", - " ['a', 'b'],\n", - " ['sin', 'cos'],\n", - " 'month',\n", - " 12,\n", - " )\n" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "collapsed": true - }, - "source": [ - "### DatetimeSinusoidCalculator fit\n", - "There is no `fit` method for the `DatetimeSinusoidCalculator` class, it does not learn anything from the input data `X`." - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### DateTimeSinusoidCalculator transform\n", - "Four columns are added to the dataframe when the class is instantiated like this; sin_a, cos_a, sin_b and cos_b." - ] - }, - { - "cell_type": "code", - "execution_count": 6, - "metadata": {}, - "outputs": [], - "source": [ - "df_2 = cosine_month_calculator.transform(df)" - ] - }, - { - "cell_type": "code", - "execution_count": 7, - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
absin_acos_asin_bcos_b
01993-09-27 11:58:582020-05-01 12:59:59-1.000000e+00-1.836970e-165.000000e-01-8.660254e-01
12000-03-19 12:59:592019-12-25 11:58:581.000000e+006.123234e-17-2.449294e-161.000000e+00
22018-11-10 11:59:592018-11-10 11:59:59-5.000000e-018.660254e-01-5.000000e-018.660254e-01
32018-10-10 11:59:592018-11-10 11:59:59-8.660254e-015.000000e-01-5.000000e-018.660254e-01
42018-10-10 11:59:592018-09-10 09:59:59-8.660254e-015.000000e-01-1.000000e+00-1.836970e-16
52018-10-10 10:59:592015-11-10 11:59:59-8.660254e-015.000000e-01-5.000000e-018.660254e-01
62018-12-10 11:59:592015-11-10 12:59:59-2.449294e-161.000000e+00-5.000000e-018.660254e-01
71985-07-23 11:59:592015-07-23 11:59:59-5.000000e-01-8.660254e-01-5.000000e-01-8.660254e-01
\n", - "
" - ], - "text/plain": [ - " a b sin_a cos_a \\\n", - "0 1993-09-27 11:58:58 2020-05-01 12:59:59 -1.000000e+00 -1.836970e-16 \n", - "1 2000-03-19 12:59:59 2019-12-25 11:58:58 1.000000e+00 6.123234e-17 \n", - "2 2018-11-10 11:59:59 2018-11-10 11:59:59 -5.000000e-01 8.660254e-01 \n", - "3 2018-10-10 11:59:59 2018-11-10 11:59:59 -8.660254e-01 5.000000e-01 \n", - "4 2018-10-10 11:59:59 2018-09-10 09:59:59 -8.660254e-01 5.000000e-01 \n", - "5 2018-10-10 10:59:59 2015-11-10 11:59:59 -8.660254e-01 5.000000e-01 \n", - "6 2018-12-10 11:59:59 2015-11-10 12:59:59 -2.449294e-16 1.000000e+00 \n", - "7 1985-07-23 11:59:59 2015-07-23 11:59:59 -5.000000e-01 -8.660254e-01 \n", - "\n", - " sin_b cos_b \n", - "0 5.000000e-01 -8.660254e-01 \n", - "1 -2.449294e-16 1.000000e+00 \n", - "2 -5.000000e-01 8.660254e-01 \n", - "3 -5.000000e-01 8.660254e-01 \n", - "4 -1.000000e+00 -1.836970e-16 \n", - "5 -5.000000e-01 8.660254e-01 \n", - "6 -5.000000e-01 8.660254e-01 \n", - "7 -5.000000e-01 -8.660254e-01 " - ] - }, - "execution_count": 7, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "df_2" - ] - }, - { - "cell_type": "code", - "execution_count": 8, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "a datetime64[ns]\n", - "b datetime64[ns]\n", - "sin_a float64\n", - "cos_a float64\n", - "sin_b float64\n", - "cos_b float64\n", - "dtype: object" - ] - }, - "execution_count": 8, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "df_2.dtypes" - ] - } - ], - "metadata": { - "kernelspec": { - "display_name": "Python 3.8.0 ('tubular-dev')", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.8.0" - }, - "toc": { - "base_numbering": 1, - "nav_menu": {}, - "number_sections": true, - "sideBar": true, - "skip_h1_title": false, - "title_cell": "Table of Contents", - "title_sidebar": "Contents", - "toc_cell": false, - "toc_position": {}, - "toc_section_display": true, - "toc_window_display": true - }, - "vscode": { - "interpreter": { - "hash": "cc666868ff21538e6058ba6d4768423bd0d0d7d7fded3ffb1bc309a0bf9339c2" - } - } - }, - "nbformat": 4, - "nbformat_minor": 2 -} diff --git a/examples/misc/SetColumnDTypeTransformer.ipynb b/examples/misc/SetColumnDTypeTransformer.ipynb deleted file mode 100644 index f5b2b15a..00000000 --- a/examples/misc/SetColumnDTypeTransformer.ipynb +++ /dev/null @@ -1,463 +0,0 @@ -{ - "cells": [ - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# SetDtypeTransformer\n", - "This notebook shows the functionality in the SetDtypeTransformer class. This transformer changes the column type to the new set type.
" - ] - }, - { - "cell_type": "code", - "execution_count": 1, - "metadata": {}, - "outputs": [], - "source": [ - "import pandas as pd\n", - "import numpy as np\n", - "from sklearn.pipeline import Pipeline" - ] - }, - { - "cell_type": "code", - "execution_count": 2, - "metadata": {}, - "outputs": [], - "source": [ - "import tubular\n", - "from tubular.mapping import MappingTransformer\n", - "from tubular.misc import SetColumnDtype" - ] - }, - { - "cell_type": "code", - "execution_count": 3, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "'0.3.3'" - ] - }, - "execution_count": 3, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "tubular.__version__" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Example 1" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Create sample data" - ] - }, - { - "cell_type": "code", - "execution_count": 4, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "(3, 5)" - ] - }, - "execution_count": 4, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "sample_data = pd.DataFrame({'col1':['a', 'b', 'c'], 'col2':[1, 2, 3], 'col3':[True, False, True], 'col4':[0.1, 0.2, 0.3], 'col5':['a', 'b', 'c']})\n", - "sample_data.shape" - ] - }, - { - "cell_type": "code", - "execution_count": 5, - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
col1col2col3col4col5
0a1True0.1a
1b2False0.2b
2c3True0.3c
\n", - "
" - ], - "text/plain": [ - " col1 col2 col3 col4 col5\n", - "0 a 1 True 0.1 a\n", - "1 b 2 False 0.2 b\n", - "2 c 3 True 0.3 c" - ] - }, - "execution_count": 5, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "sample_data" - ] - }, - { - "cell_type": "code", - "execution_count": 6, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "col1 object\n", - "col2 int64\n", - "col3 bool\n", - "col4 float64\n", - "col5 object\n", - "dtype: object" - ] - }, - "execution_count": 6, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "sample_data.dtypes" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Set dtypes using pipeline" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Initialising SetColumnDtype" - ] - }, - { - "attachments": {}, - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Creating two transformers in a pipeline. First one changes dtype of col1 and col5 into a string and second one changes col2 into a float. note that either dtype objects or strings interpretable as such by pandas.api.types.pandas_dtype will work" - ] - }, - { - "cell_type": "code", - "execution_count": 7, - "metadata": {}, - "outputs": [], - "source": [ - "set_dtypes_pipeline = Pipeline(\n", - " [\n", - " (\"dtype_string\", SetColumnDtype(['col1', 'col5'], dtype='string')),\n", - " (\"dtype_float\", SetColumnDtype('col2', dtype=float))\n", - " ])" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### SetColumnDtype transform" - ] - }, - { - "cell_type": "code", - "execution_count": 8, - "metadata": {}, - "outputs": [], - "source": [ - "sdp = set_dtypes_pipeline.transform(sample_data)" - ] - }, - { - "cell_type": "code", - "execution_count": 9, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "col1 string\n", - "col2 float64\n", - "col3 bool\n", - "col4 float64\n", - "col5 string\n", - "dtype: object" - ] - }, - "execution_count": 9, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "sdp.dtypes" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Example 2\n", - "This shows handling of 'O' type which occurswhen the mapping has missing values (there are some values which are not in the dictionary)." - ] - }, - { - "cell_type": "code", - "execution_count": 10, - "metadata": {}, - "outputs": [], - "source": [ - "data = pd.DataFrame([[1, 'a'], [2, 'b']], columns=['numbers', 'letters'])" - ] - }, - { - "cell_type": "code", - "execution_count": 11, - "metadata": {}, - "outputs": [], - "source": [ - "column_mappings_5 = {\n", - " 'numbers': {\n", - " 1: 'zzz',\n", - " 2: 'yyy',\n", - " 3: 'www'\n", - " },\n", - " 'letters': {\n", - " 'a': 'albatross'\n", - " }\n", - "}" - ] - }, - { - "cell_type": "code", - "execution_count": 12, - "metadata": {}, - "outputs": [], - "source": [ - "map_data = MappingTransformer(mappings = column_mappings_5, copy = True, verbose = False)" - ] - }, - { - "cell_type": "code", - "execution_count": 13, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "dtype('int64')" - ] - }, - "execution_count": 13, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "data['numbers'].dtype" - ] - }, - { - "cell_type": "code", - "execution_count": 14, - "metadata": {}, - "outputs": [], - "source": [ - "data_transformed = map_data.transform(data)" - ] - }, - { - "cell_type": "code", - "execution_count": 15, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "\n", - "RangeIndex: 2 entries, 0 to 1\n", - "Data columns (total 2 columns):\n", - " # Column Non-Null Count Dtype \n", - "--- ------ -------------- ----- \n", - " 0 numbers 2 non-null int64 \n", - " 1 letters 2 non-null object\n", - "dtypes: int64(1), object(1)\n", - "memory usage: 160.0+ bytes\n" - ] - } - ], - "source": [ - "data.info()" - ] - }, - { - "attachments": {}, - "cell_type": "markdown", - "metadata": {}, - "source": [ - "column letters has chnaged to 'object' type. We can use the SetColumnDtype transformer to change this back to a string." - ] - }, - { - "cell_type": "code", - "execution_count": 17, - "metadata": {}, - "outputs": [], - "source": [ - "set_dtypes = SetColumnDtype(['letters', 'numbers'], 'string')" - ] - }, - { - "cell_type": "code", - "execution_count": 18, - "metadata": {}, - "outputs": [], - "source": [ - "data_types_changed = set_dtypes.transform(data_transformed)" - ] - }, - { - "cell_type": "code", - "execution_count": 19, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "\n", - "RangeIndex: 2 entries, 0 to 1\n", - "Data columns (total 2 columns):\n", - " # Column Non-Null Count Dtype \n", - "--- ------ -------------- ----- \n", - " 0 numbers 2 non-null string\n", - " 1 letters 2 non-null string\n", - "dtypes: string(2)\n", - "memory usage: 160.0 bytes\n" - ] - } - ], - "source": [ - "data_types_changed.info()" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [] - } - ], - "metadata": { - "kernelspec": { - "display_name": "Python 3.8.0 ('tubular-dev')", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.8.0" - }, - "toc": { - "base_numbering": 1, - "nav_menu": {}, - "number_sections": true, - "sideBar": true, - "skip_h1_title": false, - "title_cell": "Table of Contents", - "title_sidebar": "Contents", - "toc_cell": false, - "toc_position": {}, - "toc_section_display": true, - "toc_window_display": true - }, - "vscode": { - "interpreter": { - "hash": "cc666868ff21538e6058ba6d4768423bd0d0d7d7fded3ffb1bc309a0bf9339c2" - } - } - }, - "nbformat": 4, - "nbformat_minor": 2 -} diff --git a/examples/numeric/PCATransformer.ipynb b/examples/numeric/PCATransformer.ipynb deleted file mode 100644 index 46bf03ac..00000000 --- a/examples/numeric/PCATransformer.ipynb +++ /dev/null @@ -1,569 +0,0 @@ -{ - "cells": [ - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# PCA Transformer\n", - "\n", - "This notebook shows the functionality in the `PCATransformer` class. This transformer applys the `sklearn.decomposition.pca` method to the input `X`.
\n", - "This transformer means that principal component analysis dimension reduction technique is applied to project data to a lower dimensional space. \n", - "This PCA Transformer is based on [sklearn.decomposition.PCA](https://scikit-learn.org/stable/modules/generated/sklearn.decomposition.PCA.html) class.\n", - "\n" - ] - }, - { - "cell_type": "code", - "execution_count": 1, - "metadata": {}, - "outputs": [], - "source": [ - "import pandas as pd\n", - "import numpy as np\n", - "from sklearn.datasets import fetch_california_housing" - ] - }, - { - "cell_type": "code", - "execution_count": 2, - "metadata": {}, - "outputs": [], - "source": [ - "import tubular\n", - "from tubular.numeric import PCATransformer" - ] - }, - { - "cell_type": "code", - "execution_count": 3, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "'0.3.3'" - ] - }, - "execution_count": 3, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "tubular.__version__\n" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Load California housing dataset from sklearn" - ] - }, - { - "cell_type": "code", - "execution_count": 4, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "(20640, 8)\n" - ] - }, - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
MedIncHouseAgeAveRoomsAveBedrmsPopulationAveOccupLatitudeLongitude
08.325241.06.9841271.023810322.02.55555637.88-122.23
18.301421.06.2381370.9718802401.02.10984237.86-122.22
27.257452.08.2881361.073446496.02.80226037.85-122.24
35.643152.05.8173521.073059558.02.54794537.85-122.25
43.846252.06.2818531.081081565.02.18146737.85-122.25
\n", - "
" - ], - "text/plain": [ - " MedInc HouseAge AveRooms AveBedrms Population AveOccup Latitude \\\n", - "0 8.3252 41.0 6.984127 1.023810 322.0 2.555556 37.88 \n", - "1 8.3014 21.0 6.238137 0.971880 2401.0 2.109842 37.86 \n", - "2 7.2574 52.0 8.288136 1.073446 496.0 2.802260 37.85 \n", - "3 5.6431 52.0 5.817352 1.073059 558.0 2.547945 37.85 \n", - "4 3.8462 52.0 6.281853 1.081081 565.0 2.181467 37.85 \n", - "\n", - " Longitude \n", - "0 -122.23 \n", - "1 -122.22 \n", - "2 -122.24 \n", - "3 -122.25 \n", - "4 -122.25 " - ] - }, - "execution_count": 4, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "cali = fetch_california_housing()\n", - "cali_df = pd.DataFrame(cali['data'], columns=cali['feature_names'])\n", - "print(cali_df.shape)\n", - "cali_df.head()" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Simple usage \n", - "### Initialising PCATransformer\n", - "\n", - "The user can specify the following;
\n", - "- `columns` the columns in the `DataFrame` passed to the`fit` and `transform` method to be transformed
\n", - "- `n_components` number of PCA dimension expected. \"mle\" value can also be provided to guess the dimension. (default value is 2)
\n", - "- `svd_solver` the solver used to compute the Singular Value Decomposition. Available solvers : 'auto', 'full', 'arpack', 'randomized' (default value is 'auto')
\n", - "- `random_state` used when the 'arpack' or 'randomized' solvers are used. (default value is None)
\n", - "- `pca_column_prefix` prefix added to each the n components features generated.(default value is pca_)
" - ] - }, - { - "cell_type": "code", - "execution_count": 5, - "metadata": {}, - "outputs": [], - "source": [ - "pca_transformer = PCATransformer(\n", - " columns = ['HouseAge','Population', 'MedInc'],\n", - " n_components = 2,\n", - ")" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### InteractionTransformer fit\n", - "The `PCATransformer` must be `fit` on data before running `transform` to compute the SVD. " - ] - }, - { - "cell_type": "code", - "execution_count": 6, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "PCATransformer(columns=['HouseAge', 'Population', 'MedInc'])" - ] - }, - "execution_count": 6, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "pca_transformer.fit(cali_df)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### InteractionTransformer transform\n", - "When running transform with this configuration new PCA dimensions columns are added to the input `X`." - ] - }, - { - "cell_type": "code", - "execution_count": 7, - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
MedIncHouseAgeAveRoomsAveBedrmsPopulationAveOccupLatitudeLongitudepca_0pca_1
08.325241.06.9841271.023810322.02.55555637.88-122.23-1103.5114258.636318
18.301421.06.2381370.9718802401.02.10984237.86-122.22975.543158-4.514731
27.257452.08.2881361.073446496.02.80226037.85-122.24-929.54859620.228204
35.643152.05.8173521.073059558.02.54794537.85-122.25-867.54894520.464517
43.846252.06.2818531.081081565.02.18146737.85-122.25-860.54899820.523405
\n", - "
" - ], - "text/plain": [ - " MedInc HouseAge AveRooms AveBedrms Population AveOccup Latitude \\\n", - "0 8.3252 41.0 6.984127 1.023810 322.0 2.555556 37.88 \n", - "1 8.3014 21.0 6.238137 0.971880 2401.0 2.109842 37.86 \n", - "2 7.2574 52.0 8.288136 1.073446 496.0 2.802260 37.85 \n", - "3 5.6431 52.0 5.817352 1.073059 558.0 2.547945 37.85 \n", - "4 3.8462 52.0 6.281853 1.081081 565.0 2.181467 37.85 \n", - "\n", - " Longitude pca_0 pca_1 \n", - "0 -122.23 -1103.511425 8.636318 \n", - "1 -122.22 975.543158 -4.514731 \n", - "2 -122.24 -929.548596 20.228204 \n", - "3 -122.25 -867.548945 20.464517 \n", - "4 -122.25 -860.548998 20.523405 " - ] - }, - "execution_count": 7, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "cali_df_2 = pca_transformer.transform(cali_df)\n", - "cali_df_2.head()" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Use a different solver\n" - ] - }, - { - "cell_type": "code", - "execution_count": 8, - "metadata": {}, - "outputs": [], - "source": [ - "pca_transformer_arpack = PCATransformer(\n", - " columns = ['HouseAge','Population', 'MedInc'],\n", - " n_components = 1,\n", - " svd_solver = 'arpack',\n", - " random_state=32, \n", - ")" - ] - }, - { - "cell_type": "code", - "execution_count": 9, - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
MedIncHouseAgeAveRoomsAveBedrmsPopulationAveOccupLatitudeLongitudepca_0
08.325241.06.9841271.023810322.02.55555637.88-122.23-1103.511425
18.301421.06.2381370.9718802401.02.10984237.86-122.22975.543158
27.257452.08.2881361.073446496.02.80226037.85-122.24-929.548596
35.643152.05.8173521.073059558.02.54794537.85-122.25-867.548945
43.846252.06.2818531.081081565.02.18146737.85-122.25-860.548998
\n", - "
" - ], - "text/plain": [ - " MedInc HouseAge AveRooms AveBedrms Population AveOccup Latitude \\\n", - "0 8.3252 41.0 6.984127 1.023810 322.0 2.555556 37.88 \n", - "1 8.3014 21.0 6.238137 0.971880 2401.0 2.109842 37.86 \n", - "2 7.2574 52.0 8.288136 1.073446 496.0 2.802260 37.85 \n", - "3 5.6431 52.0 5.817352 1.073059 558.0 2.547945 37.85 \n", - "4 3.8462 52.0 6.281853 1.081081 565.0 2.181467 37.85 \n", - "\n", - " Longitude pca_0 \n", - "0 -122.23 -1103.511425 \n", - "1 -122.22 975.543158 \n", - "2 -122.24 -929.548596 \n", - "3 -122.25 -867.548945 \n", - "4 -122.25 -860.548998 " - ] - }, - "execution_count": 9, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "cali_df_3 = pca_transformer_arpack.fit_transform(cali_df)\n", - "cali_df_3.head()" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [] - } - ], - "metadata": { - "kernelspec": { - "display_name": "basemodel", - "language": "python", - "name": "basemodel" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.9.12" - } - }, - "nbformat": 4, - "nbformat_minor": 4 -} diff --git a/examples/numeric/TwoColumnOperatorTransformer.ipynb b/examples/numeric/TwoColumnOperatorTransformer.ipynb deleted file mode 100644 index 404ef015..00000000 --- a/examples/numeric/TwoColumnOperatorTransformer.ipynb +++ /dev/null @@ -1,473 +0,0 @@ -{ - "cells": [ - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# TwoColumnOperatorTransformer\n", - "This notebook shows the functionality in the TwoColumnOperatorTransformer class. This transformer applies pandas dataframe methods that involve combining two columns under the action of some operator. Examples are shown here for addition and modulo.
" - ] - }, - { - "cell_type": "code", - "execution_count": 1, - "metadata": {}, - "outputs": [], - "source": [ - "import tubular\n", - "from tubular.numeric import TwoColumnOperatorTransformer\n", - "from sklearn.datasets import fetch_california_housing\n", - "\n", - "import pandas as pd" - ] - }, - { - "cell_type": "code", - "execution_count": 2, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "'0.3.3'" - ] - }, - "execution_count": 2, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "tubular.__version__" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Load Boston house price dataset from sklearn\n", - "Note, the load_boston script modifies the original Boston dataset to include nulls values and pandas categorical dtypes." - ] - }, - { - "cell_type": "code", - "execution_count": 3, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "(20640, 8)\n" - ] - }, - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
MedIncHouseAgeAveRoomsAveBedrmsPopulationAveOccupLatitudeLongitude
08.325241.06.9841271.023810322.02.55555637.88-122.23
18.301421.06.2381370.9718802401.02.10984237.86-122.22
27.257452.08.2881361.073446496.02.80226037.85-122.24
35.643152.05.8173521.073059558.02.54794537.85-122.25
43.846252.06.2818531.081081565.02.18146737.85-122.25
\n", - "
" - ], - "text/plain": [ - " MedInc HouseAge AveRooms AveBedrms Population AveOccup Latitude \\\n", - "0 8.3252 41.0 6.984127 1.023810 322.0 2.555556 37.88 \n", - "1 8.3014 21.0 6.238137 0.971880 2401.0 2.109842 37.86 \n", - "2 7.2574 52.0 8.288136 1.073446 496.0 2.802260 37.85 \n", - "3 5.6431 52.0 5.817352 1.073059 558.0 2.547945 37.85 \n", - "4 3.8462 52.0 6.281853 1.081081 565.0 2.181467 37.85 \n", - "\n", - " Longitude \n", - "0 -122.23 \n", - "1 -122.22 \n", - "2 -122.24 \n", - "3 -122.25 \n", - "4 -122.25 " - ] - }, - "execution_count": 3, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "cali = fetch_california_housing()\n", - "cali_df = pd.DataFrame(cali['data'], columns=cali['feature_names'])\n", - "print(cali_df.shape)\n", - "cali_df.head()" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Examples" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "The transformer assigns the output of the method to a new column. The method will be applied in the form (column 1)operator(column 2), so order matters (if the method does not commute). It is possible to supply other key word arguments to the transform method, which will be passed to the pandas.DataFrame method being called." - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "The minimal arguments to initialise the transformer are given below. More can be found in the class documentation.\n", - "- `pd_method_name` The name of the pandas dataframe method to apply\n", - "- `column1_name` The name of the 1st column in the operation.\n", - "- `column2_name` The name of the 2nd column in the operation.\n", - "- `new_column_name` The name of the new column.\n" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Addition" - ] - }, - { - "cell_type": "code", - "execution_count": 4, - "metadata": {}, - "outputs": [], - "source": [ - "addition = TwoColumnOperatorTransformer('add', ['Latitude', 'Longitude'], 'Latitude + Longitude')" - ] - }, - { - "cell_type": "code", - "execution_count": 5, - "metadata": {}, - "outputs": [], - "source": [ - "cali_df_2 = addition.transform(cali_df)" - ] - }, - { - "cell_type": "code", - "execution_count": 6, - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
LatitudeLongitudeLatitude + Longitude
037.88-122.23-84.35
137.86-122.22-84.36
237.85-122.24-84.39
337.85-122.25-84.40
437.85-122.25-84.40
\n", - "
" - ], - "text/plain": [ - " Latitude Longitude Latitude + Longitude\n", - "0 37.88 -122.23 -84.35\n", - "1 37.86 -122.22 -84.36\n", - "2 37.85 -122.24 -84.39\n", - "3 37.85 -122.25 -84.40\n", - "4 37.85 -122.25 -84.40" - ] - }, - "execution_count": 6, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "cali_df_2[['Latitude', 'Longitude', 'Latitude + Longitude']].head()" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Modulo" - ] - }, - { - "cell_type": "code", - "execution_count": 7, - "metadata": {}, - "outputs": [], - "source": [ - "modulo = TwoColumnOperatorTransformer('mod', ['Population', 'HouseAge'], 'HouseAge mod Population')" - ] - }, - { - "cell_type": "code", - "execution_count": 8, - "metadata": {}, - "outputs": [], - "source": [ - "cali_df_3 = modulo.transform(cali_df)" - ] - }, - { - "cell_type": "code", - "execution_count": 9, - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
PopulationHouseAgeHouseAge mod Population
0322.041.035.0
12401.021.07.0
2496.052.028.0
3558.052.038.0
4565.052.045.0
\n", - "
" - ], - "text/plain": [ - " Population HouseAge HouseAge mod Population\n", - "0 322.0 41.0 35.0\n", - "1 2401.0 21.0 7.0\n", - "2 496.0 52.0 28.0\n", - "3 558.0 52.0 38.0\n", - "4 565.0 52.0 45.0" - ] - }, - "execution_count": 9, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "cali_df_3[['Population', 'HouseAge', 'HouseAge mod Population']].head()" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [] - } - ], - "metadata": { - "kernelspec": { - "display_name": "Python 3.8.0 ('tubular-dev')", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.8.0" - }, - "toc": { - "base_numbering": 1, - "nav_menu": {}, - "number_sections": true, - "sideBar": true, - "skip_h1_title": false, - "title_cell": "Table of Contents", - "title_sidebar": "Contents", - "toc_cell": false, - "toc_position": {}, - "toc_section_display": true, - "toc_window_display": true - }, - "vscode": { - "interpreter": { - "hash": "cc666868ff21538e6058ba6d4768423bd0d0d7d7fded3ffb1bc309a0bf9339c2" - } - } - }, - "nbformat": 4, - "nbformat_minor": 2 -} diff --git a/examples/strings/StringConcatenator.ipynb b/examples/strings/StringConcatenator.ipynb deleted file mode 100644 index b0a8e971..00000000 --- a/examples/strings/StringConcatenator.ipynb +++ /dev/null @@ -1,301 +0,0 @@ -{ - "cells": [ - { - "attachments": {}, - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# StringConcatenator\n", - "This notebook shows the functionality of the `StringConcatenator` class. This Transformer combines data from specified columns, of mixed datatypes, into a new column containing one string." - ] - }, - { - "cell_type": "code", - "execution_count": 1, - "metadata": {}, - "outputs": [], - "source": [ - "import pandas as pd\n", - "import numpy as np" - ] - }, - { - "cell_type": "code", - "execution_count": 2, - "metadata": {}, - "outputs": [], - "source": [ - "import tubular\n", - "from tubular.strings import StringConcatenator" - ] - }, - { - "cell_type": "code", - "execution_count": 3, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "'0.3.3'" - ] - }, - "execution_count": 3, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "tubular.__version__" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Create sample data" - ] - }, - { - "cell_type": "code", - "execution_count": 4, - "metadata": {}, - "outputs": [], - "source": [ - "character_df = pd.DataFrame(([':', ')', 5], [':', '(', 3]), columns=['c1', 'c2', 'c3'])" - ] - }, - { - "cell_type": "code", - "execution_count": 5, - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
c1c2c3
0:)5
1:(3
\n", - "
" - ], - "text/plain": [ - " c1 c2 c3\n", - "0 : ) 5\n", - "1 : ( 3" - ] - }, - "execution_count": 5, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "character_df.head()" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Simple usage" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Initialising StringConcatenatorTransformer" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "The user must specify the following;\n", - "- `columns` giving the columns to join\n", - "- `new column` giving the name of the new column\n", - "- `separator` giving the separator for joining (optional)" - ] - }, - { - "cell_type": "code", - "execution_count": 6, - "metadata": {}, - "outputs": [], - "source": [ - "to_merge = ['c1', 'c2', 'c3']\n", - "\n", - "join_columns = StringConcatenator(to_merge, \"merged\", \"-\")" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### StringConcatenatorTransformer fit\n", - "There is no fit method for the StringConcatenatorTransformer as the methods that it can run do not 'learn' anything from the data." - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### StringConcatenatorTransformer transform" - ] - }, - { - "cell_type": "code", - "execution_count": 7, - "metadata": {}, - "outputs": [], - "source": [ - "joined_df = join_columns.transform(character_df)" - ] - }, - { - "cell_type": "code", - "execution_count": 8, - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
c1c2c3merged
0:)5:-)-5
1:(3:-(-3
\n", - "
" - ], - "text/plain": [ - " c1 c2 c3 merged\n", - "0 : ) 5 :-)-5\n", - "1 : ( 3 :-(-3" - ] - }, - "execution_count": 8, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "joined_df" - ] - } - ], - "metadata": { - "kernelspec": { - "display_name": "Python 3.8.0 ('tubular-dev')", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.8.0" - }, - "toc": { - "base_numbering": 1, - "nav_menu": {}, - "number_sections": true, - "sideBar": true, - "skip_h1_title": false, - "title_cell": "Table of Contents", - "title_sidebar": "Contents", - "toc_cell": false, - "toc_position": { - "height": "calc(100% - 180px)", - "left": "10px", - "top": "150px", - "width": "384px" - }, - "toc_section_display": true, - "toc_window_display": true - }, - "vscode": { - "interpreter": { - "hash": "cc666868ff21538e6058ba6d4768423bd0d0d7d7fded3ffb1bc309a0bf9339c2" - } - } - }, - "nbformat": 4, - "nbformat_minor": 2 -} diff --git a/requirements-dev.txt b/requirements-dev.txt index 06af2b48..03ab10c5 100644 --- a/requirements-dev.txt +++ b/requirements-dev.txt @@ -5,6 +5,6 @@ pytest>=5.4.1 pytest-mock>=3.5.1 pytest-cov>=2.10.1 pre-commit==2.15.0 -black==22.3.0 -flake8==5.0.4 +black==21.9b0 +flake8==3.9.2 bandit==1.7.0 \ No newline at end of file diff --git a/tests/base/test_BaseTransformer.py b/tests/base/test_BaseTransformer.py index 8e8c3e0c..6f44b14e 100644 --- a/tests/base/test_BaseTransformer.py +++ b/tests/base/test_BaseTransformer.py @@ -2,8 +2,7 @@ import test_aide as ta import tests.test_data as d from unittest import mock -import pandas as pd -import numpy as np +import pandas import re import tubular @@ -92,14 +91,14 @@ def test_class_methods(self): def test_verbose_non_bool_error(self): """Test an error is raised if verbose is not specified as a bool.""" - with pytest.raises(TypeError, match="BaseTransformer: verbose must be a bool"): + with pytest.raises(TypeError, match="verbose must be a bool"): BaseTransformer(verbose=1) def test_copy_non_bool_error(self): """Test an error is raised if copy is not specified as a bool.""" - with pytest.raises(TypeError, match="BaseTransformer: copy must be a bool"): + with pytest.raises(TypeError, match="copy must be a bool"): BaseTransformer(copy=1) @@ -116,7 +115,7 @@ def test_columns_list_element_error(self): with pytest.raises( TypeError, match=re.escape( - "BaseTransformer: each element of columns should be a single (string) column name" + "each element of columns should be a single (string) column name" ), ): @@ -128,7 +127,7 @@ def test_columns_non_string_error(self): with pytest.raises( TypeError, match=re.escape( - "BaseTransformer: columns must be a string or list with the columns to be pre-processed (if specified)" + "columns must be a string or list with the columns to be pre-processed (if specified)" ), ): @@ -163,9 +162,7 @@ def test_X_non_df_error(self): x = BaseTransformer(columns="a") - with pytest.raises( - TypeError, match="BaseTransformer: X should be a pd.DataFrame" - ): + with pytest.raises(TypeError, match="X should be a pd.DataFrame"): x.fit("a") @@ -177,8 +174,7 @@ def test_non_pd_type_error(self): x = BaseTransformer(columns="a") with pytest.raises( - TypeError, - match="BaseTransformer: unexpected type for y, should be a pd.Series", + TypeError, match="unexpected type for y, should be a pd.Series" ): x.fit(X=df, y=[1, 2, 3, 4, 5, 6]) @@ -206,11 +202,9 @@ def test_X_no_rows_error(self): x = BaseTransformer(columns="a") - df = pd.DataFrame(columns=["a"]) + df = pandas.DataFrame(columns=["a"]) - with pytest.raises( - ValueError, match=re.escape("BaseTransformer: X has no rows; (0, 1)") - ): + with pytest.raises(ValueError, match=re.escape("X has no rows; (0, 1)")): x.fit(X=df) @@ -219,24 +213,11 @@ def test_y_no_rows_error(self): x = BaseTransformer(columns="a") - df = pd.DataFrame({"a": 1}, index=[0]) - - with pytest.raises( - ValueError, match=re.escape("BaseTransformer: y is empty; (0,)") - ): - - x.fit(X=df, y=pd.Series(name="b", dtype=object)) + df = pandas.DataFrame({"a": 1}, index=[0]) - def test_unexpected_kwarg_error(self): - - with pytest.raises( - TypeError, - match=re.escape( - "__init__() got an unexpected keyword argument 'unexpected_kwarg'" - ), - ): + with pytest.raises(ValueError, match=re.escape("y is empty; (0,)")): - BaseTransformer(columns="a", unexpected_kwarg="spanish inquisition") + x.fit(X=df, y=pandas.Series(name="b", dtype=object)) class TestTransform(object): @@ -269,9 +250,7 @@ def test_non_pd_type_error(self): x = BaseTransformer(columns="a") - with pytest.raises( - TypeError, match="BaseTransformer: X should be a pd.DataFrame" - ): + with pytest.raises(TypeError, match="X should be a pd.DataFrame"): x.transform(X=[1, 2, 3, 4, 5, 6]) @@ -285,7 +264,7 @@ def test_df_copy_called(self, mocker): expected_call_args = {0: {"args": (), "kwargs": {}}} with ta.functions.assert_function_call( - mocker, pd.DataFrame, "copy", expected_call_args, return_value=df + mocker, pandas.DataFrame, "copy", expected_call_args, return_value=df ): x.transform(X=df) @@ -295,11 +274,9 @@ def test_no_rows_error(self): x = BaseTransformer(columns="a") - df = pd.DataFrame(columns=["a"]) + df = pandas.DataFrame(columns=["a"]) - with pytest.raises( - ValueError, match=re.escape("BaseTransformer: X has no rows; (0, 1)") - ): + with pytest.raises(ValueError, match=re.escape("X has no rows; (0, 1)")): x.transform(df) @@ -336,9 +313,7 @@ def test_non_pd_df_error(self): x = BaseTransformer(columns="a") - with pytest.raises( - TypeError, match="BaseTransformer: X should be a pd.DataFrame" - ): + with pytest.raises(TypeError, match="X should be a pd.DataFrame"): x.columns_check(X=[1, 2, 3, 4, 5, 6]) @@ -364,9 +339,7 @@ def test_columns_str_error(self): x.columns = "a" - with pytest.raises( - TypeError, match="BaseTransformer: self.columns should be a list" - ): + with pytest.raises(TypeError, match="self.columns should be a list"): x.columns_check(X=df) @@ -397,9 +370,7 @@ def test_non_pd_df_error(self): x = BaseTransformer(columns="a") - with pytest.raises( - TypeError, match="BaseTransformer: X should be a pd.DataFrame" - ): + with pytest.raises(TypeError, match="X should be a pd.DataFrame"): x.columns_set_or_check(X=[1, 2, 3, 4, 5, 6]) @@ -475,20 +446,18 @@ def test_X_not_DataFrame_error(self): x = BaseTransformer(columns=["a"]) - with pytest.raises( - TypeError, match="BaseTransformer: X should be a pd.DataFrame" - ): + with pytest.raises(TypeError, match="X should be a pd.DataFrame"): - x._combine_X_y(X=1, y=pd.Series([1, 2])) + x._combine_X_y(X=1, y=pandas.Series([1, 2])) def test_y_not_Series_error(self): """Test an exception is raised if y is not a pd.Series.""" x = BaseTransformer(columns=["a"]) - with pytest.raises(TypeError, match="BaseTransformer: y should be a pd.Series"): + with pytest.raises(TypeError, match="y should be a pd.Series"): - x._combine_X_y(X=pd.DataFrame({"a": [1, 2]}), y=1) + x._combine_X_y(X=pandas.DataFrame({"a": [1, 2]}), y=1) def test_X_and_y_different_number_of_rows_error(self): """Test an exception is raised if X and y have different numbers of rows.""" @@ -497,31 +466,27 @@ def test_X_and_y_different_number_of_rows_error(self): with pytest.raises( ValueError, - match=re.escape( - "BaseTransformer: X and y have different numbers of rows (2 vs 1)" - ), + match=re.escape("X and y have different numbers of rows (2 vs 1)"), ): - x._combine_X_y(X=pd.DataFrame({"a": [1, 2]}), y=pd.Series([2])) + x._combine_X_y(X=pandas.DataFrame({"a": [1, 2]}), y=pandas.Series([2])) def test_X_and_y_different_indexes_warning(self): """Test a warning is raised if X and y have different indexes, but the output is still X and y.""" x = BaseTransformer(columns=["a"]) - with pytest.warns( - UserWarning, match="BaseTransformer: X and y do not have equal indexes" - ): + with pytest.warns(UserWarning, match="X and y do not have equal indexes"): result = x._combine_X_y( - X=pd.DataFrame({"a": [1, 2]}, index=[1, 2]), y=pd.Series([2, 4]) + X=pandas.DataFrame({"a": [1, 2]}, index=[1, 2]), y=pandas.Series([2, 4]) ) - expected_output = pd.DataFrame( + expected_output = pandas.DataFrame( {"a": [1, 2], "_temporary_response": [2, 4]}, index=[1, 2] ) - pd.testing.assert_frame_equal(result, expected_output) + pandas.testing.assert_frame_equal(result, expected_output) def test_output_same_indexes(self): """Test output is correct if X and y have the same index.""" @@ -529,62 +494,12 @@ def test_output_same_indexes(self): x = BaseTransformer(columns=["a"]) result = x._combine_X_y( - X=pd.DataFrame({"a": [1, 2]}, index=[1, 2]), - y=pd.Series([2, 4], index=[1, 2]), + X=pandas.DataFrame({"a": [1, 2]}, index=[1, 2]), + y=pandas.Series([2, 4], index=[1, 2]), ) - expected_output = pd.DataFrame( + expected_output = pandas.DataFrame( {"a": [1, 2], "_temporary_response": [2, 4]}, index=[1, 2] ) - pd.testing.assert_frame_equal(result, expected_output) - - -class TestCheckWeightsColumn: - "tests for check_weights_column method" - - def test_arguments(self): - """Test that columns_set_or_check has expected arguments.""" - - ta.functions.test_function_arguments( - func=BaseTransformer.check_weights_column, - expected_arguments=["X", "weights_column"], - ) - - def test_weight_not_in_X_error(self): - """Test an error is raised if weight is not in X.""" - - X = pd.DataFrame({"a": [1, 2], "b": [3, 4]}) - - with pytest.raises( - ValueError, match=r"weight col \(c\) is not present in columns of data" - ): - - BaseTransformer.check_weights_column(X, "c") - - def test_weight_non_numeric_error(self): - """Test an error is raised if weight col is non-numeric""" - - X = pd.DataFrame({"a": [1, 2], "b": ["a", "b"]}) - - with pytest.raises(ValueError, match="weight column must be numeric."): - - BaseTransformer.check_weights_column(X, "b") - - def test_weight_non_positive_error(self): - """Test an error is raised if weight col is non-positive""" - - X = pd.DataFrame({"a": [1, 2], "b": [-1, 0]}) - - with pytest.raises(ValueError, match="weight column must be positive"): - - BaseTransformer.check_weights_column(X, "b") - - def test_weight_null_error(self): - """Test an error is raised if weight col is null""" - - X = pd.DataFrame({"a": [1, 2], "b": [np.NaN, 0]}) - - with pytest.raises(ValueError, match="weight column must be non-null"): - - BaseTransformer.check_weights_column(X, "b") + pandas.testing.assert_frame_equal(result, expected_output) diff --git a/tests/base/test_DataFrameMethodTransformer.py b/tests/base/test_DataFrameMethodTransformer.py index 8ff139cd..52e9eaeb 100644 --- a/tests/base/test_DataFrameMethodTransformer.py +++ b/tests/base/test_DataFrameMethodTransformer.py @@ -3,7 +3,6 @@ import tests.test_data as d import pandas as pd import numpy as np -import re import tubular from tubular.base import DataFrameMethodTransformer @@ -75,7 +74,7 @@ def test_invalid_input_type_errors(self): with pytest.raises( TypeError, - match=r"DataFrameMethodTransformer: unexpected type \(\\) for pd_method_name, expecting str", + match=r"unexpected type \(\\) for pd_method_name, expecting str", ): DataFrameMethodTransformer( @@ -84,7 +83,7 @@ def test_invalid_input_type_errors(self): with pytest.raises( TypeError, - match=r"DataFrameMethodTransformer: unexpected type \(\\) for new_column_name, must be str or list of strings", + match=r"unexpected type \(\\) for new_column_name, must be str or list of strings", ): DataFrameMethodTransformer( @@ -93,7 +92,7 @@ def test_invalid_input_type_errors(self): with pytest.raises( TypeError, - match=r"DataFrameMethodTransformer: if new_column_name is a list, all elements must be strings but got \ in position 1", + match=r"if new_column_name is a list, all elements must be strings but got \ in position 1", ): DataFrameMethodTransformer( @@ -102,7 +101,7 @@ def test_invalid_input_type_errors(self): with pytest.raises( TypeError, - match=r"""DataFrameMethodTransformer: pd_method_kwargs should be a dict but got type \""", + match=r"""pd_method_kwargs should be a dict but got type \""", ): DataFrameMethodTransformer( @@ -114,7 +113,7 @@ def test_invalid_input_type_errors(self): with pytest.raises( TypeError, - match=r"""DataFrameMethodTransformer: unexpected type \(\\) for pd_method_kwargs key in position 1, must be str""", + match=r"""unexpected type \(\\) for pd_method_kwargs key in position 1, must be str""", ): DataFrameMethodTransformer( @@ -126,7 +125,7 @@ def test_invalid_input_type_errors(self): with pytest.raises( TypeError, - match=r"DataFrameMethodTransformer: unexpected type \(\\) for drop_original, expecting bool", + match=r"unexpected type \(\\) for drop_original, expecting bool", ): DataFrameMethodTransformer( @@ -141,7 +140,7 @@ def test_exception_raised_non_pandas_method_passed(self): with pytest.raises( AttributeError, - match="""DataFrameMethodTransformer: error accessing "b" method on pd.DataFrame object - pd_method_name should be a pd.DataFrame method""", + match="""error accessing "b" method on pd.DataFrame object - pd_method_name should be a pd.DataFrame method""", ): DataFrameMethodTransformer( @@ -168,23 +167,6 @@ def test_attributes_set(self): msg="Attributes for DataFrameMethodTransformer set in init", ) - def test_unexpected_kwarg_error(self): - - with pytest.raises( - TypeError, - match=re.escape( - "__init__() got an unexpected keyword argument 'unexpected_kwarg'" - ), - ): - - DataFrameMethodTransformer( - new_column_name="a", - pd_method_name="sum", - columns=["b", "c"], - drop_original=True, - unexpected_kwarg="spanish inquisition", - ) - class TestTransform(object): """Tests for DataFrameMethodTransformer.transform().""" diff --git a/tests/capping/test_CappingTransformer.py b/tests/capping/test_CappingTransformer.py index 87e2f0bb..72d97c19 100644 --- a/tests/capping/test_CappingTransformer.py +++ b/tests/capping/test_CappingTransformer.py @@ -56,7 +56,7 @@ def test_capping_values_quantiles_both_none_error(self): with pytest.raises( ValueError, - match="CappingTransformer: both capping_values and quantiles are None, either supply capping values in the " + match="both capping_values and quantiles are None, either supply capping values in the " "capping_values argument or supply quantiles that can be learnt in the fit method", ): @@ -67,7 +67,7 @@ def test_capping_values_quantiles_both_specified_error(self): with pytest.raises( ValueError, - match="CappingTransformer: both capping_values and quantiles are not None, supply one or the other", + match="both capping_values and quantiles are not None, supply one or the other", ): CappingTransformer( @@ -80,7 +80,7 @@ def test_quantiles_outside_range_error(self, out_range_value): with pytest.raises( ValueError, - match=rf"CappingTransformer: quantile values must be in the range \[0, 1\] but got {out_range_value} for key f", + match=rf"quantile values must be in the range \[0, 1\] but got {out_range_value} for key f", ): CappingTransformer( @@ -217,7 +217,7 @@ def test_capping_values_not_dict_error(self): with pytest.raises( TypeError, - match="CappingTransformer: aaa should be dict of columns and capping values", + match="aaa should be dict of columns and capping values", ): x.check_capping_values_dict( @@ -231,7 +231,7 @@ def test_capping_values_non_str_key_error(self): with pytest.raises( TypeError, - match=r"CappingTransformer: all keys in bbb should be str, but got \", + match=r"all keys in bbb should be str, but got \", ): x.check_capping_values_dict( @@ -245,7 +245,7 @@ def test_capping_values_non_list_item_error(self): with pytest.raises( TypeError, - match=r"CappingTransformer: each item in ccc should be a list, but got \ for key b", + match=r"each item in ccc should be a list, but got \ for key b", ): x.check_capping_values_dict( @@ -259,7 +259,7 @@ def test_capping_values_non_length_2_list_item_error(self): with pytest.raises( ValueError, - match="CappingTransformer: each item in ddd should be length 2, but got 1 for key b", + match="each item in ddd should be length 2, but got 1 for key b", ): x.check_capping_values_dict( @@ -273,7 +273,7 @@ def test_capping_values_non_numeric_error(self): with pytest.raises( TypeError, - match=r"CappingTransformer: each item in eee lists must contain numeric values or None, got \ for key a", + match=r"each item in eee lists must contain numeric values or None, got \ for key a", ): x.check_capping_values_dict( @@ -287,7 +287,7 @@ def test_lower_value_gte_upper_value_error(self): with pytest.raises( ValueError, - match="CappingTransformer: lower value is greater than or equal to upper value for key a", + match="lower value is greater than or equal to upper value for key a", ): x.check_capping_values_dict( @@ -302,7 +302,7 @@ def test_capping_value_nan_inf_error(self, value): with pytest.raises( ValueError, - match="CappingTransformer: item in eee lists contains numpy NaN or Inf values", + match="item in eee lists contains numpy NaN or Inf values", ): x.check_capping_values_dict( @@ -314,9 +314,7 @@ def test_capping_values_both_null_error(self): x = CappingTransformer(capping_values={"a": [1, 3], "b": [None, -1]}) - with pytest.raises( - ValueError, match="CappingTransformer: both values are None for key a" - ): + with pytest.raises(ValueError, match="both values are None for key a"): x.check_capping_values_dict( capping_values_dict={"a": [None, None], "b": [None, 1]}, dict_name="eee" @@ -340,7 +338,7 @@ def test_quantiles_none_error(self): with pytest.warns( UserWarning, - match="CappingTransformer: quantiles not set so no fitting done in CappingTransformer", + match="quantiles not set so no fitting done in CappingTransformer", ): df = d.create_df_3() @@ -747,8 +745,7 @@ def test_non_numeric_column_error(self): x = CappingTransformer(capping_values={"a": [2, 5], "b": [-1, 8], "c": [-1, 8]}) with pytest.raises( - TypeError, - match=r"CappingTransformer: The following columns are not numeric in X; \['b', 'c'\]", + TypeError, match=r"The following columns are not numeric in X; \['b', 'c'\]" ): x.transform(df) @@ -762,7 +759,7 @@ def test_quantile_not_fit_error(self): with pytest.raises( ValueError, - match="CappingTransformer: capping_values attribute is an empty dict - perhaps the fit method has not been run yet", + match="capping_values attribute is an empty dict - perhaps the fit method has not been run yet", ): x.transform(df) @@ -779,7 +776,7 @@ def test_replacement_values_dict_not_set_error(self): with pytest.raises( ValueError, - match="CappingTransformer: _replacement_values attribute is an empty dict - perhaps the fit method has not been run yet", + match="_replacement_values attribute is an empty dict - perhaps the fit method has not been run yet", ): x.transform(df) @@ -877,8 +874,7 @@ def test_zero_total_weight_error(self): x = CappingTransformer(capping_values={"a": [2, 10]}) with pytest.raises( - ValueError, - match="CappingTransformer: total sample weights are not greater than 0", + ValueError, match="total sample weights are not greater than 0" ): x.weighted_quantile([2, 3, 4, 5], [0, 1], [0, 0]) @@ -888,9 +884,7 @@ def test_null_values_in_weights_error(self): x = CappingTransformer(capping_values={"a": [2, 10]}) - with pytest.raises( - ValueError, match="CappingTransformer: null values in sample weights" - ): + with pytest.raises(ValueError, match="null values in sample weights"): x.weighted_quantile([2, 3, 4, 5], [0, 1], [3, np.NaN]) @@ -899,15 +893,11 @@ def test_inf_values_in_weights_error(self): x = CappingTransformer(capping_values={"a": [2, 10]}) - with pytest.raises( - ValueError, match="CappingTransformer: infinite values in sample weights" - ): + with pytest.raises(ValueError, match="infinite values in sample weights"): x.weighted_quantile([2, 3, 4, 5], [0, 1], [2, np.inf]) - with pytest.raises( - ValueError, match="CappingTransformer: infinite values in sample weights" - ): + with pytest.raises(ValueError, match="infinite values in sample weights"): x.weighted_quantile([2, 3, 4, 5], [0, 1], [1, -np.inf]) @@ -916,8 +906,6 @@ def test_negative_values_in_weights_error(self): x = CappingTransformer(capping_values={"a": [2, 10]}) - with pytest.raises( - ValueError, match="CappingTransformer: negative weights in sample weights" - ): + with pytest.raises(ValueError, match="negative weights in sample weights"): x.weighted_quantile([2, 3, 4, 5], [0, 1], [2, -0.01]) diff --git a/tests/comparison/test_EqualityChecker.py b/tests/comparison/test_EqualityChecker.py deleted file mode 100644 index c50c0100..00000000 --- a/tests/comparison/test_EqualityChecker.py +++ /dev/null @@ -1,189 +0,0 @@ -import pytest -import tests.test_data as d -import test_aide as ta - -import tubular -from tubular.comparison import EqualityChecker - - -@pytest.fixture(scope="module", autouse=True) -def example_transformer(): - - example_transformer = EqualityChecker(columns=["a", "b"], new_col_name="d") - - return example_transformer - - -class TestInit: - - """Tests for the EqualityChecker.__init__ method.""" - - def test_arguments(self): - - """Test that init has expected arguments.""" - - ta.functions.test_function_arguments( - func=EqualityChecker.__init__, - expected_arguments=["self", "columns", "new_col_name", "drop_original"], - expected_default_values=(False,), - ) - - def test_inheritance(self, example_transformer): - """Test EqualityChecker inherits from BaseTransformer.""" - - assert isinstance( - example_transformer, tubular.base.BaseTransformer - ), "EqualityChecker is not instance of tubular.base.BaseTransformer" - - def test_super_init_call(self, mocker): - """Test that BaseTransformer.init is called as expected.""" - - expected_call_args = { - 0: { - "args": (), - "kwargs": {"columns": ["a", "b"], "verbose": False, "copy": False}, - } - } - - with ta.functions.assert_function_call( - mocker, tubular.base.BaseTransformer, "__init__", expected_call_args - ): - - EqualityChecker( - columns=["a", "b"], new_col_name="d", verbose=False, copy=False - ) - - def test_class_methods(self, example_transformer): - """Test that EqualityChecker has transform method.""" - msg = "no transformation method in class" - ta.classes.test_object_method( - obj=example_transformer, expected_method="transform", msg=msg - ) - - def test_value_new_col_name(self, example_transformer): - """Test that the value passed in the new column name arg is correct.""" - - assert ( - example_transformer.new_col_name == "d" - ), "unexpected value set to new_col_name atttribute" - - def test_value_drop_original(self, example_transformer): - """Test that the value passed in the drop_original arg is correct.""" - - assert ( - not example_transformer.drop_original - ), "unexpected value set to drop_original atttribute" - - @pytest.mark.parametrize("test_input_col_type", ["a", None]) - def test_type_error_for_columns(self, test_input_col_type): - """Checks that an error is raised if wrong data type for argument:columns""" - - with pytest.raises( - TypeError, - match="columns should be list", - ): - EqualityChecker(columns=test_input_col_type, new_col_name="d") - - @pytest.mark.parametrize("test_input_col", [["b", "b", "b"], ["a"]]) - def test_value_error_for_columns(self, test_input_col): - """Checks that a value error is raised where 2 cols are not supplied""" - with pytest.raises( - ValueError, - match="This transformer works with two columns only", - ): - EqualityChecker(columns=test_input_col, new_col_name="d") - - @pytest.mark.parametrize("test_input_new_col", [123, ["a"], True]) - def test_type_error_for_new_column_name(self, test_input_new_col): - """Checks that an error is raised if wrong data type for argument:new_col_name""" - - with pytest.raises( - TypeError, - match="new_col_name should be str", - ): - EqualityChecker(columns=["a", "b"], new_col_name=test_input_new_col) - - @pytest.mark.parametrize("test_input_drop_col", [123, ["a"], "asd"]) - def test_type_error_for_drop_column(self, test_input_drop_col): - """Checks that an error is raised if wrong data type for argument:drop_original""" - - with pytest.raises( - TypeError, - match="drop_original should be bool", - ): - EqualityChecker( - columns=["a", "b"], - new_col_name="col_name", - drop_original=test_input_drop_col, - ) - - -class TestTransform(object): - """Tests for the EqualityChecker.transform method.""" - - def test_arguments(self): - """Test that transform has expected arguments.""" - - ta.functions.test_function_arguments( - func=EqualityChecker.transform, - expected_arguments=["self", "X"], - expected_default_values=None, - ) - - def test_super_transform_called(self, mocker, example_transformer): - """Test that BaseTransformer.transform called.""" - - df = d.create_df_7() - - expected_call_args = {0: {"args": (d.create_df_7(),), "kwargs": {}}} - - with ta.functions.assert_function_call( - mocker, tubular.base.BaseTransformer, "transform", expected_call_args - ): - - example_transformer.transform(df) - - @pytest.mark.parametrize( - "test_dataframe", [d.create_df_5(), d.create_df_2(), d.create_df_9()] - ) - def test_expected_output(self, test_dataframe): - """Tests that the output given by EqualityChecker tranformer is as you would expect - when all cases are neither all True nor False""" - - expected = test_dataframe - expected["bool_logic"] = expected["b"] == expected["c"] - - example_transformer = EqualityChecker( - columns=["b", "c"], new_col_name="bool_logic" - ) - actual = example_transformer.transform(test_dataframe) - - ta.equality.assert_frame_equal_msg( - actual=actual, - expected=expected, - msg_tag="EqualityChecker transformer does not produce the expected output", - print_actual_and_expected=True, - ) - - @pytest.mark.parametrize( - "test_dataframe", [d.create_df_5(), d.create_df_2(), d.create_df_9()] - ) - def test_expected_output_dropped(self, test_dataframe): - """Tests that the output given by EqualityChecker tranformer is as you would expect - when all cases are neither all True nor False""" - - expected = test_dataframe.copy() - expected["bool_logic"] = expected["b"] == expected["c"] - expected = expected.drop(["b", "c"], axis=1) - - example_transformer = EqualityChecker( - columns=["b", "c"], new_col_name="bool_logic", drop_original=True - ) - actual = example_transformer.transform(test_dataframe) - - ta.equality.assert_frame_equal_msg( - actual=actual, - expected=expected, - msg_tag="EqualityChecker transformer does not produce the expected output", - print_actual_and_expected=True, - ) diff --git a/tests/dates/test_BetweenDatesTransformer.py b/tests/dates/test_BetweenDatesTransformer.py index 9bcd8c3d..d09fa688 100644 --- a/tests/dates/test_BetweenDatesTransformer.py +++ b/tests/dates/test_BetweenDatesTransformer.py @@ -64,9 +64,7 @@ def test_super_init_called(self, mocker): def test_first_non_str_error(self): """Test that an exception is raised if column_lower not str.""" - with pytest.raises( - TypeError, match="BetweenDatesTransformer: column_lower should be str" - ): + with pytest.raises(TypeError, match="column_lower should be str"): BetweenDatesTransformer( column_lower=False, @@ -78,9 +76,7 @@ def test_first_non_str_error(self): def test_column_between_non_str_error(self): """Test that an exception is raised if column_between not str.""" - with pytest.raises( - TypeError, match="BetweenDatesTransformer: column_between should be str" - ): + with pytest.raises(TypeError, match="column_between should be str"): BetweenDatesTransformer( column_lower="a", @@ -92,9 +88,7 @@ def test_column_between_non_str_error(self): def test_column_upper_non_str_error(self): """Test that an exception is raised if column_upper not str.""" - with pytest.raises( - TypeError, match="BetweenDatesTransformer: column_upper should be str" - ): + with pytest.raises(TypeError, match="column_upper should be str"): BetweenDatesTransformer( column_lower="a", @@ -106,9 +100,7 @@ def test_column_upper_non_str_error(self): def test_new_column_name_non_str_error(self): """Test that an exception is raised if new_column_name not str.""" - with pytest.raises( - TypeError, match="BetweenDatesTransformer: new_column_name should be str" - ): + with pytest.raises(TypeError, match="new_column_name should be str"): BetweenDatesTransformer( column_lower="a", @@ -120,9 +112,7 @@ def test_new_column_name_non_str_error(self): def test_lower_inclusive_non_bool_error(self): """Test that an exception is raised if lower_inclusive not a bool.""" - with pytest.raises( - TypeError, match="BetweenDatesTransformer: lower_inclusive should be a bool" - ): + with pytest.raises(TypeError, match="lower_inclusive should be a bool"): BetweenDatesTransformer( column_lower="a", @@ -135,9 +125,7 @@ def test_lower_inclusive_non_bool_error(self): def test_upper_inclusive_non_bool_error(self): """Test that an exception is raised if upper_inclusive not a bool.""" - with pytest.raises( - TypeError, match="BetweenDatesTransformer: upper_inclusive should be a bool" - ): + with pytest.raises(TypeError, match="upper_inclusive should be a bool"): BetweenDatesTransformer( column_lower="a", @@ -281,8 +269,7 @@ def test_cols_not_datetime(self): ) with pytest.raises( - TypeError, - match=r"BetweenDatesTransformer: a should be datetime64\[ns\] type but got int64", + TypeError, match=r"a should be datetime64\[ns\] type but got int64" ): x.transform(df) diff --git a/tests/dates/test_DateDiffLeapYearTransformer.py b/tests/dates/test_DateDiffLeapYearTransformer.py index ff9f42e6..f42bedd4 100644 --- a/tests/dates/test_DateDiffLeapYearTransformer.py +++ b/tests/dates/test_DateDiffLeapYearTransformer.py @@ -20,9 +20,7 @@ def test_row_type_error(self): column_lower="a", column_upper="b", new_column_name="c", drop_cols=True ) - with pytest.raises( - TypeError, match="DateDiffLeapYearTransformer: row should be a pd.Series" - ): + with pytest.raises(TypeError, match="row should be a pd.Series"): date_transformer.calculate_age(row=row) @@ -52,7 +50,7 @@ def test_upper_column_type_error(self): with pytest.raises( TypeError, - match="DateDiffLeapYearTransformer: upper column values should be datetime.datetime or datetime.date objects", + match="upper column values should be datetime.datetime or datetime.date objects", ): date_transformer.calculate_age(row=row) @@ -67,7 +65,7 @@ def test_lower_column_type_error(self): with pytest.raises( TypeError, - match="DateDiffLeapYearTransformer: lower column values should be datetime.datetime or datetime.date objects", + match="lower column values should be datetime.datetime or datetime.date objects", ): date_transformer.calculate_age(row=row) @@ -151,9 +149,7 @@ def test_super_init_called(self, mocker): def test_column_lower_type_error(self): """Test that an exception is raised if column_lower is not a str.""" - with pytest.raises( - TypeError, match="DateDiffLeapYearTransformer: column_lower should be a str" - ): + with pytest.raises(TypeError, match="column_lower should be a str"): DateDiffLeapYearTransformer( column_lower=123, @@ -165,9 +161,7 @@ def test_column_lower_type_error(self): def test_column_upper_type_error(self): """Test that an exception is raised if column_upper is not a str.""" - with pytest.raises( - TypeError, match="DateDiffLeapYearTransformer: column_upper should be a str" - ): + with pytest.raises(TypeError, match="column_upper should be a str"): DateDiffLeapYearTransformer( column_lower="dummy_1", @@ -179,10 +173,7 @@ def test_column_upper_type_error(self): def test_new_column_name_type_error(self): """Test that an exception is raised if new_column_name is not a str.""" - with pytest.raises( - TypeError, - match="DateDiffLeapYearTransformer: new_column_name should be a str", - ): + with pytest.raises(TypeError, match="new_column_name should be a str"): DateDiffLeapYearTransformer( column_lower="dummy_1", @@ -194,9 +185,7 @@ def test_new_column_name_type_error(self): def test_drop_cols_type_error(self): """Test that an exception is raised if drop_cols is not a bool.""" - with pytest.raises( - TypeError, match="DateDiffLeapYearTransformer: drop_cols should be a bool" - ): + with pytest.raises(TypeError, match="drop_cols should be a bool"): DateDiffLeapYearTransformer( column_lower="dummy_1", @@ -210,7 +199,7 @@ def test_missing_replacement_type_error(self): with pytest.raises( TypeError, - match="DateDiffLeapYearTransformer: if not None, missing_replacement should be an int, float or string", + match="if not None, missing_replacement should be an int, float or string", ): DateDiffLeapYearTransformer( diff --git a/tests/dates/test_DateDifferenceTransformer.py b/tests/dates/test_DateDifferenceTransformer.py index 9a6d1ccd..4fbe92ae 100644 --- a/tests/dates/test_DateDifferenceTransformer.py +++ b/tests/dates/test_DateDifferenceTransformer.py @@ -76,9 +76,7 @@ def test_super_init_called(self, mocker): def test_column_lower_type_error(self): """Test that an exception is raised if column_lower is not a str.""" - with pytest.raises( - TypeError, match="DateDifferenceTransformer: column_lower must be a str" - ): + with pytest.raises(TypeError, match="column_lower must be a str"): DateDifferenceTransformer( column_lower=123, @@ -92,9 +90,7 @@ def test_column_lower_type_error(self): def test_column_2_type_error(self): """Test that an exception is raised if column_upper is not a str.""" - with pytest.raises( - TypeError, match="DateDifferenceTransformer: column_upper must be a str" - ): + with pytest.raises(TypeError, match="column_upper must be a str"): DateDifferenceTransformer( column_lower="dummy_1", @@ -108,9 +104,7 @@ def test_column_2_type_error(self): def test_new_column_name_type_error(self): """Test that an exception is raised if new_column_name is not a str.""" - with pytest.raises( - TypeError, match="DateDifferenceTransformer: new_column_name must be a str" - ): + with pytest.raises(TypeError, match="new_column_name must be a str"): DateDifferenceTransformer( column_lower="dummy_1", @@ -124,9 +118,7 @@ def test_new_column_name_type_error(self): def test_units_type_error(self): """Test that an exception is raised if new_column_name is not a str.""" - with pytest.raises( - TypeError, match="DateDifferenceTransformer: units must be a str" - ): + with pytest.raises(TypeError, match="units must be a str"): DateDifferenceTransformer( column_lower="dummy_1", @@ -142,7 +134,7 @@ def test_units_values_error(self): with pytest.raises( ValueError, - match=r"DateDifferenceTransformer: units must be one of \['Y', 'M', 'D', 'h', 'm', 's'\], got y", + match=r"units must be one of \['Y', 'M', 'D', 'h', 'm', 's'\], got y", ): DateDifferenceTransformer( diff --git a/tests/dates/test_DateTimeInfoExtractor.py b/tests/dates/test_DateTimeInfoExtractor.py deleted file mode 100644 index 5996771c..00000000 --- a/tests/dates/test_DateTimeInfoExtractor.py +++ /dev/null @@ -1,477 +0,0 @@ -import pytest -import re -import test_aide as ta -import tests.test_data as d - -import pandas as pd -import numpy as np - -import tubular -from tubular.dates import DatetimeInfoExtractor - - -@pytest.fixture -def timeofday_extractor(): - return DatetimeInfoExtractor(columns=["a"], include=["timeofday"]) - - -@pytest.fixture -def timeofmonth_extractor(): - return DatetimeInfoExtractor(columns=["a"], include=["timeofmonth"]) - - -@pytest.fixture -def timeofyear_extractor(): - return DatetimeInfoExtractor(columns=["a"], include=["timeofyear"]) - - -@pytest.fixture -def dayofweek_extractor(): - return DatetimeInfoExtractor(columns=["a"], include=["dayofweek"]) - - -class TestExtractDatetimeInfoInit(object): - def test_assert_inheritance(self): - """Test that ExtractDatetimeInfo inherits from BaseTransformer.""" - - x = DatetimeInfoExtractor(columns=["a"]) - - ta.classes.assert_inheritance(x, tubular.base.BaseTransformer) - - def test_arguments(self): - """Test that init has the expected arguments""" - - default_include = [ - "timeofday", - "timeofmonth", - "timeofyear", - "dayofweek", - ] - ta.functions.test_function_arguments( - func=DatetimeInfoExtractor.__init__, - expected_arguments=["self", "columns", "include", "datetime_mappings"], - expected_default_values=( - default_include, - {}, - ), - ) - - def test_super_init_called(self, mocker): - """Test that init calls BaseTransformer.init.""" - - expected_call_args = { - 0: { - "args": (), - "kwargs": {"columns": ["a"]}, - } - } - - with ta.functions.assert_function_call( - mocker, tubular.base.BaseTransformer, "__init__", expected_call_args - ): - - DatetimeInfoExtractor(columns=["a"]) - - def test_values_passed_in_init_set_to_attribute(self): - """Test that the values passed in init are saved in an attribute of the same name.""" - - x = DatetimeInfoExtractor( - columns=["a"], - include=["timeofmonth", "timeofday"], - datetime_mappings={"timeofday": {"am": range(0, 12), "pm": range(12, 24)}}, - ) - - ta.classes.test_object_attributes( - obj=x, - expected_attributes={ - "columns": ["a"], - "include": ["timeofmonth", "timeofday"], - "datetime_mappings": { - "timeofday": {"am": range(0, 12), "pm": range(12, 24)} - }, - }, - msg="Attributes for ExtractDatetimeInfo set in init", - ) - - def test_class_methods(self): - """Test that DatetimeInfoExtractor has fit and transform methods.""" - - x = DatetimeInfoExtractor(columns=["a"]) - - ta.classes.test_object_method( - obj=x, expected_method="_map_values", msg="_map_values" - ) - ta.classes.test_object_method( - obj=x, expected_method="transform", msg="transform" - ) - - @pytest.mark.parametrize("incorrect_type_include", [2, 3.0, "invalid", "dayofweek"]) - def test_error_when_include_not_list(self, incorrect_type_include): - """Test that an exception is raised when value include variable is not a list""" - - with pytest.raises( - TypeError, - match="include should be List", - ): - DatetimeInfoExtractor(columns=["a"], include=incorrect_type_include) - - def test_error_when_invalid_include_option(self): - """Test that an exception is raised when include contains incorrect values""" - - with pytest.raises( - ValueError, - match=r'elements in include should be in \["timeofday", "timeofmonth", "timeofyear", "dayofweek"\]', - ): - DatetimeInfoExtractor( - columns=["a"], include=["timeofday", "timeofmonth", "invalid_option"] - ) - - @pytest.mark.parametrize( - "incorrect_type_datetime_mappings", [2, 3.0, ["a", "b"], "dayofweek"] - ) - def test_error_when_datetime_mappings_not_dict( - self, incorrect_type_datetime_mappings - ): - """Test that an exception is raised when datetime_mappings is not a dict""" - - with pytest.raises( - TypeError, - match="datetime_mappings should be Dict", - ): - DatetimeInfoExtractor( - columns=["a"], datetime_mappings=incorrect_type_datetime_mappings - ) - - @pytest.mark.parametrize( - "incorrect_type_datetime_mappings_values", [{"timeofday": 2}] - ) - def test_error_when_datetime_mapping_value_not_dict( - self, incorrect_type_datetime_mappings_values - ): - """Test that an exception is raised when values in datetime_mappings are not dict""" - - with pytest.raises( - TypeError, - match="values in datetime_mappings should be dict", - ): - DatetimeInfoExtractor( - columns=["a"], datetime_mappings=incorrect_type_datetime_mappings_values - ) - - @pytest.mark.parametrize( - "include, incorrect_datetime_mappings_keys", - [ - (["timeofyear"], {"invalid_key": {"valid_mapping": "valid_output"}}), - (["timeofyear"], {"dayofweek": {"day": range(7)}}), - ( - ["timeofyear"], - {"timeofyear": {"month": range(12)}, "timeofday": {"hour": range(24)}}, - ), - ], - ) - def test_error_when_datetime_mapping_key_not_in_include( - self, include, incorrect_datetime_mappings_keys - ): - """Test that an exception is raised when keys in datetime_mappings are not in include""" - - with pytest.raises( - ValueError, - match="keys in datetime_mappings should be in include", - ): - DatetimeInfoExtractor( - columns=["a"], - include=include, - datetime_mappings=incorrect_datetime_mappings_keys, - ) - - @pytest.mark.parametrize( - "incomplete_mappings, expected_exception", - [ - ( - {"timeofday": {"mapped": range(23)}}, - re.escape( - "timeofday mapping dictionary should contain mapping for all hours between 0-23. {23} are missing" - ), - ), - ( - {"timeofmonth": {"mapped": range(1, 31)}}, - re.escape( - "timeofmonth mapping dictionary should contain mapping for all days between 1-31. {31} are missing" - ), - ), - ( - {"timeofyear": {"mapped": range(1, 12)}}, - re.escape( - "timeofyear mapping dictionary should contain mapping for all months between 1-12. {12} are missing" - ), - ), - ( - {"dayofweek": {"mapped": range(6)}}, - re.escape( - "dayofweek mapping dictionary should contain mapping for all days between 0-6. {6} are missing" - ), - ), - ], - ) - def test_error_when_incomplete_mappings_passed( - self, incomplete_mappings, expected_exception - ): - """Test that error is raised when incomplete mappings are passed""" - - with pytest.raises(ValueError, match=expected_exception): - DatetimeInfoExtractor(columns=["a"], datetime_mappings=incomplete_mappings) - - -class TestMapValues(object): - def test_arguments(self): - """Test that identify_timeofday has the expected arguments""" - - ta.functions.test_function_arguments( - func=DatetimeInfoExtractor._map_values, - expected_arguments=["self", "value", "interval"], - expected_default_values=None, - ) - - @pytest.mark.parametrize("incorrect_type_input", ["2", [1, 2]]) - def test_incorrect_type_input(self, incorrect_type_input, timeofday_extractor): - """Test that an error is raised if input is the wrong type""" - - with pytest.raises( - TypeError, match="DatetimeInfoExtractor: value should be float or int" - ): - - timeofday_extractor._map_values(incorrect_type_input, "timeofday") - - @pytest.mark.parametrize("incorrect_size_input", [-2, 30, 5.6, 11.2]) - def test_out_of_bounds_or_fractional_input( - self, incorrect_size_input, timeofday_extractor - ): - """Test that an error is raised when value is outside of 0-23 range""" - - with pytest.raises( - ValueError, - match="DatetimeInfoExtractor: value for timeofday mapping in self._map_values should be an integer value in 0-23", - ): - timeofday_extractor._map_values(incorrect_size_input, "timeofday") - - @pytest.mark.parametrize( - "valid_hour, hour_time_of_day", - [ - (0, "night"), - (5, "night"), - (6, "morning"), - (11, "morning"), - (12, "afternoon"), - (17, "afternoon"), - (18, "evening"), - (23, "evening"), - ], - ) - def test_valid_inputs_timeofday( - self, valid_hour, hour_time_of_day, timeofday_extractor - ): - """Trial test to check all in one go""" - - output = timeofday_extractor._map_values(valid_hour, "timeofday") - - assert output == hour_time_of_day, "expected {}, output {}".format( - hour_time_of_day, output - ) - - @pytest.mark.parametrize( - "valid_day, day_time_of_month", - [ - (1, "start"), - (6, "start"), - (10, "start"), - (11, "middle"), - (16, "middle"), - (20, "middle"), - (21, "end"), - (21, "end"), - (31, "end"), - ], - ) - def test_valid_inputs_timeofmonth( - self, valid_day, day_time_of_month, timeofmonth_extractor - ): - """Test that correct values are return with valid inputs""" - output = timeofmonth_extractor._map_values(valid_day, "timeofmonth") - assert output == day_time_of_month, "expected {}, output {}".format( - day_time_of_month, output - ) - - @pytest.mark.parametrize( - "valid_month, month_time_of_year", - [ - (1, "winter"), - (3, "spring"), - (4, "spring"), - (6, "summer"), - (7, "summer"), - (9, "autumn"), - (10, "autumn"), - (12, "winter"), - ], - ) - def test_valid_inputs_timeofyear( - self, valid_month, month_time_of_year, timeofyear_extractor - ): - """Test that correct values are return with valid inputs""" - output = timeofyear_extractor._map_values(valid_month, "timeofyear") - assert output == month_time_of_year, "expected {}, output {}".format( - month_time_of_year, output - ) - - @pytest.mark.parametrize( - "valid_day, dayofweek", - [ - (0, "monday"), - (2, "wednesday"), - (4, "friday"), - (6, "sunday"), - ], - ) - def test_valid_inputs_dayofweek(self, valid_day, dayofweek, dayofweek_extractor): - """Test that correct values are return with valid inputs""" - output = dayofweek_extractor._map_values(valid_day, "dayofweek") - assert output == dayofweek, "expected {}, output {}".format(dayofweek, output) - - def test_valid_nan_output(self, timeofday_extractor): - """Test that correct values are return with valid inputs""" - output = timeofday_extractor._map_values(np.nan, "timeofday") - print(output) - assert np.isnan( - output - ), f"passing np.nan should result in np.nan, instead received {output}" - - -class TestTransform(object): - def test_arguments(self): - """Test that init has the expected arguments""" - - ta.functions.test_function_arguments( - func=DatetimeInfoExtractor.transform, - expected_arguments=["self", "X"], - expected_default_values=None, - ) - - def test_super_transform_called(self, mocker): - """Test that init calls BaseTransformer.init.""" - - df = d.create_date_test_df() - df = df.astype("datetime64[ns]") - - expected_call_args = { - 0: { - "args": (df,), - "kwargs": {}, - } - } - - with ta.functions.assert_function_call( - mocker, - tubular.base.BaseTransformer, - "transform", - expected_call_args, - return_value=df, - ): - - x = DatetimeInfoExtractor(columns=["a"], include=["dayofweek"]) - - x.transform(df) - - def test_non_datetime_column(self): - """Test that error is raised if input columns do not contain datetime values""" - - df = d.create_df_1() # Mix of int values - - x = DatetimeInfoExtractor(columns=["a"], include=["dayofweek"]) - - with pytest.raises( - TypeError, match="values in {} should be datetime".format("a") - ): - x.transform(df), - - def test_correct_col_returned(self): - """Test that the added column is correct""" - - df = d.create_date_test_df() - df = df.astype("datetime64[ns]") - - x = DatetimeInfoExtractor(columns=["b"], include=["timeofyear"]) - transformed = x.transform(df) - - expected_output = pd.Series( - [ - "spring", - "winter", - "autumn", - "autumn", - "autumn", - "autumn", - "autumn", - "summer", - ], - name="b_timeofyear", - ) - - ta.equality.assert_series_equal_msg( - transformed["b_timeofyear"], - expected_output, - "incorrect series returned", - print_actual_and_expected=True, - ) - - def test_map_values_calls(self, mocker): - """Test all intermediary methods are being called correct number of times""" - - # df is 8 rows long so each intermediate function must have 8 calls - df = d.create_date_test_df() - df = df.astype("datetime64[ns]") - - mocked_map_values = mocker.spy(DatetimeInfoExtractor, "_map_values") - - x = DatetimeInfoExtractor( - columns=["b"], - include=["timeofday", "timeofyear", "timeofmonth", "dayofweek"], - ) - x.transform(df) - - assert mocked_map_values.call_count == 32 - - def test_correct_df_returned(self): - """Test that correct df is returned after transformation""" - - df = d.create_date_test_df() - df.loc[0, "b"] = np.nan - df = df.astype("datetime64[ns]") - - x = DatetimeInfoExtractor(columns=["b"], include=["timeofmonth", "timeofyear"]) - transformed = x.transform(df) - - expected = df.copy() - expected["b_timeofmonth"] = [ - np.nan, - "end", - "start", - "start", - "start", - "start", - "start", - "end", - ] - expected["b_timeofyear"] = [ - np.nan, - "winter", - "autumn", - "autumn", - "autumn", - "autumn", - "autumn", - "summer", - ] - - ta.equality.assert_frame_equal_msg( - transformed, expected, "incorrect dataframe returned" - ) diff --git a/tests/dates/test_DatetimeSinusoidCalculator.py b/tests/dates/test_DatetimeSinusoidCalculator.py deleted file mode 100644 index 54e529b0..00000000 --- a/tests/dates/test_DatetimeSinusoidCalculator.py +++ /dev/null @@ -1,326 +0,0 @@ -import pytest -import tests.test_data as d -import test_aide as ta -import re - -import tubular -from tubular.dates import DatetimeSinusoidCalculator -import pandas as pd -import numpy as np - - -@pytest.fixture(scope="module", autouse=True) -def example_transformer(): - - return DatetimeSinusoidCalculator("a", "cos", "hour", 24) - - -class TestDatetimeSinusoidCalculatorInit(object): - """Tests for DateDifferenceTransformer.init().""" - - def test_arguments(self): - """Test that init has expected arguments.""" - - ta.functions.test_function_arguments( - func=DatetimeSinusoidCalculator.__init__, - expected_arguments=[ - "self", - "columns", - "method", - "units", - "period", - ], - expected_default_values=(2 * np.pi,), - ) - - def test_class_methods(self, example_transformer): - """Test that DateDifferenceTransformer has a transform method.""" - - ta.classes.test_object_method( - obj=example_transformer, expected_method="transform", msg="transform" - ) - - def test_inheritance(self, example_transformer): - """Test that DateDifferenceTransformer inherits from BaseTransformer.""" - - ta.classes.assert_inheritance(example_transformer, tubular.base.BaseTransformer) - - def test_super_init_called(self, mocker): - """Test that init calls BaseTransformer.init.""" - - expected_call_args = { - 0: { - "args": ("a",), - "kwargs": { - "copy": True, - }, - } - } - - with ta.functions.assert_function_call( - mocker, tubular.base.BaseTransformer, "__init__", expected_call_args - ): - - DatetimeSinusoidCalculator( - "a", - "cos", - "hour", - 24, - ) - - @pytest.mark.parametrize("incorrect_type_method", [2, 2.0, True, {"a": 4}]) - def test_method_type_error(self, incorrect_type_method): - """Test that an exception is raised if method is not a str.""" - - with pytest.raises( - TypeError, - match="method must be a string or list but got {}".format( - type(incorrect_type_method) - ), - ): - - DatetimeSinusoidCalculator( - "a", - incorrect_type_method, - "hour", - 24, - ) - - @pytest.mark.parametrize("incorrect_type_units", [2, 2.0, True, {"a": 4}, ["help"]]) - def test_units_type_error(self, incorrect_type_units): - """Test that an exception is raised if units is not a str.""" - - with pytest.raises( - TypeError, - match="units must be a string but got {}".format( - type(incorrect_type_units) - ), - ): - - DatetimeSinusoidCalculator( - "a", - "cos", - incorrect_type_units, - 24, - ) - - @pytest.mark.parametrize("incorrect_type_period", ["2", {"a": 4}, ["help"]]) - def test_period_type_error(self, incorrect_type_period): - """Test that an error is raised if period is not an int or a float""" - - with pytest.raises( - TypeError, - match="period must be a int or float but got {}".format( - type(incorrect_type_period) - ), - ): - - DatetimeSinusoidCalculator( - "a", - "cos", - "hour", - incorrect_type_period, - ) - - def test_valid_method_value_error(self): - """Test that a value error is raised if method is not sin, cos or a list containing both.""" - method = "tan" - - with pytest.raises( - ValueError, - match='Invalid method {} supplied, should be "sin", "cos" or a list containing both'.format( - method - ), - ): - - DatetimeSinusoidCalculator( - "a", - method, - "year", - 24, - ) - - def test_valid_units_value_error(self): - """Test that a value error is raised if the unit supplied is not in the valid units list.""" - units = "five" - valid_unit_list = [ - "year", - "month", - "day", - "hour", - "minute", - "second", - "microsecond", - ] - - with pytest.raises( - ValueError, - match=re.escape( - "Invalid units {} supplied, should be in {}".format( - units, valid_unit_list - ) - ), - ): - - DatetimeSinusoidCalculator( - "a", - "cos", - units, - 24, - ) - - def test_attributes(self, example_transformer): - """Test that the value passed for new_column_name and units are saved in attributes of the same name.""" - - ta.classes.test_object_attributes( - obj=example_transformer, - expected_attributes={ - "columns": ["a"], - "units": "hour", - "period": 24, - }, - msg="Attributes for DateDifferenceTransformer set in init", - ) - - -class TestDatetimeSinusoidCalculatorTransform(object): - def test_arguments(self): - """Test that transform has expected arguments.""" - - ta.functions.test_function_arguments( - func=DatetimeSinusoidCalculator.transform, - expected_arguments=["self", "X"], - expected_default_values=None, - ) - - def test_datetime_type_error(self): - """Tests that an error is raised if the column passed to the transformer is not a datetime column.""" - not_datetime = pd.DataFrame({"a": [1, 2, 3]}) - column = "a" - message = re.escape( - f"{column} should be datetime64[ns] type but got {not_datetime[column].dtype}" - ) - with pytest.raises(TypeError, match=message): - - x = DatetimeSinusoidCalculator( - "a", - "cos", - "year", - 24, - ) - - x.transform(not_datetime) - - def test_BaseTransformer_transform_called(self, example_transformer, mocker): - - test_data = d.create_datediff_test_df() - - expected_call_args = {0: {"args": (test_data,), "kwargs": {}}} - - with ta.functions.assert_function_call( - mocker, - tubular.base.BaseTransformer, - "transform", - expected_call_args, - return_value=test_data, - ): - - example_transformer.transform(test_data) - - def test_cos_called_with_correct_args(self, mocker): - - """Tests that the correct numpy method is called on the correct column - also implicitly checks that the column has been transformed - into the correct units through the value of the argument.""" - - method = "cos" - - data = d.create_datediff_test_df() - column_in_desired_unit = data["a"].dt.month - cos_argument = column_in_desired_unit * (2.0 * np.pi / 12) - - spy = mocker.spy(np, method) - - x = DatetimeSinusoidCalculator( - "a", - "cos", - "month", - 12, - ) - x.transform(data) - - # pull out positional args to target the call - - call_args = spy.call_args_list[0][0] - - # test positional args are as expected - ta.equality.assert_list_tuple_equal_msg( - actual=call_args, - expected=(cos_argument,), - msg_tag=f"""Positional arg assert for {method}""", - ) - - @pytest.mark.parametrize( - "transformer", - [ - DatetimeSinusoidCalculator( - "a", - "cos", - "month", - 12, - ), - DatetimeSinusoidCalculator( - [ - "a", - "b", - ], - "cos", - "month", - 12, - ), - ], - ) - def test_expected_output_single_method(self, transformer): - - expected = d.create_datediff_test_df() - for column in transformer.columns: - column_in_desired_unit = expected[column].dt.month - cos_argument = column_in_desired_unit * (2.0 * np.pi / 12) - new_col_name = "cos_" + column - expected[new_col_name] = cos_argument.apply(np.cos) - - x = transformer - actual = x.transform(d.create_datediff_test_df()) - ta.equality.assert_frame_equal_msg( - actual=actual, - expected=expected, - msg_tag="DatetimeSinusoidCalculator transformer does not produce the expected output", - ) - - def test_expected_output_both_methods(self): - - expected = d.create_datediff_test_df() - - transformer = DatetimeSinusoidCalculator( - [ - "a", - "b", - ], - ["sin", "cos"], - "month", - 12, - ) - - for column in transformer.columns: - column_in_desired_unit = expected[column].dt.month - method_ready_column = column_in_desired_unit * (2.0 * np.pi / 12) - new_cos_col_name = "cos_" + column - new_sin_col_name = "sin_" + column - expected[new_sin_col_name] = method_ready_column.apply(np.sin) - expected[new_cos_col_name] = method_ready_column.apply(np.cos) - - actual = transformer.transform(d.create_datediff_test_df()) - ta.equality.assert_frame_equal_msg( - actual=actual, - expected=expected, - msg_tag="DatetimeSinusoidCalculator transformer does not produce the expected output", - ) diff --git a/tests/dates/test_SeriesDtMethodTransformer.py b/tests/dates/test_SeriesDtMethodTransformer.py index 141d2dc1..3543ea62 100644 --- a/tests/dates/test_SeriesDtMethodTransformer.py +++ b/tests/dates/test_SeriesDtMethodTransformer.py @@ -68,7 +68,7 @@ def test_invalid_input_type_errors(self): with pytest.raises( TypeError, - match=r"SeriesDtMethodTransformer: column should be a str but got \", + match=r"column should be a str but got \", ): SeriesDtMethodTransformer( @@ -77,14 +77,14 @@ def test_invalid_input_type_errors(self): with pytest.raises( TypeError, - match=r"SeriesDtMethodTransformer: unexpected type \(\\) for pd_method_name, expecting str", + match=r"unexpected type \(\\) for pd_method_name, expecting str", ): SeriesDtMethodTransformer(new_column_name="a", pd_method_name=1, column="b") with pytest.raises( TypeError, - match=r"SeriesDtMethodTransformer: unexpected type \(\\) for new_column_name, must be str", + match=r"unexpected type \(\\) for new_column_name, must be str", ): SeriesDtMethodTransformer( @@ -93,7 +93,7 @@ def test_invalid_input_type_errors(self): with pytest.raises( TypeError, - match=r"""SeriesDtMethodTransformer: pd_method_kwargs should be a dict but got type \""", + match=r"""pd_method_kwargs should be a dict but got type \""", ): SeriesDtMethodTransformer( @@ -105,7 +105,7 @@ def test_invalid_input_type_errors(self): with pytest.raises( TypeError, - match=r"""SeriesDtMethodTransformer: unexpected type \(\\) for pd_method_kwargs key in position 1, must be str""", + match=r"""unexpected type \(\\) for pd_method_kwargs key in position 1, must be str""", ): SeriesDtMethodTransformer( @@ -120,7 +120,7 @@ def test_exception_raised_non_pandas_method_passed(self): with pytest.raises( AttributeError, - match="""SeriesDtMethodTransformer: error accessing "dt.b" method on pd.Series object - pd_method_name should be a pd.Series.dt method""", + match="""error accessing "dt.b" method on pd.Series object - pd_method_name should be a pd.Series.dt method""", ): SeriesDtMethodTransformer( diff --git a/tests/dates/test_ToDatetimeTransformer.py b/tests/dates/test_ToDatetimeTransformer.py index f00089b1..c49a2670 100644 --- a/tests/dates/test_ToDatetimeTransformer.py +++ b/tests/dates/test_ToDatetimeTransformer.py @@ -70,7 +70,7 @@ def test_column_type_error(self): with pytest.raises( TypeError, - match="ToDatetimeTransformer: column should be a single str giving the column to transform to datetime", + match="column should be a single str giving the column to transform to datetime", ): ToDatetimeTransformer( @@ -81,9 +81,7 @@ def test_column_type_error(self): def test_new_column_name_type_error(self): """Test that an exception is raised if new_column_name is not a str.""" - with pytest.raises( - TypeError, match="ToDatetimeTransformer: new_column_name must be a str" - ): + with pytest.raises(TypeError, match="new_column_name must be a str"): ToDatetimeTransformer(column="b", new_column_name=1) @@ -92,7 +90,7 @@ def test_to_datetime_kwargs_type_error(self): with pytest.raises( TypeError, - match=r"""ToDatetimeTransformer: to_datetime_kwargs should be a dict but got type \""", + match=r"""to_datetime_kwargs should be a dict but got type \""", ): ToDatetimeTransformer(column="b", new_column_name="a", to_datetime_kwargs=1) @@ -102,7 +100,7 @@ def test_to_datetime_kwargs_key_type_error(self): with pytest.raises( TypeError, - match=r"""ToDatetimeTransformer: unexpected type \(\\) for to_datetime_kwargs key in position 1, must be str""", + match=r"""unexpected type \(\\) for to_datetime_kwargs key in position 1, must be str""", ): ToDatetimeTransformer( diff --git a/tests/imputers/test_ArbitraryImputer.py b/tests/imputers/test_ArbitraryImputer.py index d6e966e5..01341323 100644 --- a/tests/imputers/test_ArbitraryImputer.py +++ b/tests/imputers/test_ArbitraryImputer.py @@ -51,8 +51,7 @@ def test_columns_none_error(self): """Test that an exception is raised if columns is passed as None.""" with pytest.raises( - ValueError, - match="ArbitraryImputer: columns must be specified in init for ArbitraryImputer", + ValueError, match="columns must be specified in init for ArbitraryImputer" ): ArbitraryImputer(impute_value=1, columns=None) @@ -61,8 +60,7 @@ def test_impute_value_type_error(self): """Test that an exception is raised if impute_value is not an int, float or str.""" with pytest.raises( - ValueError, - match="ArbitraryImputer: impute_value should be a single value .*", + ValueError, match="impute_value should be a single value .*" ): ArbitraryImputer(impute_value={}, columns="a") diff --git a/tests/imputers/test_MeanImputer.py b/tests/imputers/test_MeanImputer.py index 6e69704e..addf0d11 100644 --- a/tests/imputers/test_MeanImputer.py +++ b/tests/imputers/test_MeanImputer.py @@ -16,8 +16,8 @@ def test_arguments(self): ta.functions.test_function_arguments( func=MeanImputer.__init__, - expected_arguments=["self", "columns", "weight"], - expected_default_values=(None, None), + expected_arguments=["self", "columns"], + expected_default_values=(None,), ) def test_class_methods(self): @@ -51,17 +51,6 @@ def test_super_init_called(self, mocker): MeanImputer(columns=None, verbose=True, copy=True) - @pytest.mark.parametrize("weight", (0, ["a"], {"a": 10})) - def test_weight_arg_errors(self, weight): - """Test that appropriate errors are throw for bad weight arg""" - - with pytest.raises( - TypeError, - match="weight should be str or None", - ): - - MeanImputer(columns=None, weight=weight) - class TestFit(object): """Tests for MeanImputer.fit()""" @@ -90,24 +79,6 @@ def test_super_fit_called(self, mocker): x.fit(df) - def test_check_weights_column_called(self, mocker): - """Test that fit calls BaseTransformer.check_weights_column - when weights are used.""" - - df = d.create_df_9() - - x = MeanImputer(columns=["a", "b"], weight="c") - - expected_call_args = {0: {"args": (d.create_df_9(), "c"), "kwargs": {}}} - - with ta.functions.assert_function_call( - mocker, - tubular.base.BaseTransformer, - "check_weights_column", - expected_call_args, - ): - - x.fit(df) - def test_learnt_values(self): """Test that the impute values learnt during fit are expected.""" @@ -129,26 +100,6 @@ def test_learnt_values(self): msg="impute_values_ attribute", ) - def test_learnt_values_weighted(self): - """Test that the impute values learnt during fit are expected - when weights are used.""" - - df = d.create_df_9() - - x = MeanImputer(columns=["a", "b"], weight="c") - - x.fit(df) - - ta.classes.test_object_attributes( - obj=x, - expected_attributes={ - "impute_values_": { - "a": np.float64((3 + 4 + 16 + 36) / (3 + 2 + 4 + 6)), - "b": np.float64((10 + 4 + 12 + 10 + 6) / (2 + 1 + 4 + 5 + 6)), - } - }, - msg="impute_values_ attribute", - ) - def test_fit_returns_self(self): """Test fit returns self?""" @@ -160,17 +111,6 @@ def test_fit_returns_self(self): assert x_fitted is x, "Returned value from MeanImputer.fit not as expected." - def test_fit_returns_self_weighted(self): - """Test fit returns self - when weight is used""" - - df = d.create_df_9() - - x = MeanImputer(columns="a", weight="c") - - x_fitted = x.fit(df) - - assert x_fitted is x, "Returned value from MeanImputer.fit not as expected." - def test_fit_not_changing_data(self): """Test fit does not change X.""" @@ -186,21 +126,6 @@ def test_fit_not_changing_data(self): msg="Check X not changing during fit", ) - def test_fit_not_changing_data_weighted(self): - """Test fit does not change X - when weights are used.""" - - df = d.create_df_9() - - x = MeanImputer(columns="a", weight="c") - - x.fit(df) - - ta.equality.assert_equal_dispatch( - expected=d.create_df_9(), - actual=df, - msg="Check X not changing during fit", - ) - class TestTransform(object): """Tests for MeanImputer.transform().""" @@ -239,17 +164,6 @@ def expected_df_2(): return df - def expected_df_3(): - """Expected output for test_nulls_imputed_correctly_3.""" - - df = d.create_df_9() - - for col, value in zip(["a", "b"], [59 / 15, 42 / 18]): - - df[col].loc[df[col].isnull()] = value - - return df - def test_arguments(self): """Test that transform has expected arguments.""" @@ -331,27 +245,6 @@ def test_nulls_imputed_correctly_2(self, df, expected): msg="Check nulls filled correctly in transform", ) - @pytest.mark.parametrize( - "df, expected", - ta.pandas.row_by_row_params(d.create_df_9(), expected_df_3()) - + ta.pandas.index_preserved_params(d.create_df_9(), expected_df_3()), - ) - def test_nulls_imputed_correctly_3(self, df, expected): - """Test missing values are filled with the correct values - and unrelated columns are not changed.""" - - x = MeanImputer(columns=["a", "b"], weight="c") - - # set the impute values dict directly rather than fitting x on df so test works with decorators - x.impute_values_ = {"a": 59 / 15, "b": 42 / 18} - - df_transformed = x.transform(df) - - ta.equality.assert_equal_dispatch( - expected=expected, - actual=df_transformed, - msg="Check nulls filled correctly in transform", - ) - def test_learnt_values_not_modified(self): """Test that the impute_values_ from fit are not changed in transform.""" @@ -370,22 +263,3 @@ def test_learnt_values_not_modified(self): actual=x2.impute_values_, msg="Impute values not changed in transform", ) - - def test_learnt_values_not_modified_weights(self): - """Test that the impute_values_ from fit are not changed in transform - when using weights.""" - - df = d.create_df_9() - - x = MeanImputer(columns=["a", "b"], weight="c") - - x.fit(df) - - x2 = MeanImputer(columns=["a", "b"], weight="c") - - x2.fit_transform(df) - - ta.equality.assert_equal_dispatch( - expected=x.impute_values_, - actual=x2.impute_values_, - msg="Impute values not changed in transform", - ) diff --git a/tests/imputers/test_MedianImputer.py b/tests/imputers/test_MedianImputer.py index 1ee5d506..12d20b92 100644 --- a/tests/imputers/test_MedianImputer.py +++ b/tests/imputers/test_MedianImputer.py @@ -16,8 +16,8 @@ def test_arguments(self): ta.functions.test_function_arguments( func=MedianImputer.__init__, - expected_arguments=["self", "columns", "weight"], - expected_default_values=(None, None), + expected_arguments=["self", "columns"], + expected_default_values=(None,), ) def test_class_methods(self): @@ -51,17 +51,6 @@ def test_super_init_called(self, mocker): MedianImputer(columns=None, verbose=True, copy=True) - @pytest.mark.parametrize("weight", (0, ["a"], {"a": 10})) - def test_weight_arg_errors(self, weight): - """Test that appropriate errors are throw for bad weight arg""" - - with pytest.raises( - TypeError, - match="weight should be str or None", - ): - - MedianImputer(columns=None, weight=weight) - class TestFit(object): """Tests for MedianImputer.fit()""" @@ -90,24 +79,6 @@ def test_super_fit_called(self, mocker): x.fit(df) - def test_check_weights_column_called(self, mocker): - """Test that fit calls BaseTransformer.check_weights_column - when weights are used.""" - - df = d.create_df_9() - - x = MedianImputer(columns=["a", "b"], weight="c") - - expected_call_args = {0: {"args": (d.create_df_9(), "c"), "kwargs": {}}} - - with ta.functions.assert_function_call( - mocker, - tubular.base.BaseTransformer, - "check_weights_column", - expected_call_args, - ): - - x.fit(df) - def test_learnt_values(self): """Test that the impute values learnt during fit are expected.""" @@ -129,32 +100,6 @@ def test_learnt_values(self): msg="impute_values_ attribute", ) - def test_learnt_values_weighted(self): - """Test that the impute values learnt during fit are expected - when using weights.""" - - df = d.create_df_9() - - df = pd.DataFrame( - { - "a": [1, 2, 4, 6], - "c": [3, 2, 4, 6], - } - ) - - x = MedianImputer(columns=["a"], weight="c") - - x.fit(df) - - ta.classes.test_object_attributes( - obj=x, - expected_attributes={ - "impute_values_": { - "a": np.int64(4), - } - }, - msg="impute_values_ attribute", - ) - def test_fit_returns_self(self): """Test fit returns self?""" @@ -166,17 +111,6 @@ def test_fit_returns_self(self): assert x_fitted is x, "Returned value from MedianImputer.fit not as expected." - def test_fit_returns_self_weighted(self): - """Test fit returns self?""" - - df = d.create_df_9() - - x = MedianImputer(columns="a", weight="c") - - x_fitted = x.fit(df) - - assert x_fitted is x, "Returned value from MedianImputer.fit not as expected." - def test_fit_not_changing_data(self): """Test fit does not change X.""" @@ -192,21 +126,6 @@ def test_fit_not_changing_data(self): msg="Check X not changing during fit", ) - def test_fit_not_changing_data_weighted(self): - """Test fit does not change X.""" - - df = d.create_df_9() - - x = MedianImputer(columns="a", weight="c") - - x.fit(df) - - ta.equality.assert_equal_dispatch( - expected=d.create_df_9(), - actual=df, - msg="Check X not changing during fit", - ) - class TestTransform(object): """Tests for MedianImputer.transform().""" @@ -245,17 +164,6 @@ def expected_df_2(): return df - def expected_df_3(): - """Expected output for test_nulls_imputed_correctly_3.""" - - df = d.create_df_9() - - for col in ["a"]: - - df[col].loc[df[col].isnull()] = 4 - - return df - def test_arguments(self): """Test that transform has expected arguments.""" @@ -337,28 +245,6 @@ def test_nulls_imputed_correctly_2(self, df, expected): msg="Check nulls filled correctly in transform", ) - @pytest.mark.parametrize( - "df, expected", - ta.pandas.row_by_row_params(d.create_df_9(), expected_df_3()) - + ta.pandas.index_preserved_params(d.create_df_9(), expected_df_3()), - ) - def test_nulls_imputed_correctly_3(self, df, expected): - """Test missing values are filled with the correct values - and unrelated columns are not changed - (when weight is used).""" - - x = MedianImputer(columns=["a"], weight="c") - - # set the impute values dict directly rather than fitting x on df so test works with helpers - x.impute_values_ = {"a": 4} - - df_transformed = x.transform(df) - - ta.equality.assert_equal_dispatch( - expected=expected, - actual=df_transformed, - msg="Check nulls filled correctly in transform", - ) - def test_learnt_values_not_modified(self): """Test that the impute_values_ from fit are not changed in transform.""" @@ -377,22 +263,3 @@ def test_learnt_values_not_modified(self): actual=x2.impute_values_, msg="Impute values not changed in transform", ) - - def test_learnt_values_not_modified_weights(self): - """Test that the impute_values_ from fit are not changed in transform - when using weights.""" - - df = d.create_df_9() - - x = MedianImputer(columns=["a", "b"], weight="c") - - x.fit(df) - - x2 = MedianImputer(columns=["a", "b"], weight="c") - - x2.fit_transform(df) - - ta.equality.assert_equal_dispatch( - expected=x.impute_values_, - actual=x2.impute_values_, - msg="Impute values not changed in transform", - ) diff --git a/tests/imputers/test_ModeImputer.py b/tests/imputers/test_ModeImputer.py index be79505b..a7491c74 100644 --- a/tests/imputers/test_ModeImputer.py +++ b/tests/imputers/test_ModeImputer.py @@ -16,11 +16,8 @@ def test_arguments(self): ta.functions.test_function_arguments( func=ModeImputer.__init__, - expected_arguments=["self", "columns", "weight"], - expected_default_values=( - None, - None, - ), + expected_arguments=["self", "columns"], + expected_default_values=(None,), ) def test_class_methods(self): @@ -54,17 +51,6 @@ def test_super_init_called(self, mocker): ModeImputer(columns=None, verbose=True, copy=True) - @pytest.mark.parametrize("weight", (0, ["a"], {"a": 10})) - def test_weight_arg_errors(self, weight): - """Test that appropriate errors are thrown for bad weight arg""" - - with pytest.raises( - ValueError, - match="ModeImputer: weight should be a string or None", - ): - - ModeImputer(columns=None, weight=weight) - class TestFit(object): """Tests for ModeImputer.fit()""" @@ -93,24 +79,6 @@ def test_super_fit_called(self, mocker): x.fit(df) - def test_check_weights_column_called(self, mocker): - """Test that fit calls BaseTransformer.check_weights_column - when weights are used.""" - - df = d.create_df_9() - - x = ModeImputer(columns=["a", "b"], weight="c") - - expected_call_args = {0: {"args": (d.create_df_9(), "c"), "kwargs": {}}} - - with ta.functions.assert_function_call( - mocker, - tubular.base.BaseTransformer, - "check_weights_column", - expected_call_args, - ): - - x.fit(df) - def test_learnt_values(self): """Test that the impute values learnt during fit are expected.""" @@ -132,28 +100,6 @@ def test_learnt_values(self): msg="impute_values_ attribute", ) - def test_learnt_values_weighted_df(self): - """Test that the impute values learnt during fit are expected when df is weighted.""" - - df = d.create_weighted_imputers_test_df() - - x = ModeImputer(columns=["a", "b", "c", "d"], weight="weight") - - x.fit(df) - - ta.classes.test_object_attributes( - obj=x, - expected_attributes={ - "impute_values_": { - "a": np.float64(5.0), - "b": "e", - "c": "f", - "d": np.float64(1.0), - } - }, - msg="impute_values_ attribute", - ) - def test_fit_returns_self(self): """Test fit returns self?""" @@ -165,17 +111,6 @@ def test_fit_returns_self(self): assert x_fitted is x, "Returned value from ModeImputer.fit not as expected." - def test_fit_returns_self_weighted(self): - """Test fit returns self?""" - - df = d.create_df_9() - - x = ModeImputer(columns="a", weight="c") - - x_fitted = x.fit(df) - - assert x_fitted is x, "Returned value from ModeImputer.fit not as expected." - def test_fit_not_changing_data(self): """Test fit does not change X.""" @@ -191,49 +126,6 @@ def test_fit_not_changing_data(self): msg="Check X not changing during fit", ) - def test_fit_not_changing_data_weighted(self): - """Test fit does not change X - when weights are used.""" - - df = d.create_df_9() - - x = ModeImputer(columns="a", weight="c") - - x.fit(df) - - ta.equality.assert_equal_dispatch( - expected=d.create_df_9(), - actual=df, - msg="Check X not changing during fit", - ) - - def expected_df_nan(): - df = pd.DataFrame({"a": ["NaN", "NaN", "NaN"], "b": [None, None, None]}) - return df - - @pytest.mark.parametrize( - "df, expected", - ta.pandas.row_by_row_params( - pd.DataFrame({"a": [np.nan, np.nan, np.nan], "b": [None, None, None]}), - expected_df_nan(), - ) - + ta.pandas.index_preserved_params( - pd.DataFrame({"a": [np.nan, np.nan, np.nan], "b": [None, None, None]}), - expected_df_nan(), - ), - ) - def test_warning_mode_is_nan(self, df, expected): - """Test that warning is raised when mode is NaN""" - - x = ModeImputer(columns=["a", "b"]) - - with pytest.warns(Warning, match="ModeImputer: The Mode of column a is NaN."): - - x.fit(df) - - with pytest.warns(Warning, match="ModeImputer: The Mode of column b is NaN."): - - x.fit(df) - class TestTransform(object): """Tests for ModeImputer.transform().""" @@ -272,17 +164,6 @@ def expected_df_2(): return df - def expected_df_3(): - """Expected output for test_nulls_imputed_correctly_3.""" - - df = d.create_df_9() - - for col in ["a"]: - - df[col].loc[df[col].isnull()] = 6 - - return df - def test_arguments(self): """Test that transform has expected arguments.""" @@ -364,28 +245,6 @@ def test_nulls_imputed_correctly_2(self, df, expected): msg="Check nulls filled correctly in transform", ) - @pytest.mark.parametrize( - "df, expected", - ta.pandas.row_by_row_params(d.create_df_9(), expected_df_3()) - + ta.pandas.index_preserved_params(d.create_df_9(), expected_df_3()), - ) - def test_nulls_imputed_correctly_3(self, df, expected): - """Test missing values are filled with the correct values - and unrelated columns are not changed - (when weight is used).""" - - x = ModeImputer(columns=["a"], weight="c") - - # set the impute values dict directly rather than fitting x on df so test works with helpers - x.impute_values_ = {"a": 6} - - df_transformed = x.transform(df) - - ta.equality.assert_equal_dispatch( - expected=expected, - actual=df_transformed, - msg="Check nulls filled correctly in transform", - ) - def test_learnt_values_not_modified(self): """Test that the impute_values_ from fit are not changed in transform.""" @@ -404,22 +263,3 @@ def test_learnt_values_not_modified(self): actual=x2.impute_values_, msg="Impute values not changed in transform", ) - - def test_learnt_values_not_modified_weights(self): - """Test that the impute_values_ from fit are not changed in transform - when using weights.""" - - df = d.create_df_9() - - x = ModeImputer(columns=["a", "b"], weight="c") - - x.fit(df) - - x2 = ModeImputer(columns=["a", "b"], weight="c") - - x2.fit_transform(df) - - ta.equality.assert_equal_dispatch( - expected=x.impute_values_, - actual=x2.impute_values_, - msg="Impute values not changed in transform", - ) diff --git a/tests/imputers/test_NearestMeanResponseImputer.py b/tests/imputers/test_NearestMeanResponseImputer.py index 60152128..88c3c593 100644 --- a/tests/imputers/test_NearestMeanResponseImputer.py +++ b/tests/imputers/test_NearestMeanResponseImputer.py @@ -23,7 +23,7 @@ def test_arguments(self): def test_class_methods(self): """Test that NearestMeanResponseImputer has fit and transform methods.""" - x = NearestMeanResponseImputer(columns=None) + x = NearestMeanResponseImputer(response_column="c", columns=None) ta.classes.test_object_method(obj=x, expected_method="fit", msg="fit") @@ -94,9 +94,7 @@ def test_null_values_in_response_error(self): x = NearestMeanResponseImputer(columns=["a", "b"]) - with pytest.raises( - ValueError, match="NearestMeanResponseImputer: y has 1 null values" - ): + with pytest.raises(ValueError, match="y has 1 null values"): x.fit(df, df["c"]) @@ -111,7 +109,7 @@ def test_columns_with_no_nulls_error(self): with pytest.raises( ValueError, - match="NearestMeanResponseImputer: Column a has no missing values, cannot use this transformer.", + match="Column a has no missing values, cannot use this transformer.", ): x.fit(df, df["c"]) diff --git a/tests/mapping/test_BaseMappingTransformer.py b/tests/mapping/test_BaseMappingTransformer.py index c8c4b7a5..bd5dac63 100644 --- a/tests/mapping/test_BaseMappingTransformer.py +++ b/tests/mapping/test_BaseMappingTransformer.py @@ -50,9 +50,7 @@ def test_super_init_called(self, mocker): def test_no_keys_dict_error(self): """Test that an exception is raised if mappings is a dict but with no keys.""" - with pytest.raises( - ValueError, match="BaseMappingTransformer: mappings has no values" - ): + with pytest.raises(ValueError, match="mappings has no values"): BaseMappingTransformer(mappings={}) @@ -60,8 +58,7 @@ def test_mappings_contains_non_dict_items_error(self): """Test that an exception is raised if mappings contains non-dict items.""" with pytest.raises( - ValueError, - match="BaseMappingTransformer: values in mappings dictionary should be dictionaries", + ValueError, match="values in mappings dictionary should be dictionaries" ): BaseMappingTransformer(mappings={"a": {"a": 1}, "b": 1}) @@ -69,9 +66,7 @@ def test_mappings_contains_non_dict_items_error(self): def test_mappings_not_dict_error(self): """Test that an exception is raised if mappings is not a dict.""" - with pytest.raises( - ValueError, match="BaseMappingTransformer: mappings must be a dictionary" - ): + with pytest.raises(ValueError, match="mappings must be a dictionary"): BaseMappingTransformer(mappings=()) diff --git a/tests/mapping/test_CrossColumnAddTransformer.py b/tests/mapping/test_CrossColumnAddTransformer.py index c58c4abc..9e4edc56 100644 --- a/tests/mapping/test_CrossColumnAddTransformer.py +++ b/tests/mapping/test_CrossColumnAddTransformer.py @@ -60,19 +60,14 @@ def test_super_init_called(self, mocker): def test_adjust_columns_non_string_error(self): """Test that an exception is raised if adjust_column is not a string.""" - with pytest.raises( - TypeError, - match="CrossColumnAddTransformer: adjust_column should be a string", - ): + with pytest.raises(TypeError, match="adjust_column should be a string"): CrossColumnAddTransformer(mappings={"a": {"a": 1}}, adjust_column=1) def test_mapping_values_not_numeric_error(self): """Test that an exception is raised if mappings values are not numeric.""" - with pytest.raises( - TypeError, match="CrossColumnAddTransformer: mapping values must be numeric" - ): + with pytest.raises(TypeError, match="mapping values must be numeric"): CrossColumnAddTransformer(mappings={"a": {"a": "b"}}, adjust_column="b") @@ -182,9 +177,7 @@ def test_adjust_col_not_in_x_error(self): x = CrossColumnAddTransformer(mappings=mapping, adjust_column="c") - with pytest.raises( - ValueError, match="CrossColumnAddTransformer: variable c is not in X" - ): + with pytest.raises(ValueError, match="variable c is not in X"): x.transform(df) @@ -197,10 +190,7 @@ def test_adjust_col_not_numeric_error(self): x = CrossColumnAddTransformer(mappings=mapping, adjust_column="c") - with pytest.raises( - TypeError, - match="CrossColumnAddTransformer: variable c must have numeric dtype.", - ): + with pytest.raises(TypeError, match="variable c must have numeric dtype."): x.transform(df) diff --git a/tests/mapping/test_CrossColumnMappingTransformer.py b/tests/mapping/test_CrossColumnMappingTransformer.py index d1101ebd..108ae40a 100644 --- a/tests/mapping/test_CrossColumnMappingTransformer.py +++ b/tests/mapping/test_CrossColumnMappingTransformer.py @@ -60,10 +60,7 @@ def test_super_init_called(self, mocker): def test_adjust_columns_non_string_error(self): """Test that an exception is raised if adjust_column is not a string.""" - with pytest.raises( - TypeError, - match="CrossColumnMappingTransformer: adjust_column should be a string", - ): + with pytest.raises(TypeError, match="adjust_column should be a string"): CrossColumnMappingTransformer(mappings={"a": {"a": 1}}, adjust_column=1) @@ -72,7 +69,7 @@ def test_mappings_not_ordered_dict_error(self): with pytest.raises( TypeError, - match="CrossColumnMappingTransformer: mappings should be an ordered dict for 'replace' mappings using multiple columns", + match="mappings should be an ordered dict for 'replace' mappings using multiple columns", ): CrossColumnMappingTransformer( @@ -183,9 +180,7 @@ def test_adjust_col_not_in_x_error(self): x = CrossColumnMappingTransformer(mappings=mapping, adjust_column="c") - with pytest.raises( - ValueError, match="CrossColumnMappingTransformer: variable c is not in X" - ): + with pytest.raises(ValueError, match="variable c is not in X"): x.transform(df) diff --git a/tests/mapping/test_CrossColumnMultiplyTransformer.py b/tests/mapping/test_CrossColumnMultiplyTransformer.py index aa42beeb..354099e6 100644 --- a/tests/mapping/test_CrossColumnMultiplyTransformer.py +++ b/tests/mapping/test_CrossColumnMultiplyTransformer.py @@ -60,20 +60,14 @@ def test_super_init_called(self, mocker): def test_adjust_columns_non_string_error(self): """Test that an exception is raised if adjust_column is not a string.""" - with pytest.raises( - TypeError, - match="CrossColumnMultiplyTransformer: adjust_column should be a string", - ): + with pytest.raises(TypeError, match="adjust_column should be a string"): CrossColumnMultiplyTransformer(mappings={"a": {"a": 1}}, adjust_column=1) def test_mapping_values_not_numeric_error(self): """Test that an exception is raised if mappings values are not numeric.""" - with pytest.raises( - TypeError, - match="CrossColumnMultiplyTransformer: mapping values must be numeric", - ): + with pytest.raises(TypeError, match="mapping values must be numeric"): CrossColumnMultiplyTransformer( mappings={"a": {"a": "b"}}, adjust_column="b" @@ -187,9 +181,7 @@ def test_adjust_col_not_in_x_error(self): x = CrossColumnMultiplyTransformer(mappings=mapping, adjust_column="c") - with pytest.raises( - ValueError, match="CrossColumnMultiplyTransformer: variable c is not in X" - ): + with pytest.raises(ValueError, match="variable c is not in X"): x.transform(df) @@ -202,10 +194,7 @@ def test_adjust_col_not_numeric_error(self): x = CrossColumnMultiplyTransformer(mappings=mapping, adjust_column="c") - with pytest.raises( - TypeError, - match="CrossColumnMultiplyTransformer: variable c must have numeric dtype.", - ): + with pytest.raises(TypeError, match="variable c must have numeric dtype."): x.transform(df) diff --git a/tests/mapping/test_MappingTransformer.py b/tests/mapping/test_MappingTransformer.py index 015a2d7c..fab07098 100644 --- a/tests/mapping/test_MappingTransformer.py +++ b/tests/mapping/test_MappingTransformer.py @@ -7,14 +7,6 @@ from tubular.mapping import MappingTransformer from tubular.base import ReturnKeyDict -from pandas.api.types import ( - is_categorical_dtype, - is_integer_dtype, - is_bool_dtype, - is_float_dtype, - is_object_dtype, -) - class TestInit(object): """Tests for MappingTransformer.init().""" @@ -106,7 +98,7 @@ def test_mapping_non_dict_item_error(self): with pytest.raises( TypeError, - match=f"MappingTransformer: each item in mappings should be a dict but got type {type(1)} for key c", + match=f"each item in mappings should be a dict but got type {type(1)} for key c", ): MappingTransformer(mappings=mappings) @@ -138,8 +130,8 @@ def test_arguments(self): ta.functions.test_function_arguments( func=MappingTransformer.transform, - expected_arguments=["self", "X", "suppress_dtype_warning"], - expected_default_values=(False,), + expected_arguments=["self", "X"], + expected_default_values=None, ) def test_super_transform_call(self, mocker): @@ -249,123 +241,3 @@ def test_mappings_unchanged(self): expected=preserve_original_value_mapping, msg="MappingTransformer.transform has changed self.mappings unexpectedly", ) - - @pytest.mark.parametrize( - "mapping, input_col_name, output_col_type_check", - [ - ({"a": {1: 1.1, 6: 6.6}}, "a", is_float_dtype), - ({"a": {1: "one", 6: "six"}}, "a", is_object_dtype), - ( - {"a": {1: True, 2: True, 3: True, 4: False, 5: False, 6: False}}, - "a", - is_bool_dtype, - ), - ( - {"b": {"a": 1, "b": 2, "c": 3, "d": 4, "e": 5, "f": 6}}, - "b", - is_integer_dtype, - ), - ( - {"b": {"a": 1.1, "b": 2.2, "c": 3.3, "d": 4.4, "e": 5.5, "f": 6.6}}, - "b", - is_float_dtype, - ), - ], - ) - def test_expected_dtype_conversions( - self, mapping, input_col_name, output_col_type_check - ): - - df = d.create_df_1() - x = MappingTransformer(mappings=mapping) - df = x.transform(df) - - assert output_col_type_check(df[input_col_name]) - - @pytest.mark.parametrize( - "mapping, input_col_name, input_col_type", - [ - ({"a": {1: True, 6: False}}, "a", "int64"), - ], - ) - def test_unexpected_dtype_change_warning_raised( - self, mapping, input_col_name, input_col_type - ): - - df = d.create_df_1() - print(df["a"]) - - x = MappingTransformer(mappings=mapping) - - with pytest.warns( - UserWarning, - match=f"MappingTransformer: This mapping changes {input_col_name} dtype from {input_col_type} to object. This is often caused by having multiple dtypes in one column, or by not mapping all values", - ): - x.transform(df) - - @pytest.mark.parametrize( - "mapping, input_col_name, input_col_type", - [ - ({"a": {1: True, 6: False}}, "a", "int64"), - ], - ) - def test_unexpected_dtype_change_warning_suppressed( - self, mapping, input_col_name, input_col_type - ): - - df = d.create_df_1() - - x = MappingTransformer(mappings=mapping) - - with pytest.warns(None) as warnings_record: - x.transform(df, suppress_dtype_warning=True) - - assert len(warnings_record) == 0 - - def test_category_dtype_is_conserved(self): - """This is a separate test due to the behaviour of category dtypes - - See documentation of transform method - """ - - df = d.create_df_1() - df["b"] = df["b"].astype("category") - - mapping = mapping = {"b": {"a": "aaa", "b": "bbb"}} - - x = MappingTransformer(mappings=mapping) - df = x.transform(df) - - assert is_categorical_dtype(df["b"]) - - @pytest.mark.parametrize( - "mapping, mapped_col", - [({"a": {99: "99", 98: "98"}}, "a"), ({"b": {"z": 99, "y": 98}}, "b")], - ) - def test_no_applicable_mapping(self, mapping, mapped_col): - - df = d.create_df_1() - - x = MappingTransformer(mappings=mapping) - - with pytest.warns( - UserWarning, - match=f"MappingTransformer: No values from mapping for {mapped_col} exist in dataframe.", - ): - x.transform(df) - - @pytest.mark.parametrize( - "mapping, mapped_col", - [({"a": {1: "1", 99: "99"}}, "a"), ({"b": {"a": 1, "z": 99}}, "b")], - ) - def test_excess_mapping_values(self, mapping, mapped_col): - - df = d.create_df_1() - - x = MappingTransformer(mappings=mapping) - - with pytest.warns( - UserWarning, - match=f"MappingTransformer: There are values in the mapping for {mapped_col} that are not present in the dataframe", - ): - x.transform(df) diff --git a/tests/misc/test_SetColumnDtype.py b/tests/misc/test_SetColumnDtype.py deleted file mode 100644 index 7c16012b..00000000 --- a/tests/misc/test_SetColumnDtype.py +++ /dev/null @@ -1,160 +0,0 @@ -import pandas as pd -import numpy as np -import pytest -import tubular -import test_aide as ta -import tests.test_data as d - -from tubular.misc import SetColumnDtype - - -class TestSetColumnDtypeInit(object): - """Tests for SetColumnDtype custom transformer.""" - - def test_init_arguments(self): - """Test that init has expected arguments.""" - - ta.functions.test_function_arguments( - func=SetColumnDtype.__init__, - expected_arguments=[ - "self", - "columns", - "dtype", - ], - expected_default_values=None, - ) - - def test_inheritance(self): - """Test that SetColumnDtype inherits from tubular BaseTransformer.""" - - x = SetColumnDtype(columns=["a"], dtype=float) - - ta.classes.assert_inheritance(x, tubular.base.BaseTransformer) - - def test_tubular_base_transformer_super_init_called(self, mocker): - """Test that init calls tubular BaseTransformer.init.""" - expected_call_args = { - 0: { - "args": (["a"],), - "kwargs": {"copy": True}, - } - } - with ta.functions.assert_function_call( - mocker, tubular.base.BaseTransformer, "__init__", expected_call_args - ): - SetColumnDtype(columns=["a"], dtype=float) - - def test_dtype_attribute_set(self): - """Test that the value passed in the value arg is set as an attribute of the same name.""" - - x = SetColumnDtype(columns=["a"], dtype=str) - - assert x.dtype == str, "unexpected value set to dtype atttribute" - - @pytest.mark.parametrize( - "invalid_dtype", ["STRING", "misc_invalid", "np.int", int()] - ) - def test_invalid_dtype_error(self, invalid_dtype): - - msg = f"SetColumnDtype: data type '{invalid_dtype}' not understood as a valid dtype" - with pytest.raises(TypeError, match=msg): - SetColumnDtype(columns=["a"], dtype=invalid_dtype) - - -class TestSetColumnDtypeTransform(object): - @pytest.mark.parametrize( - "method_name", - [ - ("transform"), - ], - ) - def test_class_methods(self, method_name): - """Test that SetColumnDtype has transform method.""" - - x = SetColumnDtype(columns=["a"], dtype=float) - - ta.classes.test_object_method( - obj=x, expected_method=method_name, msg=method_name - ) - - def test_transform_arguments(self): - """Test that transform has expected arguments.""" - - ta.functions.test_function_arguments( - func=SetColumnDtype.transform, - expected_arguments=[ - "self", - "X", - ], - ) - - def test_super_transform_called(self, mocker): - """Test that BaseTransformer.transform called.""" - - df = d.create_df_3() - - x = SetColumnDtype(columns=["a"], dtype=float) - - expected_call_args = {0: {"args": (d.create_df_3(),), "kwargs": {}}} - - with ta.functions.assert_function_call( - mocker, - tubular.base.BaseTransformer, - "transform", - expected_call_args, - return_value=d.create_df_3(), - ): - - x.transform(df) - - def base_df(): - """Input dataframe from test_expected_output.""" - - df = pd.DataFrame( - { - "a": [1.0, 2.0, 3.0, 4.0, 5.0, 6.0, np.NaN], - "b": [1.0, 2.0, 3.0, np.NaN, 7.0, 8.0, 9.0], - "c": [1.0, 1.0, 2.0, 3.0, -4.0, -5.0, -6.0], - "d": [1, 1, 2, 3, -4, -5, -6], - } - ) - - return df - - def expected_df(): - """Expected output from test_expected_output.""" - - df = pd.DataFrame( - { - "a": [1.0, 2.0, 3.0, 4.0, 5.0, 6.0, np.NaN], - "b": [1.0, 2.0, 3.0, np.NaN, 7.0, 8.0, 9.0], - "c": [1.0, 1.0, 2.0, 3.0, -4.0, -5.0, -6.0], - "d": [1.0, 1.0, 2.0, 3.0, -4.0, -5.0, -6.0], - } - ) - - return df - - @pytest.mark.parametrize( - "df, expected", - ta.pandas.row_by_row_params(base_df(), expected_df()) - + ta.pandas.index_preserved_params(base_df(), expected_df()), - ) - @pytest.mark.parametrize("dtype", [float, "float"]) - def test_expected_output(self, df, expected, dtype): - """Test values are correctly set to float dtype""" - - df["a"] = df["a"].astype(str) - df["b"] = df["b"].astype(float) - df["c"] = df["c"].astype(int) - df["d"] = df["d"].astype(str) - - x = SetColumnDtype(columns=["a", "b", "c", "d"], dtype=dtype) - - df_transformed = x.transform(df) - - ta.equality.assert_equal_dispatch( - expected=expected, - actual=df_transformed, - msg="Check values correctly converted to float", - ) diff --git a/tests/nominal/test_BaseNominalTransformer.py b/tests/nominal/test_BaseNominalTransformer.py index afd578ce..fa12e71a 100644 --- a/tests/nominal/test_BaseNominalTransformer.py +++ b/tests/nominal/test_BaseNominalTransformer.py @@ -105,7 +105,7 @@ def test_exception_raised(self): with pytest.raises( ValueError, - match="BaseNominalTransformer: nulls would be introduced into column b from levels not present in mapping", + match="nulls would be introduced into column b from levels not present in mapping", ): x.check_mappable_rows(df) diff --git a/tests/nominal/test_GroupRareLevelsTransformer.py b/tests/nominal/test_GroupRareLevelsTransformer.py index 93e1d285..1a7b6886 100644 --- a/tests/nominal/test_GroupRareLevelsTransformer.py +++ b/tests/nominal/test_GroupRareLevelsTransformer.py @@ -61,50 +61,35 @@ def test_super_init_called(self, mocker): def test_cut_off_percent_not_float_error(self): """Test that an exception is raised if cut_off_percent is not an float.""" - with pytest.raises( - ValueError, - match="GroupRareLevelsTransformer: cut_off_percent must be a float", - ): + with pytest.raises(ValueError, match="cut_off_percent must be a float"): GroupRareLevelsTransformer(cut_off_percent="a") def test_cut_off_percent_negative_error(self): """Test that an exception is raised if cut_off_percent is negative.""" - with pytest.raises( - ValueError, - match="GroupRareLevelsTransformer: cut_off_percent must be > 0 and < 1", - ): + with pytest.raises(ValueError, match="cut_off_percent must be > 0 and < 1"): GroupRareLevelsTransformer(cut_off_percent=-1.0) def test_cut_off_percent_gt_one_error(self): """Test that an exception is raised if cut_off_percent is greater than 1.""" - with pytest.raises( - ValueError, - match="GroupRareLevelsTransformer: cut_off_percent must be > 0 and < 1", - ): + with pytest.raises(ValueError, match="cut_off_percent must be > 0 and < 1"): GroupRareLevelsTransformer(cut_off_percent=2.0) def test_weight_not_str_error(self): """Test that an exception is raised if weight is not a str, if supplied.""" - with pytest.raises( - ValueError, - match="GroupRareLevelsTransformer: weight should be a single column", - ): + with pytest.raises(ValueError, match="weight should be a single column"): GroupRareLevelsTransformer(weight=2) def test_record_rare_levels_not_str_error(self): """Test that an exception is raised if record_rare_levels is not a bool.""" - with pytest.raises( - ValueError, - match="GroupRareLevelsTransformer: record_rare_levels must be a bool", - ): + with pytest.raises(ValueError, match="record_rare_levels must be a bool"): GroupRareLevelsTransformer(record_rare_levels=2) @@ -164,9 +149,7 @@ def test_weight_column_not_in_X_error(self): x = GroupRareLevelsTransformer(columns=["b", "c"], weight="aaaa") - with pytest.raises( - ValueError, match="GroupRareLevelsTransformer: weight aaaa not in X" - ): + with pytest.raises(ValueError, match="weight aaaa not in X"): x.fit(df) @@ -220,7 +203,7 @@ def test_learnt_values_weight(self): df = d.create_df_6() - x = GroupRareLevelsTransformer(columns=["b"], cut_off_percent=0.3, weight="a") + x = GroupRareLevelsTransformer(columns=["b"], cut_off_percent=0.3, weights="a") x.fit(df) @@ -235,7 +218,7 @@ def test_learnt_values_weight_2(self): df = d.create_df_6() - x = GroupRareLevelsTransformer(columns=["c"], cut_off_percent=0.2, weight="a") + x = GroupRareLevelsTransformer(columns=["c"], cut_off_percent=0.2, weights="a") x.fit(df) @@ -251,8 +234,7 @@ def test_rare_level_name_not_diff_col_type(self): df = d.create_df_10() with pytest.raises( - ValueError, - match="GroupRareLevelsTransformer: rare_level_name must be of the same type of the columns", + ValueError, match="rare_level_name must be of the same type of the columns" ): x = GroupRareLevelsTransformer(columns=["a", "b"], rare_level_name=2) @@ -260,8 +242,7 @@ def test_rare_level_name_not_diff_col_type(self): x.fit(df) with pytest.raises( - ValueError, - match="GroupRareLevelsTransformer: rare_level_name must be of the same type of the columns", + ValueError, match="rare_level_name must be of the same type of the columns" ): x = GroupRareLevelsTransformer(columns=["c"]) diff --git a/tests/nominal/test_MeanResponseTransformer.py b/tests/nominal/test_MeanResponseTransformer.py index cae3dbcc..59bc78c1 100644 --- a/tests/nominal/test_MeanResponseTransformer.py +++ b/tests/nominal/test_MeanResponseTransformer.py @@ -3,7 +3,6 @@ import tests.test_data as d import pandas as pd import numpy as np -from pandas.testing import assert_series_equal import tubular from tubular.nominal import MeanResponseTransformer @@ -17,19 +16,14 @@ def test_arguments(self): ta.functions.test_function_arguments( func=MeanResponseTransformer.__init__, - expected_arguments=[ - "self", - "columns", - "weights_column", - "prior", - ], - expected_default_values=(None, None, 0), + expected_arguments=["self", "columns", "weights_column"], + expected_default_values=(None, None), ) def test_class_methods(self): """Test that MeanResponseTransformer has fit and transform methods.""" - x = MeanResponseTransformer() + x = MeanResponseTransformer(response_column="a") ta.classes.test_object_method(obj=x, expected_method="fit", msg="fit") @@ -40,7 +34,7 @@ def test_class_methods(self): def test_inheritance(self): """Test that NominalToIntegerTransformer inherits from BaseNominalTransformer.""" - x = MeanResponseTransformer() + x = MeanResponseTransformer(response_column="a") ta.classes.assert_inheritance(x, tubular.nominal.BaseNominalTransformer) @@ -78,106 +72,22 @@ def test_super_init_called(self, mocker): def test_weights_column_not_str_error(self): """Test that an exception is raised if weights_column is not a str.""" - with pytest.raises( - TypeError, match="MeanResponseTransformer: weights_column should be a str" - ): - - MeanResponseTransformer(weights_column=1) - - def test_prior_not_int_error(self): - """Test that an exception is raised if prior is not an int.""" - - with pytest.raises(TypeError, match="prior should be a int"): - - MeanResponseTransformer(prior="1") - - def test_prior_not_positive_int_error(self): - """Test that an exception is raised if prior is not a positive int.""" - - with pytest.raises(ValueError, match="prior should be positive int"): + with pytest.raises(TypeError, match="weights_column should be a str"): - MeanResponseTransformer(prior=-1) + MeanResponseTransformer(response_column="a", weights_column=1) def test_values_passed_in_init_set_to_attribute(self): """Test that the values passed in init are saved in an attribute of the same name.""" - x = MeanResponseTransformer(weights_column="aaa", prior=1) + x = MeanResponseTransformer(weights_column="aaa") ta.classes.test_object_attributes( obj=x, - expected_attributes={"weights_column": "aaa", "prior": 1}, + expected_attributes={"weights_column": "aaa"}, msg="Attributes for MeanResponseTransformer set in init", ) -class Test_prior_regularisation(object): - "tests for _prior_regularisation method" - - def test_arguments(self): - """Test that MeanResponseTransformer._prior_regularisation has expected arguments.""" - - ta.functions.test_function_arguments( - func=MeanResponseTransformer._prior_regularisation, - expected_arguments=["self", "target_means", "cat_freq"], - expected_default_values=None, - ) - - def test_check_is_fitted_called(self, mocker): - """Test that _prior_regularisation calls BaseTransformer.check_is_fitted.""" - - expected_call_args = {0: {"args": (["global_mean"],), "kwargs": {}}} - - x = MeanResponseTransformer() - - x.fit(pd.DataFrame({"a": ["1", "2"]}), pd.Series([2, 3])) - - with ta.functions.assert_function_call( - mocker, tubular.base.BaseTransformer, "check_is_fitted", expected_call_args - ): - - x._prior_regularisation( - cat_freq=pd.Series([1, 2]), target_means=pd.Series([1, 2]) - ) - - def test_output1(self): - "Test output of method" - - x = MeanResponseTransformer(columns="a", prior=3) - - x.fit(X=pd.DataFrame({"a": [1, 2]}), y=pd.Series([2, 3])) - - expected1 = (1 * 1 + 3 * 2.5) / (1 + 3) - - expected2 = (2 * 2 + 3 * 2.5) / (2 + 3) - - expected = pd.Series([expected1, expected2]) - - output = x._prior_regularisation( - cat_freq=pd.Series([1, 2]), target_means=pd.Series([1, 2]) - ) - - assert_series_equal(expected, output) - - def test_output2(self): - "Test output of method" - - x = MeanResponseTransformer(columns="a", prior=0) - - x.fit(X=pd.DataFrame({"a": [1, 2]}), y=pd.Series([2, 3])) - - expected1 = (1 * 1) / (1) - - expected2 = (2 * 2) / (2) - - expected = pd.Series([expected1, expected2]) - - output = x._prior_regularisation( - cat_freq=pd.Series([1, 2]), target_means=pd.Series([1, 2]) - ) - - assert_series_equal(expected, output) - - class TestFit(object): """Tests for MeanResponseTransformer.fit()""" @@ -273,49 +183,12 @@ def test_learnt_values(self): "b": {"a": 1.0, "b": 2.0, "c": 3.0, "d": 4.0, "e": 5.0, "f": 6.0}, "d": {1: 1.0, 2: 2.0, 3: 3.0, 4: 4.0, 5: 5.0, 6: 6.0}, "f": {False: 2.0, True: 5.0}, - }, - "global_mean": np.float64(3.5), - }, - msg="mappings attribute", - ) - - def test_learnt_values_prior_no_weight(self): - """Test that the mean response values learnt during fit are expected.""" - - df = d.create_MeanResponseTransformer_test_df() - - x = MeanResponseTransformer(columns=["b", "d", "f"], prior=5) - - x.fit(df, df["a"]) - - ta.classes.test_object_attributes( - obj=x, - expected_attributes={ - "mappings": { - "b": { - "a": 37 / 12, - "b": 13 / 4, - "c": 41 / 12, - "d": 43 / 12, - "e": 15 / 4, - "f": 47 / 12, - }, - "d": { - 1: 37 / 12, - 2: 13 / 4, - 3: 41 / 12, - 4: 43 / 12, - 5: 15 / 4, - 6: 47 / 12, - }, - "f": {False: 47 / 16, True: 65 / 16}, - }, - "global_mean": np.float64(3.5), + } }, msg="mappings attribute", ) - def test_learnt_values_no_prior_weight(self): + def test_learnt_values_weight(self): """Test that the mean response values learnt during fit are expected if a weights column is specified.""" df = d.create_MeanResponseTransformer_test_df() @@ -328,150 +201,14 @@ def test_learnt_values_no_prior_weight(self): obj=x, expected_attributes={ "mappings": { - "b": {"a": 1.0, "b": 2.0, "c": 3.0, "d": 4.0, "e": 5.0, "f": 6.0}, - "d": {1: 1.0, 2: 2.0, 3: 3.0, 4: 4.0, 5: 5.0, 6: 6.0}, - "f": {False: 14 / 6, True: 77 / 15}, + "b": {"a": 1.0, "b": 1.0, "c": 1.0, "d": 1.0, "e": 1.0, "f": 1.0}, + "d": {1: 1.0, 2: 1.0, 3: 1.0, 4: 1.0, 5: 1.0, 6: 1.0}, + "f": {False: 1.0, True: 1.0}, } }, msg="mappings attribute", ) - def test_learnt_values_prior_weight(self): - """Test that the mean response values learnt during fit are expected - when using weight and prior.""" - - df = d.create_MeanResponseTransformer_test_df() - - df["weight"] = [1, 1, 1, 2, 2, 2] - - x = MeanResponseTransformer( - columns=["d", "f"], prior=5, weights_column="weight" - ) - - x.fit(df, df["a"]) - - ta.classes.test_object_attributes( - obj=x, - expected_attributes={ - "mappings": { - "d": {1: 7 / 2, 2: 11 / 3, 3: 23 / 6, 4: 4.0, 5: 30 / 7, 6: 32 / 7}, - "f": {False: 13 / 4, True: 50 / 11}, - }, - "global_mean": np.float64(4.0), - }, - msg="mappings attribute", - ) - - @pytest.mark.parametrize("prior", (1, 3, 5, 7, 9, 11, 100)) - def test_prior_logic(self, prior): - "test that for prior>0 encodings are closer to global mean than for prior=0" - - df = d.create_MeanResponseTransformer_test_df() - - df["weight"] = [1, 1, 1, 2, 2, 2] - - x_prior = MeanResponseTransformer( - columns=["d", "f"], - prior=prior, - weights_column="weight", - ) - - x_no_prior = MeanResponseTransformer( - columns=["d", "f"], prior=0, weights_column="weight" - ) - - x_prior.fit(df, df["a"]) - - x_no_prior.fit(df, df["a"]) - - prior_mappings = x_prior.mappings - - no_prior_mappings = x_no_prior.mappings - - global_mean = x_prior.global_mean - - assert ( - global_mean == x_no_prior.global_mean - ), "global means for transformers with/without priors should match" - - for col in prior_mappings: - for value in prior_mappings[col]: - - prior_encoding = prior_mappings[col][value] - no_prior_encoding = no_prior_mappings[col][value] - - prior_mean_dist = np.abs(prior_encoding - global_mean) - no_prior_mean_dist = np.abs(no_prior_encoding - global_mean) - - assert ( - prior_mean_dist <= no_prior_mean_dist - ), "encodings using priors should be closer to the global mean than without" - - @pytest.mark.parametrize( - "low_weight, high_weight", ((1, 2), (2, 3), (3, 4), (10, 20)) - ) - def test_prior_logic_for_weights(self, low_weight, high_weight): - "test that for fixed prior a group with lower weight is moved closer to the global mean than one with higher weight" - - df = d.create_MeanResponseTransformer_test_df() - - # column f looks like [False, False, False, True, True, True] - df["weight"] = [ - low_weight, - low_weight, - low_weight, - high_weight, - high_weight, - high_weight, - ] - - x_prior = MeanResponseTransformer( - columns=["f"], - prior=5, - weights_column="weight", - ) - - x_no_prior = MeanResponseTransformer( - columns=["f"], prior=0, weights_column="weight" - ) - - x_prior.fit(df, df["a"]) - - x_no_prior.fit(df, df["a"]) - - prior_mappings = x_prior.mappings - - no_prior_mappings = x_no_prior.mappings - - global_mean = x_prior.global_mean - - assert ( - global_mean == x_no_prior.global_mean - ), "global means for transformers with/without priors should match" - - low_weight_prior_encoding = prior_mappings["f"][False] - high_weight_prior_encoding = prior_mappings["f"][True] - - low_weight_no_prior_encoding = no_prior_mappings["f"][False] - high_weight_no_prior_encoding = no_prior_mappings["f"][True] - - low_weight_prior_mean_dist = np.abs(low_weight_prior_encoding - global_mean) - high_weight_prior_mean_dist = np.abs(high_weight_prior_encoding - global_mean) - - low_weight_no_prior_mean_dist = np.abs( - low_weight_no_prior_encoding - global_mean - ) - high_weight_no_prior_mean_dist = np.abs( - high_weight_no_prior_encoding - global_mean - ) - - # check low weight group has been moved further towards mean than high weight group by prior, i.e - # that the distance remaining is a smaller proportion of the no prior distance - low_ratio = low_weight_prior_mean_dist / low_weight_no_prior_mean_dist - high_ratio = high_weight_prior_mean_dist / high_weight_no_prior_mean_dist - assert ( - low_ratio <= high_ratio - ), "encodings for categories with lower weights should be moved closer to the global mean than those with higher weights, for fixed prior" - def test_weights_column_missing_error(self): """Test that an exception is raised if weights_column is specified but not present in data for fit.""" @@ -479,9 +216,7 @@ def test_weights_column_missing_error(self): x = MeanResponseTransformer(weights_column="z", columns=["b", "d", "f"]) - with pytest.raises( - ValueError, match="MeanResponseTransformer: weights column z not in X" - ): + with pytest.raises(ValueError, match="weights column z not in X"): x.fit(df, df["a"]) @@ -492,9 +227,7 @@ def test_response_column_nulls_error(self): x = MeanResponseTransformer(columns=["b"]) - with pytest.raises( - ValueError, match="MeanResponseTransformer: y has 1 null values" - ): + with pytest.raises(ValueError, match="y has 1 null values"): x.fit(df, df["a"]) @@ -612,7 +345,7 @@ def test_learnt_values_not_modified(self): def test_expected_output(self, df, expected): """Test that the output is expected from transform.""" - x = MeanResponseTransformer(columns=["b", "d", "f"]) + x = MeanResponseTransformer(response_column="a", columns=["b", "d", "f"]) # set the impute values dict directly rather than fitting x on df so test works with helpers x.mappings = { @@ -642,7 +375,7 @@ def test_nulls_introduced_in_transform_error(self): with pytest.raises( ValueError, - match="MeanResponseTransformer: nulls would be introduced into column b from levels not present in mapping", + match="nulls would be introduced into column b from levels not present in mapping", ): x.transform(df) diff --git a/tests/nominal/test_NominalToIntegerTransformer.py b/tests/nominal/test_NominalToIntegerTransformer.py index 3c27d6cf..69d3a3f1 100644 --- a/tests/nominal/test_NominalToIntegerTransformer.py +++ b/tests/nominal/test_NominalToIntegerTransformer.py @@ -286,7 +286,7 @@ def test_non_mappable_rows_raises_error(self): with pytest.raises( ValueError, - match="NominalToIntegerTransformer: nulls would be introduced into column a from levels not present in mapping", + match="nulls would be introduced into column a from levels not present in mapping", ): x.transform(df) @@ -385,7 +385,7 @@ def test_non_mappable_rows_raises_error(self): with pytest.raises( ValueError, - match="NominalToIntegerTransformer: nulls introduced from levels not present in mapping for column: b", + match="nulls introduced from levels not present in mapping for column: b", ): x.inverse_transform(df_transformed) diff --git a/tests/nominal/test_OneHotEncodingTransformer.py b/tests/nominal/test_OneHotEncodingTransformer.py index 468b4aef..aef06fd1 100644 --- a/tests/nominal/test_OneHotEncodingTransformer.py +++ b/tests/nominal/test_OneHotEncodingTransformer.py @@ -237,8 +237,7 @@ def test_nulls_in_X_error(self): x = OneHotEncodingTransformer(columns=["b", "c"]) with pytest.raises( - ValueError, - match="OneHotEncodingTransformer: column b has nulls - replace before proceeding", + ValueError, match="column b has nulls - replace before proceeding" ): x.fit(df) @@ -253,7 +252,7 @@ def test_fields_with_over_100_levels_error(self): with pytest.raises( ValueError, - match="OneHotEncodingTransformer: column b has over 100 unique values - consider another type of encoding", + match="column b has over 100 unique values - consider another type of encoding", ): x.fit(df) @@ -391,8 +390,7 @@ def test_non_numeric_column_error_1(self): x.fit(df_train) with pytest.raises( - ValueError, - match="OneHotEncodingTransformer: column b has nulls - replace before proceeding", + ValueError, match="column b has nulls - replace before proceeding" ): x.transform(df_test) diff --git a/tests/nominal/test_OrdinalEncoderTransformer.py b/tests/nominal/test_OrdinalEncoderTransformer.py index 68275566..1064ad4a 100644 --- a/tests/nominal/test_OrdinalEncoderTransformer.py +++ b/tests/nominal/test_OrdinalEncoderTransformer.py @@ -22,7 +22,7 @@ def test_arguments(self): def test_class_methods(self): """Test that OrdinalEncoderTransformer has fit and transform methods.""" - x = OrdinalEncoderTransformer() + x = OrdinalEncoderTransformer(response_column="a") ta.classes.test_object_method(obj=x, expected_method="fit", msg="fit") @@ -33,7 +33,7 @@ def test_class_methods(self): def test_inheritance(self): """Test that NominalToIntegerTransformer inherits from BaseNominalTransformer.""" - x = OrdinalEncoderTransformer() + x = OrdinalEncoderTransformer(response_column="a") ta.classes.assert_inheritance(x, tubular.nominal.BaseNominalTransformer) ta.classes.assert_inheritance(x, tubular.mapping.BaseMappingTransformMixin) @@ -72,9 +72,7 @@ def test_super_init_called(self, mocker): def test_weights_column_not_str_error(self): """Test that an exception is raised if weights_column is not a str.""" - with pytest.raises( - TypeError, match="OrdinalEncoderTransformer: weights_column should be a str" - ): + with pytest.raises(TypeError, match="weights_column should be a str"): OrdinalEncoderTransformer(weights_column=1) @@ -218,9 +216,7 @@ def test_weights_column_missing_error(self): x = OrdinalEncoderTransformer(weights_column="z", columns=["b", "d", "f"]) - with pytest.raises( - ValueError, match="OrdinalEncoderTransformer: weights column z not in X" - ): + with pytest.raises(ValueError, match="weights column z not in X"): x.fit(df, df["a"]) @@ -231,9 +227,7 @@ def test_response_column_nulls_error(self): x = OrdinalEncoderTransformer(columns=["b"]) - with pytest.raises( - ValueError, match="OrdinalEncoderTransformer: y has 1 null values" - ): + with pytest.raises(ValueError, match="y has 1 null values"): x.fit(df, df["a"]) @@ -369,7 +363,7 @@ def test_nulls_introduced_in_transform_error(self): with pytest.raises( ValueError, - match="OrdinalEncoderTransformer: nulls would be introduced into column b from levels not present in mapping", + match="nulls would be introduced into column b from levels not present in mapping", ): x.transform(df) diff --git a/tests/numeric/test_CutTransformer.py b/tests/numeric/test_CutTransformer.py index c2c86597..2c9dc985 100644 --- a/tests/numeric/test_CutTransformer.py +++ b/tests/numeric/test_CutTransformer.py @@ -59,7 +59,7 @@ def test_column_type_error(self): with pytest.raises( TypeError, match=re.escape( - "CutTransformer: column arg (name of column) should be a single str giving the column to discretise" + "column arg (name of column) should be a single str giving the column to discretise" ), ): @@ -71,9 +71,7 @@ def test_column_type_error(self): def test_new_column_name_type_error(self): """Test that an exception is raised if new_column_name is not a str.""" - with pytest.raises( - TypeError, match="CutTransformer: new_column_name must be a str" - ): + with pytest.raises(TypeError, match="new_column_name must be a str"): CutTransformer(column="b", new_column_name=1) @@ -92,7 +90,7 @@ def test_cut_kwargs_key_type_error(self): with pytest.raises( TypeError, - match=r"""CutTransformer: unexpected type \(\\) for cut_kwargs key in position 1, must be str""", + match=r"""unexpected type \(\\) for cut_kwargs key in position 1, must be str""", ): CutTransformer( @@ -237,8 +235,7 @@ def test_non_numeric_column_error(self): x = CutTransformer(column="b", new_column_name="d") with pytest.raises( - TypeError, - match="CutTransformer: b should be a numeric dtype but got object", + TypeError, match="b should be a numeric dtype but got object" ): x.transform(df) diff --git a/tests/numeric/test_InteractionTransformer.py b/tests/numeric/test_InteractionTransformer.py index 96659728..c3799c23 100644 --- a/tests/numeric/test_InteractionTransformer.py +++ b/tests/numeric/test_InteractionTransformer.py @@ -71,7 +71,7 @@ def test_invalid_input_type_errors(self): with pytest.raises( TypeError, match=re.escape( - "InteractionTransformer: columns must be a string or list with the columns to be pre-processed (if specified)" + "columns must be a string or list with the columns to be pre-processed (if specified)" ), ): @@ -84,7 +84,7 @@ def test_invalid_input_type_errors(self): with pytest.raises( TypeError, match=re.escape( - "InteractionTransformer: each element of columns should be a single (string) column name" + "each element of columns should be a single (string) column name" ), ): @@ -96,7 +96,7 @@ def test_invalid_input_type_errors(self): with pytest.raises( TypeError, - match=r"""InteractionTransformer: unexpected type \(\\) for min_degree, must be int""", + match=r"""unexpected type \(\\) for min_degree, must be int""", ): InteractionTransformer( @@ -106,7 +106,7 @@ def test_invalid_input_type_errors(self): ) with pytest.raises( TypeError, - match=r"""InteractionTransformer: unexpected type \(\\) for max_degree, must be int""", + match=r"""unexpected type \(\\) for max_degree, must be int""", ): InteractionTransformer( @@ -119,7 +119,7 @@ def test_invalid_input_value_errors(self): """Test and exception is raised if degrees or columns provided are inconsistent.""" with pytest.raises( ValueError, - match=r"""InteractionTransformer: number of columns must be equal or greater than 2, got 1 column.""", + match=r"""number of columns must be equal or greater than 2, got 1 column.""", ): InteractionTransformer( @@ -128,7 +128,7 @@ def test_invalid_input_value_errors(self): with pytest.raises( ValueError, - match=r"""InteractionTransformer: min_degree must be equal or greater than 2, got 0""", + match=r"""min_degree must be equal or greater than 2, got 0""", ): InteractionTransformer( @@ -139,7 +139,7 @@ def test_invalid_input_value_errors(self): with pytest.raises( ValueError, - match=r"""InteractionTransformer: max_degree must be equal or greater than min_degree""", + match=r"""max_degree must be equal or greater than min_degree""", ): InteractionTransformer( @@ -150,7 +150,7 @@ def test_invalid_input_value_errors(self): # NEW with pytest.raises( ValueError, - match=r"""InteractionTransformer: max_degree must be equal or lower than number of columns""", + match=r"""max_degree must be equal or lower than number of columns""", ): InteractionTransformer( diff --git a/tests/numeric/test_LogTransformer.py b/tests/numeric/test_LogTransformer.py index c63e74ca..cdc0d607 100644 --- a/tests/numeric/test_LogTransformer.py +++ b/tests/numeric/test_LogTransformer.py @@ -26,12 +26,13 @@ def test_base_type_error(self): with pytest.raises( ValueError, - match=re.escape("LogTransformer: base should be numeric or None"), + match=re.escape("base should be numeric or None"), ): LogTransformer( columns=["a"], base="a", + new_column_name="b", ) def test_base_not_strictly_positive_error(self): @@ -39,12 +40,13 @@ def test_base_not_strictly_positive_error(self): with pytest.raises( ValueError, - match=re.escape("LogTransformer: base should be strictly positive"), + match=re.escape("base should be strictly positive"), ): LogTransformer( columns=["a"], base=0, + new_column_name="b", ) def test_class_methods(self): @@ -209,8 +211,7 @@ def test_error_with_non_numeric_columns(self): x = LogTransformer(columns=["a", "b", "c"]) with pytest.raises( - TypeError, - match=r"LogTransformer: The following columns are not numeric in X; \['b', 'c'\]", + TypeError, match=r"The following columns are not numeric in X; \['b', 'c'\]" ): x.transform(df) @@ -366,7 +367,7 @@ def test_negative_values_raise_exception( with pytest.raises( ValueError, - match=f"LogTransformer: values less than or equal to 0 in columns{extra_exception_text}, make greater than 0 before using transform", + match=f"values less than or equal to 0 in columns{extra_exception_text}, make greater than 0 before using transform", ): x.transform(df) diff --git a/tests/numeric/test_PCATransformer.py b/tests/numeric/test_PCATransformer.py deleted file mode 100644 index 9affba01..00000000 --- a/tests/numeric/test_PCATransformer.py +++ /dev/null @@ -1,452 +0,0 @@ -import pytest -import test_aide as ta -import tests.test_data as d -import pandas as pd - -import tubular -from tubular.numeric import PCATransformer - - -class TestInit(object): - """Tests for PCATransformer.init().""" - - def test_arguments(self): - """Test that init has expected arguments.""" - - ta.functions.test_function_arguments( - func=PCATransformer.__init__, - expected_arguments=[ - "self", - "columns", - "n_components", - "svd_solver", - "random_state", - "pca_column_prefix", - ], - expected_default_values=(2, "auto", None, "pca_"), - ) - - def test_inheritance(self): - """Test that PCATransformer inherits from BaseTransformer.""" - - x = PCATransformer(columns=["a"]) - - ta.classes.assert_inheritance(x, tubular.base.BaseTransformer) - - def test_to_random_state_type_error(self): - """Test that an exception is raised if random_state is not a int or None.""" - - with pytest.raises( - TypeError, - match=r"""PCATransformer:unexpected type for random_state, must be int or None.""", - ): - PCATransformer(columns="b", random_state="2") - - def test_to_svd_solver_type_error(self): - """Test that an exception is raised if svd_solver is not a str.""" - - with pytest.raises( - TypeError, - match=r"""PCATransformer:unexpected type for svd_solver, must be str""", - ): - PCATransformer(columns="b", svd_solver=2) - - def test_to_n_components_type_error(self): - """Test that an exception is raised if n_components is not a int or 'mle'.""" - - with pytest.raises( - TypeError, - match=r"""PCATransformer:unexpected type for n_components, must be int, float \(0-1\) or equal to 'mle'.""", - ): - PCATransformer(columns="b", n_components="3") - - def test_to_pca_prefix_type_error(self): - """Test that an exception is raised if pca_column_prefix is not str.""" - - with pytest.raises( - TypeError, - match=r"""PCATransformer:unexpected type for pca_column_prefix, must be str""", - ): - PCATransformer(columns="b", n_components=2, pca_column_prefix=3) - - def test_to_svd_solver_value_error(self): - """Test that an exception is raised if svd_solver is not one of the allowed values.""" - - with pytest.raises( - ValueError, - match=r"""PCATransformer:svd_solver zzz is unknown. Please select among 'auto', 'full', 'arpack', 'randomized'.""", - ): - PCATransformer(columns="b", svd_solver="zzz") - - def test_to_n_components_value_error(self): - """Test that an exception is raised if n_components is not one of the allowed values.""" - - with pytest.raises( - ValueError, - match=r"""PCATransformer:n_components must be strictly positive got -1""", - ): - PCATransformer(columns="b", n_components=-1) - - def test_to_n_components_float_value_error(self): - """Test that an exception is raised if n_components is not one of the allowed float values.""" - - with pytest.raises( - ValueError, - match=r"""PCATransformer:n_components must be strictly positive and must be of type int when greater than or equal to 1. Got 1.4""", - ): - PCATransformer(columns="b", n_components=1.4) - - def test_to_arpack_mle_value_error(self): - """Test that an exception is raised if svd solver is arpack and n_components is "mle".""" - with pytest.raises( - ValueError, - match=r"""PCATransformer: n_components='mle' cannot be a string with svd_solver='arpack'""", - ): - PCATransformer(columns="b", n_components="mle", svd_solver="arpack") - - def test_to_arpack_randomized_float_type_error(self): - """Test that an exception is raised if svd solver is arpack or randomized and n_components is float .""" - with pytest.raises( - TypeError, - match=r"""PCATransformer: n_components 0.3 cannot be a float with svd_solver='arpack'""", - ): - PCATransformer(columns="b", n_components=0.3, svd_solver="arpack") - - def test_super_init_called(self, mocker): - """Test that super.__init__ called.""" - - expected_call_args = { - 0: { - "args": (), - "kwargs": {"columns": ["a", "b"], "copy": True, "verbose": False}, - } - } - - with ta.functions.assert_function_call( - mocker, tubular.base.BaseTransformer, "__init__", expected_call_args - ): - PCATransformer(columns=["a", "b"], n_components=1, copy=True, verbose=False) - - -class TestCheckNumericColumns(object): - """Tests for the check_numeric_columns method.""" - - def test_arguments(self): - """Test that check_numeric_columns has expected arguments.""" - - ta.functions.test_function_arguments( - func=PCATransformer.check_numeric_columns, - expected_arguments=["self", "X"], - expected_default_values=None, - ) - - def test_exception_raised(self): - """Test an exception is raised if non numeric columns are passed in X.""" - - df = d.create_df_2() - - x = PCATransformer(columns=["a", "b", "c"], n_components=2) - - with pytest.raises( - TypeError, - match=r"""PCATransformer: The following columns are not numeric in X; \['b', 'c'\]""", - ): - x.check_numeric_columns(df) - - def test_X_returned(self): - """Test that the input X is returned from the method.""" - - df = d.create_df_2() - - x = PCATransformer(columns=["a"], n_components=2) - - df_returned = x.check_numeric_columns(df) - - ta.equality.assert_equal_dispatch( - expected=df, - actual=df_returned, - msg="unexepcted object returned from check_numeric_columns", - ) - - -class TestFit(object): - """Tests for PCATransformer.fit().""" - - def test_arguments(self): - """Test that fit has expected arguments.""" - - ta.functions.test_function_arguments( - func=PCATransformer.fit, - expected_arguments=["self", "X", "y"], - expected_default_values=(None,), - ) - - def test_super_fit_call(self, mocker): - """Test the call to BaseTransformer.fit.""" - - df = d.create_numeric_df_1() - - x = PCATransformer(columns=["a", "b"], n_components=1) - - expected_call_args = { - 0: {"args": (d.create_numeric_df_1(), None), "kwargs": {}} - } - - with ta.functions.assert_function_call( - mocker, tubular.base.BaseTransformer, "fit", expected_call_args - ): - x.fit(df) - - def test_check_numeric_columns_call(self, mocker): - """Test the call to PCATransformer.check_numeric_columns.""" - - df = d.create_numeric_df_1() - - x = PCATransformer(columns=["a", "b"], n_components=1) - - expected_call_args = {0: {"args": (d.create_numeric_df_1(),), "kwargs": {}}} - - with ta.functions.assert_function_call( - mocker, - tubular.numeric.PCATransformer, - "check_numeric_columns", - expected_call_args, - return_value=d.create_numeric_df_1(), - ): - x.fit(df) - - def test_to_arpack_n_compontes_value_error(self): - """Test that an exception is raised if svd solver is arpack and n_components greater than nb samples or features.""" - with pytest.raises( - ValueError, - match=r"""PCATransformer: n_components 10 must be between 1 and min\(n_samples 10, n_features 2\) is 2 with svd_solver 'arpack'""", - ): - # must be between 1 and min(n_samples 10, n_features 2) is 2 with svd_solver arpack - df = d.create_numeric_df_1() - - x = PCATransformer(columns=["a", "b"], n_components=10, svd_solver="arpack") - - x.fit(df) - - def test_return_self(self): - """Test that fit returns self.""" - - df = d.create_numeric_df_1() - - x = PCATransformer(columns=["a", "b"]) - - x_fitted = x.fit(df) - - assert ( - x_fitted is x - ), "return value from PCATransformer.fit not as expected (self)." - - -def create_svd_sovler_output(): - svd_sovler_output = dict() - svd_sovler_output["full"] = pd.DataFrame( - { - "a": [34.48, 21.71, 32.83, 1.08, 32.93, 4.74, 2.76, 75.7, 14.08, 61.31], - "b": [12.03, 20.32, 24.12, 24.18, 68.99, 0.0, 0.0, 59.46, 11.02, 60.68], - "c": [17.06, 12.25, 19.15, 29.73, 1.98, 8.23, 15.22, 20.59, 3.82, 39.73], - "d": [25.94, 70.22, 72.94, 64.55, 0.41, 13.62, 30.22, 4.6, 67.13, 10.38], - "e": [94.3, 4.18, 51.7, 16.63, 2.6, 16.57, 3.51, 30.79, 66.19, 25.44], - "pca_0": [ - -7.0285210087721985, - -10.570772171093276, - 0.7141476951788178, - -19.755517377029697, - 30.46293987797488, - -37.27200224865943, - -37.718068808834694, - 55.636246999483866, - -23.564287941836838, - 49.095834983588574, - ], - "pca_1": [ - -14.719057085223534, - 0.6588448890236053, - -6.504809368610448, - 8.411936495027216, - 30.75596190514493, - -0.8912674725933973, - -2.647964525208776, - -9.600190936709105, - 2.6606364975891146, - -8.124090398439629, - ], - } - ) - - svd_sovler_output["randomized"] = pd.DataFrame( - { - "a": [34.48, 21.71, 32.83, 1.08, 32.93, 4.74, 2.76, 75.7, 14.08, 61.31], - "b": [12.03, 20.32, 24.12, 24.18, 68.99, 0.0, 0.0, 59.46, 11.02, 60.68], - "c": [17.06, 12.25, 19.15, 29.73, 1.98, 8.23, 15.22, 20.59, 3.82, 39.73], - "d": [25.94, 70.22, 72.94, 64.55, 0.41, 13.62, 30.22, 4.6, 67.13, 10.38], - "e": [94.3, 4.18, 51.7, 16.63, 2.6, 16.57, 3.51, 30.79, 66.19, 25.44], - "pca_0": [ - -7.028521008772197, - -10.570772171093276, - 0.7141476951788183, - -19.755517377029697, - 30.46293987797488, - -37.27200224865943, - -37.718068808834694, - 55.636246999483866, - -23.564287941836838, - 49.09583498358857, - ], - "pca_1": [ - -14.71905708522353, - 0.6588448890236093, - -6.504809368610448, - 8.411936495027184, - 30.755961905144947, - -0.8912674725933926, - -2.647964525208781, - -9.600190936709092, - 2.660636497589127, - -8.12409039843965, - ], - } - ) - - svd_sovler_output["arpack"] = pd.DataFrame( - { - "a": [34.48, 21.71, 32.83, 1.08, 32.93, 4.74, 2.76, 75.7, 14.08, 61.31], - "b": [12.03, 20.32, 24.12, 24.18, 68.99, 0.0, 0.0, 59.46, 11.02, 60.68], - "c": [17.06, 12.25, 19.15, 29.73, 1.98, 8.23, 15.22, 20.59, 3.82, 39.73], - "d": [25.94, 70.22, 72.94, 64.55, 0.41, 13.62, 30.22, 4.6, 67.13, 10.38], - "e": [94.3, 4.18, 51.7, 16.63, 2.6, 16.57, 3.51, 30.79, 66.19, 25.44], - "pca_0": [ - -7.0285210087722, - -10.570772171093276, - 0.7141476951788169, - -19.75551737702969, - 30.46293987797488, - -37.272002248659426, - -37.718068808834694, - 55.63624699948385, - -23.564287941836838, - 49.09583498358856, - ], - "pca_1": [ - -14.71905708522354, - 0.6588448890236054, - -6.5048093686104504, - 8.411936495027229, - 30.755961905144936, - -0.8912674725933969, - -2.647964525208771, - -9.600190936709119, - 2.660636497589114, - -8.124090398439632, - ], - } - ) - return svd_sovler_output - - -class TestTransform(object): - """Tests for PCATransformer.transform().""" - - def test_arguments(self): - """Test that transform has expected arguments.""" - - ta.functions.test_function_arguments( - func=PCATransformer.transform, - expected_arguments=["self", "X"], - expected_default_values=None, - ) - - def test_super_transform_called(self, mocker): - """Test that BaseTransformer.transform called.""" - - df = d.create_numeric_df_1() - - x = PCATransformer(columns=["a", "b"]) - - x.fit(df) - - expected_call_args = {0: {"args": (d.create_numeric_df_1(),), "kwargs": {}}} - - with ta.functions.assert_function_call( - mocker, - tubular.base.BaseTransformer, - "transform", - expected_call_args, - return_value=d.create_numeric_df_1(), - ): - - x.transform(df) - - def test_check_numeric_columns_call(self, mocker): - """Test the call to PCATransformer.check_numeric_columns.""" - - df = d.create_numeric_df_1() - - x = PCATransformer(columns=["a", "b"], copy=True) - - x.fit(df) - - expected_call_args = {0: {"args": (d.create_numeric_df_1(),), "kwargs": {}}} - - with ta.functions.assert_function_call( - mocker, - tubular.base.BaseTransformer, - "transform", - expected_call_args, - return_value=d.create_numeric_df_1(), - ): - - x.transform(df) - - @pytest.mark.parametrize( - "svd_solver, svd_solver_output_str", - [("full", "full"), ("arpack", "arpack"), ("randomized", "randomized")], - ) - def test_output_from_pca_transform_set_to_columns( - self, mocker, svd_solver, svd_solver_output_str - ): - """Test that the call to the pca.transform method returns expected outputs.""" - - df = d.create_numeric_df_1() - - x = PCATransformer( - columns=["a", "b", "c"], - n_components=2, - svd_solver=svd_solver, - random_state=32, - ) - x.fit(df) - df_transformed = x.transform(df) - - pca_transform_output = create_svd_sovler_output() - - mocker.patch( - "sklearn.decomposition.PCA.transform", - return_value=pca_transform_output[svd_solver_output_str], - ) - - ta.equality.assert_equal_dispatch( - expected=pca_transform_output[svd_solver_output_str], - actual=df_transformed, - msg=f"output from {svd_solver_output_str} doesn't match", - ) - - @pytest.mark.parametrize("columns", [("b"), ("c"), (["b", "c"])]) - def test_return_type(self, columns): - """Test that transform returns a pd.DataFrame.""" - - df = d.create_numeric_df_1() - - x = PCATransformer(columns=columns, n_components=1) - - x.fit(df) - - df_transformed = x.transform(df) - - assert ( - type(df_transformed) is pd.DataFrame - ), "unexpected output type from transform" diff --git a/tests/numeric/test_ScalingTransformer.py b/tests/numeric/test_ScalingTransformer.py index a7cc9774..a9ba3cbc 100644 --- a/tests/numeric/test_ScalingTransformer.py +++ b/tests/numeric/test_ScalingTransformer.py @@ -32,7 +32,7 @@ def test_to_scaler_kwargs_type_error(self): with pytest.raises( TypeError, - match=r"""ScalingTransformer: scaler_kwargs should be a dict but got type \""", + match=r"""scaler_kwargs should be a dict but got type \""", ): ScalingTransformer(columns="b", scaler_type="standard", scaler_kwargs=1) @@ -42,7 +42,7 @@ def test_scaler_kwargs_key_type_error(self): with pytest.raises( TypeError, - match=r"""ScalingTransformer: unexpected type \(\\) for scaler_kwargs key in position 1, must be str""", + match=r"""unexpected type \(\\) for scaler_kwargs key in position 1, must be str""", ): ScalingTransformer( @@ -56,7 +56,7 @@ def test_to_scaler_non_allowed_value_error(self): with pytest.raises( ValueError, - match=r"""ScalingTransformer: scaler_type should be one of; \['min_max', 'max_abs', 'standard'\]""", + match=r"""scaler_type should be one of; \['min_max', 'max_abs', 'standard'\]""", ): ScalingTransformer(columns="b", scaler_type="zzz", scaler_kwargs={"a": 1}) @@ -157,7 +157,7 @@ def test_exception_raised(self): with pytest.raises( TypeError, - match=r"""ScalingTransformer: The following columns are not numeric in X; \['b', 'c'\]""", + match=r"""The following columns are not numeric in X; \['b', 'c'\]""", ): x.check_numeric_columns(df) diff --git a/tests/numeric/test_TwoColumnOperatorTransformer.py b/tests/numeric/test_TwoColumnOperatorTransformer.py deleted file mode 100644 index 521c659c..00000000 --- a/tests/numeric/test_TwoColumnOperatorTransformer.py +++ /dev/null @@ -1,214 +0,0 @@ -import pytest -import pandas as pd -import test_aide as ta -import tubular -from tubular.numeric import TwoColumnOperatorTransformer -import tests.test_data as d - - -@pytest.fixture(scope="module", autouse=True) -def example_transformer(): - - example_transformer = TwoColumnOperatorTransformer( - "mul", - ["a", "b"], - "c", - ) - - return example_transformer - - -class TestTwoColumnOperatorTransformerInit(object): - """ - Tests for TwoColumnMethodTransformer.__init__() - """ - - def test_inheritance(self, example_transformer): - """Test that TwoColumnOperatorTransformer inherits from BaseTransformer.""" - - ta.classes.assert_inheritance( - example_transformer, tubular.base.DataFrameMethodTransformer - ) - - def test_class_methods(self, example_transformer): - """Test that TwoColumnOperatorTransformer has transform method.""" - - ta.classes.test_object_method( - obj=example_transformer, expected_method="transform", msg="transform" - ) - - def test_arguments(self): - """Test that init has expected arguments.""" - - ta.functions.test_function_arguments( - func=TwoColumnOperatorTransformer.__init__, - expected_arguments=[ - "self", - "pd_method_name", - "columns", - "new_column_name", - "pd_method_kwargs", - ], - expected_default_values=({"axis": 0},), - ) - - def test_axis_not_present_error(self): - """Checks that an error is raised if no axis element present in pd_method_kwargs dict""" - - with pytest.raises( - ValueError, - match='pd_method_kwargs must contain an entry "axis" set to 0 or 1', - ): - TwoColumnOperatorTransformer("mul", ["a", "b"], "c", pd_method_kwargs={}) - - def test_axis_not_valid_error(self): - """Checks that an error is raised if no axis element present in pd_method_kwargs dict""" - - with pytest.raises(ValueError, match="pd_method_kwargs 'axis' must be 0 or 1"): - TwoColumnOperatorTransformer( - "mul", ["a", "b"], "c", pd_method_kwargs={"axis": 2} - ) - - def test_attributes(self, example_transformer): - """Tests that the transformer has the expected attributes""" - expected_attributes = { - "pd_method_name": "mul", - # 'a' is given as a list here because that's how DataFrameMethodTransformer.__init__ stores the columns attribute - "column1_name": "a", - "column2_name": "b", - "new_column_name": "c", - "pd_method_kwargs": {"axis": 0}, - } - - msg = "TwoColumneMethodTransformer object does not have expected attributes" - ta.classes.test_object_attributes(example_transformer, expected_attributes, msg) - - def test_DataFrameMethodTransformer_init_call(self, mocker): - """Tests that the .__init__ method is called from the parent DataFrameMethodTransformer class""" - expected_call_args = { - 0: { - "args": (), - "kwargs": { - "new_column_name": "c", - "pd_method_name": "mul", - "columns": ["a", "b"], - "pd_method_kwargs": {"axis": 0}, - }, - } - } - - with ta.functions.assert_function_call( - mocker, - tubular.base.DataFrameMethodTransformer, - "__init__", - expected_call_args, - return_value=None, - ): - - TwoColumnOperatorTransformer("mul", ["a", "b"], "c") - - -class TestTwoColumnOperatorTransformerTransform(object): - def test_arguments(self): - """Test that transform has expected arguments.""" - - ta.functions.test_function_arguments( - func=TwoColumnOperatorTransformer.transform, - expected_arguments=["self", "X"], - expected_default_values=None, - ) - - def test_BaseTransformer_transform_called(self, example_transformer, mocker): - """Tests that the .transform method is called from the grandparent BaseTransformer class""" - test_data = d.create_df_11() - - expected_call_args = {0: {"args": (test_data,), "kwargs": {}}} - - with ta.functions.assert_function_call( - mocker, tubular.base.BaseTransformer, "transform", expected_call_args - ): - - example_transformer.transform(test_data) - - @pytest.mark.parametrize( - "pd_method_name", - [ - ("mul"), - ("div"), - ("pow"), - ], - ) - def test_pandas_method_called(self, mocker, pd_method_name): - """Test that the pandas method is called as expected (with kwargs passed) during transform.""" - spy = mocker.spy(pd.DataFrame, pd_method_name) - - pd_method_kwargs = {"axis": 0} - - data = d.create_df_11() - x = TwoColumnOperatorTransformer( - pd_method_name, - ["a", "b"], - "c", - ) - x.transform(data) - - # pull out positional and keyword args to target the call - print(spy.call_args_list) - call_args = spy.call_args_list[0] - call_pos_args = call_args[0] - call_kwargs = call_args[1] - - # test keyword are as expected - ta.equality.assert_dict_equal_msg( - actual=call_kwargs, - expected=pd_method_kwargs, - msg_tag=f"""Keyword arg assert for '{pd_method_name}'""", - ) - - # test positional args are as expected - ta.equality.assert_list_tuple_equal_msg( - actual=call_pos_args, - # 'a' is indexed as a list here because that's how DataFrameMethodTransformer.__init__ stores the columns attribute - expected=(data[["a"]], data["b"]), - msg_tag=f"""Positional arg assert for {pd_method_name}""", - ) - - @pytest.mark.parametrize( - "pd_method_name, output", - [ - ( - "mul", - [4, 10, 18], - ), - ("div", [0.25, 0.4, 0.5]), - ("pow", [1, 32, 729]), - ], - ) - def test_expected_output(self, pd_method_name, output): - """Tests that the output given by TwoColumnOperatorTransformer is as you would expect""" - expected = d.create_df_11() - expected["c"] = output - x = TwoColumnOperatorTransformer( - pd_method_name, - ["a", "b"], - "c", - ) - actual = x.transform(d.create_df_11()) - ta.equality.assert_frame_equal_msg( - actual=actual, - expected=expected, - msg_tag="TwoColumnMethod transformer does not produce the expected output", - ) - - def test_non_numeric_error(self): - x = TwoColumnOperatorTransformer( - "mul", - ["a", "b"], - "c", - ) - - with pytest.raises( - TypeError, - match="TwoColumnOperatorTransformer: input columns in X must contain only numeric values", - ): - x.transform(d.create_df_8()) diff --git a/tests/strings/test_SeriesStrMethodTransformer.py b/tests/strings/test_SeriesStrMethodTransformer.py index 5a7c7315..09297510 100644 --- a/tests/strings/test_SeriesStrMethodTransformer.py +++ b/tests/strings/test_SeriesStrMethodTransformer.py @@ -72,7 +72,7 @@ def test_invalid_input_type_errors(self): with pytest.raises( ValueError, - match="SeriesStrMethodTransformer: columns arg should contain only 1 column name but got 2", + match="columns arg should contain only 1 column name but got 2", ): SeriesStrMethodTransformer( @@ -81,7 +81,7 @@ def test_invalid_input_type_errors(self): with pytest.raises( TypeError, - match=r"SeriesStrMethodTransformer: unexpected type \(\\) for pd_method_name, expecting str", + match=r"unexpected type \(\\) for pd_method_name, expecting str", ): SeriesStrMethodTransformer( @@ -90,7 +90,7 @@ def test_invalid_input_type_errors(self): with pytest.raises( TypeError, - match=r"SeriesStrMethodTransformer: unexpected type \(\\) for new_column_name, must be str", + match=r"unexpected type \(\\) for new_column_name, must be str", ): SeriesStrMethodTransformer( @@ -99,7 +99,7 @@ def test_invalid_input_type_errors(self): with pytest.raises( TypeError, - match=r"""SeriesStrMethodTransformer: pd_method_kwargs should be a dict but got type \""", + match=r"""pd_method_kwargs should be a dict but got type \""", ): SeriesStrMethodTransformer( @@ -111,7 +111,7 @@ def test_invalid_input_type_errors(self): with pytest.raises( TypeError, - match=r"""SeriesStrMethodTransformer: unexpected type \(\\) for pd_method_kwargs key in position 1, must be str""", + match=r"""unexpected type \(\\) for pd_method_kwargs key in position 1, must be str""", ): SeriesStrMethodTransformer( @@ -126,7 +126,7 @@ def test_exception_raised_non_pandas_method_passed(self): with pytest.raises( AttributeError, - match="""SeriesStrMethodTransformer: error accessing "str.b" method on pd.Series object - pd_method_name should be a pd.Series.str method""", + match="""error accessing "str.b" method on pd.Series object - pd_method_name should be a pd.Series.str method""", ): SeriesStrMethodTransformer( diff --git a/tests/strings/test_StringConcatenatorTransformer.py b/tests/strings/test_StringConcatenatorTransformer.py deleted file mode 100644 index 93d66ed4..00000000 --- a/tests/strings/test_StringConcatenatorTransformer.py +++ /dev/null @@ -1,145 +0,0 @@ -import pytest -import test_aide as ta -import tests.test_data as d - -import tubular -from tubular.strings import StringConcatenator - - -@pytest.fixture -def concatenate_str(): - return StringConcatenator(columns=["a", "b"], new_column="merged_values") - - -class TestStringConcatenator: - """Tests for the StringConcatenator.__init__ method.""" - - def test_arguments(self): - """Test that init has expected arguments.""" - - ta.functions.test_function_arguments( - func=StringConcatenator.__init__, - expected_arguments=["self", "columns", "new_column", "separator"], - expected_default_values=( - "new_column", - " ", - ), - ) - - def test_assert_inheritance(self, concatenate_str): - """Test StringConcatenator inherits from BaseTransformer.""" - - ta.classes.assert_inheritance(concatenate_str, tubular.base.BaseTransformer), - "StringConcatenator is not instance of tubular.base.BaseTransformer" - - def test_super_init_call(self, mocker): - """Test that BaseTransformer.init us called as expected.""" - - expected_call_args = { - 0: { - "args": (), - "kwargs": {"columns": ["a", "b"], "copy": True}, - } - } - - with ta.functions.assert_function_call( - mocker, tubular.base.BaseTransformer, "__init__", expected_call_args - ): - - StringConcatenator(columns=["a", "b"], new_column="merged_column") - - def test_merged_values_attribute_set(self, concatenate_str): - """Test that the new column name passed in the new column arg is set as an attribute of the same name.""" - - assert ( - concatenate_str.new_column == "merged_values" - ), "unexpected value set to new_column attribute" - - @pytest.mark.parametrize("new_column", [1, True, ["a", "b"], 2.0]) - def test_warning_new_column_str(self, new_column): - """Test that an exception is raised if new_column is not a str""" - - df = d.create_df_1() - - with pytest.raises( - TypeError, - match="StringConcatenator: new_column should be a str", - ): - - x = StringConcatenator(columns=["a", "b"], new_column=new_column) - x.transform(df) - - @pytest.mark.parametrize("separator", [0.0, False, ["a", "b"], 7]) - def test_warning_seperator_str(self, separator): - """Test that an exception is raised if separator is not a str""" - - df = d.create_df_1() - - with pytest.raises( - TypeError, match="StringConcatenator: The separator should be a str" - ): - x = StringConcatenator( - columns=["a", "b"], new_column="new_column", separator=separator - ) - x.transform(df) - - -class TestTransform: - """Tests for the StringConcatenator.transform method.""" - - def test_arguments(self): - """Test that transform has expected arguments.""" - - ta.functions.test_function_arguments( - func=StringConcatenator.transform, - expected_arguments=["self", "X"], - expected_default_values=None, - ) - - def test_super_transform_called(self, mocker, concatenate_str): - """Test that BaseTransformer.transform called.""" - - df = d.create_df_7() - - expected_call_args = {0: {"args": (d.create_df_7(),), "kwargs": {}}} - - with ta.functions.assert_function_call( - mocker, tubular.base.BaseTransformer, "transform", expected_call_args - ): - - concatenate_str.transform(df) - - def test_correct_df_returned_1(self, concatenate_str): - """Test that correct df is returned after transformation""" - - df = d.create_df_1() - - df_transformed = concatenate_str.transform(df) - - expected_df = df.copy() - expected_df["merged_values"] = ["1 a", "2 b", "3 c", "4 d", "5 e", "6 f"] - - ta.equality.assert_frame_equal_msg( - df_transformed, - expected_df, - "Incorrect dataframe returned after StringConcatenator transform", - ) - - def test_correct_df_returned_2(self): - """Test that correct df is returned after transformation""" - - df = d.create_df_1() - - x = StringConcatenator( - columns=["a", "b"], new_column="merged_values", separator=":" - ) - df_transformed = x.transform(df) - - expected_df = df.copy() - expected_df["merged_values"] = ["1:a", "2:b", "3:c", "4:d", "5:e", "6:f"] - - ta.equality.assert_frame_equal_msg( - df_transformed, - expected_df, - "Incorrect dataframe returned after StringConcatenator transform", - ) diff --git a/tests/test_data.py b/tests/test_data.py index ea9f0674..829d9920 100644 --- a/tests/test_data.py +++ b/tests/test_data.py @@ -29,20 +29,6 @@ def create_zeros_array(shape=(10, 3)): return arr -def create_numeric_df_1(): - """Example with numeric dataframe""" - df = pd.DataFrame( - { - "a": [34.48, 21.71, 32.83, 1.08, 32.93, 4.74, 2.76, 75.7, 14.08, 61.31], - "b": [12.03, 20.32, 24.12, 24.18, 68.99, 0.0, 0.0, 59.46, 11.02, 60.68], - "c": [17.06, 12.25, 19.15, 29.73, 1.98, 8.23, 15.22, 20.59, 3.82, 39.73], - "d": [25.94, 70.22, 72.94, 64.55, 0.41, 13.62, 30.22, 4.6, 67.13, 10.38], - "e": [94.3, 4.18, 51.7, 16.63, 2.6, 16.57, 3.51, 30.79, 66.19, 25.44], - } - ) - return df - - def create_df_1(): """Create simple DataFrame with the following... @@ -197,19 +183,6 @@ def create_df_10(): return df -def create_df_11(): - """Create simple DataFrame to use in other tests""" - - df = pd.DataFrame( - { - "a": [1, 2, 3], - "b": [4, 5, 6], - } - ) - - return df - - def create_large_null_df(n_col=1000): """Create large single row df with all null values. @@ -265,26 +238,6 @@ def create_large_half_null_df(n_col=1000): return data_df -def create_weighted_imputers_test_df(): - """Create DataFrame to use imputer tests that correct values are imputed for weighted dataframes - - weight column contains the weights between 0 and 1 - """ - - df = pd.DataFrame( - { - "a": [1.0, 1.0, 1.0, 3.0, 5.0, 5.0], - "b": ["a", "a", "a", "d", "e", "e"], - "c": ["a", "a", np.nan, np.nan, np.nan, "f"], - "d": [1.0, 5.0, 3.0, np.nan, np.nan, 1.0], - "response": [0, 1, 0, 1, 1, 1], - "weight": [0.1, 0.1, 0.8, 0.5, 0.9, 0.8], - } - ) - - return df - - def create_MeanResponseTransformer_test_df(): """Create DataFrame to use MeanResponseTransformer tests that correct values are diff --git a/tests/test_transformers.py b/tests/test_transformers.py index 43de6fd3..293bdbd7 100644 --- a/tests/test_transformers.py +++ b/tests/test_transformers.py @@ -2,7 +2,6 @@ import pytest import tubular.base as base import tubular.capping as capping -import tubular.comparison as comparison import tubular.dates as dates import tubular.imputers as imputers import tubular.mapping as mapping @@ -26,7 +25,6 @@ def ListOfTransformers(): ), capping.CappingTransformer(capping_values={"a": [0.1, 0.2]}), capping.OutOfRangeNullTransformer(capping_values={"a": [0.1, 0.2]}), - comparison.EqualityChecker(columns=["a", "b"], new_col_name="c"), dates.DateDiffLeapYearTransformer( column_lower="a", column_upper="b", new_column_name="c", drop_cols=True ), @@ -34,7 +32,6 @@ def ListOfTransformers(): column_lower="a", column_upper="b", new_column_name="c", units="D" ), dates.ToDatetimeTransformer(column="a", new_column_name="b"), - dates.DatetimeInfoExtractor(columns="a"), dates.SeriesDtMethodTransformer( new_column_name="a", pd_method_name="month", column="b" ), @@ -44,18 +41,12 @@ def ListOfTransformers(): column_between="c", new_column_name="c", ), - dates.DatetimeSinusoidCalculator( - "a", - "sin", - "month", - 12, - ), imputers.BaseImputer(), imputers.ArbitraryImputer(impute_value=1, columns="a"), imputers.MedianImputer(columns="a"), imputers.MeanImputer(columns="a"), imputers.ModeImputer(columns="a"), - imputers.NearestMeanResponseImputer(columns="a"), + imputers.NearestMeanResponseImputer(response_column="a"), imputers.NullIndicator(columns="a"), mapping.BaseMappingTransformer(mappings={"a": {1: 2, 3: 4}}), mapping.BaseMappingTransformMixin(), @@ -70,20 +61,14 @@ def ListOfTransformers(): adjust_column="b", mappings={"a": {1: 2, 3: 4}} ), misc.SetValueTransformer(columns="a", value=1), - misc.SetColumnDtype(columns="a", dtype=str), nominal.BaseNominalTransformer(), nominal.NominalToIntegerTransformer(columns="a"), nominal.GroupRareLevelsTransformer(columns="a"), - nominal.MeanResponseTransformer(columns="a"), - nominal.OrdinalEncoderTransformer(columns="a"), + nominal.MeanResponseTransformer(columns="a", response_column="b"), + nominal.OrdinalEncoderTransformer(columns="a", response_column="b"), nominal.OneHotEncodingTransformer(columns="a"), numeric.LogTransformer(columns="a"), numeric.CutTransformer(column="a", new_column_name="b"), - numeric.TwoColumnOperatorTransformer( - pd_method_name="add", - columns=["a", "b"], - new_column_name="c", - ), numeric.ScalingTransformer(columns="a", scaler_type="standard"), strings.SeriesStrMethodTransformer( new_column_name="a", @@ -91,7 +76,6 @@ def ListOfTransformers(): columns="b", pd_method_kwargs={"sub": "a"}, ), - strings.StringConcatenator(columns=["a", "b"], new_column="c"), ] return list_of_transformers @@ -112,11 +96,3 @@ def test_clone(self, transformer): """ b.clone(transformer) - - @pytest.mark.parametrize("transformer", ListOfTransformers()) - def test_unexpected_kwarg(self, transformer): - """ - Test that transformer can be used in sklearn.base.clone function. - """ - - b.clone(transformer) diff --git a/tubular/_version.py b/tubular/_version.py index e19434e2..f9aa3e11 100644 --- a/tubular/_version.py +++ b/tubular/_version.py @@ -1 +1 @@ -__version__ = "0.3.3" +__version__ = "0.3.2" diff --git a/tubular/base.py b/tubular/base.py index afc54550..fd90a082 100644 --- a/tubular/base.py +++ b/tubular/base.py @@ -30,6 +30,9 @@ class BaseTransformer(TransformerMixin, BaseEstimator): verbose : bool, default = False Should statements be printed when methods are run? + **kwds + Arbitrary keyword arguments. + Attributes ---------- columns : list or None @@ -47,17 +50,13 @@ class BaseTransformer(TransformerMixin, BaseEstimator): """ - def classname(self): - """Method that returns the name of the current class when called""" - return type(self).__name__ - - def __init__(self, columns=None, copy=True, verbose=False): + def __init__(self, columns=None, copy=True, verbose=False, **kwargs): self.version_ = __version__ if not isinstance(verbose, bool): - raise TypeError(f"{self.classname()}: verbose must be a bool") + raise TypeError("verbose must be a bool") else: @@ -82,14 +81,14 @@ def __init__(self, columns=None, copy=True, verbose=False): if not len(columns) > 0: - raise ValueError(f"{self.classname()}: columns has no values") + raise ValueError("columns has no values") for c in columns: if not isinstance(c, str): raise TypeError( - f"{self.classname()}: each element of columns should be a single (string) column name" + "each element of columns should be a single (string) column name" ) self.columns = columns @@ -97,12 +96,12 @@ def __init__(self, columns=None, copy=True, verbose=False): else: raise TypeError( - f"{self.classname()}: columns must be a string or list with the columns to be pre-processed (if specified)" + "columns must be a string or list with the columns to be pre-processed (if specified)" ) if not isinstance(copy, bool): - raise TypeError(f"{self.classname()}: copy must be a bool") + raise TypeError("copy must be a bool") else: @@ -133,19 +132,17 @@ def fit(self, X, y=None): if not X.shape[0] > 0: - raise ValueError(f"{self.classname()}: X has no rows; {X.shape}") + raise ValueError(f"X has no rows; {X.shape}") if y is not None: if not isinstance(y, pd.Series): - raise TypeError( - f"{self.classname()}: unexpected type for y, should be a pd.Series" - ) + raise TypeError("unexpected type for y, should be a pd.Series") if not y.shape[0] > 0: - raise ValueError(f"{self.classname()}: y is empty; {y.shape}") + raise ValueError(f"y is empty; {y.shape}") return self @@ -169,21 +166,21 @@ def _combine_X_y(self, X, y): if not isinstance(X, pd.DataFrame): - raise TypeError(f"{self.classname()}: X should be a pd.DataFrame") + raise TypeError("X should be a pd.DataFrame") if not isinstance(y, pd.Series): - raise TypeError(f"{self.classname()}: y should be a pd.Series") + raise TypeError("y should be a pd.Series") if X.shape[0] != y.shape[0]: raise ValueError( - f"{self.classname()}: X and y have different numbers of rows ({X.shape[0]} vs {y.shape[0]})" + f"X and y have different numbers of rows ({X.shape[0]} vs {y.shape[0]})" ) if not (X.index == y.index).all(): - warnings.warn(f"{self.classname()}: X and y do not have equal indexes") + warnings.warn("X and y do not have equal indexes") X_y = X.copy() @@ -220,7 +217,7 @@ def transform(self, X): if not X.shape[0] > 0: - raise ValueError(f"{self.classname()}: X has no rows; {X.shape}") + raise ValueError(f"X has no rows; {X.shape}") return X @@ -251,21 +248,21 @@ def columns_check(self, X): if not isinstance(X, pd.DataFrame): - raise TypeError(f"{self.classname()}: X should be a pd.DataFrame") + raise TypeError("X should be a pd.DataFrame") if self.columns is None: - raise ValueError(f"{self.classname()}: columns not set") + raise ValueError("columns not set") if not isinstance(self.columns, list): - raise TypeError(f"{self.classname()}: self.columns should be a list") + raise TypeError("self.columns should be a list") for c in self.columns: if c not in X.columns.values: - raise ValueError(f"{self.classname()}: variable " + c + " is not in X") + raise ValueError("variable " + c + " is not in X") def columns_set_or_check(self, X): """Function to check or set columns attribute. @@ -281,7 +278,7 @@ def columns_set_or_check(self, X): if not isinstance(X, pd.DataFrame): - raise TypeError(f"{self.classname()}: X should be a pd.DataFrame") + raise TypeError("X should be a pd.DataFrame") if self.columns is None: @@ -291,40 +288,6 @@ def columns_set_or_check(self, X): self.columns_check(X) - @staticmethod - def check_weights_column(X, weights_column): - """Helper method for validating weights column in dataframe - - Args: - X (pd.DataFrame): df containing weight column - weights_column (str): name of weight column - - """ - - if weights_column is not None: - - # check if given weight is in columns - if weights_column not in X.columns: - - raise ValueError( - f"weight col ({weights_column}) is not present in columns of data" - ) - - # check weight is numeric - elif not pd.api.types.is_numeric_dtype(X[weights_column]): - - raise ValueError("weight column must be numeric.") - - # check weight is positive - elif not (X[weights_column] < 0).sum() == 0: - - raise ValueError("weight column must be positive") - - # check weight non-null - elif not (X[weights_column].isnull()).sum() == 0: - - raise ValueError("weight column must be non-null") - class ReturnKeyDict(dict): """Dict class that implements __missing__ method to return the key if it is not present in the dict @@ -418,25 +381,25 @@ def __init__( if not type(item) is str: raise TypeError( - f"{self.classname()}: if new_column_name is a list, all elements must be strings but got {type(item)} in position {i}" + f"if new_column_name is a list, all elements must be strings but got {type(item)} in position {i}" ) elif not type(new_column_name) is str: raise TypeError( - f"{self.classname()}: unexpected type ({type(new_column_name)}) for new_column_name, must be str or list of strings" + f"unexpected type ({type(new_column_name)}) for new_column_name, must be str or list of strings" ) if not type(pd_method_name) is str: raise TypeError( - f"{self.classname()}: unexpected type ({type(pd_method_name)}) for pd_method_name, expecting str" + f"unexpected type ({type(pd_method_name)}) for pd_method_name, expecting str" ) if not type(pd_method_kwargs) is dict: raise TypeError( - f"{self.classname()}: pd_method_kwargs should be a dict but got type {type(pd_method_kwargs)}" + f"pd_method_kwargs should be a dict but got type {type(pd_method_kwargs)}" ) else: @@ -446,13 +409,13 @@ def __init__( if not type(k) is str: raise TypeError( - f"{self.classname()}: unexpected type ({type(k)}) for pd_method_kwargs key in position {i}, must be str" + f"unexpected type ({type(k)}) for pd_method_kwargs key in position {i}, must be str" ) if not type(drop_original) is bool: raise TypeError( - f"{self.classname()}: unexpected type ({type(drop_original)}) for drop_original, expecting bool" + f"unexpected type ({type(drop_original)}) for drop_original, expecting bool" ) self.new_column_name = new_column_name @@ -468,7 +431,7 @@ def __init__( except Exception as err: raise AttributeError( - f"""{self.classname()}: error accessing "{pd_method_name}" method on pd.DataFrame object - pd_method_name should be a pd.DataFrame method""" + f"""error accessing "{pd_method_name}" method on pd.DataFrame object - pd_method_name should be a pd.DataFrame method""" ) from err def transform(self, X): diff --git a/tubular/capping.py b/tubular/capping.py index 4906f5a5..d955a45a 100644 --- a/tubular/capping.py +++ b/tubular/capping.py @@ -67,14 +67,14 @@ def __init__( if capping_values is None and quantiles is None: raise ValueError( - f"{self.classname()}: both capping_values and quantiles are None, either supply capping values in the " + "both capping_values and quantiles are None, either supply capping values in the " "capping_values argument or supply quantiles that can be learnt in the fit method" ) if capping_values is not None and quantiles is not None: raise ValueError( - f"{self.classname()}: both capping_values and quantiles are not None, supply one or the other" + "both capping_values and quantiles are not None, supply one or the other" ) if capping_values is not None: @@ -98,7 +98,7 @@ def __init__( if quantile_value < 0 or quantile_value > 1: raise ValueError( - f"{self.classname()}: quantile values must be in the range [0, 1] but got {quantile_value} for key {k}" + f"quantile values must be in the range [0, 1] but got {quantile_value} for key {k}" ) self.capping_values = {} @@ -114,28 +114,26 @@ def check_capping_values_dict(self, capping_values_dict, dict_name): if type(capping_values_dict) is not dict: - raise TypeError( - f"{self.classname()}: {dict_name} should be dict of columns and capping values" - ) + raise TypeError(f"{dict_name} should be dict of columns and capping values") for k, cap_values in capping_values_dict.items(): if type(k) is not str: raise TypeError( - f"{self.classname()}: all keys in {dict_name} should be str, but got {type(k)}" + f"all keys in {dict_name} should be str, but got {type(k)}" ) if type(cap_values) is not list: raise TypeError( - f"{self.classname()}: each item in {dict_name} should be a list, but got {type(cap_values)} for key {k}" + f"each item in {dict_name} should be a list, but got {type(cap_values)} for key {k}" ) if len(cap_values) != 2: raise ValueError( - f"{self.classname()}: each item in {dict_name} should be length 2, but got {len(cap_values)} for key {k}" + f"each item in {dict_name} should be length 2, but got {len(cap_values)} for key {k}" ) for cap_value in cap_values: @@ -145,13 +143,13 @@ def check_capping_values_dict(self, capping_values_dict, dict_name): if type(cap_value) not in [int, float]: raise TypeError( - f"{self.classname()}: each item in {dict_name} lists must contain numeric values or None, got {type(cap_value)} for key {k}" + f"each item in {dict_name} lists must contain numeric values or None, got {type(cap_value)} for key {k}" ) if np.isnan(cap_value) or np.isinf(cap_value): raise ValueError( - f"{self.classname()}: item in {dict_name} lists contains numpy NaN or Inf values" + f"item in {dict_name} lists contains numpy NaN or Inf values" ) if all([cap_value is not None for cap_value in cap_values]): @@ -159,14 +157,12 @@ def check_capping_values_dict(self, capping_values_dict, dict_name): if cap_values[0] >= cap_values[1]: raise ValueError( - f"{self.classname()}: lower value is greater than or equal to upper value for key {k}" + f"lower value is greater than or equal to upper value for key {k}" ) if all([cap_value is None for cap_value in cap_values]): - raise ValueError( - f"{self.classname()}: both values are None for key {k}" - ) + raise ValueError(f"both values are None for key {k}") def fit(self, X, y=None): """Learn capping values from input data X. @@ -207,9 +203,7 @@ def fit(self, X, y=None): else: - warnings.warn( - f"{self.classname()}: quantiles not set so no fitting done in CappingTransformer" - ) + warnings.warn("quantiles not set so no fitting done in CappingTransformer") self._replacement_values = copy.deepcopy(self.capping_values) @@ -328,18 +322,16 @@ def weighted_quantile(self, values, quantiles, sample_weight=None): sample_weight = np.array(sample_weight) if np.isnan(sample_weight).sum() > 0: - raise ValueError(f"{self.classname()}: null values in sample weights") + raise ValueError("null values in sample weights") if np.isinf(sample_weight).sum() > 0: - raise ValueError(f"{self.classname()}: infinite values in sample weights") + raise ValueError("infinite values in sample weights") if (sample_weight < 0).sum() > 0: - raise ValueError(f"{self.classname()}: negative weights in sample weights") + raise ValueError("negative weights in sample weights") if sample_weight.sum() <= 0: - raise ValueError( - f"{self.classname()}: total sample weights are not greater than 0" - ) + raise ValueError("total sample weights are not greater than 0") values = np.array(values) quantiles = np.array(quantiles) @@ -387,13 +379,13 @@ def transform(self, X): if self.capping_values == {}: raise ValueError( - f"{self.classname()}: capping_values attribute is an empty dict - perhaps the fit method has not been run yet" + "capping_values attribute is an empty dict - perhaps the fit method has not been run yet" ) if self._replacement_values == {}: raise ValueError( - f"{self.classname()}: _replacement_values attribute is an empty dict - perhaps the fit method has not been run yet" + "_replacement_values attribute is an empty dict - perhaps the fit method has not been run yet" ) X = super().transform(X) @@ -409,7 +401,7 @@ def transform(self, X): ) raise TypeError( - f"{self.classname()}: The following columns are not numeric in X; {non_numeric_columns}" + f"The following columns are not numeric in X; {non_numeric_columns}" ) for col in self.columns: diff --git a/tubular/comparison.py b/tubular/comparison.py deleted file mode 100644 index 457fc0a4..00000000 --- a/tubular/comparison.py +++ /dev/null @@ -1,72 +0,0 @@ -from tubular.base import BaseTransformer -import pandas as pd - - -class EqualityChecker(BaseTransformer): - - """Transformer to check if two columns are equal. - - Parameters - ---------- - columns: list - List containing names of the two columns to check. - - new_col_name: string - string containing the name of the new column. - - drop_original: boolean = False - boolean representing dropping the input columns from X after checks. - - **kwargs: - Arbitrary keyword arguments passed onto BaseTransformer.init method. - - """ - - def __init__( - self, columns: list, new_col_name: str, drop_original: bool = False, **kwargs - ) -> None: - - super().__init__(columns=columns, **kwargs) - - if not (isinstance(columns, list)): - raise TypeError(f"{self.classname()}: columns should be list") - - if len(columns) != 2: - raise ValueError( - f"{self.classname()}: This transformer works with two columns only" - ) - - if not (isinstance(new_col_name, str)): - raise TypeError(f"{self.classname()}: new_col_name should be str") - - if not (isinstance(drop_original, bool)): - raise TypeError(f"{self.classname()}: drop_original should be bool") - - self.new_col_name = new_col_name - self.drop_original = drop_original - - def transform(self, X: pd.DataFrame) -> pd.DataFrame: - - """Create a column which is populated by the boolean - matching between two columns iterated over rows. - - Parameters - ---------- - X : pd.DataFrame - Data to apply mappings to. - - Returns - ------- - X : pd.DataFrame - Transformed input X with additional boolean column. - - """ - X = super().transform(X) - - X[self.new_col_name] = X[self.columns[0]] == X[self.columns[1]] - - if self.drop_original: - - X.drop(self.columns, axis=1, inplace=True) - - return X diff --git a/tubular/dates.py b/tubular/dates.py index 7aa225c7..43a29ba4 100644 --- a/tubular/dates.py +++ b/tubular/dates.py @@ -7,8 +7,6 @@ import numpy as np import pandas as pd -from typing import Union, List - from tubular.base import BaseTransformer @@ -68,21 +66,21 @@ def __init__( ): if not isinstance(column_lower, str): - raise TypeError(f"{self.classname()}: column_lower should be a str") + raise TypeError("column_lower should be a str") if not isinstance(column_upper, str): - raise TypeError(f"{self.classname()}: column_upper should be a str") + raise TypeError("column_upper should be a str") if not isinstance(new_column_name, str): - raise TypeError(f"{self.classname()}: new_column_name should be a str") + raise TypeError("new_column_name should be a str") if not isinstance(drop_cols, bool): - raise TypeError(f"{self.classname()}: drop_cols should be a bool") + raise TypeError("drop_cols should be a bool") if missing_replacement: if not type(missing_replacement) in [int, float, str]: raise TypeError( - f"{self.classname()}: if not None, missing_replacement should be an int, float or string" + "if not None, missing_replacement should be an int, float or string" ) super().__init__(columns=[column_lower, column_upper], **kwargs) @@ -115,7 +113,7 @@ def calculate_age(self, row): """ if not isinstance(row, pd.Series): - raise TypeError(f"{self.classname()}: row should be a pd.Series") + raise TypeError("row should be a pd.Series") if (pd.isnull(row[self.columns[0]])) or (pd.isnull(row[self.columns[1]])): return self.missing_replacement @@ -124,12 +122,12 @@ def calculate_age(self, row): if not type(row[self.columns[1]]) in [datetime.date, datetime.datetime]: raise TypeError( - f"{self.classname()}: upper column values should be datetime.datetime or datetime.date objects" + "upper column values should be datetime.datetime or datetime.date objects" ) if not type(row[self.columns[0]]) in [datetime.date, datetime.datetime]: raise TypeError( - f"{self.classname()}: lower column values should be datetime.datetime or datetime.date objects" + "lower column values should be datetime.datetime or datetime.date objects" ) age = row[self.columns[1]].year - row[self.columns[0]].year @@ -208,11 +206,11 @@ def __init__( if not type(column_lower) is str: - raise TypeError(f"{self.classname()}: column_lower must be a str") + raise TypeError("column_lower must be a str") if not type(column_upper) is str: - raise TypeError(f"{self.classname()}: column_upper must be a str") + raise TypeError("column_upper must be a str") columns = [column_lower, column_upper] @@ -227,12 +225,12 @@ def __init__( if not type(units) is str: - raise TypeError(f"{self.classname()}: units must be a str") + raise TypeError("units must be a str") if units not in accepted_values_units: raise ValueError( - f"{self.classname()}: units must be one of {accepted_values_units}, got {units}" + f"units must be one of {accepted_values_units}, got {units}" ) self.units = units @@ -241,7 +239,7 @@ def __init__( if not type(new_column_name) is str: - raise TypeError(f"{self.classname()}: new_column_name must be a str") + raise TypeError("new_column_name must be a str") self.new_column_name = new_column_name @@ -301,17 +299,17 @@ def __init__(self, column, new_column_name, to_datetime_kwargs={}, **kwargs): if not type(column) is str: raise TypeError( - f"{self.classname()}: column should be a single str giving the column to transform to datetime" + "column should be a single str giving the column to transform to datetime" ) if not type(new_column_name) is str: - raise TypeError(f"{self.classname()}: new_column_name must be a str") + raise TypeError("new_column_name must be a str") if not type(to_datetime_kwargs) is dict: raise TypeError( - f"{self.classname()}: to_datetime_kwargs should be a dict but got type {type(to_datetime_kwargs)}" + f"to_datetime_kwargs should be a dict but got type {type(to_datetime_kwargs)}" ) else: @@ -321,7 +319,7 @@ def __init__(self, column, new_column_name, to_datetime_kwargs={}, **kwargs): if not type(k) is str: raise TypeError( - f"{self.classname()}: unexpected type ({type(k)}) for to_datetime_kwargs key in position {i}, must be str" + f"unexpected type ({type(k)}) for to_datetime_kwargs key in position {i}, must be str" ) self.to_datetime_kwargs = to_datetime_kwargs @@ -412,28 +410,26 @@ def __init__( if type(column) is not str: - raise TypeError( - f"{self.classname()}: column should be a str but got {type(column)}" - ) + raise TypeError(f"column should be a str but got {type(column)}") super().__init__(columns=column, **kwargs) if type(new_column_name) is not str: raise TypeError( - f"{self.classname()}: unexpected type ({type(new_column_name)}) for new_column_name, must be str" + f"unexpected type ({type(new_column_name)}) for new_column_name, must be str" ) if type(pd_method_name) is not str: raise TypeError( - f"{self.classname()}: unexpected type ({type(pd_method_name)}) for pd_method_name, expecting str" + f"unexpected type ({type(pd_method_name)}) for pd_method_name, expecting str" ) if type(pd_method_kwargs) is not dict: raise TypeError( - f"{self.classname()}: pd_method_kwargs should be a dict but got type {type(pd_method_kwargs)}" + f"pd_method_kwargs should be a dict but got type {type(pd_method_kwargs)}" ) else: @@ -443,7 +439,7 @@ def __init__( if not type(k) is str: raise TypeError( - f"{self.classname()}: unexpected type ({type(k)}) for pd_method_kwargs key in position {i}, must be str" + f"unexpected type ({type(k)}) for pd_method_kwargs key in position {i}, must be str" ) self.new_column_name = new_column_name @@ -458,7 +454,7 @@ def __init__( except Exception as err: raise AttributeError( - f"""{self.classname()}: error accessing "dt.{pd_method_name}" method on pd.Series object - pd_method_name should be a pd.Series.dt method""" + f"""error accessing "dt.{pd_method_name}" method on pd.Series object - pd_method_name should be a pd.Series.dt method""" ) from err if callable(getattr(ser.dt, pd_method_name)): @@ -582,22 +578,22 @@ def __init__( ): if type(column_lower) is not str: - raise TypeError(f"{self.classname()}: column_lower should be str") + raise TypeError("column_lower should be str") if type(column_between) is not str: - raise TypeError(f"{self.classname()}: column_between should be str") + raise TypeError("column_between should be str") if type(column_upper) is not str: - raise TypeError(f"{self.classname()}: column_upper should be str") + raise TypeError("column_upper should be str") if type(new_column_name) is not str: - raise TypeError(f"{self.classname()}: new_column_name should be str") + raise TypeError("new_column_name should be str") if type(lower_inclusive) is not bool: - raise TypeError(f"{self.classname()}: lower_inclusive should be a bool") + raise TypeError("lower_inclusive should be a bool") if type(upper_inclusive) is not bool: - raise TypeError(f"{self.classname()}: upper_inclusive should be a bool") + raise TypeError("upper_inclusive should be a bool") self.new_column_name = new_column_name self.lower_inclusive = lower_inclusive @@ -637,13 +633,13 @@ def transform(self, X): if not pd.api.types.is_datetime64_dtype(X[col]): raise TypeError( - f"{self.classname()}: {col} should be datetime64[ns] type but got {X[col].dtype}" + f"{col} should be datetime64[ns] type but got {X[col].dtype}" ) if not (X[self.columns[0]] <= X[self.columns[2]]).all(): warnings.warn( - f"{self.classname()}: not all {self.columns[2]} are greater than or equal to {self.columns[0]}" + f"not all {self.columns[2]} are greater than or equal to {self.columns[0]}" ) if self.lower_inclusive: @@ -665,464 +661,3 @@ def transform(self, X): X[self.new_column_name] = lower_comparison & upper_comparison return X - - -class DatetimeInfoExtractor(BaseTransformer): - """Transformer to extract various features from datetime var - - Parameters - ---------- - columns : str or list - datetime columns to extract information from - - include : list of str, default = ["timeofday", "timeofmonth", "timeofyear", "dayofweek"] - Which datetime categorical information to extract - - datetime_mappings : dict, default = {} - Optional argument to define custom mappings for datetime values. - Keys of the dictionary must be contained in `include` - All possible values of each feature must be included in the mappings, - ie, a mapping for `dayofweek` must include all values 0-6; - datetime_mappings = {"dayofweek": {"week": [0, 1, 2, 3, 4], - "weekend": [5, 6]}} - The values for the mapping array must be iterable; - datetime_mappings = {"timeofday": {"am": range(0, 12), - "pm": range(12, 24)}} - The required ranges for each mapping are: - timeofday: 0-23 - timeofmonth: 1-31 - timeofyear: 1-12 - dayofweek: 0-6 - - If in include but no mappings provided default values will be used as follows: - timeofday_mapping = { - "night": range(0, 6), # Midnight - 6am - "morning": range(6, 12), # 6am - Noon - "afternoon": range(12, 18), # Noon - 6pm - "evening": range(18, 24), # 6pm - Midnight - } - timeofmonth_mapping = { - "start": range(0, 11), - "middle": range(11, 21), - "end": range(21, 32), - } - timeofyear_mapping = { - "spring": range(3, 6), # Mar, Apr, May - "summer": range(6, 9), # Jun, Jul, Aug - "autumn": range(9, 12), # Sep, Oct, Nov - "winter": [12, 1, 2], # Dec, Jan, Feb - } - dayofweek_mapping = { - "monday": [0], - "tuesday": [1], - "wednesday": [2], - "thursday": [3], - "friday": [4], - "saturday": [5], - "sunday": [6], - } - - - **kwargs - Arbitrary keyword arguments passed onto BaseTransformer.init method. - - Attributes - ---------- - include : list of str, default = ["timeofday", "timeofmonth", "timeofyear", "dayofweek"] - Which datetime categorical information to extract - - datetime_mappings : dict, default = {} - Optional argument to define custom mappings for datetime values. - - """ - - def __init__( - self, - columns, - include=["timeofday", "timeofmonth", "timeofyear", "dayofweek"], - datetime_mappings={}, - **kwargs, - ): - - if not type(include) is list: - raise TypeError(f"{self.classname()}: include should be List") - - if not type(datetime_mappings) is dict: - raise TypeError(f"{self.classname()}: datetime_mappings should be Dict") - - super().__init__(columns=columns, **kwargs) - - for var in include: - if var not in [ - "timeofday", - "timeofmonth", - "timeofyear", - "dayofweek", - ]: - raise ValueError( - f'{self.classname()}: elements in include should be in ["timeofday", "timeofmonth", "timeofyear", "dayofweek"]' - ) - - if datetime_mappings != {}: - for key, mapping in datetime_mappings.items(): - if not type(mapping) is dict: - raise TypeError( - f"{self.classname()}: values in datetime_mappings should be dict" - ) - if key not in include: - raise ValueError( - f"{self.classname()}: keys in datetime_mappings should be in include" - ) - - self.include = include - self.datetime_mappings = datetime_mappings - self.mappings_provided = self.datetime_mappings.keys() - - # Select correct mapping either from default or user input - - if ("timeofday" in include) and ("timeofday" in self.mappings_provided): - timeofday_mapping = self.datetime_mappings["timeofday"] - elif "timeofday" in include: # Choose default mapping - timeofday_mapping = { - "night": range(0, 6), # Midnight - 6am - "morning": range(6, 12), # 6am - Noon - "afternoon": range(12, 18), # Noon - 6pm - "evening": range(18, 24), # 6pm - Midnight - } - - if ("timeofmonth" in include) and ("timeofmonth" in self.mappings_provided): - timeofmonth_mapping = self.datetime_mappings["timeofmonth"] - elif "timeofmonth" in include: # Choose default mapping - timeofmonth_mapping = { - "start": range(0, 11), - "middle": range(11, 21), - "end": range(21, 32), - } - - if ("timeofyear" in include) and ("timeofyear" in self.mappings_provided): - timeofyear_mapping = self.datetime_mappings["timeofyear"] - elif "timeofyear" in include: # Choose default mapping - timeofyear_mapping = { - "spring": range(3, 6), # Mar, Apr, May - "summer": range(6, 9), # Jun, Jul, Aug - "autumn": range(9, 12), # Sep, Oct, Nov - "winter": [12, 1, 2], # Dec, Jan, Feb - } - - if ("dayofweek" in include) and ("dayofweek" in self.mappings_provided): - dayofweek_mapping = self.datetime_mappings["dayofweek"] - elif "dayofweek" in include: # Choose default mapping - dayofweek_mapping = { - "monday": [0], - "tuesday": [1], - "wednesday": [2], - "thursday": [3], - "friday": [4], - "saturday": [5], - "sunday": [6], - } - - # Invert dictionaries for quicker lookup - - if "timeofday" in include: - self.timeofday_mapping = { - vi: k for k, v in timeofday_mapping.items() for vi in v - } - if set(self.timeofday_mapping.keys()) != set(range(24)): - raise ValueError( - "{}: timeofday mapping dictionary should contain mapping for all hours between 0-23. {} are missing".format( - self.classname(), - set(range(24)) - set(self.timeofday_mapping.keys()), - ) - ) - # Check if all hours in dictionary - else: - self.timeofday_mapping = {} - - if "timeofmonth" in include: - self.timeofmonth_mapping = { - vi: k for k, v in timeofmonth_mapping.items() for vi in v - } - if set(self.timeofmonth_mapping.keys()) != set(range(32)): - raise ValueError( - "{}: timeofmonth mapping dictionary should contain mapping for all days between 1-31. {} are missing".format( - self.classname(), - set(range(1, 32)) - set(self.timeofmonth_mapping.keys()), - ) - ) - else: - self.timeofmonth_mapping = {} - - if "timeofyear" in include: - self.timeofyear_mapping = { - vi: k for k, v in timeofyear_mapping.items() for vi in v - } - if set(self.timeofyear_mapping.keys()) != set(range(1, 13)): - raise ValueError( - "{}: timeofyear mapping dictionary should contain mapping for all months between 1-12. {} are missing".format( - self.classname(), - set(range(1, 13)) - set(self.timeofyear_mapping.keys()), - ) - ) - else: - self.timeofyear_mapping = {} - - if "dayofweek" in include: - self.dayofweek_mapping = { - vi: k for k, v in dayofweek_mapping.items() for vi in v - } - if set(self.dayofweek_mapping.keys()) != set(range(7)): - raise ValueError( - "{}: dayofweek mapping dictionary should contain mapping for all days between 0-6. {} are missing".format( - self.classname(), - set(range(7)) - set(self.dayofweek_mapping.keys()), - ) - ) - else: - self.dayofweek_mapping = {} - - def _map_values(self, value, interval: str): - - """ - Method to apply mappings for a specified interval ("timeofday", "timeofmonth", "timeofyear" or "dayofweek") - from corresponding mapping attribute to a single value. - - Parameters - ---------- - interval : str - the time period to map "timeofday", "timeofmonth", "timeofyear" or "dayofweek" - - value : float or int - the value to be mapped - - - Returns - ------- - str : str - Mapped value - """ - - if not type(value) is float: - if not type(value) is int: - raise TypeError(f"{self.classname()}: value should be float or int") - - errors = { - "timeofday": "0-23", - "dayofweek": "0-6", - "timeofmonth": "1-31", - "timeofyear": "1-12", - } - ranges = { - "timeofday": (0, 24, 1), - "dayofweek": (0, 7, 1), - "timeofmonth": (1, 32, 1), - "timeofyear": (1, 13, 1), - } - mappings = { - "timeofday": self.timeofday_mapping, - "dayofweek": self.dayofweek_mapping, - "timeofmonth": self.timeofmonth_mapping, - "timeofyear": self.timeofyear_mapping, - } - - if not np.isnan(value): - if value not in np.arange(*ranges[interval]): - raise ValueError( - f"{self.classname()}: value for {interval} mapping in self._map_values should be an integer value in {errors[interval]}" - ) - - if np.isnan(value): - return np.nan - else: - return mappings[interval][value] - - def transform(self, X): - """Transform - Extracts new features from datetime variables - - Parameters - ---------- - X : pd.DataFrame - Data with columns to extract info from. - - Returns - ------- - X : pd.DataFrame - Transformed input X with added columns of extracted information. - """ - - X = super().transform(X) - - for col in self.columns: - if not X[col].dtype.name == "datetime64[ns]": - try: - X[col] = X[col].dt.tz_localize(None) - except AttributeError: - raise TypeError( - f"{self.classname()}: values in {col} should be datetime64[ns]" - ) - - for col in self.columns: - if "timeofday" in self.include: - X[col + "_timeofday"] = X[col].dt.hour.apply( - self._map_values, interval="timeofday" - ) - - if "timeofmonth" in self.include: - X[col + "_timeofmonth"] = X[col].dt.day.apply( - self._map_values, interval="timeofmonth" - ) - - if "timeofyear" in self.include: - X[col + "_timeofyear"] = X[col].dt.month.apply( - self._map_values, interval="timeofyear" - ) - - if "dayofweek" in self.include: - X[col + "_dayofweek"] = X[col].dt.weekday.apply( - self._map_values, interval="dayofweek" - ) - - return X - - -class DatetimeSinusoidCalculator(BaseTransformer): - - """ - Transformer to derive a feature in a dataframe by calculating the - sine or cosine of a datetime column in a given unit (e.g hour), with the option to scale - period of the sine or cosine to match the natural period of the unit (e.g. 24). - - Parameters - ---------- - columns : str or list - Columns to take the sine or cosine of. Must be a datetime[64] column. - - method : str or list - Argument to specify which function is to be calculated. Accepted values are 'sin', 'cos' or a list containing both. - - units : str - Which time unit the calculation is to be carried out on. Accepted values are 'year', 'month', - 'day', 'hour', 'minute', 'second', 'microsecond'. - - period : int or float, default = 2*np.pi - The period of the output in the units specified above. To leave the period of the sinusoid output as 2 pi, specify 2*np.pi (or leave as default) - - Attributes - ----------- - columns : str or list - Columns to take the sine or cosine of. - - method : str - The function to be calculated; either sin, cos or a list containing both. - - units : str - Which time unit the calculation is to be carried out on. Will take any of 'year', 'month', - 'day', 'hour', 'minute', 'second', 'microsecond'. - - period : str or float, default = 2*np.pi - The period of the output in the units specified above. - """ - - def __init__( - self, - columns: Union[str, List[str]], - method: Union[str, List[str]], - units: str, - period: Union[int, float] = 2 * np.pi, - ): - - super().__init__(columns, copy=True) - - if not isinstance(method, str) and not isinstance(method, list): - raise TypeError( - "{}: method must be a string or list but got {}".format( - self.classname(), type(method) - ) - ) - - if not isinstance(units, str): - raise TypeError( - "{}: units must be a string but got {}".format( - self.classname(), type(units) - ) - ) - - if (not isinstance(period, int)) and (not isinstance(period, float)): - raise TypeError( - "{}: period must be a int or float but got {}".format( - self.classname(), type(period) - ) - ) - - valid_method_list = ["sin", "cos"] - - if isinstance(method, str): - method_list = [method] - else: - method_list = method - - for method in method_list: - if method not in valid_method_list: - raise ValueError( - '{}: Invalid method {} supplied, should be "sin", "cos" or a list containing both'.format( - self.classname(), method - ) - ) - - valid_unit_list = [ - "year", - "month", - "day", - "hour", - "minute", - "second", - "microsecond", - ] - - if units not in valid_unit_list: - raise ValueError( - "{}: Invalid units {} supplied, should be in {}".format( - self.classname(), units, valid_unit_list - ) - ) - - self.method = method_list - self.units = units - self.period = period - - def transform(self, X: pd.DataFrame) -> pd.DataFrame: - """Transform - creates column containing sine or cosine of another datetime column. - - Which function is used is stored in the self.method attribute. - - Parameters - ---------- - X : pd.DataFrame - Data to transform. - - Returns - ------- - X : pd.DataFrame - Input X with additional columns added, these are named "_" - """ - - X = super().transform(X) - - for column in self.columns: - if not pd.api.types.is_datetime64_dtype(X[column]): - - raise TypeError( - f"{self.classname()} : {column} should be datetime64[ns] type but got {X[column].dtype}" - ) - - column_in_desired_unit = getattr(X[column].dt, self.units) - - for method in self.method: - - new_column_name = method + "_" + column - - X[new_column_name] = getattr(np, method)( - column_in_desired_unit * (2.0 * np.pi / self.period) - ) - - return X diff --git a/tubular/imputers.py b/tubular/imputers.py index 509a2449..49ebddb2 100644 --- a/tubular/imputers.py +++ b/tubular/imputers.py @@ -4,8 +4,6 @@ import pandas as pd import numpy as np -import warnings - from tubular.base import BaseTransformer @@ -69,9 +67,7 @@ def __init__(self, impute_value, columns, **kwargs): if columns is None: - raise ValueError( - f"{self.classname()}: columns must be specified in init for ArbitraryImputer" - ) + raise ValueError("columns must be specified in init for ArbitraryImputer") super().__init__(columns=columns, **kwargs) @@ -82,7 +78,7 @@ def __init__(self, impute_value, columns, **kwargs): ): raise ValueError( - f"{self.classname()}: impute_value should be a single value (int, float or str)" + "impute_value should be a single value (int, float or str)" ) self.impute_values_ = {} @@ -133,9 +129,6 @@ class MedianImputer(BaseImputer): Columns to impute, if the default of None is supplied all columns in X are used when the transform method is called. - weight: None or str, default=None - Column containing weights - **kwargs Arbitrary keyword arguments passed onto BaseTransformer.init method. @@ -147,16 +140,10 @@ class MedianImputer(BaseImputer): """ - def __init__(self, columns=None, weight=None, **kwargs): + def __init__(self, columns=None, **kwargs): super().__init__(columns=columns, **kwargs) - if not isinstance(weight, str): - if weight is not None: - raise TypeError("weight should be str or None") - - self.weight = weight - def fit(self, X, y=None): """Calculate median values to impute with from X. @@ -174,36 +161,9 @@ def fit(self, X, y=None): self.impute_values_ = {} - if self.weight is not None: - - super().check_weights_column(X, self.weight) - - temp = X.copy() - - for c in self.columns: - - # filter out null rows so their weight doesn't influence calc - filtered = temp[temp[c].notnull()] - - # first sort df by column to be imputed (order of weight column shouldn't matter for median) - filtered.sort_values(c, inplace=True) - - # next calculate cumulative weight sums - cumsum = filtered[self.weight].cumsum() - - # find midpoint - cutoff = filtered[self.weight].sum() / 2.0 - - # find first value >= this point - median = filtered[c][cumsum >= cutoff].iloc[0] - - self.impute_values_[c] = median - - else: - - for c in self.columns: + for c in self.columns: - self.impute_values_[c] = X[c].median() + self.impute_values_[c] = X[c].median() return self @@ -217,9 +177,6 @@ class MeanImputer(BaseImputer): Columns to impute, if the default of None is supplied all columns in X are used when the transform method is called. - weights : None or str, default = None - Column containing weights. - **kwargs Arbitrary keyword arguments passed onto BaseTransformer.init method. @@ -231,16 +188,10 @@ class MeanImputer(BaseImputer): """ - def __init__(self, columns=None, weight=None, **kwargs): + def __init__(self, columns=None, **kwargs): super().__init__(columns=columns, **kwargs) - if not isinstance(weight, str): - if weight is not None: - raise TypeError("weight should be str or None") - - self.weight = weight - def fit(self, X, y=None): """Calculate mean values to impute with from X. @@ -258,29 +209,9 @@ def fit(self, X, y=None): self.impute_values_ = {} - if self.weight is not None: - - super().check_weights_column(X, self.weight) - - for c in self.columns: - - # filter out null rows so they don't count towards total weight - filtered = X[X[c].notnull()] - - # calculate total weight and total of weighted col - total_weight = filtered[self.weight].sum() - total_weighted_col = filtered[c].mul(filtered[self.weight]).sum() - - # find weighted mean and add to dict - weighted_mean = total_weighted_col / total_weight - - self.impute_values_[c] = weighted_mean - - else: - - for c in self.columns: + for c in self.columns: - self.impute_values_[c] = X[c].mean() + self.impute_values_[c] = X[c].mean() return self @@ -288,18 +219,12 @@ def fit(self, X, y=None): class ModeImputer(BaseImputer): """Transformer to impute missing values with the mode of the supplied columns. - If mode is NaN, a warning will be raised. - Parameters ---------- columns : None or str or list, default = None Columns to impute, if the default of None is supplied all columns in X are used when the transform method is called. - weight : str - Name of weights columns to use if mode should be in terms of sum of weights - not count of rows. - **kwargs Arbitrary keyword arguments passed onto BaseTransformer.init method. @@ -311,18 +236,10 @@ class ModeImputer(BaseImputer): """ - def __init__(self, columns=None, weight=None, **kwargs): + def __init__(self, columns=None, **kwargs): super().__init__(columns=columns, **kwargs) - if weight is not None: - - if not isinstance(weight, str): - - raise ValueError("ModeImputer: weight should be a string or None") - - self.weight = weight - def fit(self, X, y=None): """Calculate mode values to impute with from X. @@ -340,29 +257,9 @@ def fit(self, X, y=None): self.impute_values_ = {} - if self.weight is None: - - for c in self.columns: - - mode_value = X[c].mode(dropna=True) - - if len(mode_value) == 0: - - self.impute_values_[c] = np.nan - - warnings.warn(f"ModeImputer: The Mode of column {c} is NaN.") - - else: - - self.impute_values_[c] = mode_value[0] - - else: - - super().check_weights_column(X, self.weight) - - for c in self.columns: + for c in self.columns: - self.impute_values_[c] = X.groupby(c)[self.weight].sum().idxmax() + self.impute_values_[c] = X[c].mode()[0] return self @@ -404,7 +301,7 @@ def fit(self, X, y): if n_nulls > 0: - raise ValueError(f"{self.classname()}: y has {n_nulls} null values") + raise ValueError(f"y has {n_nulls} null values") self.impute_values_ = {} @@ -418,7 +315,7 @@ def fit(self, X, y): if c_nulls.sum() == 0: raise ValueError( - f"{self.classname()}: Column {c} has no missing values, cannot use this transformer." + f"Column {c} has no missing values, cannot use this transformer." ) else: diff --git a/tubular/mapping.py b/tubular/mapping.py index f2ef5555..bc61b714 100644 --- a/tubular/mapping.py +++ b/tubular/mapping.py @@ -3,11 +3,8 @@ """ import pandas as pd -from pandas.api.types import is_categorical_dtype import numpy as np from collections import OrderedDict -import warnings - from tubular.base import BaseTransformer, ReturnKeyDict @@ -40,21 +37,21 @@ def __init__(self, mappings, **kwargs): if not len(mappings) > 0: - raise ValueError(f"{self.classname()}: mappings has no values") + raise ValueError("mappings has no values") for j in mappings.values(): if not isinstance(j, dict): raise ValueError( - f"{self.classname()}: values in mappings dictionary should be dictionaries" + "values in mappings dictionary should be dictionaries" ) self.mappings = mappings else: - raise ValueError(f"{self.classname()}: mappings must be a dictionary") + raise ValueError("mappings must be a dictionary") columns = list(mappings.keys()) @@ -128,7 +125,7 @@ class MappingTransformer(BaseMappingTransformer, BaseMappingTransformMixin): is not available in the mapping dict. This transformer inherits from BaseMappingTransformMixin as well as the BaseMappingTransformer - in order to access the standard pd.Series.map transform function. + in order to access the startard pd.Series.map transform function. Parameters ---------- @@ -160,86 +157,33 @@ def __init__(self, mappings, **kwargs): else: raise TypeError( - f"{self.classname()}: each item in mappings should be a dict but got type {type(v)} for key {k}" + f"each item in mappings should be a dict but got type {type(v)} for key {k}" ) BaseMappingTransformer.__init__(self, mappings=mappings, **kwargs) - def transform(self, X, suppress_dtype_warning=False): - """Transform the input data X according to the mappings in the mappings attribute dict. + def transform(self, X): + """Transfrom the input data X according to the mappings in the mappings attribute dict. This method calls the BaseMappingTransformMixin.transform. Note, this transform method is different to some of the transform methods in the nominal module, even though they also use the BaseMappingTransformMixin.transform method. Here, if a value does not exist in the mapping it is unchanged. - Due to the way pd.Series.map works, mappings can result in column dtypes changing, - sometimes unexpectedly. If the result of the mappings is a dtype that doesn't match - the original dtype, or the dtype of the values provided in the mapping a warning - will be raised. This normally results from an incomplete mapping being provided, - or a mix of dtypes causing pandas to default to the object dtype. - - For columns with a 'category' dtype the warning will not be raised. - Parameters ---------- X : pd.DataFrame Data with nominal columns to transform. - suppress_dtype_warning: Bool, default = False - Whether to suppress warnings about dtype changes - Returns ------- X : pd.DataFrame Transformed input X with levels mapped accoriding to mappings dict. """ - mapped_columns = self.mappings.keys() - original_dtypes = X[mapped_columns].dtypes - - for col in mapped_columns: - - values_to_be_mapped = set(self.mappings[col].keys()) - values_in_df = set(X[col].unique()) - - if len(values_to_be_mapped.intersection(values_in_df)) == 0: - - warnings.warn( - f"{self.classname()}: No values from mapping for {col} exist in dataframe." - ) - - if len(values_to_be_mapped.difference(values_in_df)) > 0: - - warnings.warn( - f"{self.classname()}: There are values in the mapping for {col} that are not present in the dataframe" - ) X = BaseMappingTransformMixin.transform(self, X) - mapped_dtypes = X[mapped_columns].dtypes - - if not suppress_dtype_warning: - - for col in mapped_columns: - - col_mappings = pd.Series(self.mappings[col]) - mapping_dtype = col_mappings.dtype - - if (mapped_dtypes[col] != mapping_dtype) and ( - mapped_dtypes[col] != original_dtypes[col] - ): - - # Confirm the initial and end dtypes are not categories - if not ( - is_categorical_dtype(original_dtypes[col]) - and is_categorical_dtype(mapped_dtypes[col]) - ): - - warnings.warn( - f"{self.classname()}: This mapping changes {col} dtype from {original_dtypes[col]} to {mapped_dtypes[col]}. This is often caused by having multiple dtypes in one column, or by not mapping all values." - ) - return X @@ -258,7 +202,7 @@ class CrossColumnMappingTransformer(BaseMappingTransformer): would replace the values in the adjustment column based off the values in column a using the mapping 1->'a', 3->'b' and also replace based off the values in column b using a mapping 'a'->1, 'b'->2. If more than one column is defined for this mapping, then this object must be an OrderedDict - to ensure reproducibility. + to ensure reproducability. **kwargs Arbitrary keyword arguments passed onto BaseTransformer.init method. @@ -282,14 +226,14 @@ def __init__(self, adjust_column, mappings, **kwargs): if not isinstance(adjust_column, str): - raise TypeError(f"{self.classname()}: adjust_column should be a string") + raise TypeError("adjust_column should be a string") if len(mappings) > 1: if not isinstance(mappings, OrderedDict): raise TypeError( - f"{self.classname()}: mappings should be an ordered dict for 'replace' mappings using multiple columns" + "mappings should be an ordered dict for 'replace' mappings using multiple columns" ) self.adjust_column = adjust_column @@ -315,9 +259,7 @@ def transform(self, X): if self.adjust_column not in X.columns.values: - raise ValueError( - f"{self.classname()}: variable {self.adjust_column} is not in X" - ) + raise ValueError("variable " + self.adjust_column + " is not in X") for i in self.columns: @@ -369,7 +311,7 @@ def __init__(self, adjust_column, mappings, **kwargs): if not isinstance(adjust_column, str): - raise TypeError(f"{self.classname()}: adjust_column should be a string") + raise TypeError("adjust_column should be a string") for j in mappings.values(): @@ -377,9 +319,7 @@ def __init__(self, adjust_column, mappings, **kwargs): if type(k) not in [int, float]: - raise TypeError( - f"{self.classname()}: mapping values must be numeric" - ) + raise TypeError("mapping values must be numeric") self.adjust_column = adjust_column @@ -404,14 +344,12 @@ def transform(self, X): if self.adjust_column not in X.columns.values: - raise ValueError( - f"{self.classname()}: variable {self.adjust_column} is not in X" - ) + raise ValueError("variable " + self.adjust_column + " is not in X") if not pd.api.types.is_numeric_dtype(X[self.adjust_column]): raise TypeError( - f"{self.classname()}: variable {self.adjust_column} must have numeric dtype." + "variable " + self.adjust_column + " must have numeric dtype." ) for i in self.columns: @@ -466,7 +404,7 @@ def __init__(self, adjust_column, mappings, **kwargs): if not isinstance(adjust_column, str): - raise TypeError(f"{self.classname()}: adjust_column should be a string") + raise TypeError("adjust_column should be a string") for j in mappings.values(): @@ -474,9 +412,7 @@ def __init__(self, adjust_column, mappings, **kwargs): if type(k) not in [int, float]: - raise TypeError( - f"{self.classname()}: mapping values must be numeric" - ) + raise TypeError("mapping values must be numeric") self.adjust_column = adjust_column @@ -501,16 +437,12 @@ def transform(self, X): if self.adjust_column not in X.columns.values: - raise ValueError( - f"{self.classname()}: variable " + self.adjust_column + " is not in X" - ) + raise ValueError("variable " + self.adjust_column + " is not in X") if not pd.api.types.is_numeric_dtype(X[self.adjust_column]): raise TypeError( - f"{self.classname()}: variable " - + self.adjust_column - + " must have numeric dtype." + "variable " + self.adjust_column + " must have numeric dtype." ) for i in self.columns: diff --git a/tubular/misc.py b/tubular/misc.py index 26a153d8..2e22cc44 100644 --- a/tubular/misc.py +++ b/tubular/misc.py @@ -1,5 +1,4 @@ from tubular.base import BaseTransformer -import pandas as pd class SetValueTransformer(BaseTransformer): @@ -46,44 +45,3 @@ def transform(self, X): X[self.columns] = self.value return X - - -class SetColumnDtype(BaseTransformer): - """ - Transformer to set transform columns in a dataframe to a dtype - - Parameters - ---------- - columns : str or list - Columns to set dtype. Must be set or transform will not run. - - dtype : type or string - dtype object to set columns to or a string interpretable as one by pd.api.types.pandas_dtype - e.g. float or 'float' - """ - - def __init__(self, columns, dtype): - - super().__init__(columns, copy=True) - - self.__validate_dtype(dtype) - - self.dtype = dtype - - def transform(self, X): - - X = super().transform(X) - - X[self.columns] = X[self.columns].astype(self.dtype) - - return X - - def __validate_dtype(self, dtype: str): - """Check string is a valid dtype""" - - try: - pd.api.types.pandas_dtype(dtype) - except TypeError: - raise TypeError( - f"{self.classname()}: data type '{dtype}' not understood as a valid dtype" - ) diff --git a/tubular/nominal.py b/tubular/nominal.py index 96699903..d9169132 100644 --- a/tubular/nominal.py +++ b/tubular/nominal.py @@ -40,9 +40,7 @@ def columns_set_or_check(self, X): if not len(columns) > 0: - raise ValueError( - f"{self.classname()}: no object or category columns in X" - ) + raise ValueError("no object or category columns in X") self.columns = columns @@ -71,7 +69,7 @@ def check_mappable_rows(self, X): if mappable_rows < X.shape[0]: raise ValueError( - f"{self.classname()}: nulls would be introduced into column {c} from levels not present in mapping" + f"nulls would be introduced into column {c} from levels not present in mapping" ) @@ -115,7 +113,7 @@ def __init__(self, columns=None, start_encoding=0, **kwargs): if not isinstance(start_encoding, int): - raise ValueError(f"{self.classname()}: start_encoding should be an integer") + raise ValueError("start_encoding should be an integer") self.start_encoding = start_encoding @@ -207,7 +205,7 @@ def inverse_transform(self, X): if (X.shape[0] - mappable_rows) > 0: raise ValueError( - f"{self.classname()}: nulls introduced from levels not present in mapping for column: " + "nulls introduced from levels not present in mapping for column: " + c ) @@ -292,11 +290,11 @@ def __init__( if not isinstance(cut_off_percent, float): - raise ValueError(f"{self.classname()}: cut_off_percent must be a float") + raise ValueError("cut_off_percent must be a float") if not ((cut_off_percent > 0) & (cut_off_percent < 1)): - raise ValueError(f"{self.classname()}: cut_off_percent must be > 0 and < 1") + raise ValueError("cut_off_percent must be > 0 and < 1") self.cut_off_percent = cut_off_percent @@ -304,9 +302,7 @@ def __init__( if not isinstance(weight, str): - raise ValueError( - f"{self.classname()}: weight should be a single column (str)" - ) + raise ValueError("weight should be a single column (str)") self.weight = weight @@ -314,7 +310,7 @@ def __init__( if not isinstance(record_rare_levels, bool): - raise ValueError(f"{self.classname()}: record_rare_levels must be a bool") + raise ValueError("record_rare_levels must be a bool") self.record_rare_levels = record_rare_levels @@ -346,14 +342,14 @@ def fit(self, X, y=None): if pd.Series(self.rare_level_name).dtype != X[c].dtypes: raise ValueError( - f"{self.classname()}: rare_level_name must be of the same type of the columns" + "rare_level_name must be of the same type of the columns" ) if self.weight is not None: if self.weight not in X.columns.values: - raise ValueError(f"{self.classname()}: weight {self.weight} not in X") + raise ValueError("weight " + self.weight + " not in X") self.mapping_ = {} @@ -482,10 +478,6 @@ class MeanResponseTransformer(BaseNominalTransformer, BaseMappingTransformMixin) weights_column : str or None Weights column to use when calculating the mean response. - prior : int, default = 0 - Regularisation parameter, can be thought of roughly as the size a category should be in order for - its statistics to be considered reliable (hence default value of 0 means no regularisation). - **kwargs Arbitrary keyword arguments passed onto BaseTransformer.init method. @@ -500,53 +492,18 @@ class MeanResponseTransformer(BaseNominalTransformer, BaseMappingTransformMixin) """ - def __init__(self, columns=None, weights_column=None, prior=0, **kwargs): + def __init__(self, columns=None, weights_column=None, **kwargs): if weights_column is not None: if type(weights_column) is not str: - raise TypeError(f"{self.classname()}: weights_column should be a str") - - if type(prior) is not int: - - raise TypeError("prior should be a int") - - if not prior >= 0: - raise ValueError("prior should be positive int") + raise TypeError("weights_column should be a str") self.weights_column = weights_column - self.prior = prior - # TODO: set default prior to None and refactor to only use prior regularisation when it is set? BaseNominalTransformer.__init__(self, columns=columns, **kwargs) - def _prior_regularisation(self, target_means, cat_freq): - """Regularise encoding values by pushing encodings of infrequent categories towards the global mean. If prior is zero this will return target_means unaltered. - - Parameters - ---------- - target_means : pd.Series - Series containing group means for levels of column in data - - cat_freq : str - Series containing group sizes for levels of column in data - - Returns - ------- - regularised : pd.Series - Series of regularised encoding values - """ - - self.check_is_fitted(["global_mean"]) - - regularised = ( - target_means.multiply(cat_freq, axis="index") - + self.global_mean * self.prior - ).divide(cat_freq + self.prior, axis="index") - - return regularised - def fit(self, X, y): """Identify mapping of categorical levels to mean response values. @@ -572,59 +529,31 @@ def fit(self, X, y): if self.weights_column not in X.columns.values: - raise ValueError( - f"{self.classname()}: weights column {self.weights_column} not in X" - ) + raise ValueError(f"weights column {self.weights_column} not in X") response_null_count = y.isnull().sum() if response_null_count > 0: - raise ValueError( - f"{self.classname()}: y has {response_null_count} null values" - ) + raise ValueError(f"y has {response_null_count} null values") X_y = self._combine_X_y(X, y) response_column = "_temporary_response" - if self.weights_column is None: - - self.global_mean = X_y[response_column].mean() - - else: - - X_y["weighted_response"] = X_y[response_column].multiply( - X_y[self.weights_column] - ) - - self.global_mean = ( - X_y["weighted_response"].sum() / X_y[self.weights_column].sum() - ) - for c in self.columns: if self.weights_column is None: - group_means = X_y.groupby(c)[response_column].mean() - - group_counts = X_y.groupby(c)[response_column].size() - - self.mappings[c] = self._prior_regularisation( - group_means, group_counts - ).to_dict() + self.mappings[c] = X_y.groupby([c])[response_column].mean().to_dict() else: groupby_sum = X_y.groupby([c])[ - ["weighted_response", self.weights_column] + [response_column, self.weights_column] ].sum() - group_weight = groupby_sum[self.weights_column] - - group_means = groupby_sum["weighted_response"] / group_weight - - self.mappings[c] = self._prior_regularisation( - group_means, group_weight + self.mappings[c] = ( + groupby_sum[response_column] / groupby_sum[self.weights_column] ).to_dict() return self @@ -693,7 +622,7 @@ def __init__(self, columns=None, weights_column=None, **kwargs): if type(weights_column) is not str: - raise TypeError(f"{self.classname()}: weights_column should be a str") + raise TypeError("weights_column should be a str") self.weights_column = weights_column @@ -724,17 +653,13 @@ def fit(self, X, y): if self.weights_column not in X.columns.values: - raise ValueError( - f"{self.classname()}: weights column {self.weights_column} not in X" - ) + raise ValueError(f"weights column {self.weights_column} not in X") response_null_count = y.isnull().sum() if response_null_count > 0: - raise ValueError( - f"{self.classname()}: y has {response_null_count} null values" - ) + raise ValueError(f"y has {response_null_count} null values") X_y = self._combine_X_y(X, y) response_column = "_temporary_response" @@ -889,10 +814,7 @@ def fit(self, X, y=None): if X[c].isnull().sum() > 0: - raise ValueError( - f"{self.classname()}: column %s has nulls - replace before proceeding" - % c - ) + raise ValueError("column %s has nulls - replace before proceeding" % c) # Check each field has less than 100 categories/levels for c in self.columns: @@ -902,7 +824,7 @@ def fit(self, X, y=None): if len(levels) > 100: raise ValueError( - f"{self.classname()}: column %s has over 100 unique values - consider another type of encoding" + "column %s has over 100 unique values - consider another type of encoding" % c ) @@ -937,10 +859,7 @@ def transform(self, X): if X[c].isnull().sum() > 0: - raise ValueError( - f"{self.classname()}: column %s has nulls - replace before proceeding" - % c - ) + raise ValueError("column %s has nulls - replace before proceeding" % c) X = BaseNominalTransformer.transform(self, X) @@ -981,8 +900,7 @@ def transform(self, X): if len(unseen_levels) > 0: warnings.warn( - f"{self.classname()}: column %s has unseen categories: %s" - % (c, unseen_levels) + "column %s has unseen categories: %s" % (c, unseen_levels) ) # Drop original columns diff --git a/tubular/numeric.py b/tubular/numeric.py index a1f32028..2638942c 100644 --- a/tubular/numeric.py +++ b/tubular/numeric.py @@ -10,9 +10,8 @@ StandardScaler, PolynomialFeatures, ) -from sklearn.decomposition import PCA -from tubular.base import BaseTransformer, DataFrameMethodTransformer +from tubular.base import BaseTransformer class LogTransformer(BaseTransformer): @@ -65,11 +64,9 @@ def __init__( if base is not None: if not isinstance(base, (int, float)): - raise ValueError(f"{self.classname()}: base should be numeric or None") + raise ValueError("base should be numeric or None") if not base > 0: - raise ValueError( - f"{self.classname()}: base should be strictly positive" - ) + raise ValueError("base should be strictly positive") self.base = base self.add_1 = add_1 @@ -108,7 +105,7 @@ def transform(self, X): ) raise TypeError( - f"{self.classname()}: The following columns are not numeric in X; {non_numeric_columns}" + f"The following columns are not numeric in X; {non_numeric_columns}" ) new_column_names = [f"{column}_{self.suffix}" for column in self.columns] @@ -118,7 +115,7 @@ def transform(self, X): if (X[self.columns] <= -1).sum().sum() > 0: raise ValueError( - f"{self.classname()}: values less than or equal to 0 in columns (after adding 1), make greater than 0 before using transform" + "values less than or equal to 0 in columns (after adding 1), make greater than 0 before using transform" ) if self.base is None: @@ -134,7 +131,7 @@ def transform(self, X): if (X[self.columns] <= 0).sum().sum() > 0: raise ValueError( - f"{self.classname()}: values less than or equal to 0 in columns, make greater than 0 before using transform" + "values less than or equal to 0 in columns, make greater than 0 before using transform" ) if self.base is None: @@ -179,17 +176,17 @@ def __init__(self, column, new_column_name, cut_kwargs={}, **kwargs): if not type(column) is str: raise TypeError( - f"{self.classname()}: column arg (name of column) should be a single str giving the column to discretise" + "column arg (name of column) should be a single str giving the column to discretise" ) if not type(new_column_name) is str: - raise TypeError(f"{self.classname()}: new_column_name must be a str") + raise TypeError("new_column_name must be a str") if not type(cut_kwargs) is dict: raise TypeError( - f"{self.classname()}: cut_kwargs should be a dict but got type {type(cut_kwargs)}" + f"cut_kwargs should be a dict but got type {type(cut_kwargs)}" ) else: @@ -199,7 +196,7 @@ def __init__(self, column, new_column_name, cut_kwargs={}, **kwargs): if not type(k) is str: raise TypeError( - f"{self.classname()}: unexpected type ({type(k)}) for cut_kwargs key in position {i}, must be str" + f"unexpected type ({type(k)}) for cut_kwargs key in position {i}, must be str" ) self.cut_kwargs = cut_kwargs @@ -226,7 +223,7 @@ def transform(self, X): if not pd.api.types.is_numeric_dtype(X[self.columns[0]]): raise TypeError( - f"{self.classname()}: {self.columns[0]} should be a numeric dtype but got {X[self.columns[0]].dtype}" + f"{self.columns[0]} should be a numeric dtype but got {X[self.columns[0]].dtype}" ) X[self.new_column_name] = pd.cut(X[self.columns[0]], **self.cut_kwargs) @@ -234,123 +231,6 @@ def transform(self, X): return X -class TwoColumnOperatorTransformer(DataFrameMethodTransformer): - - """ - This transformer applies a pandas.DataFrame method to two columns (add, sub, mul, div, mod, pow). - - Transformer assigns the output of the method to a new column. The method will be applied - in the form (column 1)operator(column 2), so order matters (if the method does not commute). It is possible to - supply other key word arguments to the transform method, which will be passed to the pandas.DataFrame method being called. - - Parameters - ---------- - pd_method_name : str - The name of the pandas.DataFrame method to be called. - - column1_name : str - The name of the 1st column in the operation. - - column2_name : str - The name of the 2nd column in the operation. - - new_column_name : str - The name of the new column that the output is assigned to. - - pd_method_kwargs : dict, default = {'axis':0} - Dictionary of method kwargs to be passed to pandas.DataFrame method. Must contain an entry for axis, set to either 1 or 0. - - **kwargs : - Arbitrary keyword arguments passed onto BaseTransformer.__init__(). - - Attributes - ---------- - pd_method_name : str - The name of the pandas.DataFrame method to be called. - - columns : list - list containing two string items: [column1_name, column2_name] The first will be operated upon by the - chosen pandas method using the second. - - column2_name : str - The name of the 2nd column in the operation. - - new_column_name : str - The name of the new column that the output is assigned to. - - pd_method_kwargs : dict - Dictionary of method kwargs to be passed to pandas.DataFrame method. - - """ - - def __init__( - self, - pd_method_name, - columns, - new_column_name, - pd_method_kwargs={"axis": 0}, - **kwargs, - ): - """ - Performs input checks not done in either DataFrameMethodTransformer.__init__ or BaseTransformer.__init__ - """ - - if "axis" not in pd_method_kwargs.keys(): - raise ValueError( - f'{self.classname()}: pd_method_kwargs must contain an entry "axis" set to 0 or 1' - ) - - if pd_method_kwargs["axis"] not in [0, 1]: - raise ValueError( - f"{self.classname()}: pd_method_kwargs 'axis' must be 0 or 1" - ) - - if not type(columns) is list: - if len(columns) != 2: - raise ValueError( - f"{self.classname()}: columns must be a list containing two column names but got {columns}" - ) - - self.column1_name = columns[0] - self.column2_name = columns[1] - - # call DataFrameMethodTransformer.__init__ - # This class will inherit all the below attributes from DataFrameMethodTransformer - super().__init__( - new_column_name=new_column_name, - pd_method_name=pd_method_name, - columns=columns, - pd_method_kwargs=pd_method_kwargs, - **kwargs, - ) - - def transform(self, X): - """ - Transform input data by applying the chosen method to the two specified columns - - Args: - X (pd.DataFrame): Data to transform. - - Returns: - pd.DataFrame: Input X with an additional column. - """ - # call BaseTransformer.transform - X = super(DataFrameMethodTransformer, self).transform(X) - - is_numeric = X[self.columns].apply(pd.api.types.is_numeric_dtype, axis=0) - - if not is_numeric.all(): - raise TypeError( - f"{self.classname()}: input columns in X must contain only numeric values" - ) - - X[self.new_column_name] = getattr(X[[self.column1_name]], self.pd_method_name)( - X[self.column2_name], **self.pd_method_kwargs - ) - - return X - - class ScalingTransformer(BaseTransformer): """Transformer to perform scaling of numeric columns. @@ -379,7 +259,7 @@ def __init__(self, columns, scaler_type, scaler_kwargs={}, **kwargs): if not type(scaler_kwargs) is dict: raise TypeError( - f"{self.classname()}: scaler_kwargs should be a dict but got type {type(scaler_kwargs)}" + f"scaler_kwargs should be a dict but got type {type(scaler_kwargs)}" ) else: @@ -389,16 +269,14 @@ def __init__(self, columns, scaler_type, scaler_kwargs={}, **kwargs): if not type(k) is str: raise TypeError( - f"{self.classname()}: unexpected type ({type(k)}) for scaler_kwargs key in position {i}, must be str" + f"unexpected type ({type(k)}) for scaler_kwargs key in position {i}, must be str" ) allowed_scaler_values = ["min_max", "max_abs", "standard"] if scaler_type not in allowed_scaler_values: - raise ValueError( - f"{self.classname()}: scaler_type should be one of; {allowed_scaler_values}" - ) + raise ValueError(f"scaler_type should be one of; {allowed_scaler_values}") if scaler_type == "min_max": @@ -440,7 +318,7 @@ def check_numeric_columns(self, X): ) raise TypeError( - f"{self.classname()}: The following columns are not numeric in X; {non_numeric_columns}" + f"The following columns are not numeric in X; {non_numeric_columns}" ) return X @@ -541,31 +419,23 @@ def __init__(self, columns, min_degree=2, max_degree=2, **kwargs): if len(columns) < 2: raise ValueError( - f"{self.classname()}: number of columns must be equal or greater than 2, got {str(len(columns))} column." + f"number of columns must be equal or greater than 2, got {str(len(columns))} column." ) if type(min_degree) is int: if min_degree < 2: raise ValueError( - f"{self.classname()}: min_degree must be equal or greater than 2, got {str(min_degree)}" + f"min_degree must be equal or greater than 2, got {str(min_degree)}" ) else: self.min_degree = min_degree else: raise TypeError( - f"{self.classname()}: unexpected type ({type(min_degree)}) for min_degree, must be int" + f"unexpected type ({type(min_degree)}) for min_degree, must be int" ) if type(max_degree) is int: if min_degree > max_degree: - raise ValueError( - f"{self.classname()}: max_degree must be equal or greater than min_degree" - ) - else: - self.max_degree = max_degree - if max_degree > len(columns): - raise ValueError( - f"{self.classname()}: max_degree must be equal or lower than number of columns" - ) + raise ValueError("max_degree must be equal or greater than min_degree") else: self.max_degree = max_degree if max_degree > len(columns): @@ -576,7 +446,7 @@ def __init__(self, columns, min_degree=2, max_degree=2, **kwargs): self.max_degree = max_degree else: raise TypeError( - f"{self.classname()}: unexpected type ({type(max_degree)}) for max_degree, must be int" + f"unexpected type ({type(max_degree)}) for max_degree, must be int" ) self.nb_features_to_interact = len(self.columns) @@ -642,234 +512,3 @@ def transform(self, X): ].product(axis=1, skipna=False) return X - - -class PCATransformer(BaseTransformer): - """Transformer that generates variables using Principal component analysis (PCA). - Linear dimensionality reduction using Singular Value Decomposition of the - data to project it to a lower dimensional space. - - It is based on sklearn class sklearn.decomposition.PCA - - Parameters - ---------- - columns : None or list or str - Columns to apply the transformer to. If a str is passed this is put into a list. Value passed - in columns is saved in the columns attribute on the object. Note this has no default value so - the user has to specify the columns when initialising the transformer. When the user forget to set columns, - all columns would be picked up when super transform runs. - n_components : int, float or 'mle', default=None - Number of components to keep. - if n_components is not set all components are kept:: - n_components == min(n_samples, n_features) - If ``n_components == 'mle'`` and ``svd_solver == 'full'``, Minka's - MLE is used to guess the dimension. Use of ``n_components == 'mle'`` - will interpret ``svd_solver == 'auto'`` as ``svd_solver == 'full'``. - If ``0 < n_components < 1`` and ``svd_solver == 'full'``, select the - number of components such that the amount of variance that needs to be - explained is greater than the percentage specified by n_components. - If ``svd_solver == 'arpack'``, the number of components must be - strictly less than the minimum of n_features and n_samples. - Hence, the None case results in:: - n_components == min(n_samples, n_features) - 1 svd_solver='auto', tol=0.0, n_oversamples=10, random_state=None - svd_solver : {'auto', 'full', 'arpack', 'randomized'}, default='auto' - If auto : - The solver is selected by a default policy based on `X.shape` and - `n_components`: if the input data is larger than 500x500 and the - number of components to extract is lower than 80% of the smallest - dimension of the data, then the more efficient 'randomized' - method is enabled. Otherwise the exact full SVD is computed and - optionally truncated afterwards. - If full : - run exact full SVD calling the standard LAPACK solver via - `scipy.linalg.svd` and select the components by postprocessing - If arpack : - run SVD truncated to n_components calling ARPACK solver via - `scipy.sparse.linalg.svds`. It requires strictly - 0 < n_components < min(X.shape) - If randomized : - run randomized SVD by the method of Halko et al. - .. sklearn versionadded:: 0.18.0 - - random_state : int, RandomState instance or None, default=None - Used when the 'arpack' or 'randomized' solvers are used. Pass an int - for reproducible results across multiple function calls. - .. sklearn versionadded:: 0.18.0 - pca_column_prefix : str, prefix added to each the n components features generated. Default is "pca_" - example: if n_components = 3, new columns would be 'pca_0','pca_1','pca_2'. - - Attributes - ---------- - - pca : PCA class from sklearn.decomposition - n_components_ : int - The estimated number of components. When n_components is set - to 'mle' or a number between 0 and 1 (with svd_solver == 'full') this - number is estimated from input data. Otherwise it equals the parameter - n_components, or the lesser value of n_features and n_samples - if n_components is None. - feature_names_out: list or None - list of feature name representing the new dimensions. - - - """ - - def __init__( - self, - columns, - n_components=2, - svd_solver="auto", - random_state=None, - pca_column_prefix="pca_", - **kwargs, - ): - - super().__init__(columns=columns, **kwargs) - - if type(n_components) is int: - if n_components < 1: - raise ValueError( - f"{self.classname()}:n_components must be strictly positive got {str(n_components)}" - ) - else: - self.n_components = n_components - elif type(n_components) is float: - if 0 < n_components < 1: - self.n_components = n_components - else: - raise ValueError( - f"{self.classname()}:n_components must be strictly positive and must be of type int when greater than or equal to 1. Got {str(n_components)}" - ) - - else: - if n_components == "mle": - self.n_components = n_components - else: - raise TypeError( - f"{self.classname()}:unexpected type {type(n_components)} for n_components, must be int, float (0-1) or equal to 'mle'." - ) - - if type(svd_solver) is str: - if svd_solver not in ["auto", "full", "arpack", "randomized"]: - raise ValueError( - f"{self.classname()}:svd_solver {svd_solver} is unknown. Please select among 'auto', 'full', 'arpack', 'randomized'." - ) - else: - self.svd_solver = svd_solver - else: - raise TypeError( - f"{self.classname()}:unexpected type {type(svd_solver)} for svd_solver, must be str" - ) - - if type(random_state) is int: - self.random_state = random_state - else: - if random_state is None: - self.random_state = random_state - else: - raise TypeError( - f"{self.classname()}:unexpected type {type(random_state)} for random_state, must be int or None." - ) - - if (svd_solver == "arpack") and (n_components == "mle"): - raise ValueError( - f"{self.classname()}: n_components='mle' cannot be a string with svd_solver='arpack'" - ) - if (svd_solver in ["randomized", "arpack"]) and (type(n_components) is float): - raise TypeError( - f"{self.classname()}: n_components {n_components} cannot be a float with svd_solver='{svd_solver}'" - ) - - if type(pca_column_prefix) is str: - self.pca_column_prefix = pca_column_prefix - else: - raise TypeError( - f"{self.classname()}:unexpected type {type(pca_column_prefix)} for pca_column_prefix, must be str" - ) - - self.pca = PCA( - n_components=self.n_components, - svd_solver=self.svd_solver, - random_state=self.random_state, - ) - - self.pca_column_prefix = pca_column_prefix - self.feature_names_out = None - self.n_components_ = None - - def check_numeric_columns(self, X): - """Method to check all columns (specicifed in self.columns) in X are all numeric. - - Parameters - ---------- - X : pd.DataFrame - Data containing columns to check. - - """ - - numeric_column_types = X[self.columns].apply( - pd.api.types.is_numeric_dtype, axis=0 - ) - - if not numeric_column_types.all(): - - non_numeric_columns = list( - numeric_column_types.loc[~numeric_column_types].index - ) - - raise TypeError( - f"{self.classname()}: The following columns are not numeric in X; {non_numeric_columns}" - ) - - return X - - def fit(self, X, y=None): - """Fit PCA to input data. - - Parameters - ---------- - X : pd.DataFrame - Dataframe with columns to learn scaling values from. - - y : None - Required for pipeline. - - """ - - super().fit(X, y) - - X = self.check_numeric_columns(X) - - if self.n_components != "mle": - if 0 < self.n_components <= min(X[self.columns].shape): - pass - else: - raise ValueError( - f"""{self.classname()}: n_components {self.n_components} must be between 1 and min(n_samples {X[self.columns].shape[0]}, n_features {X[self.columns].shape[1]}) is {min(X[self.columns].shape)} with svd_solver '{self.svd_solver}'""" - ) - - self.pca.fit(X[self.columns]) - self.n_components_ = self.pca.n_components_ - self.feature_names_out = [ - self.pca_column_prefix + str(i) for i in range(self.n_components_) - ] - - return self - - def transform(self, X): - """Generate from input pandas DataFrame (X) PCA features and add this column or columns in X. - Parameters - ---------- - X : pd.DataFrame - Data to transform. - Returns - ------- - X : pd.DataFrame - Input X with additional column or columns (self.interaction_colname) added. These contain the output of - running the product pandas DataFrame method on identified combinations. - """ - X = super().transform(X) - X = self.check_numeric_columns(X) - X[self.feature_names_out] = self.pca.transform(X[self.columns]) - - return X diff --git a/tubular/strings.py b/tubular/strings.py index 3f86b4ab..0a752671 100644 --- a/tubular/strings.py +++ b/tubular/strings.py @@ -60,7 +60,7 @@ def __init__( if len(columns) > 1: raise ValueError( - f"{self.classname()}: columns arg should contain only 1 column name but got {len(columns)}" + f"columns arg should contain only 1 column name but got {len(columns)}" ) super().__init__(columns=columns, **kwargs) @@ -68,19 +68,19 @@ def __init__( if type(new_column_name) is not str: raise TypeError( - f"{self.classname()}: unexpected type ({type(new_column_name)}) for new_column_name, must be str" + f"unexpected type ({type(new_column_name)}) for new_column_name, must be str" ) if type(pd_method_name) is not str: raise TypeError( - f"{self.classname()}: unexpected type ({type(pd_method_name)}) for pd_method_name, expecting str" + f"unexpected type ({type(pd_method_name)}) for pd_method_name, expecting str" ) if type(pd_method_kwargs) is not dict: raise TypeError( - f"{self.classname()}: pd_method_kwargs should be a dict but got type {type(pd_method_kwargs)}" + f"pd_method_kwargs should be a dict but got type {type(pd_method_kwargs)}" ) else: @@ -90,7 +90,7 @@ def __init__( if not type(k) is str: raise TypeError( - f"{self.classname()}: unexpected type ({type(k)}) for pd_method_kwargs key in position {i}, must be str" + f"unexpected type ({type(k)}) for pd_method_kwargs key in position {i}, must be str" ) self.new_column_name = new_column_name @@ -105,7 +105,7 @@ def __init__( except Exception as err: raise AttributeError( - f"""{self.classname()}: error accessing "str.{pd_method_name}" method on pd.Series object - pd_method_name should be a pd.Series.str method""" + f"""error accessing "str.{pd_method_name}" method on pd.Series object - pd_method_name should be a pd.Series.str method""" ) from err def transform(self, X): @@ -135,56 +135,3 @@ def transform(self, X): ) return X - - -class StringConcatenator(BaseTransformer): - """ - Transformer to combine data from specified columns, of mixed datatypes, into a new column containing one string. - - Parameters - ---------- - columns : str or list of str - Columns to concatenate. - new_column : str, default = "new_column" - New column name - separator : str, default = " " - Separator for the new string value - """ - - def __init__(self, columns, new_column="new_column", separator=" "): - - super().__init__(columns=columns, copy=True) - - if not isinstance(new_column, str): - raise TypeError(f"{self.classname()}: new_column should be a str") - - self.new_column = new_column - - if not isinstance(separator, str): - raise TypeError(f"{self.classname()}: The separator should be a str") - - self.separator = separator - - def transform(self, X): - """ - Combine data from specified columns, of mixed datatypes, into a new column containing one string. - - Parameters - ---------- - X : df - Data to concatenate values on. - - Returns - ------- - X : df - Returns a dataframe with concatenated values. - - """ - - X = super().transform(X) - - X[self.new_column] = ( - X[self.columns].astype(str).apply(lambda x: self.separator.join(x), axis=1) - ) - - return X