diff --git a/docs/sources/CHANGELOG.md b/docs/sources/CHANGELOG.md index 5d3e8c44f..afbbb9e29 100755 --- a/docs/sources/CHANGELOG.md +++ b/docs/sources/CHANGELOG.md @@ -6,6 +6,20 @@ The CHANGELOG for the current development version is available at [https://github.com/rasbt/mlxtend/blob/master/docs/sources/CHANGELOG.md](https://github.com/rasbt/mlxtend/blob/master/docs/sources/CHANGELOG.md). --- +### Version 0.23.2 (TBD) + +##### Downloads + +- [Source code (zip)](https://github.com/rasbt/mlxtend/archive/v0.23.2.zip) + +- [Source code (tar.gz)](https://github.com/rasbt/mlxtend/archive/v0.23.2.tar.gz) + +##### New Features and Enhancements + +- Integrated scikit-learn's `set_output` method into `TransactionEncoder` ([#1087](https://github.com/rasbt/mlxtend/issues/1087) via [it176131](https://github.com/it176131)) + + + ### Version 0.23.2 (TBD) diff --git a/docs/sources/user_guide/preprocessing/TransactionEncoder.ipynb b/docs/sources/user_guide/preprocessing/TransactionEncoder.ipynb index fe84cee9d..01e2db775 100644 --- a/docs/sources/user_guide/preprocessing/TransactionEncoder.ipynb +++ b/docs/sources/user_guide/preprocessing/TransactionEncoder.ipynb @@ -89,7 +89,7 @@ " [False, False, True, True, True, True],\n", " [False, False, True, False, True, True],\n", " [False, False, True, False, True, False],\n", - " [ True, True, False, False, False, False]], dtype=bool)" + " [ True, True, False, False, False, False]])" ] }, "execution_count": 2, @@ -141,7 +141,7 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "After fitting, the unique column names that correspond to the data array shown above can be accessed via the `columns_` attribute:" + "After fitting, the unique column names that correspond to the data array shown above can be accessed via the `columns_` attribute, or the `get_feature_names_out` method:" ] }, { @@ -161,19 +161,71 @@ } ], "source": [ - "te.columns_" + "te.columns_ # list of strings" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "array(['Apple', 'Bananas', 'Beer', 'Chicken', 'Milk', 'Rice'],\n", + " dtype=object)" + ] + }, + "execution_count": 5, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "te.get_feature_names_out() # numpy.array of strings (objects)." ] }, { "cell_type": "markdown", "metadata": {}, "source": [ - "For our convenience, we can turn theencoded array into a pandas `DataFrame`:" + "If we desire, we can turn the one-hot encoded array back into a transaction list of lists via the `inverse_transform` function:" ] }, { "cell_type": "code", - "execution_count": 5, + "execution_count": 6, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "[['Apple', 'Beer', 'Chicken', 'Rice'],\n", + " ['Apple', 'Beer', 'Rice'],\n", + " ['Apple', 'Beer'],\n", + " ['Apple', 'Bananas']]" + ] + }, + "execution_count": 6, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "first4 = te_ary[:4]\n", + "te.inverse_transform(first4)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "For our convenience, we can set the default output to a pandas `DataFrame` with the `set_output` method:" + ] + }, + { + "cell_type": "code", + "execution_count": 7, "metadata": {}, "outputs": [ { @@ -294,46 +346,15 @@ "7 True True False False False False" ] }, - "execution_count": 5, + "execution_count": 7, "metadata": {}, "output_type": "execute_result" } ], "source": [ - "import pandas as pd\n", - "\n", - "pd.DataFrame(te_ary, columns=te.columns_)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "If we desire, we can turn the one-hot encoded array back into a transaction list of lists via the `inverse_transform` function:" - ] - }, - { - "cell_type": "code", - "execution_count": 6, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "[['Apple', 'Beer', 'Chicken', 'Rice'],\n", - " ['Apple', 'Beer', 'Rice'],\n", - " ['Apple', 'Beer'],\n", - " ['Apple', 'Bananas']]" - ] - }, - "execution_count": 6, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "first4 = te_ary[:4]\n", - "te.inverse_transform(first4)" + "te = TransactionEncoder().set_output(transform=\"pandas\")\n", + "te_df = te.fit(dataset).transform(dataset)\n", + "te_df" ] }, { @@ -346,7 +367,9 @@ { "cell_type": "code", "execution_count": 3, - "metadata": {}, + "metadata": { + "scrolled": true + }, "outputs": [ { "name": "stdout", @@ -545,13 +568,6 @@ "with open('../../api_modules/mlxtend.preprocessing/TransactionEncoder.md', 'r') as f:\n", " print(f.read())" ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [] } ], "metadata": { @@ -571,7 +587,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.9.7" + "version": "3.11.7" }, "toc": { "nav_menu": {}, diff --git a/mlxtend/preprocessing/tests/test_transactionencoder.py b/mlxtend/preprocessing/tests/test_transactionencoder.py index de5f14c5c..7a7d0d771 100644 --- a/mlxtend/preprocessing/tests/test_transactionencoder.py +++ b/mlxtend/preprocessing/tests/test_transactionencoder.py @@ -5,6 +5,7 @@ # License: BSD 3 clause import numpy as np +import pandas as pd from scipy.sparse import csr_matrix from sklearn.base import clone @@ -91,3 +92,27 @@ def test_cloning(): trans = oht2.fit_transform(dataset) np.testing.assert_array_equal(expect, trans) + + +def test_get_feature_names_out(): + """Assert TransactionEncoder has attribute get_feature_names_out.""" + oht = TransactionEncoder() + assert hasattr(oht, "get_feature_names_out") + oht.fit(dataset) + np.testing.assert_array_equal(oht.get_feature_names_out(), oht.columns_) + + +def test_set_output(): + """Assert TransactionEncoder has attribute set_output. + + When transform="pandas", the transformed output of + TransactionEncoder should be a pandas.DataFrame with the correct + column names and the values should match those of the original + numpy.array. + """ + oht = TransactionEncoder() + assert hasattr(oht, "set_output") + oht = oht.set_output(transform="pandas") + out = oht.fit_transform(dataset) + assert isinstance(out, pd.DataFrame) + np.testing.assert_array_equal(out.columns, oht.columns_) diff --git a/mlxtend/preprocessing/transactionencoder.py b/mlxtend/preprocessing/transactionencoder.py index bfb8a0035..55c4dffc6 100644 --- a/mlxtend/preprocessing/transactionencoder.py +++ b/mlxtend/preprocessing/transactionencoder.py @@ -7,6 +7,7 @@ import numpy as np from scipy.sparse import csr_matrix from sklearn.base import BaseEstimator, TransformerMixin +from sklearn.utils.validation import _check_feature_names_in, check_is_fitted class TransactionEncoder(BaseEstimator, TransformerMixin): @@ -181,3 +182,16 @@ def inverse_transform(self, array): def fit_transform(self, X, sparse=False): """Fit a TransactionEncoder encoder and transform a dataset.""" return self.fit(X).transform(X, sparse=sparse) + + def get_feature_names_out(self): + """Used to get the column names of pandas output. + + This method combined with the `TransformerMixin` exposes the + set_output API to the `TransactionEncoder`. This allows the user + to set the transformed output to a `pandas.DataFrame` by default. + + See https://scikit-learn.org/stable/developers/develop.html#developer-api-set-output + for more details. + """ + check_is_fitted(self, attributes="columns_") + return _check_feature_names_in(estimator=self, input_features=self.columns_)