From 967b10a5599231631199eb8c506d46e4cab6b78c Mon Sep 17 00:00:00 2001 From: xdssio Date: Mon, 29 Nov 2021 14:50:13 +0100 Subject: [PATCH 01/11] cleaning --- packages/vaex-ml/vaex/ml/vowpalwabbit.py | 149 +++++++++++++++++++++++ tests/ml/vowpalwabbit_test.py | 50 ++++++++ 2 files changed, 199 insertions(+) create mode 100644 packages/vaex-ml/vaex/ml/vowpalwabbit.py create mode 100644 tests/ml/vowpalwabbit_test.py diff --git a/packages/vaex-ml/vaex/ml/vowpalwabbit.py b/packages/vaex-ml/vaex/ml/vowpalwabbit.py new file mode 100644 index 0000000000..023ef05a86 --- /dev/null +++ b/packages/vaex-ml/vaex/ml/vowpalwabbit.py @@ -0,0 +1,149 @@ +import base64 +import tempfile + +import numpy as np +import pandas as pd +import traitlets +import vaex.serialize +from sklearn.utils import shuffle +from vowpalwabbit.DFtoVW import DFtoVW +from vowpalwabbit.pyvw import vw + +from . import generate +from . import state + + +@vaex.serialize.register +@generate.register +class VowpalWabbitModel(state.HasState): + '''The Vowpal Wabbit algorithm. + + This class provides an interface to the Vowpal Wabbit package. + + Vowpal Wabbit provides fast, efficient, and flexible online machine learning + techniques for reinforcement learning, supervised learning, and more. + It is influenced by an ecosystem of community contributions, academic research, and proven algorithms. + Microsoft Research is a major contributor to Vowpal Wabbit. + + For more information, please visit https://vowpalwabbit.org/index.html. + + Example: + + >>> import vaex.ml + >>> import vaex.ml.vowpalwabbit + >>> df = vaex.ml.datasets.load_iris() + >>> df['class_'] = df['class_']+1 # Vowpal Wabbit classification is an int stareting from 1. + >>> features = ['sepal_width', 'petal_length', 'sepal_length', 'petal_width'] + >>> df_train, df_test = df.ml.train_test_split() + >>> params = { 'oaa': '3', 'P': 1} + >>> booster = vaex.ml.vowpalwabbit.VowpalWabbitModel(features=features, target='class_', epochs=100, params=params) + >>> booster.fit(df_train) + >>> df_train = booster.transform(df_train) + >>> df_train.head(3) + # sepal_width petal_length sepal_length petal_width class_ vowpalwabbit_prediction + 0 3 4.5 5.4 1.5 2 2 + 1 3.4 1.6 4.8 0.2 1 1 + 2 3.1 4.9 6.9 1.5 2 2 + >>> df_test = booster.transform(df_test) + >>> df_test.head(3) + # sepal_width petal_length sepal_length petal_width class_ vowpalwabbit_prediction + 0 3 4.2 5.9 1.5 2 2 + 1 3 4.6 6.1 1.4 2 2 + 2 2.9 4.6 6.6 1.3 2 2 + ''' + snake_name = 'vowpalwabbit_model' + features = traitlets.List(traitlets.Unicode(), help='List of features to use when fitting the Vowpal Wabbit.') + target = traitlets.Unicode(allow_none=False, help='The name of the target column.') + passes = traitlets.CInt(help='Number of iterations.') + params = traitlets.Dict(default_value={}, help='parameters to be passed on the to the Vowpal Wabbit model.') + prediction_name = traitlets.Unicode(default_value='vowpalwabbit_prediction', + help='The name of the virtual column housing the predictions.') + + def __call__(self, *args): + data2d = np.array(args).T + return self.predict(data2d) + + def transform(self, df): + '''Transform a DataFrame such that it contains the predictions of the + Vowpal Wabbit in form of a virtual column. + + :param df: A vaex DataFrame. + + :return copy: A shallow copy of the DataFrame that includes the Vowpal Wabbit prediction as a virtual column. + :rtype: DataFrame + ''' + copy = df.copy() + lazy_function = copy.add_function('vowpalwabbit_prediction_function', self, unique=True) + expression = lazy_function(*self.features) + copy.add_virtual_column(self.prediction_name, expression, unique=False) + return copy + + def fit(self, df, passes=1, chunk_size=500, partial_fit=False): + """Fit the VowpalWabbitModel to the DataFrame. + :param df: A vaex DataFrame containing the features and target on which to train the model. + :param int passes: Number of passes over the data + :param int chunk_size: Size of chunks to iterate + """ + passes = passes or self.passes + params = {k: v for k, v in self.params.items() if v is not None} + target = self.target + features = self.features + model = self.model if (hasattr(self, 'model') and self.model is not None and partial_fit) else vw(**self.params) + for n in range(passes): + for _, _, X in df.to_pandas_df(chunk_size=chunk_size): + if n > 1: + X = shuffle(X) + for ex in DFtoVW.from_colnames(df=X, y=target, x=features).convert_df(): + model.learn(ex) + self.model = model + return self + + def predict(self, df, **kwargs): + '''Get an in-memory numpy array with the predictions of the VowpalWabbitModel on a vaex DataFrame. + This method accepts the key word arguments of the predict method from VowpalWabbit. + + :param df: A vaex DataFrame. + + :returns: A in-memory numpy array containing the VowpalWabbitModel predictions. + :rtype: numpy.array + ''' + if self.model is None: + raise RuntimeError("model is not fitted") + X = pd.DataFrame(df, columns=self.features) if isinstance(df, np.ndarray) else df[ + self.features].to_pandas_df() + X[self.target] = 1 # DFtoVW.from_colnames issue - will be ignored in predictions + examples = DFtoVW.from_colnames(df=X, y=self.target, x=self.features).convert_df() + return np.array([self.model.predict(ex) for ex in examples]) + + def _encode_vw(self): + if self.model is None: + return None + if isinstance(self.model, bytes): + return self.model + filename = tempfile.mktemp() + self.model.save(filename) + with open(filename, 'rb') as f: + model_data = f.read() + return base64.encodebytes(model_data).decode('ascii') + + def _decode_vw(self, encoding): + if encoding is None: + return vw(**self.params) + if isinstance(encoding, str): + model_data = base64.decodebytes(encoding.encode('ascii')) + openfilename = tempfile.mktemp() + with open(openfilename, 'wb') as f: + f.write(model_data) + params = self.params.copy() + params['i'] = openfilename + return vw(**params) + else: + return encoding + + def state_get(self): + return dict(model_state=self._encode_vw(), + substate=super(VowpalWabbitModel, self).state_get()) + + def state_set(self, state, trusted=True): + super(VowpalWabbitModel, self).state_set(state['substate']) + self.model = self._decode_vw(state['model_state']) diff --git a/tests/ml/vowpalwabbit_test.py b/tests/ml/vowpalwabbit_test.py new file mode 100644 index 0000000000..2885a51ae3 --- /dev/null +++ b/tests/ml/vowpalwabbit_test.py @@ -0,0 +1,50 @@ +import sys + +import pytest + +pytest.importorskip("vowpalwabbit") +from sklearn.metrics import accuracy_score +import vaex.ml.vowpalwabbit +import vaex.ml.datasets + +params = {'oaa': '3', 'P': 1, 'enable_logging': True} + + +@pytest.mark.skipif(sys.version_info < (3, 6), reason="requires python3.6 or higher") +def test_vowpalwabbit(df_iris): + ds = df_iris + + ds['class_'] = ds['class_'] + 1 # VW classification starts from 1 + ds['x'] = ds.sepal_length * 1 + ds['y'] = ds.sepal_width * 1 + ds['w'] = ds.petal_length * 1 + ds['z'] = ds.petal_width * 1 + ds_train, ds_test = ds.ml.train_test_split(test_size=0.2, verbose=False) + features = ['x', 'y', 'z', 'w'] + + params = {'oaa': '3', 'P': 1, 'link': 'logistic', 'enable_logging': True} + model = vaex.ml.vowpalwabbit.VowpalWabbitModel( + params=params, + features=features, + target='class_') + model.fit(ds_train) + assert 0 < accuracy_score(ds_test.col.class_.values, model.predict(ds_test)) + + ds_train = model.transform(ds_train) # this will add the lightgbm_prediction column + state = ds_train.state_get() + ds_test.state_set(state) + + +@pytest.mark.skipif(sys.version_info < (3, 6), reason="requires python3.6 or higher") +def test_vowpalwabbit_serialize(tmpdir, df_iris): + ds = df_iris + ds['class_'] = ds['class_'] + 1 # VW classification starts from 1 + features = ['sepal_length', 'sepal_width', 'petal_length', 'petal_width'] + model = vaex.ml.vowpalwabbit.VowpalWabbitModel( + params=params, + features=features, + target='class_') + model.fit(ds) + pl = vaex.ml.Pipeline([model]) + pl.save(str(tmpdir.join('test.json'))) + pl.load(str(tmpdir.join('test.json'))) From fd489ce22f3be77ae35909dd8fbcc532bbe48115 Mon Sep 17 00:00:00 2001 From: xdssio Date: Mon, 29 Nov 2021 14:59:26 +0100 Subject: [PATCH 02/11] cleaning --- tests/ml/vowpalwabbit_test.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/ml/vowpalwabbit_test.py b/tests/ml/vowpalwabbit_test.py index 2885a51ae3..877d527ce1 100644 --- a/tests/ml/vowpalwabbit_test.py +++ b/tests/ml/vowpalwabbit_test.py @@ -27,10 +27,10 @@ def test_vowpalwabbit(df_iris): params=params, features=features, target='class_') - model.fit(ds_train) + model.fit(ds_train, passes=5) assert 0 < accuracy_score(ds_test.col.class_.values, model.predict(ds_test)) - ds_train = model.transform(ds_train) # this will add the lightgbm_prediction column + ds_train = model.transform(ds_train) # this will add the vw column state = ds_train.state_get() ds_test.state_set(state) From 320c9cadb3f9858a9367d79c112c74fddeb18426 Mon Sep 17 00:00:00 2001 From: xdssio <37710064+xdssio@users.noreply.github.com> Date: Sun, 12 Dec 2021 19:10:42 +0100 Subject: [PATCH 03/11] Update packages/vaex-ml/vaex/ml/vowpalwabbit.py pip8 clean-up Co-authored-by: Jovan Veljanoski --- packages/vaex-ml/vaex/ml/vowpalwabbit.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/packages/vaex-ml/vaex/ml/vowpalwabbit.py b/packages/vaex-ml/vaex/ml/vowpalwabbit.py index 023ef05a86..4e0ba9047c 100644 --- a/packages/vaex-ml/vaex/ml/vowpalwabbit.py +++ b/packages/vaex-ml/vaex/ml/vowpalwabbit.py @@ -35,7 +35,7 @@ class VowpalWabbitModel(state.HasState): >>> df['class_'] = df['class_']+1 # Vowpal Wabbit classification is an int stareting from 1. >>> features = ['sepal_width', 'petal_length', 'sepal_length', 'petal_width'] >>> df_train, df_test = df.ml.train_test_split() - >>> params = { 'oaa': '3', 'P': 1} + >>> params = {'oaa': '3', 'P': 1} >>> booster = vaex.ml.vowpalwabbit.VowpalWabbitModel(features=features, target='class_', epochs=100, params=params) >>> booster.fit(df_train) >>> df_train = booster.transform(df_train) From 0f15cafa3178aeb3e696ef89e0f861835984cb8e Mon Sep 17 00:00:00 2001 From: xdssio <37710064+xdssio@users.noreply.github.com> Date: Sun, 12 Dec 2021 19:11:03 +0100 Subject: [PATCH 04/11] Update packages/vaex-ml/vaex/ml/vowpalwabbit.py rename-fix Co-authored-by: Jovan Veljanoski --- packages/vaex-ml/vaex/ml/vowpalwabbit.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/packages/vaex-ml/vaex/ml/vowpalwabbit.py b/packages/vaex-ml/vaex/ml/vowpalwabbit.py index 4e0ba9047c..232cdda9d1 100644 --- a/packages/vaex-ml/vaex/ml/vowpalwabbit.py +++ b/packages/vaex-ml/vaex/ml/vowpalwabbit.py @@ -37,7 +37,7 @@ class VowpalWabbitModel(state.HasState): >>> df_train, df_test = df.ml.train_test_split() >>> params = {'oaa': '3', 'P': 1} >>> booster = vaex.ml.vowpalwabbit.VowpalWabbitModel(features=features, target='class_', epochs=100, params=params) - >>> booster.fit(df_train) + >>> vw_model.fit(df_train) >>> df_train = booster.transform(df_train) >>> df_train.head(3) # sepal_width petal_length sepal_length petal_width class_ vowpalwabbit_prediction From 7c3dfc5027e9e9b6a2eb59433610f63bee2b5a6b Mon Sep 17 00:00:00 2001 From: xdssio <37710064+xdssio@users.noreply.github.com> Date: Sun, 12 Dec 2021 19:11:16 +0100 Subject: [PATCH 05/11] Update packages/vaex-ml/vaex/ml/vowpalwabbit.py Co-authored-by: Jovan Veljanoski --- packages/vaex-ml/vaex/ml/vowpalwabbit.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/packages/vaex-ml/vaex/ml/vowpalwabbit.py b/packages/vaex-ml/vaex/ml/vowpalwabbit.py index 232cdda9d1..2b5a1fadc2 100644 --- a/packages/vaex-ml/vaex/ml/vowpalwabbit.py +++ b/packages/vaex-ml/vaex/ml/vowpalwabbit.py @@ -38,7 +38,7 @@ class VowpalWabbitModel(state.HasState): >>> params = {'oaa': '3', 'P': 1} >>> booster = vaex.ml.vowpalwabbit.VowpalWabbitModel(features=features, target='class_', epochs=100, params=params) >>> vw_model.fit(df_train) - >>> df_train = booster.transform(df_train) + >>> df_train = vw_model.transform(df_train) >>> df_train.head(3) # sepal_width petal_length sepal_length petal_width class_ vowpalwabbit_prediction 0 3 4.5 5.4 1.5 2 2 From a7efd24d3dd322d4782e5e71a81e07d01be31da2 Mon Sep 17 00:00:00 2001 From: xdssio <37710064+xdssio@users.noreply.github.com> Date: Sun, 12 Dec 2021 19:11:24 +0100 Subject: [PATCH 06/11] Update packages/vaex-ml/vaex/ml/vowpalwabbit.py Co-authored-by: Jovan Veljanoski --- packages/vaex-ml/vaex/ml/vowpalwabbit.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/packages/vaex-ml/vaex/ml/vowpalwabbit.py b/packages/vaex-ml/vaex/ml/vowpalwabbit.py index 2b5a1fadc2..d95dca2a3b 100644 --- a/packages/vaex-ml/vaex/ml/vowpalwabbit.py +++ b/packages/vaex-ml/vaex/ml/vowpalwabbit.py @@ -44,7 +44,7 @@ class VowpalWabbitModel(state.HasState): 0 3 4.5 5.4 1.5 2 2 1 3.4 1.6 4.8 0.2 1 1 2 3.1 4.9 6.9 1.5 2 2 - >>> df_test = booster.transform(df_test) + >>> df_test = vw_model.transform(df_test) >>> df_test.head(3) # sepal_width petal_length sepal_length petal_width class_ vowpalwabbit_prediction 0 3 4.2 5.9 1.5 2 2 From ffb174f058b21ebfba9735cf9aa1fcfdf2f199fb Mon Sep 17 00:00:00 2001 From: xdssio <37710064+xdssio@users.noreply.github.com> Date: Sun, 12 Dec 2021 19:11:37 +0100 Subject: [PATCH 07/11] Update packages/vaex-ml/vaex/ml/vowpalwabbit.py Co-authored-by: Jovan Veljanoski --- packages/vaex-ml/vaex/ml/vowpalwabbit.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/packages/vaex-ml/vaex/ml/vowpalwabbit.py b/packages/vaex-ml/vaex/ml/vowpalwabbit.py index d95dca2a3b..7da969e581 100644 --- a/packages/vaex-ml/vaex/ml/vowpalwabbit.py +++ b/packages/vaex-ml/vaex/ml/vowpalwabbit.py @@ -32,7 +32,7 @@ class VowpalWabbitModel(state.HasState): >>> import vaex.ml >>> import vaex.ml.vowpalwabbit >>> df = vaex.ml.datasets.load_iris() - >>> df['class_'] = df['class_']+1 # Vowpal Wabbit classification is an int stareting from 1. + >>> df['class_'] = df['class_']+1 # Vowpal Wabbit classification target should be an int starting from 1. >>> features = ['sepal_width', 'petal_length', 'sepal_length', 'petal_width'] >>> df_train, df_test = df.ml.train_test_split() >>> params = {'oaa': '3', 'P': 1} From fe9aceea74b904f868fcd846217f47d591a0052d Mon Sep 17 00:00:00 2001 From: xdssio Date: Wed, 22 Dec 2021 16:13:38 +0100 Subject: [PATCH 08/11] implement rename a map --- packages/vaex-core/vaex/dataframe.py | 14 ++++++++++++++ tests/rename_test.py | 12 ++++++++++++ 2 files changed, 26 insertions(+) diff --git a/packages/vaex-core/vaex/dataframe.py b/packages/vaex-core/vaex/dataframe.py index eebdb56904..d2fc814892 100644 --- a/packages/vaex-core/vaex/dataframe.py +++ b/packages/vaex-core/vaex/dataframe.py @@ -3664,6 +3664,20 @@ def add_virtual_column(self, name, expression, unique=False): self._save_assign_expression(valid_name) self.signal_column_changed.emit(self, valid_name, "add") + def renames(self, names, unique=False): + """Renames a column or variable, and rewrite expressions such that they refer to the new name""" + columns = self.get_column_names() + ret = [] + for name, new_name in names.items(): + if name == new_name: + continue + if name not in columns: + continue + new_name = vaex.utils.find_valid_name(new_name, used=None if not unique else self.get_column_names(hidden=True)) + self._rename(name, new_name, rename_meta_data=True) + ret.append(new_name) + return ret + def rename(self, name, new_name, unique=False): """Renames a column or variable, and rewrite expressions such that they refer to the new name""" if name == new_name: diff --git a/tests/rename_test.py b/tests/rename_test.py index 066369f7e0..70c4a060f9 100644 --- a/tests/rename_test.py +++ b/tests/rename_test.py @@ -1,5 +1,17 @@ from common import * + +def test_renames(df_local): + ds = df_local + new_columns = ds.renames({'x': 'x1', 'y': 'y1'}) + assert new_columns == ['x1', 'y1'] + current_columns = ds.get_column_names() + for column in new_columns: + assert column in current_columns + for column in ['x', 'y']: + assert column not in current_columns + + def test_rename(ds_filtered): ds = ds_filtered ds['r'] = ds.x From a0f01e22b046c1721d8e34e71daf77c52c1bf1cb Mon Sep 17 00:00:00 2001 From: xdssio <37710064+xdssio@users.noreply.github.com> Date: Wed, 22 Dec 2021 16:31:43 +0100 Subject: [PATCH 09/11] Delete vowpalwabbit_test.py unrelated --- tests/ml/vowpalwabbit_test.py | 50 ----------------------------------- 1 file changed, 50 deletions(-) delete mode 100644 tests/ml/vowpalwabbit_test.py diff --git a/tests/ml/vowpalwabbit_test.py b/tests/ml/vowpalwabbit_test.py deleted file mode 100644 index 877d527ce1..0000000000 --- a/tests/ml/vowpalwabbit_test.py +++ /dev/null @@ -1,50 +0,0 @@ -import sys - -import pytest - -pytest.importorskip("vowpalwabbit") -from sklearn.metrics import accuracy_score -import vaex.ml.vowpalwabbit -import vaex.ml.datasets - -params = {'oaa': '3', 'P': 1, 'enable_logging': True} - - -@pytest.mark.skipif(sys.version_info < (3, 6), reason="requires python3.6 or higher") -def test_vowpalwabbit(df_iris): - ds = df_iris - - ds['class_'] = ds['class_'] + 1 # VW classification starts from 1 - ds['x'] = ds.sepal_length * 1 - ds['y'] = ds.sepal_width * 1 - ds['w'] = ds.petal_length * 1 - ds['z'] = ds.petal_width * 1 - ds_train, ds_test = ds.ml.train_test_split(test_size=0.2, verbose=False) - features = ['x', 'y', 'z', 'w'] - - params = {'oaa': '3', 'P': 1, 'link': 'logistic', 'enable_logging': True} - model = vaex.ml.vowpalwabbit.VowpalWabbitModel( - params=params, - features=features, - target='class_') - model.fit(ds_train, passes=5) - assert 0 < accuracy_score(ds_test.col.class_.values, model.predict(ds_test)) - - ds_train = model.transform(ds_train) # this will add the vw column - state = ds_train.state_get() - ds_test.state_set(state) - - -@pytest.mark.skipif(sys.version_info < (3, 6), reason="requires python3.6 or higher") -def test_vowpalwabbit_serialize(tmpdir, df_iris): - ds = df_iris - ds['class_'] = ds['class_'] + 1 # VW classification starts from 1 - features = ['sepal_length', 'sepal_width', 'petal_length', 'petal_width'] - model = vaex.ml.vowpalwabbit.VowpalWabbitModel( - params=params, - features=features, - target='class_') - model.fit(ds) - pl = vaex.ml.Pipeline([model]) - pl.save(str(tmpdir.join('test.json'))) - pl.load(str(tmpdir.join('test.json'))) From 7fa267a547067baab7a896d77820cbabb925449b Mon Sep 17 00:00:00 2001 From: xdssio <37710064+xdssio@users.noreply.github.com> Date: Wed, 22 Dec 2021 16:32:00 +0100 Subject: [PATCH 10/11] Delete vowpalwabbit.py unrelated --- packages/vaex-ml/vaex/ml/vowpalwabbit.py | 149 ----------------------- 1 file changed, 149 deletions(-) delete mode 100644 packages/vaex-ml/vaex/ml/vowpalwabbit.py diff --git a/packages/vaex-ml/vaex/ml/vowpalwabbit.py b/packages/vaex-ml/vaex/ml/vowpalwabbit.py deleted file mode 100644 index 7da969e581..0000000000 --- a/packages/vaex-ml/vaex/ml/vowpalwabbit.py +++ /dev/null @@ -1,149 +0,0 @@ -import base64 -import tempfile - -import numpy as np -import pandas as pd -import traitlets -import vaex.serialize -from sklearn.utils import shuffle -from vowpalwabbit.DFtoVW import DFtoVW -from vowpalwabbit.pyvw import vw - -from . import generate -from . import state - - -@vaex.serialize.register -@generate.register -class VowpalWabbitModel(state.HasState): - '''The Vowpal Wabbit algorithm. - - This class provides an interface to the Vowpal Wabbit package. - - Vowpal Wabbit provides fast, efficient, and flexible online machine learning - techniques for reinforcement learning, supervised learning, and more. - It is influenced by an ecosystem of community contributions, academic research, and proven algorithms. - Microsoft Research is a major contributor to Vowpal Wabbit. - - For more information, please visit https://vowpalwabbit.org/index.html. - - Example: - - >>> import vaex.ml - >>> import vaex.ml.vowpalwabbit - >>> df = vaex.ml.datasets.load_iris() - >>> df['class_'] = df['class_']+1 # Vowpal Wabbit classification target should be an int starting from 1. - >>> features = ['sepal_width', 'petal_length', 'sepal_length', 'petal_width'] - >>> df_train, df_test = df.ml.train_test_split() - >>> params = {'oaa': '3', 'P': 1} - >>> booster = vaex.ml.vowpalwabbit.VowpalWabbitModel(features=features, target='class_', epochs=100, params=params) - >>> vw_model.fit(df_train) - >>> df_train = vw_model.transform(df_train) - >>> df_train.head(3) - # sepal_width petal_length sepal_length petal_width class_ vowpalwabbit_prediction - 0 3 4.5 5.4 1.5 2 2 - 1 3.4 1.6 4.8 0.2 1 1 - 2 3.1 4.9 6.9 1.5 2 2 - >>> df_test = vw_model.transform(df_test) - >>> df_test.head(3) - # sepal_width petal_length sepal_length petal_width class_ vowpalwabbit_prediction - 0 3 4.2 5.9 1.5 2 2 - 1 3 4.6 6.1 1.4 2 2 - 2 2.9 4.6 6.6 1.3 2 2 - ''' - snake_name = 'vowpalwabbit_model' - features = traitlets.List(traitlets.Unicode(), help='List of features to use when fitting the Vowpal Wabbit.') - target = traitlets.Unicode(allow_none=False, help='The name of the target column.') - passes = traitlets.CInt(help='Number of iterations.') - params = traitlets.Dict(default_value={}, help='parameters to be passed on the to the Vowpal Wabbit model.') - prediction_name = traitlets.Unicode(default_value='vowpalwabbit_prediction', - help='The name of the virtual column housing the predictions.') - - def __call__(self, *args): - data2d = np.array(args).T - return self.predict(data2d) - - def transform(self, df): - '''Transform a DataFrame such that it contains the predictions of the - Vowpal Wabbit in form of a virtual column. - - :param df: A vaex DataFrame. - - :return copy: A shallow copy of the DataFrame that includes the Vowpal Wabbit prediction as a virtual column. - :rtype: DataFrame - ''' - copy = df.copy() - lazy_function = copy.add_function('vowpalwabbit_prediction_function', self, unique=True) - expression = lazy_function(*self.features) - copy.add_virtual_column(self.prediction_name, expression, unique=False) - return copy - - def fit(self, df, passes=1, chunk_size=500, partial_fit=False): - """Fit the VowpalWabbitModel to the DataFrame. - :param df: A vaex DataFrame containing the features and target on which to train the model. - :param int passes: Number of passes over the data - :param int chunk_size: Size of chunks to iterate - """ - passes = passes or self.passes - params = {k: v for k, v in self.params.items() if v is not None} - target = self.target - features = self.features - model = self.model if (hasattr(self, 'model') and self.model is not None and partial_fit) else vw(**self.params) - for n in range(passes): - for _, _, X in df.to_pandas_df(chunk_size=chunk_size): - if n > 1: - X = shuffle(X) - for ex in DFtoVW.from_colnames(df=X, y=target, x=features).convert_df(): - model.learn(ex) - self.model = model - return self - - def predict(self, df, **kwargs): - '''Get an in-memory numpy array with the predictions of the VowpalWabbitModel on a vaex DataFrame. - This method accepts the key word arguments of the predict method from VowpalWabbit. - - :param df: A vaex DataFrame. - - :returns: A in-memory numpy array containing the VowpalWabbitModel predictions. - :rtype: numpy.array - ''' - if self.model is None: - raise RuntimeError("model is not fitted") - X = pd.DataFrame(df, columns=self.features) if isinstance(df, np.ndarray) else df[ - self.features].to_pandas_df() - X[self.target] = 1 # DFtoVW.from_colnames issue - will be ignored in predictions - examples = DFtoVW.from_colnames(df=X, y=self.target, x=self.features).convert_df() - return np.array([self.model.predict(ex) for ex in examples]) - - def _encode_vw(self): - if self.model is None: - return None - if isinstance(self.model, bytes): - return self.model - filename = tempfile.mktemp() - self.model.save(filename) - with open(filename, 'rb') as f: - model_data = f.read() - return base64.encodebytes(model_data).decode('ascii') - - def _decode_vw(self, encoding): - if encoding is None: - return vw(**self.params) - if isinstance(encoding, str): - model_data = base64.decodebytes(encoding.encode('ascii')) - openfilename = tempfile.mktemp() - with open(openfilename, 'wb') as f: - f.write(model_data) - params = self.params.copy() - params['i'] = openfilename - return vw(**params) - else: - return encoding - - def state_get(self): - return dict(model_state=self._encode_vw(), - substate=super(VowpalWabbitModel, self).state_get()) - - def state_set(self, state, trusted=True): - super(VowpalWabbitModel, self).state_set(state['substate']) - self.model = self._decode_vw(state['model_state']) From 768dd65bd9c76232f8beccc157b0cdc2ee3fd629 Mon Sep 17 00:00:00 2001 From: xdssio Date: Tue, 4 Jan 2022 13:40:45 +0100 Subject: [PATCH 11/11] rename accepts a dict --- packages/vaex-core/vaex/dataframe.py | 6 ++++-- tests/rename_test.py | 2 +- 2 files changed, 5 insertions(+), 3 deletions(-) diff --git a/packages/vaex-core/vaex/dataframe.py b/packages/vaex-core/vaex/dataframe.py index d2fc814892..4b7de7666b 100644 --- a/packages/vaex-core/vaex/dataframe.py +++ b/packages/vaex-core/vaex/dataframe.py @@ -3664,7 +3664,7 @@ def add_virtual_column(self, name, expression, unique=False): self._save_assign_expression(valid_name) self.signal_column_changed.emit(self, valid_name, "add") - def renames(self, names, unique=False): + def _renames(self, names, unique=False): """Renames a column or variable, and rewrite expressions such that they refer to the new name""" columns = self.get_column_names() ret = [] @@ -3678,8 +3678,10 @@ def renames(self, names, unique=False): ret.append(new_name) return ret - def rename(self, name, new_name, unique=False): + def rename(self, name, new_name=None, unique=False): """Renames a column or variable, and rewrite expressions such that they refer to the new name""" + if isinstance(name, dict): + return self._renames(name, unique=unique) if name == new_name: return new_name = vaex.utils.find_valid_name(new_name, used=None if not unique else self.get_column_names(hidden=True)) diff --git a/tests/rename_test.py b/tests/rename_test.py index 70c4a060f9..4ccec6aa3a 100644 --- a/tests/rename_test.py +++ b/tests/rename_test.py @@ -3,7 +3,7 @@ def test_renames(df_local): ds = df_local - new_columns = ds.renames({'x': 'x1', 'y': 'y1'}) + new_columns = ds.rename({'x': 'x1', 'y': 'y1'}) assert new_columns == ['x1', 'y1'] current_columns = ds.get_column_names() for column in new_columns: