Merge pull request #517 from Kimoby/refactor-hp-repo-automl

Refactor of the AutoML Loop, Trainer, and HyperparamsRepos
Neuraxio · Mar 30, 2022 · af96f79 · af96f79
2 parents c5627e6 + 98d68f7
commit af96f79
Show file tree

Hide file tree

Showing 110 changed files with 10,389 additions and 11,207 deletions.
diff --git a/.github/pull_request_template.md b/.github/pull_request_template.md
@@ -45,7 +45,7 @@ Things to check each time you contribute:
 - [ ] Your local Git username is set to your GitHub username, and your local Git email is set to your GitHub email. This is important to avoid breaking the cla-bot and for your contributions to be linked to your profile. More info: https://github.com/settings/emails
 - [ ] Argument's dimensions and types are specified for new steps (important), with examples in docstrings when needed.
 - [ ] Class names and argument / API variables are very clear: there is no possible ambiguity. They also respect the existing code style (avoid duplicating words for the same concept) and are intuitive.
-- [ ] Use typing like `variable: Typing = ...` as much as possible. Also use typing for function arguments and return values like `def my_func(self, my_list: Dict[int, List[str]]) -> OrderedDict[int, str]:`.
+- [ ] Use typing like `variable: Typing = ...` as much as possible. Also use typing for function arguments and return values like `def my_func(self, my_list: Dict[int, List[str]]) -> 'OrderedDict[int, str]':`.
 - [ ] Classes are documented: their behavior is explained beyond just the title of the class. You may even use the description written in your pull request above to fill some docstrings accurately.
 - [ ] If a numpy array is used, it is important to remember that these arrays are a special type that must be documented accordingly, and that numpy array should not be abused. This is because Neuraxle is a library that is not only limited to transforming numpy arrays. To this effect, numpy steps should probably be located in the existing numpy python files as much as possible, and not be all over the place. The same applies to Pandas DataFrames.
 - [ ] Code coverage is above 90% for the added code for the unit tests.

diff --git a/.github/workflows/license_checker_v2.py b/.github/workflows/license_checker_v2.py
@@ -63,7 +63,6 @@ def is_license_in_list(license, license_list):
         library_license_dict[library_name] = library_license
         print(f"{library_name}: {library_license}")
         # First checks if its refused_licenses, then if its in accepted_licenses, else add in the maybe list
-        # TODO : Should use regex instead?
 
         if is_license_in_list(library_license, args.forbidden_licenses):
             refused_libraries.append(library_name)

diff --git a/.gitignore b/.gitignore
@@ -46,6 +46,7 @@ coverage.xml
 *.cover
 .hypothesis/
 .pytest_cache/
+prof/
 
 # Translations
 *.mo
@@ -82,6 +83,7 @@ celerybeat-schedule
 *.sage.py
 
 # Environments
+venv
 .env
 .venv
 env/
@@ -106,11 +108,15 @@ venv.bak/
 # IDEs
 .idea
 .vscode
+.style.yapf
 vandelay-py.js
+appmap.yml
+tmp
 
 # Other
+.DS_Store
 ___*
-
+todo.txt
 **cache/**
 **caching/**
 cache/**
@@ -119,4 +125,5 @@ testing/examples/cache/**
 testing/cache/**
 testing/cache/*
 cov.xml
+profile.sh
 
diff --git a/README.rst b/README.rst
@@ -61,7 +61,7 @@ For example, you can build a time series processing pipeline as such:
 .. code:: python
 
     p = Pipeline([
-        TrainOnly(DataShuffler()),
+        TrainOnlyWrapper(DataShuffler()),
         WindowTimeSeries(),
         MiniBatchSequentialPipeline([
             Tensorflow2ModelStep(
@@ -113,7 +113,7 @@ You can also tune your hyperparameters using AutoML algorithms such as the TPE:
             use_linear_forgetting_weights=False,
             number_recent_trial_at_full_weights=25
         ),
-        validation_splitter=ValidationSplitter(test_size=0.20),
+        validation_splitter=ValidationSplitter(validation_size=0.20),
         scoring_callback=ScoringCallback(accuracy_score, higher_score_is_better=True),
         callbacks[
             MetricCallback(f1_score, higher_score_is_better=True),
@@ -122,17 +122,15 @@ You can also tune your hyperparameters using AutoML algorithms such as the TPE:
         ],
         n_trials=7,
         epochs=10,
-        hyperparams_repository=HyperparamsJSONRepository(cache_folder='cache'),
-        refit_trial=True,
+        refit_best_trial=True,
     )
 
     # Load data, and launch AutoML loop !
     X_train, y_train, X_test, y_test = generate_classification_data()
     auto_ml = auto_ml.fit(X_train, y_train)
 
     # Get the model from the best trial, and make predictions using predict.
-    best_pipeline = auto_ml.get_best_model()
-    y_pred = best_pipeline.predict(X_test)
+    y_pred = auto_ml.predict(X_test)
 
 
 --------------

diff --git a/coverage.sh b/coverage.sh
@@ -1,4 +1,5 @@
 #!/usr/bin/env bash
 ./flake8.sh
-pytest --cov-report html --cov-report xml:cov.xml --cov=neuraxle testing
+pytest -n 7 --cov-report html --cov-report xml:cov.xml --cov=neuraxle testing
+# pytest --cov-report html --cov=neuraxle testing; open htmlcov/index.html
 
diff --git a/examples/Handler Methods.ipynb b/examples/Handler Methods.ipynb
@@ -54,7 +54,6 @@
     "* Edit the [DataContainer](https://www.neuraxle.org/stable/api/neuraxle.data_container.html#neuraxle.data_container.DataContainer)\n",
     "* Call a method on a step\n",
     "* Mini-Batching (see [MiniBatchSequentialPipeline](https://www.neuraxle.org/stable/api/neuraxle.pipeline.html#neuraxle.pipeline.MiniBatchSequentialPipeline))\n",
-    "* Caching (see [neuraxle.checkpoint](https://www.neuraxle.org/stable/api/neuraxle.checkpoints.html) package)\n",
     "* etc.\n",
     "\n",
     "### [HandleOnlyMixin](https://www.neuraxle.org/stable/api/neuraxle.base.html#neuraxle.base.HandleOnlyMixin)\n",
@@ -97,14 +96,13 @@
     "def _transform_data_container(self, data_container: DataContainer, context: ExecutionContext) -> DataContainer:\n",
     "    output_data_container: ListDataContainer = ListDataContainer.empty()\n",
     "\n",
-    "    for current_id, di, eo in data_container:\n",
+    "    for _id, di, eo in data_container:\n",
     "        output: DataContainer = self.wrapped.handle_transform(\n",
-    "            DataContainer(summary_id=data_container.summary_id, current_ids=None, data_inputs=di,  expected_outputs=eo),\n",
+    "            DataContainer(data_inputs=di,  expected_outputs=eo),\n",
     "            context\n",
     "        )\n",
     "\n",
-    "        output_data_container.append(current_id, output.data_inputs, output.expected_outputs)\n",
-    "        output_data_container.summary_id = data_container.summary_id\n",
+    "        output_data_container.append(_id, output.data_inputs, output.expected_outputs)\n",
     "\n",
     "    return output_data_container"
    ]
@@ -160,10 +158,10 @@
     "\n",
     "\n",
     "class OutputTransformerWrapper(ForceHandleOnlyMixin, MetaStepMixin, BaseStep):\n",
-    "    def __init__(self, wrapped, cache_folder_when_no_handle=None):\n",
+    "    def __init__(self, wrapped):\n",
     "        BaseStep.__init__(self)\n",
     "        MetaStepMixin.__init__(self, wrapped)\n",
-    "        ForceHandleOnlyMixin.__init__(self, cache_folder_when_no_handle)"
+    "        ForceHandleOnlyMixin.__init__(self)"
    ]
   },
   {
@@ -185,8 +183,7 @@
     "    new_expected_outputs_data_container = self.wrapped.handle_transform(\n",
     "        DataContainer(\n",
     "            data_inputs=data_container.expected_outputs, \n",
-    "            current_ids=data_container.current_ids, \n",
-    "            expected_outputs=None\n",
+    "            ids=data_container.ids, \n",
     "        ), \n",
     "        context\n",
     "    )\n",
@@ -214,8 +211,7 @@
     "    self.wrapped = self.wrapped.handle_fit(\n",
     "        DataContainer(\n",
     "            data_inputs=data_container.expected_outputs, \n",
-    "            current_ids=data_container.current_ids, \n",
-    "            expected_outputs=None),\n",
+    "            ids=data_container.ids),\n",
     "        context\n",
     "    )\n",
     "\n",
@@ -242,8 +238,7 @@
     "    self.wrapped, new_expected_outputs_data_container = self.wrapped.handle_fit_transform(\n",
     "        DataContainer(\n",
     "            data_inputs=data_container.expected_outputs, \n",
-    "            current_ids=data_container.current_ids,\n",
-    "            expected_outputs=None\n",
+    "            ids=data_container.ids\n",
     "        ),\n",
     "        context\n",
     "    )\n",
@@ -270,15 +265,14 @@
     "\n",
     "\n",
     "class OutputTransformerWrapper(ForceHandleOnlyMixin, MetaStepMixin, BaseStep):\n",
-    "    def __init__(self, wrapped, cache_folder_when_no_handle=None):\n",
+    "    def __init__(self, wrapped):\n",
     "        BaseStep.__init__(self)\n",
     "        MetaStepMixin.__init__(self, wrapped)\n",
-    "        ForceHandleOnlyMixin.__init__(self, cache_folder_when_no_handle)\n",
+    "        ForceHandleOnlyMixin.__init__(self)\n",
     "\n",
     "    def _transform_data_container(self, data_container: DataContainer, context: ExecutionContext) -> DataContainer:\n",
     "        new_expected_outputs_data_container = self.wrapped.handle_transform(\n",
-    "            DataContainer(data_inputs=data_container.expected_outputs, current_ids=data_container.current_ids,\n",
-    "                          expected_outputs=None),\n",
+    "            DataContainer(data_inputs=data_container.expected_outputs, ids=data_container.ids),\n",
     "            context\n",
     "        )\n",
     "        data_container.set_expected_outputs(new_expected_outputs_data_container.data_inputs)\n",
@@ -287,16 +281,15 @@
     "\n",
     "    def _fit_data_container(self, data_container: DataContainer, context: ExecutionContext) -> (BaseStep, DataContainer):\n",
     "        self.wrapped = self.wrapped.handle_fit(\n",
-    "            DataContainer(data_inputs=data_container.expected_outputs, current_ids=data_container.current_ids,\n",
-    "                          expected_outputs=None),\n",
+    "            DataContainer(data_inputs=data_container.expected_outputs, ids=data_container.ids),\n",
     "            context\n",
     "        )\n",
     "\n",
     "        return self, data_container\n",
     "\n",
     "    def _fit_transform_data_container(self, data_container: DataContainer, context: ExecutionContext) -> (BaseStep, DataContainer):\n",
     "        self.wrapped, new_expected_outputs_data_container = self.wrapped.handle_fit_transform(\n",
-    "            DataContainer(data_inputs=data_container.expected_outputs, current_ids=data_container.current_ids, expected_outputs=None),\n",
+    "            DataContainer(data_inputs=data_container.expected_outputs, ids=data_container.ids),\n",
     "            context\n",
     "        )\n",
     "        data_container.set_expected_outputs(new_expected_outputs_data_container.data_inputs)\n",

diff --git a/examples/Hyperparams And Distributions.ipynb b/examples/Hyperparams And Distributions.ipynb
@@ -305,7 +305,6 @@
     "\n",
     "hd = ScipyDistributionWrapper(\n",
     "    scipy_distribution=randint(low=0, high=10),\n",
-    "    is_continuous=False,\n",
     "    null_default_value=0\n",
     ")"
    ]

diff --git a/examples/Introduction to Automatic Hyperparameter Tuning.ipynb b/examples/Introduction to Automatic Hyperparameter Tuning.ipynb
diff --git a/examples/Introduction to Time Series Processing.ipynb b/examples/Introduction to Time Series Processing.ipynb
@@ -657,22 +657,19 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "from neuraxle.metaopt.auto_ml import AutoML, InMemoryHyperparamsRepository, ValidationSplitter, \\\n",
-    "    RandomSearchHyperparameterSelectionStrategy\n",
-    "#from neuraxle.metaopt.tpe import TreeParzenEstimatorSelectionStrategy\n",
-    "#from neuraxle.metaopt.auto_ml import HyperparamsJSONRepository\n",
+    "from neuraxle.metaopt.auto_ml import AutoML, ValidationSplitter\n",
+    "from neuraxle.metaopt.validation import RandomSearchSampler\n",
     "from neuraxle.metaopt.callbacks import ScoringCallback\n",
     "from sklearn.metrics import accuracy_score\n",
     "\n",
     "\n",
     "auto_ml = AutoML(\n",
     "    pipeline=pipeline,\n",
-    "    hyperparams_optimizer=RandomSearchHyperparameterSelectionStrategy(),\n",
+    "    hyperparams_optimizer=RandomSearchSampler(),\n",
     "    validation_splitter=ValidationSplitter(test_size=0.20),\n",
     "    scoring_callback=ScoringCallback(accuracy_score, higher_score_is_better=True),\n",
     "    n_trials=10,\n",
     "    epochs=1,\n",
-    "    hyperparams_repository=InMemoryHyperparamsRepository(cache_folder=cache_folder),\n",
     "    refit_trial=True,\n",
     "    # callbacks=[MetricCallbacks(...)]\n",
     ")"

diff --git a/examples/auto_ml/plot_automl_loop_clean_kata.py b/examples/auto_ml/plot_automl_loop_clean_kata.py
@@ -5,7 +5,7 @@
 This demonstrates how you can build an AutoML loop that finds the best possible sklearn classifier.
 It also shows you how to add hyperparams to sklearn steps using SKLearnWrapper.
 This example has been derived and simplified from the following repository: https://github.com/Neuraxio/Kata-Clean-Machine-Learning-From-Dirty-Code
-Here, 2D data is fitted, whereas in the original example 3D (sequential / time series) data is preprocessed and then fitted with the same models. 
+Here, 2D data is fitted, whereas in the original example 3D (sequential / time series) data is preprocessed and then fitted with the same models.
 
 ..
     Copyright 2019, Neuraxio Inc.
@@ -25,28 +25,29 @@
 """
 import shutil
 
-from sklearn.datasets import make_classification
-from sklearn.ensemble import RandomForestClassifier
-from sklearn.linear_model import RidgeClassifier, LogisticRegression
-from sklearn.metrics import accuracy_score
-from sklearn.model_selection import train_test_split
-from sklearn.tree import DecisionTreeClassifier, ExtraTreeClassifier
-
-from neuraxle.hyperparams.distributions import Choice, RandInt, Boolean, LogUniform
+from neuraxle.base import ExecutionContext as CX
+from neuraxle.hyperparams.distributions import (Boolean, Choice, LogUniform,
+                                                RandInt)
 from neuraxle.hyperparams.space import HyperparameterSpace
-from neuraxle.metaopt.auto_ml import AutoML, RandomSearchHyperparameterSelectionStrategy, ValidationSplitter, \
-    HyperparamsJSONRepository
+from neuraxle.metaopt.auto_ml import (AutoML, RandomSearchSampler,
+                                      ValidationSplitter)
 from neuraxle.metaopt.callbacks import ScoringCallback
+from neuraxle.metaopt.data.json_repo import HyperparamsOnDiskRepository
 from neuraxle.pipeline import Pipeline
 from neuraxle.steps.flow import ChooseOneStepOf
 from neuraxle.steps.numpy import NumpyRavel
 from neuraxle.steps.output_handlers import OutputTransformerWrapper
 from neuraxle.steps.sklearn import SKLearnWrapper
+from sklearn.datasets import make_classification
+from sklearn.ensemble import RandomForestClassifier
+from sklearn.linear_model import LogisticRegression, RidgeClassifier
+from sklearn.metrics import accuracy_score
+from sklearn.model_selection import train_test_split
+from sklearn.tree import DecisionTreeClassifier, ExtraTreeClassifier
 
 
-def main():
+def main(tmpdir: str):
     # Define classification models, and hyperparams.
-    # See also HyperparameterSpace documentation : https://www.neuraxle.org/stable/api/neuraxle.hyperparams.space.html#neuraxle.hyperparams.space.HyperparameterSpace
 
     decision_tree_classifier = SKLearnWrapper(
         DecisionTreeClassifier(),
@@ -97,7 +98,7 @@ def main():
     ]).set_name('RandomForestClassifier')
 
     # Define a classification pipeline that lets the AutoML loop choose one of the classifier.
-    # See also ChooseOneStepOf documentation : https://www.neuraxle.org/stable/api/neuraxle.steps.flow.html#neuraxle.steps.flow.ChooseOneStepOf
+    # See also ChooseOneStepOf documentation: https://www.neuraxle.org/stable/api/neuraxle.steps.flow.html#neuraxle.steps.flow.ChooseOneStepOf
 
     pipeline = Pipeline([
         ChooseOneStepOf([
@@ -110,17 +111,17 @@ def main():
     ])
 
     # Create the AutoML loop object.
-    # See also AutoML documentation : https://www.neuraxle.org/stable/api/neuraxle.metaopt.auto_ml.html#neuraxle.metaopt.auto_ml.AutoML
+    # See also AutoML documentation: https://www.neuraxle.org/stable/api/neuraxle.metaopt.auto_ml.html#neuraxle.metaopt.auto_ml.AutoML
 
     auto_ml = AutoML(
         pipeline=pipeline,
-        hyperparams_optimizer=RandomSearchHyperparameterSelectionStrategy(),
-        validation_splitter=ValidationSplitter(test_size=0.20),
+        hyperparams_optimizer=RandomSearchSampler(),
+        validation_splitter=ValidationSplitter(validation_size=0.20),
         scoring_callback=ScoringCallback(accuracy_score, higher_score_is_better=True),
         n_trials=7,
         epochs=1,
-        hyperparams_repository=HyperparamsJSONRepository(cache_folder='cache'),
-        refit_trial=True,
+        hyperparams_repository=HyperparamsOnDiskRepository(cache_folder=tmpdir),
+        refit_best_trial=True,
         continue_loop_on_error=False
     )
 
@@ -129,16 +130,13 @@ def main():
     X_train, y_train, X_test, y_test = generate_classification_data()
     auto_ml = auto_ml.fit(X_train, y_train)
 
-    # Get the model from the best trial, and make predictions using predict.
-    # See also predict documentation : https://www.neuraxle.org/stable/api/neuraxle.base.html#neuraxle.base.BaseStep.predict
-
-    best_pipeline = auto_ml.get_best_model()
-    y_pred = best_pipeline.predict(X_test)
+    # Get the model from the best trial, and make predictions using predict, as per the `refit_best_trial=True` argument to AutoML.
+    y_pred = auto_ml.predict(X_test)
 
     accuracy = accuracy_score(y_true=y_test, y_pred=y_pred)
     print("Test accuracy score:", accuracy)
 
-    shutil.rmtree('cache')
+    shutil.rmtree(tmpdir)
 
 
 def generate_classification_data():
@@ -163,4 +161,4 @@ def generate_classification_data():
 
 
 if __name__ == '__main__':
-    main()
+    main(CX.get_new_cache_folder())