diff --git a/.github/workflows/run_tests.yml b/.github/workflows/run_tests.yml
index 1a2f5ab..402a66f 100644
--- a/.github/workflows/run_tests.yml
+++ b/.github/workflows/run_tests.yml
@@ -12,7 +12,7 @@ jobs:
runs-on: ubuntu-latest
strategy:
matrix:
- python-version: [3.6, 3.7, 3.8]
+ python-version: [3.8, 3.9]
steps:
- uses: actions/checkout@v2
@@ -23,7 +23,7 @@ jobs:
- name: Install dependencies
run: |
python -m pip install --upgrade pip
- pip install -r requirements.txt
+ pip install .
- name: Lint with flake8
run: |
pip install flake8
@@ -33,5 +33,5 @@ jobs:
flake8 . --count --exit-zero --max-complexity=10 --max-line-length=127 --statistics
- name: Test with pytest
run: |
- pip install pytest
+ pip install -r tests/requirements.txt
pytest
diff --git a/.gitignore b/.gitignore
index 1f764ff..3e30f77 100644
--- a/.gitignore
+++ b/.gitignore
@@ -5,6 +5,7 @@ laptop_env/
worker*/
*.dirlock
*.lock
+notes/
# Byte-compiled / optimized / DLL files
__pycache__/
diff --git a/README.md b/README.md
index abea51c..0ae6ef9 100644
--- a/README.md
+++ b/README.md
@@ -39,16 +39,14 @@ from sklearn.datasets import make_blobs
from incremental_trees.models.classification.streaming_rfc import StreamingRFC
# Generate some data in memory
-x, y = make_blobs(n_samples=int(2e5), random_state=0, n_features=40,
- centers=2, cluster_std=100)
+x, y = make_blobs(n_samples=int(2e5), random_state=0, n_features=40, centers=2, cluster_std=100)
srfc = StreamingRFC(n_estimators_per_chunk=3,
max_n_estimators=np.inf,
spf_n_fits=30, # Number of calls to .partial_fit()
spf_sample_prop=0.3) # Number of rows to sample each on .partial_fit()
-srfc.fit(x, y,
- sample_weight=np.ones_like(y)) # Optional, gets sampled along with the data
+srfc.fit(x, y, sample_weight=np.ones_like(y)) # Optional, gets sampled along with the data
# Should be n_estimators_per_chunk * spf_n_fits
print(len(srfc.estimators_))
@@ -96,7 +94,7 @@ For example, this can be used to feed .partial_fit() sequentially (although belo
````python
import numpy as np
from sklearn.datasets import make_blobs
-from incremental_trees.trees import StreamingRFC
+from incremental_trees.models.classification.streaming_rfc import StreamingRFC
srfc = StreamingRFC(n_estimators_per_chunk=20,
max_n_estimators=np.inf,
@@ -110,11 +108,11 @@ x, y = make_blobs(n_samples=int(2e5), random_state=0, n_features=40,
n_chunks = 30
chunk_size = int(2e3)
for i in range(n_chunks):
- sample_idx = np.random.randint(0, x.shape[0], chunk_size)
- # Call .partial_fit(), specifying expected classes, also supports other .fit args such as sample_weight
- srfc.partial_fit(x[sample_idx, :], y[sample_idx],
- classes=np.unique(y))
-
+ sample_idx = np.random.randint(0, x.shape[0], chunk_size)
+ # Call .partial_fit(), specifying expected classes, also supports other .fit args such as sample_weight
+ srfc.partial_fit(x[sample_idx, :], y[sample_idx],
+ classes=np.unique(y))
+
# Should be n_chunks * n_estimators_per_chunk
print(len(srfc.estimators_))
print(srfc.score(x, y))
@@ -126,17 +124,17 @@ There are a couple of different model setups worth considering. No idea which wo
#### "Incremental forest"
For the number of chunks/fits, sample rows from X, then fit a number of single trees (with different column subsets), eg.
````python
-srfc = StreamingRFC(n_estimators_per_chunk=10,
- max_features='sqrt')
+srfc = StreamingRFC(n_estimators_per_chunk=10, max_features='sqrt')
````
#### "Incremental decision trees"
Single (or few) decision trees per data subset, with all features.
````python
-srfc = StreamingRFC(n_estimators_per_chunk=1,
- max_features=x.shape[1])
+srfc = StreamingRFC(n_estimators_per_chunk=1, max_features=x.shape[1])
````
# Version history
+## v0.6.0
+ - Update to work with scikit-learn==1.2, dask==2022.12, dask-glm==0.2.0, dask-ml==2022.5.27. Support python 3.8 and 3.9.
## v0.5.1
- Add support for passing fit args/kwargs via `.fit` (specifically, `sample_weight`)
## v0.5.0
diff --git a/example_dask.py b/example_dask.py
deleted file mode 100644
index 6903f1a..0000000
--- a/example_dask.py
+++ /dev/null
@@ -1,40 +0,0 @@
-import dask as dd
-import dask_ml.cluster
-import dask_ml.datasets
-import numpy as np
-from dask.distributed import Client, LocalCluster
-from dask_ml.wrappers import Incremental
-
-from incremental_trees.trees import StreamingRFC
-
-
-def run_on_blobs():
- x, y = dask_ml.datasets.make_blobs(n_samples=1e8,
- chunks=1e5,
- random_state=0,
- centers=3)
-
- x = dd.dataframe.from_array(x)
- y = dd.dataframe.from_array(y)
-
- print(f"Rows: {x.shape[0].compute()}")
-
- ests_per_chunk = 4
- chunks = len(x.divisions)
-
- srfc = Incremental(StreamingRFC(n_estimators_per_chunk=ests_per_chunk,
- max_n_estimators=np.inf,
- verbose=1,
- n_jobs=4))
- srfc.fit(x, y,
- classes=y.unique().compute())
-
-
-# Create, connect, and run on local cluster.
-with LocalCluster(processes=False,
- n_workers=2,
- threads_per_worker=2,
- scheduler_port=8080,
- diagnostics_port=8081) as cluster, Client(cluster) as client:
- print(client)
- run_on_blobs()
diff --git a/example_fit.py b/example_fit.py
deleted file mode 100644
index bb26fbb..0000000
--- a/example_fit.py
+++ /dev/null
@@ -1,21 +0,0 @@
-import numpy as np
-from sklearn.datasets import make_blobs
-
-from incremental_trees.models.classification.streaming_rfc import StreamingRFC
-
-if __name__ == "__main__":
- # Generate some data in memory
- x, y = make_blobs(n_samples=int(2e5), random_state=0, n_features=40,
- centers=2, cluster_std=100)
-
- srfc = StreamingRFC(n_estimators_per_chunk=3,
- max_n_estimators=np.inf,
- spf_n_fits=30, # Number of calls to .partial_fit()
- spf_sample_prop=0.3) # Number of rows to sample each on .partial_fit()
-
- srfc.fit(x, y,
- sample_weight=np.ones_like(y)) # Optional
-
- # Should be n_estimators_per_chunk * spf_n_fits
- print(len(srfc.estimators_))
- print(srfc.score(x, y))
diff --git a/incremental_trees/__init__.py b/incremental_trees/__init__.py
index 93b60a1..ef7eb44 100644
--- a/incremental_trees/__init__.py
+++ b/incremental_trees/__init__.py
@@ -1 +1 @@
-__version__ = '0.5.1'
+__version__ = '0.6.0'
diff --git a/incremental_trees/models/classification/streaming_extc.py b/incremental_trees/models/classification/streaming_extc.py
index 9f82686..4ec891e 100644
--- a/incremental_trees/models/classification/streaming_extc.py
+++ b/incremental_trees/models/classification/streaming_extc.py
@@ -1,3 +1,5 @@
+from typing import Optional, Dict, Union
+
import numpy as np
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.tree import ExtraTreeClassifier
@@ -10,52 +12,54 @@ class StreamingEXTC(ClassifierAdditions, ClassifierOverloads, ExtraTreesClassifi
"""Overload sklearn.ensemble.ExtraTreesClassifier to add partial fit method and new params."""
def __init__(self,
+ criterion: str = "gini",
+ max_depth: Optional[int] = None,
+ min_samples_split: int = 2,
+ min_samples_leaf: int = 1,
+ min_weight_fraction_leaf: float = 0.0,
+ max_features: float = 1.0,
+ max_leaf_nodes: Optional[int] = None,
+ min_impurity_decrease: float = 0.0,
+ bootstrap: bool = False,
+ oob_score: bool = False,
+ n_jobs: Optional[int] = None,
+ random_state: Optional[int] = None,
+ verbose: int = 0,
+ warm_start: bool = True,
+ class_weight: Optional[Union[str, Dict]] = None,
+ ccp_alpha: float = 0.0,
+ max_samples: Optional[float] = None,
n_estimators_per_chunk: int = 1,
- n_estimators: bool = None,
- max_n_estimators=np.inf,
- criterion="gini",
- max_depth=None,
- min_samples_split=2,
- min_samples_leaf=1,
- min_weight_fraction_leaf=0.,
- max_features="auto",
- max_leaf_nodes=None,
- min_impurity_decrease=0.,
- min_impurity_split=None,
- bootstrap=False,
- oob_score=False,
- n_jobs=None,
- random_state=None,
- verbose=0,
- warm_start=True,
- class_weight=None,
+ max_n_estimators: float = np.inf,
dask_feeding: bool = True,
- spf_n_fits=100,
- spf_sample_prop: float = 0.1):
+ spf_n_fits: int = 100,
+ spf_sample_prop: float = 0.1
+ ):
super(ExtraTreesClassifier, self).__init__(
- base_estimator=ExtraTreeClassifier(),
+ estimator=ExtraTreeClassifier(),
n_estimators=n_estimators_per_chunk,
estimator_params=("criterion", "max_depth", "min_samples_split",
"min_samples_leaf", "min_weight_fraction_leaf",
"max_features", "max_leaf_nodes",
- "min_impurity_decrease", "min_impurity_split",
- "random_state"),
+ "min_impurity_decrease",
+ "random_state", "ccp_alpha"),
bootstrap=bootstrap,
oob_score=oob_score,
n_jobs=n_jobs,
random_state=random_state,
verbose=verbose,
warm_start=warm_start,
- class_weight=class_weight)
+ class_weight=class_weight,
+ max_samples=max_samples
+ )
self.max_n_estimators: int = None
self._fit_estimators: int = 0
self.classes_: np.array = None # NB: Needs to be array, not list.
self.n_classes_: int = None
-
self._fit_estimators = 0
self.max_n_estimators = max_n_estimators
- self.n_estimators_per_chunk = n_estimators
+ self.n_estimators_per_chunk = n_estimators_per_chunk
self.criterion = criterion
self.max_depth = max_depth
self.min_samples_split = min_samples_split
@@ -64,7 +68,8 @@ def __init__(self,
self.max_features = max_features
self.max_leaf_nodes = max_leaf_nodes
self.min_impurity_decrease = min_impurity_decrease
- self.min_impurity_split = min_impurity_split
+ self.ccp_alpha = ccp_alpha
+ self.max_samples = max_samples
# Set additional params.
self.set_params(n_estimators_per_chunk=n_estimators_per_chunk,
diff --git a/incremental_trees/models/classification/streaming_rfc.py b/incremental_trees/models/classification/streaming_rfc.py
index 54199b3..902cc2b 100644
--- a/incremental_trees/models/classification/streaming_rfc.py
+++ b/incremental_trees/models/classification/streaming_rfc.py
@@ -1,4 +1,4 @@
-import warnings
+from typing import Optional, Union, Dict, List
import numpy as np
from sklearn.ensemble import RandomForestClassifier
@@ -15,27 +15,28 @@ class StreamingRFC(ClassifierAdditions, ClassifierOverloads, RandomForestClassif
"""
def __init__(self,
- bootstrap=True,
- class_weight=None,
- criterion='gini',
- max_depth=None,
- max_features='auto',
- max_leaf_nodes=None,
- min_impurity_decrease=0.0,
- min_impurity_split=None,
- min_samples_leaf=1,
- min_samples_split=2,
- min_weight_fraction_leaf=0.0,
- n_estimators_per_chunk: int = 1,
- n_jobs=None,
- oob_score=False,
- random_state=None,
- verbose=0,
+ criterion: str = 'gini',
+ max_depth: Optional[int] = None,
+ min_samples_split: int = 2,
+ min_samples_leaf: int = 1,
+ min_weight_fraction_leaf: float = 0.0,
+ max_features: Optional[str] = 'sqrt',
+ max_leaf_nodes: Optional[int] = None,
+ min_impurity_decrease: float = 0.0,
+ bootstrap: bool = True,
+ oob_score: bool = False,
+ n_jobs: Optional[int] = None,
+ random_state: Optional[int] = None,
+ verbose: int = 0,
warm_start: bool = True,
+ class_weight: Optional[Union[str, Dict, List[Dict]]] = None,
+ ccp_alpha: float = 0.0,
+ max_samples: Optional[int] = None,
dask_feeding: bool = True,
- max_n_estimators=10,
- spf_n_fits=100,
- spf_sample_prop=0.1) -> None:
+ n_estimators_per_chunk: int = 1,
+ max_n_estimators: int = 10,
+ spf_n_fits: int = 100,
+ spf_sample_prop: float = 0.1) -> None:
"""
:param bootstrap:
:param class_weight:
@@ -44,7 +45,6 @@ def __init__(self,
:param max_features:
:param max_leaf_nodes:
:param min_impurity_decrease:
- :param min_impurity_split:
:param min_samples_leaf:
:param min_samples_split:
:param min_weight_fraction_leaf:
@@ -73,7 +73,6 @@ def __init__(self,
max_features=max_features,
max_leaf_nodes=max_leaf_nodes,
min_impurity_decrease=min_impurity_decrease,
- min_impurity_split=min_impurity_split,
min_samples_leaf=min_samples_leaf,
min_samples_split=min_samples_split,
min_weight_fraction_leaf=min_weight_fraction_leaf,
@@ -89,4 +88,7 @@ def __init__(self,
max_n_estimators=max_n_estimators,
verb=0,
spf_n_fits=spf_n_fits,
- spf_sample_prop=spf_sample_prop)
+ spf_sample_prop=spf_sample_prop,
+ ccp_alpha=ccp_alpha,
+ max_samples=max_samples
+ )
diff --git a/incremental_trees/models/regression/streaming_extr.py b/incremental_trees/models/regression/streaming_extr.py
index 6d565d2..3a05509 100644
--- a/incremental_trees/models/regression/streaming_extr.py
+++ b/incremental_trees/models/regression/streaming_extr.py
@@ -1,3 +1,5 @@
+from typing import Optional, Union
+
import numpy as np
from sklearn.ensemble import ExtraTreesRegressor
from sklearn.tree import ExtraTreeRegressor
@@ -8,45 +10,46 @@
class StreamingEXTR(RegressorAdditions, RegressorOverloads, ExtraTreesRegressor):
def __init__(self,
+ criterion: str = "squared_error",
+ max_depth: Optional[int] = None,
+ min_samples_split: int = 2,
+ min_samples_leaf: int = 1,
+ min_weight_fraction_leaf: float = 0.0,
+ max_features: Union[str, float] = 1.0,
+ max_leaf_nodes: Optional[int] = None,
+ min_impurity_decrease: float = 0.0,
+ bootstrap: bool = False,
+ oob_score: bool = False,
+ n_jobs: Optional[int] = None,
+ random_state: Optional[int] = None,
+ verbose: int = 0,
+ warm_start: bool = True,
+ ccp_alpha: float = 0.0,
+ max_samples: Optional[float] = None,
+ max_n_estimators: int = np.inf,
n_estimators_per_chunk: int = 1,
- n_estimators='warn',
- max_n_estimators=np.inf,
- criterion="mse",
- max_depth=None,
- min_samples_split=2,
- min_samples_leaf=1,
- min_weight_fraction_leaf=0.,
- max_features="auto",
- max_leaf_nodes=None,
- min_impurity_decrease=0.,
- min_impurity_split=None,
- bootstrap=False,
- oob_score=False,
- n_jobs=None,
- random_state=None,
- verbose=0,
- warm_start=True,
dask_feeding: bool = True,
spf_n_fits: int = 100,
spf_sample_prop: float = 0.1):
super(ExtraTreesRegressor, self).__init__(
- base_estimator=ExtraTreeRegressor(),
+ estimator=ExtraTreeRegressor(),
n_estimators=n_estimators_per_chunk,
estimator_params=("criterion", "max_depth", "min_samples_split",
"min_samples_leaf", "min_weight_fraction_leaf",
"max_features", "max_leaf_nodes",
- "min_impurity_decrease", "min_impurity_split",
- "random_state"),
+ "min_impurity_decrease", "random_state", "ccp_alpha"),
bootstrap=bootstrap,
oob_score=oob_score,
n_jobs=n_jobs,
random_state=random_state,
verbose=verbose,
- warm_start=warm_start)
+ warm_start=warm_start,
+ max_samples=max_samples
+ )
self._fit_estimators = 0
self.max_n_estimators = max_n_estimators
- self.n_estimators_per_chunk = n_estimators
+ self.n_estimators_per_chunk = n_estimators_per_chunk
self.criterion = criterion
self.max_depth = max_depth
self.min_samples_split = min_samples_split
@@ -55,7 +58,8 @@ def __init__(self,
self.max_features = max_features
self.max_leaf_nodes = max_leaf_nodes
self.min_impurity_decrease = min_impurity_decrease
- self.min_impurity_split = min_impurity_split
+ self.ccp_alpha = ccp_alpha
+ self.max_samples = max_samples
# Set additional params.
self.set_params(n_estimators_per_chunk=n_estimators_per_chunk,
diff --git a/incremental_trees/models/regression/streaming_rfr.py b/incremental_trees/models/regression/streaming_rfr.py
index a028c93..bdc811a 100644
--- a/incremental_trees/models/regression/streaming_rfr.py
+++ b/incremental_trees/models/regression/streaming_rfr.py
@@ -1,3 +1,5 @@
+from typing import Optional
+
from sklearn.ensemble import RandomForestRegressor
from sklearn.tree import DecisionTreeRegressor
@@ -9,35 +11,32 @@ class StreamingRFR(RegressorAdditions, RegressorOverloads, RandomForestRegressor
"""Overload sklearn.ensemble.RandomForestClassifier to add partial fit method and new params."""
def __init__(self,
- n_estimators='warn',
- criterion="mse",
- max_depth=None,
- min_samples_split=2,
- min_samples_leaf=1,
- min_weight_fraction_leaf=0.,
- max_features="auto",
- max_leaf_nodes=None,
- min_impurity_decrease=0.,
- min_impurity_split=None,
- bootstrap=True,
- oob_score=False,
- n_jobs=None,
- random_state=None,
- verbose=0,
+ criterion: str = "squared_error",
+ max_depth: Optional[int] = None,
+ min_samples_split: int = 2,
+ min_samples_leaf: float = 1,
+ min_weight_fraction_leaf: float = 0.0,
+ max_features: Optional[float] = 1.0,
+ max_leaf_nodes: Optional[int] = None,
+ min_impurity_decrease: float = 0.0,
+ bootstrap: bool = True,
+ oob_score: bool = False,
+ n_jobs: Optional[int] = None,
+ random_state: Optional[int] = None,
+ verbose: int = 0,
n_estimators_per_chunk: int = 1,
warm_start: bool = True,
dask_feeding: bool = True,
- max_n_estimators=10,
- spf_n_fits=100,
- spf_sample_prop=0.1):
+ max_n_estimators: int = 10,
+ spf_n_fits: int = 100,
+ spf_sample_prop: float = 0.1):
super(RandomForestRegressor, self).__init__(
- base_estimator=DecisionTreeRegressor(),
+ estimator=DecisionTreeRegressor(),
n_estimators=n_estimators_per_chunk,
estimator_params=("criterion", "max_depth", "min_samples_split",
"min_samples_leaf", "min_weight_fraction_leaf",
"max_features", "max_leaf_nodes",
- "min_impurity_decrease", "min_impurity_split",
- "random_state"),
+ "min_impurity_decrease", "random_state"),
bootstrap=bootstrap,
oob_score=oob_score,
n_jobs=n_jobs,
@@ -47,7 +46,7 @@ def __init__(self,
self._fit_estimators = 0
self.max_n_estimators = max_n_estimators
- self.n_estimators_per_chunk = n_estimators
+ self.n_estimators_per_chunk = n_estimators_per_chunk
self.criterion = criterion
self.max_depth = max_depth
self.min_samples_split = min_samples_split
@@ -56,7 +55,6 @@ def __init__(self,
self.max_features = max_features
self.max_leaf_nodes = max_leaf_nodes
self.min_impurity_decrease = min_impurity_decrease
- self.min_impurity_split = min_impurity_split
# Set additional params.
self.set_params(n_estimators_per_chunk=n_estimators_per_chunk,
diff --git a/incremental_trees/trees.py b/incremental_trees/trees.py
deleted file mode 100644
index 4d29adf..0000000
--- a/incremental_trees/trees.py
+++ /dev/null
@@ -1,90 +0,0 @@
-import numpy as np
-
-from incremental_trees.models.classification.streaming_extc import StreamingEXTC
-from incremental_trees.models.classification.streaming_rfc import StreamingRFC
-from incremental_trees.models.regression.streaming_extr import StreamingEXTR
-from incremental_trees.models.regression.streaming_rfr import StreamingRFR
-
-
-def bunch_of_examples():
- from sklearn.datasets import make_blobs, make_regression
-
- x, y = make_regression(n_samples=int(2e5),
- random_state=0,
- n_features=40)
-
- srfr = StreamingRFR(n_estimators_per_chunk=5,
- spf_n_fits=10,
- dask_feeding=False,
- verbose=0,
- n_jobs=2)
-
- srfr.fit(x, y)
-
- # Fit 10 regressors
- for _ in range(10):
- x, y = make_regression(n_samples=int(2e5),
- random_state=0,
- n_features=40)
-
- srfr = StreamingRFR(n_estimators_per_chunk=5,
- max_n_estimators=100,
- verbose=0,
- n_jobs=5)
-
- chunk_size = int(2e3)
- for _ in range(20):
- sample_idx = np.random.randint(0, x.shape[0], chunk_size)
- srfr.partial_fit(x[sample_idx], y[sample_idx],
- classes=np.unique(y))
-
- print(f"SRFR: {srfr.score(x, y)}")
-
- sext = StreamingEXTR(n_estimators_per_chunk=5,
- max_n_estimators=100,
- verbose=0,
- n_jobs=5)
-
- for _ in range(20):
- sample_idx = np.random.randint(0, x.shape[0], chunk_size)
- sext.partial_fit(x[sample_idx], y[sample_idx],
- classes=np.unique(y))
-
- print(f"SEXTR: {sext.score(x, y)}")
-
- # Fit 10 classifiers
- for _ in range(10):
- x, y = make_blobs(n_samples=int(2e5),
- random_state=0,
- n_features=40,
- centers=2,
- cluster_std=100)
-
- srfc = StreamingRFC(n_estimators_per_chunk=5,
- max_n_estimators=100,
- verbose=0,
- n_jobs=5)
-
- chunk_size = int(2e3)
- for _ in range(20):
- sample_idx = np.random.randint(0, x.shape[0], chunk_size)
- srfc.partial_fit(x[sample_idx], y[sample_idx],
- classes=np.unique(y))
-
- print(f"SRFC: {srfc.score(x, y)}")
-
- sext = StreamingEXTC(n_estimators_per_chunk=5,
- max_n_estimators=100,
- verbose=0,
- n_jobs=5)
-
- for _ in range(20):
- sample_idx = np.random.randint(0, x.shape[0], chunk_size)
- sext.partial_fit(x[sample_idx], y[sample_idx],
- classes=np.unique(y))
-
- print(f"SEXTC: {sext.score(x, y)}")
-
-
-if __name__ == '__main__':
- bunch_of_examples()
diff --git a/notes/InconsistentClasses.ipynb b/notes/InconsistentClasses.ipynb
deleted file mode 100644
index dd59dd7..0000000
--- a/notes/InconsistentClasses.ipynb
+++ /dev/null
@@ -1,240 +0,0 @@
-{
- "cells": [
- {
- "cell_type": "code",
- "execution_count": 1,
- "metadata": {},
- "outputs": [],
- "source": [
- "# Change dir to repo root if running from repo (rather than pip installed)\n",
- "# (Assuming running from [repo]/notes/)\n",
- "import os\n",
- "os.chdir('../')\n",
- "\n",
- "%load_ext autoreload\n",
- "%autoreload 2"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 2,
- "metadata": {},
- "outputs": [],
- "source": [
- "import pandas as pd"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 7,
- "metadata": {},
- "outputs": [
- {
- "data": {
- "text/html": [
- "
\n",
- "\n",
- "
\n",
- " \n",
- " \n",
- " | \n",
- " a | \n",
- " b | \n",
- " c | \n",
- " target | \n",
- "
\n",
- " \n",
- " \n",
- " \n",
- " 0 | \n",
- " 1 | \n",
- " 1 | \n",
- " 1 | \n",
- " 1 | \n",
- "
\n",
- " \n",
- " 1 | \n",
- " 2 | \n",
- " 2 | \n",
- " 2 | \n",
- " 1 | \n",
- "
\n",
- " \n",
- " 2 | \n",
- " 1 | \n",
- " 1 | \n",
- " 1 | \n",
- " 1 | \n",
- "
\n",
- " \n",
- " 3 | \n",
- " 2 | \n",
- " 2 | \n",
- " 2 | \n",
- " 1 | \n",
- "
\n",
- " \n",
- " 4 | \n",
- " 3 | \n",
- " 3 | \n",
- " 3 | \n",
- " 2 | \n",
- "
\n",
- " \n",
- " 5 | \n",
- " 4 | \n",
- " 4 | \n",
- " 4 | \n",
- " 2 | \n",
- "
\n",
- " \n",
- " 6 | \n",
- " 3 | \n",
- " 3 | \n",
- " 3 | \n",
- " 2 | \n",
- "
\n",
- " \n",
- " 7 | \n",
- " 4 | \n",
- " 4 | \n",
- " 4 | \n",
- " 2 | \n",
- "
\n",
- " \n",
- " 8 | \n",
- " 5 | \n",
- " 5 | \n",
- " 5 | \n",
- " 3 | \n",
- "
\n",
- " \n",
- " 9 | \n",
- " 5 | \n",
- " 5 | \n",
- " 5 | \n",
- " 3 | \n",
- "
\n",
- " \n",
- "
\n",
- "
"
- ],
- "text/plain": [
- " a b c target\n",
- "0 1 1 1 1\n",
- "1 2 2 2 1\n",
- "2 1 1 1 1\n",
- "3 2 2 2 1\n",
- "4 3 3 3 2\n",
- "5 4 4 4 2\n",
- "6 3 3 3 2\n",
- "7 4 4 4 2\n",
- "8 5 5 5 3\n",
- "9 5 5 5 3"
- ]
- },
- "execution_count": 7,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "data= pd.DataFrame({'a': (1, 2, 3, 4, 5), \n",
- " 'b': (1, 2, 3, 4, 5),\n",
- " 'c': (1, 2, 3, 4, 5),\n",
- " 'target': (1, 1, 2, 2, 3)})\n",
- "\n",
- "data = pd.concat((data, data), \n",
- " axis=0).sort_values('target').reset_index(drop=True)\n",
- "\n",
- "x = data[[c for c in data if c != 'target']]\n",
- "y = data['target']\n",
- "\n",
- "data"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 8,
- "metadata": {},
- "outputs": [],
- "source": [
- "from incremental_trees.trees import StreamingRFC\n",
- "\n",
- "srfc = StreamingRFC()\n",
- "srfc.partial_fit(x[0:7], y[0:7], # No 3s\n",
- " classes=y.unique())"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 9,
- "metadata": {},
- "outputs": [],
- "source": [
- "srfc.partial_fit(x, y)"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 10,
- "metadata": {},
- "outputs": [
- {
- "name": "stderr",
- "output_type": "stream",
- "text": [
- "/mnt/s/OneDrive/Matlab/dask tests/IncrementalTrees/incremental_trees/trees.py:196: RuntimeWarning: invalid value encountered in true_divide\n",
- " norm_prob = preds / counts\n"
- ]
- },
- {
- "data": {
- "text/plain": [
- "array([1, 1, 1, 1, 2, 2, 2, 2, 3, 3])"
- ]
- },
- "execution_count": 10,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "srfc.predict(x)"
- ]
- }
- ],
- "metadata": {
- "kernelspec": {
- "display_name": "Python 3",
- "language": "python",
- "name": "python3"
- },
- "language_info": {
- "codemirror_mode": {
- "name": "ipython",
- "version": 3
- },
- "file_extension": ".py",
- "mimetype": "text/x-python",
- "name": "python",
- "nbconvert_exporter": "python",
- "pygments_lexer": "ipython3",
- "version": "3.6.7"
- }
- },
- "nbformat": 4,
- "nbformat_minor": 2
-}
diff --git a/notes/PerformanceComparisons.ipynb b/notes/PerformanceComparisons.ipynb
deleted file mode 100644
index 2c61669..0000000
--- a/notes/PerformanceComparisons.ipynb
+++ /dev/null
@@ -1,860 +0,0 @@
-{
- "cells": [
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "# Performamce comparison\n",
- "\n",
- "In memory, no dask."
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 1,
- "metadata": {},
- "outputs": [],
- "source": [
- "# Change dir to repo root if running from repo (rather than pip installed)\n",
- "# (Assuming running from [repo]/notes/)\n",
- "import os\n",
- "os.chdir('../')\n",
- "\n",
- "%load_ext autoreload\n",
- "%autoreload 2"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 2,
- "metadata": {},
- "outputs": [],
- "source": [
- "import matplotlib.pyplot as plt\n",
- "import numpy as np\n",
- "\n",
- "from typing import Tuple\n",
- "\n",
- "from incremental_trees.trees import StreamingRFC\n",
- "\n",
- "from sklearn.ensemble import RandomForestClassifier\n",
- "from sklearn.datasets import make_blobs\n",
- "from sklearn.model_selection import train_test_split\n",
- "from sklearn.metrics import roc_auc_score"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 3,
- "metadata": {},
- "outputs": [],
- "source": [
- "# Settings\n",
- "MAX_ESTIMATORS = 120 # Lower to run faster"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "# Synthetic data\n",
- "\n",
- "20000 samples, 2 classes, 40 features."
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 4,
- "metadata": {},
- "outputs": [],
- "source": [
- "x, y = make_blobs(n_samples=20000,\n",
- " centers=2,\n",
- " cluster_std=100,\n",
- " n_features=40,\n",
- " random_state=0)"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "## Default params"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "### Standard random forest"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 5,
- "metadata": {},
- "outputs": [],
- "source": [
- "def score(mod, \n",
- " train: Tuple[np.array, np.array],\n",
- " test: Tuple[np.array, np.array],\n",
- " pr=False) -> Tuple[float, float]:\n",
- " \"\"\"\n",
- " Return ROC auc on x_train and x_test (from caller) on mod. Print if requested.\n",
- " \"\"\"\n",
- " y_pred_train_proba = mod.predict_proba(train[0])[:, 1]\n",
- " y_pred_test_proba = mod.predict_proba(test[0])[:, 1]\n",
- "\n",
- " roc_train = roc_auc_score(train[1], y_pred_train_proba)\n",
- " roc_test = roc_auc_score(test[1], y_pred_test_proba)\n",
- " if pr:\n",
- " print(f\"n_ests: {len(rfc.estimators_)}\")\n",
- " print(f'Train AUC: {roc_train}')\n",
- " print(f'Test AUC: {roc_test}')\n",
- " \n",
- " return roc_train, roc_test\n",
- "\n",
- "\n",
- "def inc_fit(x: np.array, y: np.array,\n",
- " rfc=None,\n",
- " steps=np.arange(1, 101, 2),\n",
- " sample: int=1):\n",
- " \"\"\"\n",
- " Fit a random forest model with an increasing number of estimators.\n",
- " \n",
- " Uses .fit with warm_start=True.\n",
- " \n",
- " :param rfc: RFC model to test. Default = None (use example with default RFC params).\n",
- " If model is supplied, the .n_estimators param will be ignored and managed here.\n",
- " :param steps: Range to iterate over. Sets total number of estimators that will be fit in model\n",
- " after each iteration. Should be range with constant step size.\n",
- " :param sample: Proportion of randomly sampled training data to use on each partial_fit call.\n",
- " If sample = 1, all training data is used on each interation,\n",
- " so should behave as standard random forest. Default = 1 (100%).\n",
- " \"\"\"\n",
- " \n",
- " x_train, x_test, y_train, y_test = train_test_split(x, y, \n",
- " test_size=0.25,\n",
- " random_state=1)\n",
- " \n",
- " if rfc is None:\n",
- " rfc = RandomForestClassifier(warm_start=True)\n",
- " \n",
- " train_scores = []\n",
- " test_scores = []\n",
- " for s in steps:\n",
- " # Fit model with these n ests\n",
- " rfc.set_params(n_estimators=s)\n",
- " rfc.fit(x_train, y_train)\n",
- " \n",
- " tr_score, te_score = score(rfc, \n",
- " train=(x_train, y_train),\n",
- " test=(x_test, y_test),\n",
- " pr=False)\n",
- " train_scores.append(tr_score)\n",
- " test_scores.append(te_score)\n",
- " \n",
- " return train_scores, test_scores\n",
- "\n",
- "\n",
- "def plot_auc(steps, train_scores, test_scores):\n",
- " \"\"\"\n",
- " Plot the train and test auc scores vs total number of model estimators\n",
- " \"\"\"\n",
- " \n",
- " fig = plt.figure(figsize=(4, 4))\n",
- " plt.plot(steps, train_scores)\n",
- " plt.plot(steps, test_scores)\n",
- " plt.xlabel('n_estimators')\n",
- " plt.ylabel('auc')\n",
- " plt.legend(['train', 'test'])"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 6,
- "metadata": {},
- "outputs": [
- {
- "name": "stdout",
- "output_type": "stream",
- "text": [
- "CPU times: user 18.2 s, sys: 219 ms, total: 18.4 s\n",
- "Wall time: 18.5 s\n",
- "With 119: 1.0 | 0.6327917799469568\n"
- ]
- },
- {
- "data": {
- "image/png": "\n",
- "text/plain": [
- ""
- ]
- },
- "metadata": {
- "needs_background": "light"
- },
- "output_type": "display_data"
- }
- ],
- "source": [
- "steps = np.arange(1, MAX_ESTIMATORS, 2)\n",
- "rfc = RandomForestClassifier(warm_start=True)\n",
- "\n",
- "%time train_scores, test_scores = inc_fit(x, y, rfc=rfc, steps=steps)\n",
- "print(f\"With {len(rfc.estimators_)}: {train_scores[-1]} | {test_scores[-1]}\")\n",
- "plot_auc(steps, train_scores, test_scores)"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "### Streaming random forest"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 7,
- "metadata": {},
- "outputs": [],
- "source": [
- "def inc_partial_fit(x: np.ndarray, y:np.ndarray,\n",
- " srfc=None,\n",
- " steps=np.arange(1, 101, 2),\n",
- " sample: int=0.1,\n",
- " **kwargs) -> None:\n",
- " \n",
- " \"\"\"\n",
- " Fit increasing number of estimators using .partial_fit on a subsample of the training data.\n",
- " \n",
- " StreamingRFC.n_estimators: Number of estimators that will be fit in each step. Set from first\n",
- " difference in range (ie. range[1]-range[0])\n",
- " StreamingRFC.max_n_estimators: Limit on number of estimators than will be fit in model. Should >\n",
- " range[-1].\n",
- " \n",
- " :param srfc: StreamingRFC model to test. Default = None (use example with default RFC params).\n",
- " If model is supplied, the .n_estimators param should match the constant range\n",
- " step size.\n",
- " :param steps: Range to iterate over. Sets total number of estimators that will be fit in model\n",
- " after each iteration. Should be range with constant step size.\n",
- " :param sample: Proportion of randomly sampled training data to use on each partial_fit call.\n",
- " If sample = 1, all training data is used on each interation,\n",
- " so should behave as standard random forest. Default = 0.1 (10%)\n",
- " \"\"\"\n",
- " \n",
- " x_train, x_test, y_train, y_test = train_test_split(x, y, \n",
- " test_size=0.25,\n",
- " random_state=1)\n",
- " n_train = x_train.shape[0]\n",
- " \n",
- " if srfc is None:\n",
- " srfc = StreamingRFC(n_estimators_per_chunk=np.diff(steps)[0],\n",
- " max_n_estimators=np.max(steps),\n",
- " **kwargs)\n",
- " \n",
- " train_scores = []\n",
- " test_scores = []\n",
- " for s in steps:\n",
- " \n",
- " use_idx = np.arange(0, n_train)[np.random.randint(low=0, \n",
- " high=n_train, \n",
- " size=int(n_train * sample))]\n",
- " \n",
- " # Fit model with these n ests\n",
- " srfc.partial_fit(x_train[use_idx, :], y_train[use_idx],\n",
- " classes=np.unique(y))\n",
- " \n",
- " tr_score, te_score = score(srfc,\n",
- " train=(x_train, y_train),\n",
- " test=(x_test, y_test),\n",
- " pr=False)\n",
- " train_scores.append(tr_score)\n",
- " test_scores.append(te_score)\n",
- " \n",
- " return train_scores, test_scores"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "#### As normal random forest\n",
- "1 estimator per full subset"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 8,
- "metadata": {},
- "outputs": [
- {
- "name": "stdout",
- "output_type": "stream",
- "text": [
- "CPU times: user 32.2 s, sys: 78.1 ms, total: 32.3 s\n",
- "Wall time: 32.7 s\n",
- "With 119: 0.9999999999999999 | 0.6331369808306709\n"
- ]
- },
- {
- "data": {
- "image/png": "\n",
- "text/plain": [
- ""
- ]
- },
- "metadata": {
- "needs_background": "light"
- },
- "output_type": "display_data"
- }
- ],
- "source": [
- "steps = np.arange(1, MAX_ESTIMATORS, 1)\n",
- "srfc = StreamingRFC(n_estimators=1,\n",
- " max_n_estimators=np.max(steps))\n",
- "\n",
- "%time train_scores, test_scores = inc_partial_fit(x, y, srfc=srfc, steps=steps, sample=1)\n",
- "print(f\"With {len(srfc.estimators_)}: {train_scores[-1]} | {test_scores[-1]}\")\n",
- "plot_auc(steps, train_scores, test_scores)"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "#### Partial random forest\n",
- "1 estimator per 10 % subset"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 9,
- "metadata": {},
- "outputs": [
- {
- "name": "stdout",
- "output_type": "stream",
- "text": [
- "CPU times: user 20.1 s, sys: 15.6 ms, total: 20.1 s\n",
- "Wall time: 20.5 s\n",
- "With 119: 0.8225306784087263 | 0.6294571314102564\n"
- ]
- },
- {
- "data": {
- "image/png": "\n",
- "text/plain": [
- ""
- ]
- },
- "metadata": {
- "needs_background": "light"
- },
- "output_type": "display_data"
- }
- ],
- "source": [
- "steps = np.arange(1, MAX_ESTIMATORS, 1)\n",
- "srfc = StreamingRFC(n_estimators_per_chunk=1,\n",
- " max_n_estimators=np.max(steps))\n",
- "\n",
- "%time train_scores, test_scores = inc_partial_fit(x, y, srfc=srfc, steps=steps, sample=0.1)\n",
- "print(f\"With {len(srfc.estimators_)}: {train_scores[-1]} | {test_scores[-1]}\")\n",
- "plot_auc(steps, train_scores, test_scores)"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "#### Partial random forest\n",
- "3 estimators per 10 % subset, but /3 fewer steps"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 10,
- "metadata": {},
- "outputs": [
- {
- "name": "stdout",
- "output_type": "stream",
- "text": [
- "CPU times: user 7.44 s, sys: 31.2 ms, total: 7.47 s\n",
- "Wall time: 7.56 s\n",
- "With 120: 0.8246353545629453 | 0.6307688947683706\n"
- ]
- },
- {
- "data": {
- "image/png": "\n",
- "text/plain": [
- ""
- ]
- },
- "metadata": {
- "needs_background": "light"
- },
- "output_type": "display_data"
- }
- ],
- "source": [
- "steps = np.arange(1, MAX_ESTIMATORS, 3)\n",
- "srfc = StreamingRFC(n_estimators_per_chunk=3,\n",
- " max_n_estimators=np.max(steps))\n",
- "\n",
- "%time train_scores, test_scores = inc_partial_fit(x, y, srfc=srfc, steps=steps, sample=0.1)\n",
- "print(f\"With {len(srfc.estimators_)}: {train_scores[-1]} | {test_scores[-1]}\")\n",
- "plot_auc(steps, train_scores, test_scores)"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "#### Forest of partial decision trees\n",
- "1 estimator per 10 % subset with all features"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 11,
- "metadata": {},
- "outputs": [
- {
- "name": "stdout",
- "output_type": "stream",
- "text": [
- "CPU times: user 22.3 s, sys: 31.2 ms, total: 22.3 s\n",
- "Wall time: 22.6 s\n",
- "With 119: 0.8138937870631217 | 0.6356310672155322\n"
- ]
- },
- {
- "data": {
- "image/png": "\n",
- "text/plain": [
- ""
- ]
- },
- "metadata": {
- "needs_background": "light"
- },
- "output_type": "display_data"
- }
- ],
- "source": [
- "steps = np.arange(1, MAX_ESTIMATORS, 1)\n",
- "srfc = StreamingRFC(n_estimators_per_chunk=1,\n",
- " max_n_estimators=np.max(steps),\n",
- " max_features=x.shape[1])\n",
- "\n",
- "%time train_scores, test_scores = inc_partial_fit(x, y, srfc=srfc, steps=steps, sample=0.1)\n",
- "print(f\"With {len(srfc.estimators_)}: {train_scores[-1]} | {test_scores[-1]}\")\n",
- "plot_auc(steps, train_scores, test_scores)"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "# Optimised parameters\n",
- "\n",
- "Using a better set of parameters for this dataset"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 12,
- "metadata": {},
- "outputs": [
- {
- "name": "stdout",
- "output_type": "stream",
- "text": [
- "Fitting 3 folds for each of 100 candidates, totalling 300 fits\n"
- ]
- },
- {
- "name": "stderr",
- "output_type": "stream",
- "text": [
- "[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.\n",
- "[Parallel(n_jobs=-1)]: Done 2 tasks | elapsed: 5.1s\n",
- "[Parallel(n_jobs=-1)]: Done 9 tasks | elapsed: 5.4s\n",
- "[Parallel(n_jobs=-1)]: Done 16 tasks | elapsed: 6.1s\n",
- "[Parallel(n_jobs=-1)]: Done 25 tasks | elapsed: 6.6s\n",
- "[Parallel(n_jobs=-1)]: Done 34 tasks | elapsed: 7.8s\n",
- "[Parallel(n_jobs=-1)]: Done 45 tasks | elapsed: 9.1s\n",
- "[Parallel(n_jobs=-1)]: Done 56 tasks | elapsed: 9.9s\n",
- "[Parallel(n_jobs=-1)]: Done 69 tasks | elapsed: 11.9s\n",
- "[Parallel(n_jobs=-1)]: Done 82 tasks | elapsed: 12.8s\n",
- "[Parallel(n_jobs=-1)]: Done 97 tasks | elapsed: 13.7s\n",
- "[Parallel(n_jobs=-1)]: Done 112 tasks | elapsed: 14.9s\n",
- "[Parallel(n_jobs=-1)]: Done 129 tasks | elapsed: 16.1s\n",
- "[Parallel(n_jobs=-1)]: Done 146 tasks | elapsed: 17.2s\n",
- "[Parallel(n_jobs=-1)]: Done 165 tasks | elapsed: 18.5s\n",
- "[Parallel(n_jobs=-1)]: Done 184 tasks | elapsed: 20.4s\n",
- "[Parallel(n_jobs=-1)]: Done 205 tasks | elapsed: 22.0s\n",
- "[Parallel(n_jobs=-1)]: Done 226 tasks | elapsed: 23.2s\n",
- "[Parallel(n_jobs=-1)]: Done 249 tasks | elapsed: 25.5s\n",
- "[Parallel(n_jobs=-1)]: Done 272 tasks | elapsed: 28.5s\n",
- "[Parallel(n_jobs=-1)]: Done 300 out of 300 | elapsed: 30.6s finished\n"
- ]
- },
- {
- "name": "stdout",
- "output_type": "stream",
- "text": [
- "{'bootstrap': True, 'class_weight': None, 'criterion': 'gini', 'max_depth': None, 'max_features': 'auto', 'max_leaf_nodes': None, 'min_impurity_decrease': 0, 'min_impurity_split': None, 'min_samples_leaf': 60, 'min_samples_split': 2, 'min_weight_fraction_leaf': 0.0, 'n_estimators': 30, 'n_jobs': -1, 'oob_score': False, 'random_state': None, 'verbose': 0, 'warm_start': False}\n",
- "n_ests: 119\n",
- "Train AUC: 0.8107342572755221\n",
- "Test AUC: 0.6393736367965102\n"
- ]
- }
- ],
- "source": [
- "from sklearn.model_selection import RandomizedSearchCV as RCV\n",
- "\n",
- "grid = RCV(RandomForestClassifier(n_estimators=30, \n",
- " n_jobs=-1),\n",
- " param_distributions={'min_samples_leaf': [1, 2, 10, 30, 60, 120, 240, 480],\n",
- " 'min_samples_split': [2, 10, 30, 60, 120, 240, 480],\n",
- " 'min_impurity_decrease': [0, 0.05, 0.1, 0.2, 0.3]},\n",
- " cv=3,\n",
- " n_iter=100,\n",
- " verbose=10,\n",
- " n_jobs=-1)\n",
- "\n",
- "x_train, x_test, y_train, y_test = train_test_split(x, y, \n",
- " test_size=0.25,\n",
- " random_state=1)\n",
- "\n",
- "grid.fit(x_train, y_train)\n",
- "print(grid.best_estimator_.get_params(deep=True))\n",
- "\n",
- "tr_score, te_score = score(grid,\n",
- " train=(x_train, y_train),\n",
- " test=(x_test, y_test),\n",
- " pr=True)"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 13,
- "metadata": {},
- "outputs": [
- {
- "data": {
- "text/plain": [
- "{'bootstrap': True,\n",
- " 'class_weight': None,\n",
- " 'criterion': 'gini',\n",
- " 'max_depth': None,\n",
- " 'max_features': 'auto',\n",
- " 'max_leaf_nodes': None,\n",
- " 'min_impurity_decrease': 0,\n",
- " 'min_impurity_split': None,\n",
- " 'min_samples_leaf': 60,\n",
- " 'min_samples_split': 2,\n",
- " 'min_weight_fraction_leaf': 0.0,\n",
- " 'oob_score': False,\n",
- " 'random_state': None,\n",
- " 'verbose': 0}"
- ]
- },
- "execution_count": 13,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "params = grid.best_estimator_.get_params()\n",
- "params.pop('warm_start', None)\n",
- "params.pop('n_jobs', None)\n",
- "params.pop('n_estimators', None)\n",
- "params"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "### Standard random forest"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 14,
- "metadata": {},
- "outputs": [
- {
- "name": "stdout",
- "output_type": "stream",
- "text": [
- "CPU times: user 5.44 s, sys: 46.9 ms, total: 5.48 s\n",
- "Wall time: 5.55 s\n",
- "With 111: 0.8330460502886542 | 0.649373182395347\n"
- ]
- },
- {
- "data": {
- "image/png": "\n",
- "text/plain": [
- ""
- ]
- },
- "metadata": {
- "needs_background": "light"
- },
- "output_type": "display_data"
- }
- ],
- "source": [
- "steps = np.arange(1, MAX_ESTIMATORS, 10)\n",
- "\n",
- "rfc = RandomForestClassifier(warm_start=True,\n",
- " **params)\n",
- "\n",
- "%time train_scores, test_scores = inc_fit(x, y, rfc=rfc, steps=steps)\n",
- "print(f\"With {len(rfc.estimators_)}: {train_scores[-1]} | {test_scores[-1]}\")\n",
- "plot_auc(steps, train_scores, test_scores)"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "#### As normal random forest\n",
- "1 estimator per full subset"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 15,
- "metadata": {},
- "outputs": [
- {
- "name": "stdout",
- "output_type": "stream",
- "text": [
- "CPU times: user 24.4 s, sys: 0 ns, total: 24.4 s\n",
- "Wall time: 24.6 s\n",
- "With 119: 0.82665000402489 | 0.6500849442174572\n"
- ]
- },
- {
- "data": {
- "image/png": "\n",
- "text/plain": [
- ""
- ]
- },
- "metadata": {
- "needs_background": "light"
- },
- "output_type": "display_data"
- }
- ],
- "source": [
- "steps = np.arange(1, MAX_ESTIMATORS, 1)\n",
- "srfc = StreamingRFC(n_estimators_per_chunk=1,\n",
- " max_n_estimators=np.max(steps),\n",
- " **params)\n",
- "\n",
- "%time train_scores, test_scores = inc_partial_fit(x, y, srfc=srfc, steps=steps, sample=1)\n",
- "print(f\"With {len(srfc.estimators_)}: {train_scores[-1]} | {test_scores[-1]}\")\n",
- "plot_auc(steps, train_scores, test_scores)"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "#### Partial random forest\n",
- "1 estimator per 10 % subset"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 16,
- "metadata": {},
- "outputs": [
- {
- "name": "stdout",
- "output_type": "stream",
- "text": [
- "CPU times: user 16.4 s, sys: 15.6 ms, total: 16.4 s\n",
- "Wall time: 16.7 s\n",
- "With 119: 0.6795318199557177 | 0.6549408766486443\n"
- ]
- },
- {
- "data": {
- "image/png": "\n",
- "text/plain": [
- ""
- ]
- },
- "metadata": {
- "needs_background": "light"
- },
- "output_type": "display_data"
- }
- ],
- "source": [
- "steps = np.arange(1, MAX_ESTIMATORS, 1)\n",
- "srfc = StreamingRFC(n_estimators=1,\n",
- " max_n_estimators=np.max(steps),\n",
- " **params)\n",
- "\n",
- "%time train_scores, test_scores = inc_partial_fit(x, y, srfc=srfc, steps=steps, sample=0.1)\n",
- "print(f\"With {len(srfc.estimators_)}: {train_scores[-1]} | {test_scores[-1]}\")\n",
- "plot_auc(steps, train_scores, test_scores)"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "#### Partial random forest\n",
- "3 estimators per 10 % subset, but /3 fewer steps"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 17,
- "metadata": {},
- "outputs": [
- {
- "name": "stdout",
- "output_type": "stream",
- "text": [
- "CPU times: user 2.19 s, sys: 0 ns, total: 2.19 s\n",
- "Wall time: 2.2 s\n",
- "With 40: 0.6577470048702592 | 0.6314094564082084\n"
- ]
- },
- {
- "data": {
- "image/png": "\n",
- "text/plain": [
- ""
- ]
- },
- "metadata": {
- "needs_background": "light"
- },
- "output_type": "display_data"
- }
- ],
- "source": [
- "steps = np.arange(1, MAX_ESTIMATORS, 3)\n",
- "srfc = StreamingRFC(n_estimators=3,\n",
- " max_n_estimators=np.max(steps),\n",
- " **params)\n",
- "\n",
- "%time train_scores, test_scores = inc_partial_fit(x, y, srfc=srfc, steps=steps, sample=0.1)\n",
- "print(f\"With {len(srfc.estimators_)}: {train_scores[-1]} | {test_scores[-1]}\")\n",
- "plot_auc(steps, train_scores, test_scores)"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "#### Forest of partial decision trees\n",
- "1 estimator per 10 % subset with all features"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 18,
- "metadata": {},
- "outputs": [
- {
- "data": {
- "text/plain": [
- "'auto'"
- ]
- },
- "execution_count": 18,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "params.pop('max_features', None)"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 19,
- "metadata": {},
- "outputs": [
- {
- "name": "stdout",
- "output_type": "stream",
- "text": [
- "CPU times: user 18.7 s, sys: 46.9 ms, total: 18.8 s\n",
- "Wall time: 19.1 s\n",
- "With 119: 0.6836834655810746 | 0.6486341405033996\n"
- ]
- },
- {
- "data": {
- "image/png": "\n",
- "text/plain": [
- ""
- ]
- },
- "metadata": {
- "needs_background": "light"
- },
- "output_type": "display_data"
- }
- ],
- "source": [
- "steps = np.arange(1, MAX_ESTIMATORS, 1)\n",
- "srfc = StreamingRFC(n_estimators=1,\n",
- " max_n_estimators=np.max(steps),\n",
- " max_features=x.shape[1],\n",
- " **params)\n",
- "\n",
- "%time train_scores, test_scores = inc_partial_fit(x, y, srfc=srfc, steps=steps, sample=0.1)\n",
- "print(f\"With {len(srfc.estimators_)}: {train_scores[-1]} | {test_scores[-1]}\")\n",
- "plot_auc(steps, train_scores, test_scores)"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {},
- "outputs": [],
- "source": []
- }
- ],
- "metadata": {
- "kernelspec": {
- "display_name": "Python 3",
- "language": "python",
- "name": "python3"
- },
- "language_info": {
- "codemirror_mode": {
- "name": "ipython",
- "version": 3
- },
- "file_extension": ".py",
- "mimetype": "text/x-python",
- "name": "python",
- "nbconvert_exporter": "python",
- "pygments_lexer": "ipython3",
- "version": "3.6.7"
- }
- },
- "nbformat": 4,
- "nbformat_minor": 2
-}
diff --git a/notes/PerformanceComparisonsDask.ipynb b/notes/PerformanceComparisonsDask.ipynb
deleted file mode 100644
index a72abe7..0000000
--- a/notes/PerformanceComparisonsDask.ipynb
+++ /dev/null
@@ -1,683 +0,0 @@
-{
- "cells": [
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "# Performance comparisons\n",
- "\n",
- "In memory and out of memory, using dask."
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 1,
- "metadata": {},
- "outputs": [],
- "source": [
- "# Change dir to repo root if running from repo (rather than pip installed)\n",
- "# (Assuming running from [repo]/notes/)\n",
- "import os\n",
- "os.chdir('../')\n",
- "\n",
- "%load_ext autoreload\n",
- "%autoreload 2"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 2,
- "metadata": {},
- "outputs": [],
- "source": [
- "import matplotlib.pyplot as plt\n",
- "import numpy as np\n",
- "\n",
- "from typing import Tuple\n",
- "\n",
- "from incremental_trees.trees import StreamingRFC\n",
- "\n",
- "from sklearn.ensemble import RandomForestClassifier\n",
- "from sklearn.datasets import make_blobs\n",
- "from sklearn.model_selection import train_test_split\n",
- "from sklearn.metrics import roc_auc_score\n",
- "\n",
- "import dask_ml\n",
- "import dask_ml.datasets\n",
- "from dask_ml.wrappers import Incremental\n",
- "from dask.distributed import Client, LocalCluster\n",
- "from dask_ml.model_selection import train_test_split as dask_tts\n",
- "\n",
- "import dask as dd\n",
- "import pandas as pd"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 3,
- "metadata": {},
- "outputs": [],
- "source": [
- "# Settings\n",
- "MAX_ESTIMATORS = 60 # Lower to run faster"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 4,
- "metadata": {},
- "outputs": [
- {
- "name": "stderr",
- "output_type": "stream",
- "text": [
- "distributed.comm.tcp - WARNING - Could not set timeout on TCP stream: [Errno 92] Protocol not available\n",
- "distributed.comm.tcp - WARNING - Could not set timeout on TCP stream: [Errno 92] Protocol not available\n",
- "distributed.comm.tcp - WARNING - Could not set timeout on TCP stream: [Errno 92] Protocol not available\n",
- "distributed.comm.tcp - WARNING - Could not set timeout on TCP stream: [Errno 92] Protocol not available\n",
- "distributed.comm.tcp - WARNING - Could not set timeout on TCP stream: [Errno 92] Protocol not available\n",
- "distributed.comm.tcp - WARNING - Could not set timeout on TCP stream: [Errno 92] Protocol not available\n",
- "distributed.comm.tcp - WARNING - Could not set timeout on TCP stream: [Errno 92] Protocol not available\n",
- "distributed.comm.tcp - WARNING - Could not set timeout on TCP stream: [Errno 92] Protocol not available\n"
- ]
- }
- ],
- "source": [
- "# Prepare dask cluster\n",
- "cluster = LocalCluster(processes=False,\n",
- " n_workers=2,\n",
- " threads_per_worker=2,\n",
- " scheduler_port=8383,\n",
- " diagnostics_port=8484)\n",
- "client = Client(cluster)"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "# Synthetic data, in memory\n",
- "\n",
- "Compare increasing estimators with RandomForest (using warm_start) against Incremental StreamingRFC (dask handles .partial_fit).\n",
- "\n"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 5,
- "metadata": {},
- "outputs": [
- {
- "name": "stderr",
- "output_type": "stream",
- "text": [
- "distributed.comm.tcp - WARNING - Could not set timeout on TCP stream: [Errno 92] Protocol not available\n",
- "distributed.comm.tcp - WARNING - Could not set timeout on TCP stream: [Errno 92] Protocol not available\n",
- "distributed.comm.tcp - WARNING - Could not set timeout on TCP stream: [Errno 92] Protocol not available\n",
- "distributed.comm.tcp - WARNING - Could not set timeout on TCP stream: [Errno 92] Protocol not available\n"
- ]
- }
- ],
- "source": [
- "x, y = dask_ml.datasets.make_blobs(n_samples=1e5,\n",
- " chunks=1e4,\n",
- " random_state=0,\n",
- " n_features=40,\n",
- " centers=2,\n",
- " cluster_std=100)\n",
- "\n",
- "x_dd = dd.dataframe.from_array(x, \n",
- " chunksize=1e4)\n",
- "y_dd = dd.dataframe.from_array(y,\n",
- " chunksize=1e4)\n",
- "\n",
- "x_pd = pd.DataFrame(x.persist().compute())\n",
- "y_pd = pd.DataFrame(y.persist().compute())"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 6,
- "metadata": {},
- "outputs": [
- {
- "data": {
- "text/plain": [
- "30.517654418945312"
- ]
- },
- "execution_count": 6,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "x_pd.memory_usage(deep=True).sum() / 1024 /1024"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "### Standard random forest"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 7,
- "metadata": {},
- "outputs": [
- {
- "name": "stderr",
- "output_type": "stream",
- "text": [
- "distributed.comm.tcp - WARNING - Could not set timeout on TCP stream: [Errno 92] Protocol not available\n",
- "distributed.comm.tcp - WARNING - Could not set timeout on TCP stream: [Errno 92] Protocol not available\n",
- "distributed.comm.tcp - WARNING - Could not set timeout on TCP stream: [Errno 92] Protocol not available\n",
- "distributed.comm.tcp - WARNING - Could not set timeout on TCP stream: [Errno 92] Protocol not available\n"
- ]
- }
- ],
- "source": [
- "def score(mod, \n",
- " train: Tuple[np.array, np.array],\n",
- " test: Tuple[np.array, np.array],\n",
- " pr=False) -> Tuple[float, float]:\n",
- " \"\"\"\n",
- " Return ROC auc on x_train and x_test (from caller) on mod. Print if requested.\n",
- " \"\"\"\n",
- " y_pred_train_proba = mod.predict_proba(train[0])[:, 1]\n",
- " y_pred_test_proba = mod.predict_proba(test[0])[:, 1]\n",
- "\n",
- " roc_train = roc_auc_score(train[1], y_pred_train_proba)\n",
- " roc_test = roc_auc_score(test[1], y_pred_test_proba)\n",
- " if pr:\n",
- " print(f\"n_ests: {len(rfc.estimators_)}\")\n",
- " print(f'Train AUC: {roc_train}')\n",
- " print(f'Test AUC: {roc_test}')\n",
- " \n",
- " return roc_train, roc_test\n",
- "\n",
- "\n",
- "def score_dask(mod, \n",
- " train: Tuple[np.array, np.array],\n",
- " test: Tuple[np.array, np.array],\n",
- " pr=False) -> Tuple[float, float]:\n",
- " \"\"\"\n",
- " Score model using available dask metric (accuracy)\n",
- " \"\"\"\n",
- " roc_train = mod.score(train[0], train[1])\n",
- " roc_test = mod.score(test[0], test[1])\n",
- " if pr:\n",
- " print(f\"n_ests: {len(rfc.estimators_)}\")\n",
- " print(f'Train AUC: {roc_train}')\n",
- " print(f'Test AUC: {roc_test}')\n",
- " \n",
- " return roc_train, roc_test\n",
- "\n",
- "\n",
- "def multiple_fit(x: np.array, y: np.array,\n",
- " steps=np.arange(1, 101, 2),\n",
- " sample: int=1):\n",
- " \"\"\"\n",
- " Fit a random forest model with an increasing number of estimators.\n",
- " \n",
- " This version doesn't use warm start and refits the model from scratch each iteration.\n",
- " This is for the sake of comparing timings to dask function below.\n",
- " \n",
- " :param steps: Range to iterate over. Sets total number of estimators that will be fit in model\n",
- " after each iteration. Should be range with constant step size.\n",
- " :param sample: Proportion of randomly sampled training data to use on each partial_fit call.\n",
- " If sample = 1, all training data is used on each interation,\n",
- " so should behave as standard random forest. Default = 1 (100%).\n",
- " \"\"\"\n",
- " \n",
- " x_train, x_test, y_train, y_test = train_test_split(x, y, \n",
- " test_size=0.25,\n",
- " random_state=1)\n",
- " \n",
- " train_scores = []\n",
- " test_scores = []\n",
- " for s in steps:\n",
- " \n",
- " # Fit full model on each iteration\n",
- " rfc = RandomForestClassifier(warm_start=False)\n",
- " \n",
- " # Fit model with these n ests\n",
- " rfc.set_params(n_estimators=s)\n",
- " rfc.fit(x_train, y_train)\n",
- " \n",
- " tr_score, te_score = score(rfc, \n",
- " train=(x_train, y_train),\n",
- " test=(x_test, y_test),\n",
- " pr=False)\n",
- " \n",
- " train_scores.append(tr_score)\n",
- " test_scores.append(te_score)\n",
- " \n",
- " return rfc, train_scores, test_scores\n",
- "\n",
- "\n",
- "def plot_auc(steps, train_scores, test_scores):\n",
- " \"\"\"\n",
- " Plot the train and test auc scores vs total number of model estimators\n",
- " \"\"\"\n",
- " \n",
- " fig = plt.figure(figsize=(4, 4))\n",
- " plt.plot(steps, train_scores)\n",
- " plt.plot(steps, test_scores)\n",
- " plt.xlabel('n_estimators')\n",
- " plt.ylabel('auc')\n",
- " plt.legend(['train', 'test'])"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 8,
- "metadata": {},
- "outputs": [],
- "source": [
- "steps = np.arange(1, MAX_ESTIMATORS, 4)"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 9,
- "metadata": {},
- "outputs": [
- {
- "name": "stdout",
- "output_type": "stream",
- "text": [
- "CPU times: user 3min 30s, sys: 1.02 s, total: 3min 31s\n",
- "Wall time: 3min 37s\n",
- "With 57: 1.0 | 0.6337612567122703\n"
- ]
- },
- {
- "data": {
- "image/png": "\n",
- "text/plain": [
- ""
- ]
- },
- "metadata": {
- "needs_background": "light"
- },
- "output_type": "display_data"
- }
- ],
- "source": [
- "rfc = RandomForestClassifier(warm_start=True)\n",
- "\n",
- "%time rfc, train_scores, test_scores = multiple_fit(x_pd.values, y_pd.values.squeeze(), steps=steps)\n",
- "\n",
- "print(f\"With {len(rfc.estimators_)}: {train_scores[-1]} | {test_scores[-1]}\")\n",
- "plot_auc(steps, train_scores, test_scores)"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "## Single incremental forest specs"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 10,
- "metadata": {},
- "outputs": [
- {
- "data": {
- "text/plain": [
- "((7500, 7500, 7500, 7500, 7500, 7500, 7500, 7500, 7500, 7500), (40,))"
- ]
- },
- "execution_count": 10,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "x_train, x_test, y_train, y_test = dask_tts(x, y, \n",
- " test_size=0.25)\n",
- "\n",
- "x_train.chunks"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "#### Incremental forest\n",
- "1 estimator per subset, 10 % per chunk, 1 pass through data.\n"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 11,
- "metadata": {},
- "outputs": [
- {
- "name": "stderr",
- "output_type": "stream",
- "text": [
- "distributed.comm.tcp - WARNING - Could not set timeout on TCP stream: [Errno 92] Protocol not available\n",
- "distributed.comm.tcp - WARNING - Could not set timeout on TCP stream: [Errno 92] Protocol not available\n",
- "distributed.comm.tcp - WARNING - Could not set timeout on TCP stream: [Errno 92] Protocol not available\n",
- "distributed.comm.tcp - WARNING - Could not set timeout on TCP stream: [Errno 92] Protocol not available\n",
- "/mnt/s/OneDrive/Matlab/dask tests/IncrementalTrees/incremental_trees/trees.py:199: RuntimeWarning: invalid value encountered in true_divide\n",
- " norm_prob = preds / counts\n"
- ]
- },
- {
- "name": "stdout",
- "output_type": "stream",
- "text": [
- "n_ests: 57\n",
- "Train AUC: 0.6658850266643547\n",
- "Test AUC: 0.566714711139625\n"
- ]
- }
- ],
- "source": [
- "srfc = Incremental(StreamingRFC(n_estimators_per_chunk=1,\n",
- " max_n_estimators=np.inf))\n",
- "\n",
- "srfc.fit(x_train, y_train,\n",
- " classes=[0, 1])\n",
- "\n",
- "tr_score, te_score = score(srfc, \n",
- " train=(x_train, y_train),\n",
- " test=(x_test, y_test),\n",
- " pr=True)"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "#### Incremental forest\n",
- "20 estimators per subset (different features), 10 % per chunk, 1 pass through data."
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 12,
- "metadata": {},
- "outputs": [
- {
- "name": "stdout",
- "output_type": "stream",
- "text": [
- "n_ests: 57\n",
- "Train AUC: 0.8403617676637958\n",
- "Test AUC: 0.6507955222895951\n"
- ]
- }
- ],
- "source": [
- "srfc = Incremental(StreamingRFC(n_estimators_per_chunk=20,\n",
- " max_n_estimators=np.inf))\n",
- "\n",
- "srfc.fit(x_train, y_train,\n",
- " classes=[0, 1])\n",
- "\n",
- "tr_score, te_score = score(srfc, \n",
- " train=(x_train, y_train),\n",
- " test=(x_test, y_test),\n",
- " pr=True)"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "#### Forest of partial decision trees\n",
- "1 estimator per subset with all features, 10 % per chunk, 1 pass through data."
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 13,
- "metadata": {},
- "outputs": [
- {
- "name": "stdout",
- "output_type": "stream",
- "text": [
- "n_ests: 57\n",
- "Train AUC: 0.6702321483770426\n",
- "Test AUC: 0.5732644847212355\n"
- ]
- }
- ],
- "source": [
- "srfc = Incremental(StreamingRFC(n_estimators_per_chunk=1,\n",
- " max_n_estimators=np.max(steps),\n",
- " max_features=x.shape[1]))\n",
- "\n",
- "srfc.fit(x_train, y_train,\n",
- " classes=[0, 1])\n",
- "\n",
- "tr_score, te_score = score(srfc, \n",
- " train=(x_train, y_train),\n",
- " test=(x_test, y_test),\n",
- " pr=True)"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "#### Forest of partial decision trees\n",
- "20 estimator per subset with all features, 10 % per chunk, 1 pass through data.\n",
- "\n",
- "Extra estimators shouldn't help here?"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 14,
- "metadata": {},
- "outputs": [
- {
- "name": "stdout",
- "output_type": "stream",
- "text": [
- "n_ests: 57\n",
- "Train AUC: 0.7542754850739607\n",
- "Test AUC: 0.6273808721369764\n"
- ]
- }
- ],
- "source": [
- "srfc = Incremental(StreamingRFC(n_estimators_per_chunk=20,\n",
- " max_n_estimators=np.max(steps),\n",
- " max_features=x.shape[1]))\n",
- "\n",
- "srfc.fit(x_train, y_train,\n",
- " classes=[0, 1])\n",
- "\n",
- "tr_score, te_score = score(srfc, \n",
- " train=(x_train, y_train),\n",
- " test=(x_test, y_test),\n",
- " pr=True)"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "### n estimators per chunk vs performance\n",
- "\n",
- "Effect of increasing estimators per subset (with different set ups)\n",
- "\n",
- "Function here add Incremental to supplied model, and uses .fit to refit the full model in each iteration.\n",
- "\n",
- "The other functions (above and in PerformanceComparisons.ipynb) do incremental fits using warm start (either directly or via .partial_fit). \n",
- "\n",
- "This means the timing information cannot be directly compared!"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 15,
- "metadata": {},
- "outputs": [],
- "source": [
- "def multiple_dask_fit(x: np.ndarray, y:np.ndarray,\n",
- " steps=np.arange(1, 101, 2),\n",
- " **kwargs) -> None:\n",
- " \n",
- " \"\"\"\n",
- " Fit increasing number of estimators using .partial_fit on a subsample of the training data.\n",
- " Uses Dask by adding Incremental to model and calling fit. This refits the whole model one each\n",
- " iteration, so will be slower than the other test functions. Timing this function can only be compared\n",
- " to other calls of this function.\n",
- " \n",
- " The data passed to the Random forest fit by partial_fit is handled by dask and is sequential batches\n",
- " of data, rather than random samples (as used by inc_partial_fit in PerformanceComparisons.ipynb).\n",
- " \n",
- " StreamingRFC.n_estimators: Number of estimators that will be fit in each step. Set from first\n",
- " difference in range (ie. range[1]-range[0])\n",
- " StreamingRFC.max_n_estimators: Limit on number of estimators than will be fit in model. Should >\n",
- " range[-1].\n",
- " \n",
- " :param steps: Range to iterate over. Sets total number of estimators that will be fit in model\n",
- " after each iteration. Should be range with constant step size.\n",
- " \"\"\"\n",
- " \n",
- " \n",
- " x_train, x_test, y_train, y_test = dask_tts(x, y, \n",
- " test_size=0.25)\n",
- " \n",
- " n_train = x_train.shape[0]\n",
- " \n",
- " train_scores = []\n",
- " test_scores = []\n",
- " for s in steps:\n",
- " \n",
- " # Create fresh model each iteration\n",
- " srfc_ = StreamingRFC(n_estimators_per_chunk=s,\n",
- " max_n_estimators=np.inf,\n",
- " **kwargs)\n",
- " \n",
- " \n",
- " # Add Incremental\n",
- " srfc_ = Incremental(srfc_)\n",
- " \n",
- " # Fit model with these n ests\n",
- " # From scratch each time\n",
- " srfc_.fit(x_train, y_train,\n",
- " classes=[0, 1])\n",
- " \n",
- " tr_score, te_score = score(srfc_,\n",
- " train=(x_train, y_train),\n",
- " test=(x_test, y_test),\n",
- " pr=False)\n",
- " train_scores.append(tr_score)\n",
- " test_scores.append(te_score)\n",
- " \n",
- " return srfc_, train_scores, test_scores"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "#### Incremental forest\n",
- "*range* estimators per subset (different features), 10 % per chunk, 1 pass through data."
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 16,
- "metadata": {},
- "outputs": [
- {
- "ename": "RuntimeError",
- "evalue": "Cannot clone object StreamingRFC(bootstrap=True, class_weight=None, criterion='gini',\n max_depth=None, max_features='auto', max_leaf_nodes=None,\n max_n_estimators=inf, min_impurity_decrease=0.0,\n min_impurity_split=None, min_samples_leaf=1, min_samples_split=2,\n min_weight_fraction_leaf=0.0, n_estimators=1,\n n_estimators_per_chunk=1, n_jobs=None, oob_score=False,\n random_state=None, verbose=0, warm_start=True), as the constructor either does not set or modifies parameter n_estimators",
- "output_type": "error",
- "traceback": [
- "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
- "\u001b[0;31mRuntimeError\u001b[0m Traceback (most recent call last)",
- "\u001b[0;32m\u001b[0m in \u001b[0;36m\u001b[0;34m\u001b[0m\n",
- "\u001b[0;32m\u001b[0m in \u001b[0;36mmultiple_dask_fit\u001b[0;34m(x, y, steps, **kwargs)\u001b[0m\n\u001b[1;32m 43\u001b[0m \u001b[0;31m# From scratch each time\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 44\u001b[0m srfc_.fit(x_train, y_train,\n\u001b[0;32m---> 45\u001b[0;31m classes=[0, 1])\n\u001b[0m\u001b[1;32m 46\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 47\u001b[0m tr_score, te_score = score(srfc_,\n",
- "\u001b[0;32m/mnt/s/OneDrive/Matlab/dask tests/IncrementalTrees/pc_env_linux/lib/python3.6/site-packages/dask_ml/wrappers.py\u001b[0m in \u001b[0;36mfit\u001b[0;34m(self, X, y, **fit_kwargs)\u001b[0m\n\u001b[1;32m 461\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 462\u001b[0m \u001b[0;32mdef\u001b[0m \u001b[0mfit\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mX\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0my\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;32mNone\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m**\u001b[0m\u001b[0mfit_kwargs\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 463\u001b[0;31m \u001b[0mestimator\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0msklearn\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mbase\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mclone\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mestimator\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 464\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_fit_for_estimator\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mestimator\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mX\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0my\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m**\u001b[0m\u001b[0mfit_kwargs\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 465\u001b[0m \u001b[0;32mreturn\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
- "\u001b[0;32m/mnt/s/OneDrive/Matlab/dask tests/IncrementalTrees/pc_env_linux/lib/python3.6/site-packages/sklearn/base.py\u001b[0m in \u001b[0;36mclone\u001b[0;34m(estimator, safe)\u001b[0m\n\u001b[1;32m 71\u001b[0m raise RuntimeError('Cannot clone object %s, as the constructor '\n\u001b[1;32m 72\u001b[0m \u001b[0;34m'either does not set or modifies parameter %s'\u001b[0m \u001b[0;34m%\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m---> 73\u001b[0;31m (estimator, name))\n\u001b[0m\u001b[1;32m 74\u001b[0m \u001b[0;32mreturn\u001b[0m \u001b[0mnew_object\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 75\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n",
- "\u001b[0;31mRuntimeError\u001b[0m: Cannot clone object StreamingRFC(bootstrap=True, class_weight=None, criterion='gini',\n max_depth=None, max_features='auto', max_leaf_nodes=None,\n max_n_estimators=inf, min_impurity_decrease=0.0,\n min_impurity_split=None, min_samples_leaf=1, min_samples_split=2,\n min_weight_fraction_leaf=0.0, n_estimators=1,\n n_estimators_per_chunk=1, n_jobs=None, oob_score=False,\n random_state=None, verbose=0, warm_start=True), as the constructor either does not set or modifies parameter n_estimators"
- ]
- },
- {
- "ename": "NameError",
- "evalue": "name 'final_est' is not defined",
- "output_type": "error",
- "traceback": [
- "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
- "\u001b[0;31mNameError\u001b[0m Traceback (most recent call last)",
- "\u001b[0;32m\u001b[0m in \u001b[0;36m\u001b[0;34m\u001b[0m\n\u001b[1;32m 2\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 3\u001b[0m \u001b[0mget_ipython\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mrun_line_magic\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m'time'\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m'final_est, train_scores, test_scores = multiple_dask_fit(x, y, steps=steps)'\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m----> 4\u001b[0;31m \u001b[0mprint\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34mf\"With {len(final_est.estimators_)}: {train_scores[-1]} | {test_scores[-1]}\"\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 5\u001b[0m \u001b[0mplot_auc\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0msteps\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mtrain_scores\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mtest_scores\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
- "\u001b[0;31mNameError\u001b[0m: name 'final_est' is not defined"
- ]
- }
- ],
- "source": [
- "steps = np.arange(1, MAX_ESTIMATORS, 6)\n",
- "\n",
- "%time final_est, train_scores, test_scores = multiple_dask_fit(x, y, steps=steps)\n",
- "print(f\"With {len(final_est.estimators_)}: {train_scores[-1]} | {test_scores[-1]}\")\n",
- "plot_auc(steps, train_scores, test_scores)"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "#### Forest of partial decision trees\n",
- "*range* estimators per subset with all features, 10 % per chunk, 1 pass through data."
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {},
- "outputs": [],
- "source": [
- "steps = np.arange(1, MAX_ESTIMATORS, 6)\n",
- "\n",
- "%time final_est, train_scores, test_scores = multiple_dask_fit(x, y, steps=steps, max_features=x.shape[1])\n",
- "print(f\"With {len(final_est.estimators_)}: {train_scores[-1]} | {test_scores[-1]}\")\n",
- "plot_auc(steps, train_scores, test_scores)"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {},
- "outputs": [],
- "source": []
- }
- ],
- "metadata": {
- "kernelspec": {
- "display_name": "Python 3",
- "language": "python",
- "name": "python3"
- },
- "language_info": {
- "codemirror_mode": {
- "name": "ipython",
- "version": 3
- },
- "file_extension": ".py",
- "mimetype": "text/x-python",
- "name": "python",
- "nbconvert_exporter": "python",
- "pygments_lexer": "ipython3",
- "version": "3.6.7"
- }
- },
- "nbformat": 4,
- "nbformat_minor": 2
-}
diff --git a/requirements.txt b/requirements.txt
index 0ce12d8..30de622 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -1,14 +1,8 @@
-scikit-learn>=0.22
-pandas
+scikit-learn==1.2
numpy
-dask>=2
+dask==2022.12
dask-glm==0.2.0
-dask-ml>=1
-distributed>=2
+dask-ml==2022.5.27
+distributed==2022.12
bokeh
-pytest
-jupyter
-jupyterlab
-ipykernel
-matplotlib
fsspec
diff --git a/notes/EquivRows.ipynb b/scripts/EquivRows.ipynb
similarity index 70%
rename from notes/EquivRows.ipynb
rename to scripts/EquivRows.ipynb
index 6252378..4cbf6ba 100644
--- a/notes/EquivRows.ipynb
+++ b/scripts/EquivRows.ipynb
@@ -3,7 +3,11 @@
{
"cell_type": "code",
"execution_count": 1,
- "metadata": {},
+ "metadata": {
+ "pycharm": {
+ "name": "#%%\n"
+ }
+ },
"outputs": [],
"source": [
"# Change dir to repo root if running from repo (rather than pip installed)\n",
@@ -17,46 +21,48 @@
},
{
"cell_type": "code",
- "execution_count": 5,
- "metadata": {},
+ "execution_count": 4,
+ "metadata": {
+ "pycharm": {
+ "name": "#%%\n"
+ }
+ },
"outputs": [],
"source": [
- "import matplotlib.pyplot as plt\n",
"import numpy as np\n",
- "import math\n",
"\n",
- "from typing import Tuple\n",
"\n",
"from incremental_trees.trees import StreamingRFC\n",
"\n",
- "from sklearn.ensemble import RandomForestClassifier\n",
- "from sklearn.datasets import make_blobs\n",
- "from sklearn.model_selection import train_test_split\n",
"from sklearn.metrics import roc_auc_score\n",
- "from sklearn.ensemble.forest import RandomForestClassifier\n",
+ "from sklearn.ensemble import RandomForestClassifier\n",
"from sklearn.datasets import load_breast_cancer\n",
"from sklearn.model_selection import train_test_split\n",
- "from sklearn.linear_model import LogisticRegression\n",
- "from sklearn.metrics.classification import classification_report\n",
- "from sklearn.base import clone"
+ "from sklearn.metrics import classification_report"
]
},
{
"cell_type": "code",
- "execution_count": 6,
- "metadata": {},
+ "execution_count": 5,
+ "metadata": {
+ "pycharm": {
+ "name": "#%%\n"
+ }
+ },
"outputs": [],
"source": [
"x, y = load_breast_cancer(return_X_y=True)\n",
- "x_train, x_test, y_train, y_test = train_test_split(x, y,\n",
- " test_size=0.25,\n",
- " random_state=123)"
+ "x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.25, random_state=123)"
]
},
{
"cell_type": "code",
- "execution_count": 24,
- "metadata": {},
+ "execution_count": 6,
+ "metadata": {
+ "pycharm": {
+ "name": "#%%\n"
+ }
+ },
"outputs": [],
"source": [
"def fit_srfc(srfc, x, y,\n",
@@ -123,7 +129,11 @@
},
{
"cell_type": "markdown",
- "metadata": {},
+ "metadata": {
+ "pycharm": {
+ "name": "#%% md\n"
+ }
+ },
"source": [
"# 10 full trees vs equivilents\n",
"RFC: 10 tress with 100%\n",
@@ -134,8 +144,12 @@
},
{
"cell_type": "code",
- "execution_count": 32,
- "metadata": {},
+ "execution_count": 7,
+ "metadata": {
+ "pycharm": {
+ "name": "#%%\n"
+ }
+ },
"outputs": [
{
"name": "stdout",
@@ -143,15 +157,15 @@
"text": [
" precision recall f1-score support\n",
"\n",
- " 0 1.00 0.98 0.99 54\n",
- " 1 0.99 1.00 0.99 89\n",
+ " 0 0.96 0.98 0.97 54\n",
+ " 1 0.99 0.98 0.98 89\n",
"\n",
- " micro avg 0.99 0.99 0.99 143\n",
- " macro avg 0.99 0.99 0.99 143\n",
- "weighted avg 0.99 0.99 0.99 143\n",
+ " accuracy 0.98 143\n",
+ " macro avg 0.98 0.98 0.98 143\n",
+ "weighted avg 0.98 0.98 0.98 143\n",
"\n",
- "Train AUC: 0.9998583034196108\n",
- "Test AUC: 0.9886600083229296\n"
+ "Train AUC: 0.9999763839032684\n",
+ "Test AUC: 0.9888680815647108\n"
]
}
],
@@ -163,8 +177,12 @@
},
{
"cell_type": "code",
- "execution_count": 33,
- "metadata": {},
+ "execution_count": 8,
+ "metadata": {
+ "pycharm": {
+ "name": "#%%\n"
+ }
+ },
"outputs": [
{
"name": "stdout",
@@ -172,15 +190,15 @@
"text": [
" precision recall f1-score support\n",
"\n",
- " 0 0.91 0.91 0.91 54\n",
- " 1 0.94 0.94 0.94 89\n",
+ " 0 0.98 0.94 0.96 54\n",
+ " 1 0.97 0.99 0.98 89\n",
"\n",
- " micro avg 0.93 0.93 0.93 143\n",
- " macro avg 0.93 0.93 0.93 143\n",
- "weighted avg 0.93 0.93 0.93 143\n",
+ " accuracy 0.97 143\n",
+ " macro avg 0.97 0.97 0.97 143\n",
+ "weighted avg 0.97 0.97 0.97 143\n",
"\n",
- "Train AUC: 0.9814495560173814\n",
- "Test AUC: 0.9889721181856014\n"
+ "Train AUC: 0.9830554505951257\n",
+ "Test AUC: 0.987411568872243\n"
]
}
],
@@ -196,7 +214,11 @@
},
{
"cell_type": "markdown",
- "metadata": {},
+ "metadata": {
+ "pycharm": {
+ "name": "#%% md\n"
+ }
+ },
"source": [
"## vs SRFC: 100 x 1 x 0.1 vs 10\n",
"100 fits with 1 tree on 10% of data each"
@@ -204,8 +226,12 @@
},
{
"cell_type": "code",
- "execution_count": 34,
- "metadata": {},
+ "execution_count": 9,
+ "metadata": {
+ "pycharm": {
+ "name": "#%%\n"
+ }
+ },
"outputs": [
{
"name": "stdout",
@@ -213,15 +239,15 @@
"text": [
" precision recall f1-score support\n",
"\n",
- " 0 0.98 0.93 0.95 54\n",
- " 1 0.96 0.99 0.97 89\n",
+ " 0 0.98 0.96 0.97 54\n",
+ " 1 0.98 0.99 0.98 89\n",
"\n",
- " micro avg 0.97 0.97 0.97 143\n",
- " macro avg 0.97 0.96 0.96 143\n",
- "weighted avg 0.97 0.97 0.96 143\n",
+ " accuracy 0.98 143\n",
+ " macro avg 0.98 0.98 0.98 143\n",
+ "weighted avg 0.98 0.98 0.98 143\n",
"\n",
- "Train AUC: 0.9757580767050822\n",
- "Test AUC: 0.9916770703287556\n"
+ "Train AUC: 0.975769884753448\n",
+ "Test AUC: 0.9938618393674573\n"
]
}
],
@@ -237,7 +263,11 @@
},
{
"cell_type": "markdown",
- "metadata": {},
+ "metadata": {
+ "pycharm": {
+ "name": "#%% md\n"
+ }
+ },
"source": [
"## vs SRFC: 100 x 10 x 0.1 vs 10\n",
"100 fits with 1 tree on 10% of data each"
@@ -245,8 +275,12 @@
},
{
"cell_type": "code",
- "execution_count": 35,
- "metadata": {},
+ "execution_count": 10,
+ "metadata": {
+ "pycharm": {
+ "name": "#%%\n"
+ }
+ },
"outputs": [
{
"name": "stdout",
@@ -254,15 +288,15 @@
"text": [
" precision recall f1-score support\n",
"\n",
- " 0 0.98 0.83 0.90 54\n",
- " 1 0.91 0.99 0.95 89\n",
+ " 0 1.00 0.93 0.96 54\n",
+ " 1 0.96 1.00 0.98 89\n",
"\n",
- " micro avg 0.93 0.93 0.93 143\n",
- " macro avg 0.94 0.91 0.92 143\n",
- "weighted avg 0.93 0.93 0.93 143\n",
+ " accuracy 0.97 143\n",
+ " macro avg 0.98 0.96 0.97 143\n",
+ "weighted avg 0.97 0.97 0.97 143\n",
"\n",
- "Train AUC: 0.967846684300019\n",
- "Test AUC: 0.9850187265917603\n"
+ "Train AUC: 0.990459096920461\n",
+ "Test AUC: 0.985538909696213\n"
]
}
],
@@ -278,7 +312,11 @@
},
{
"cell_type": "markdown",
- "metadata": {},
+ "metadata": {
+ "pycharm": {
+ "name": "#%% md\n"
+ }
+ },
"source": [
"## vs SRFC: 100 x 1 x 0.1 vs 10 (all features per tree)\n",
"100 fits with 1 tree on 10% of data each"
@@ -286,8 +324,12 @@
},
{
"cell_type": "code",
- "execution_count": 49,
- "metadata": {},
+ "execution_count": 11,
+ "metadata": {
+ "pycharm": {
+ "name": "#%%\n"
+ }
+ },
"outputs": [
{
"name": "stdout",
@@ -295,15 +337,15 @@
"text": [
" precision recall f1-score support\n",
"\n",
- " 0 0.98 0.93 0.95 54\n",
- " 1 0.96 0.99 0.97 89\n",
+ " 0 1.00 0.91 0.95 54\n",
+ " 1 0.95 1.00 0.97 89\n",
"\n",
- " micro avg 0.97 0.97 0.97 143\n",
- " macro avg 0.97 0.96 0.96 143\n",
+ " accuracy 0.97 143\n",
+ " macro avg 0.97 0.95 0.96 143\n",
"weighted avg 0.97 0.97 0.96 143\n",
"\n",
- "Train AUC: 0.9817801813716229\n",
- "Test AUC: 0.9847066167290888\n"
+ "Train AUC: 0.9882155677309654\n",
+ "Test AUC: 0.987411568872243\n"
]
}
],
@@ -320,7 +362,11 @@
},
{
"cell_type": "markdown",
- "metadata": {},
+ "metadata": {
+ "pycharm": {
+ "name": "#%% md\n"
+ }
+ },
"source": [
"## vs SRFC: 33 x 3 x 0.1 vs 10 (sampled features per tree)\n",
"100 fits with 1 tree on 10% of data each"
@@ -328,8 +374,12 @@
},
{
"cell_type": "code",
- "execution_count": 48,
- "metadata": {},
+ "execution_count": 12,
+ "metadata": {
+ "pycharm": {
+ "name": "#%%\n"
+ }
+ },
"outputs": [
{
"name": "stdout",
@@ -337,15 +387,15 @@
"text": [
" precision recall f1-score support\n",
"\n",
- " 0 0.98 0.98 0.98 54\n",
- " 1 0.99 0.99 0.99 89\n",
+ " 0 1.00 0.94 0.97 54\n",
+ " 1 0.97 1.00 0.98 89\n",
"\n",
- " micro avg 0.99 0.99 0.99 143\n",
- " macro avg 0.99 0.99 0.99 143\n",
- "weighted avg 0.99 0.99 0.99 143\n",
+ " accuracy 0.98 143\n",
+ " macro avg 0.98 0.97 0.98 143\n",
+ "weighted avg 0.98 0.98 0.98 143\n",
"\n",
- "Train AUC: 0.9844606083506519\n",
- "Test AUC: 0.9887640449438202\n"
+ "Train AUC: 0.9930096353674664\n",
+ "Test AUC: 0.9985434873075323\n"
]
}
],
@@ -362,7 +412,11 @@
{
"cell_type": "code",
"execution_count": null,
- "metadata": {},
+ "metadata": {
+ "pycharm": {
+ "name": "#%%\n"
+ }
+ },
"outputs": [],
"source": []
}
@@ -388,4 +442,4 @@
},
"nbformat": 4,
"nbformat_minor": 2
-}
+}
\ No newline at end of file
diff --git a/scripts/InconsistentClasses.ipynb b/scripts/InconsistentClasses.ipynb
new file mode 100644
index 0000000..cf3af75
--- /dev/null
+++ b/scripts/InconsistentClasses.ipynb
@@ -0,0 +1,141 @@
+{
+ "cells": [
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {
+ "pycharm": {
+ "name": "#%%\n",
+ "is_executing": true
+ }
+ },
+ "outputs": [],
+ "source": [
+ "# Change dir to repo root if running from repo (rather than pip installed)\n",
+ "# (Assuming running from [repo]/scripts/)\n",
+ "import os\n",
+ "os.chdir('../')\n",
+ "\n",
+ "%load_ext autoreload\n",
+ "%autoreload 2"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {
+ "pycharm": {
+ "name": "#%%\n",
+ "is_executing": true
+ }
+ },
+ "outputs": [],
+ "source": [
+ "import pandas as pd"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {
+ "pycharm": {
+ "name": "#%%\n",
+ "is_executing": true
+ }
+ },
+ "outputs": [],
+ "source": [
+ "data= pd.DataFrame({'a': (1, 2, 3, 4, 5), \n",
+ " 'b': (1, 2, 3, 4, 5),\n",
+ " 'c': (1, 2, 3, 4, 5),\n",
+ " 'target': (1, 1, 2, 2, 3)})\n",
+ "\n",
+ "data = pd.concat((data, data), axis=0).sort_values('target').reset_index(drop=True)\n",
+ "\n",
+ "x = data[[c for c in data if c != 'target']]\n",
+ "y = data['target']\n",
+ "\n",
+ "data"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {
+ "pycharm": {
+ "name": "#%%\n",
+ "is_executing": true
+ }
+ },
+ "outputs": [],
+ "source": [
+ "from incremental_trees.trees import StreamingRFC\n",
+ "\n",
+ "srfc = StreamingRFC()\n",
+ "srfc.partial_fit(x[0:3], y[0:3], # No 3s\n",
+ " classes=y.unique())"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {
+ "pycharm": {
+ "name": "#%%\n",
+ "is_executing": true
+ }
+ },
+ "outputs": [],
+ "source": [
+ "srfc.partial_fit(x, y)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {
+ "pycharm": {
+ "name": "#%%\n",
+ "is_executing": true
+ }
+ },
+ "outputs": [],
+ "source": [
+ "srfc.predict(x)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "outputs": [],
+ "source": [],
+ "metadata": {
+ "collapsed": false,
+ "pycharm": {
+ "name": "#%%\n"
+ }
+ }
+ }
+ ],
+ "metadata": {
+ "kernelspec": {
+ "name": "python3",
+ "language": "python",
+ "display_name": "Python 3 (ipykernel)"
+ },
+ "language_info": {
+ "codemirror_mode": {
+ "name": "ipython",
+ "version": 3
+ },
+ "file_extension": ".py",
+ "mimetype": "text/x-python",
+ "name": "python",
+ "nbconvert_exporter": "python",
+ "pygments_lexer": "ipython3",
+ "version": "3.7.0"
+ }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}
\ No newline at end of file
diff --git a/scripts/PerformanceComparisons.ipynb b/scripts/PerformanceComparisons.ipynb
new file mode 100644
index 0000000..f54130d
--- /dev/null
+++ b/scripts/PerformanceComparisons.ipynb
@@ -0,0 +1,985 @@
+{
+ "cells": [
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "pycharm": {
+ "name": "#%% md\n"
+ }
+ },
+ "source": [
+ "# Performamce comparison\n",
+ "\n",
+ "In memory, no dask."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 1,
+ "metadata": {
+ "pycharm": {
+ "name": "#%%\n"
+ }
+ },
+ "outputs": [],
+ "source": [
+ "# Change dir to repo root if running from repo (rather than pip installed)\n",
+ "# (Assuming running from [repo]/notes/)\n",
+ "import os\n",
+ "os.chdir('../')\n",
+ "\n",
+ "%load_ext autoreload\n",
+ "%autoreload 2"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 2,
+ "metadata": {
+ "pycharm": {
+ "name": "#%%\n"
+ }
+ },
+ "outputs": [],
+ "source": [
+ "import matplotlib.pyplot as plt\n",
+ "import numpy as np\n",
+ "\n",
+ "from typing import Tuple\n",
+ "\n",
+ "from incremental_trees.trees import StreamingRFC\n",
+ "\n",
+ "from sklearn.ensemble import RandomForestClassifier\n",
+ "from sklearn.datasets import make_blobs\n",
+ "from sklearn.model_selection import train_test_split\n",
+ "from sklearn.metrics import roc_auc_score"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 3,
+ "metadata": {
+ "pycharm": {
+ "name": "#%%\n"
+ }
+ },
+ "outputs": [],
+ "source": [
+ "# Settings\n",
+ "MAX_ESTIMATORS = 120 # Lower to run faster"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "pycharm": {
+ "name": "#%% md\n"
+ }
+ },
+ "source": [
+ "# Synthetic data\n",
+ "\n",
+ "20000 samples, 2 classes, 40 features."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 4,
+ "metadata": {
+ "pycharm": {
+ "name": "#%%\n"
+ }
+ },
+ "outputs": [],
+ "source": [
+ "x, y = make_blobs(\n",
+ " n_samples=20000,\n",
+ " centers=2,\n",
+ " cluster_std=100,\n",
+ " n_features=40,\n",
+ " random_state=0\n",
+ ")"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "pycharm": {
+ "name": "#%% md\n"
+ }
+ },
+ "source": [
+ "## Default params"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "pycharm": {
+ "name": "#%% md\n"
+ }
+ },
+ "source": [
+ "### Standard random forest"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 5,
+ "metadata": {
+ "pycharm": {
+ "name": "#%%\n"
+ }
+ },
+ "outputs": [],
+ "source": [
+ "def score(mod,\n",
+ " train: Tuple[np.array, np.array],\n",
+ " test: Tuple[np.array, np.array],\n",
+ " pr=False) -> Tuple[float, float]:\n",
+ " \"\"\"\n",
+ " Return ROC auc on x_train and x_test (from caller) on mod. Print if requested.\n",
+ " \"\"\"\n",
+ " y_pred_train_proba = mod.predict_proba(train[0])[:, 1]\n",
+ " y_pred_test_proba = mod.predict_proba(test[0])[:, 1]\n",
+ "\n",
+ " roc_train = roc_auc_score(train[1], y_pred_train_proba)\n",
+ " roc_test = roc_auc_score(test[1], y_pred_test_proba)\n",
+ " if pr:\n",
+ " print(f\"n_ests: {len(rfc.estimators_)}\")\n",
+ " print(f'Train AUC: {roc_train}')\n",
+ " print(f'Test AUC: {roc_test}')\n",
+ "\n",
+ " return roc_train, roc_test\n",
+ "\n",
+ "\n",
+ "def inc_fit(x: np.array, y: np.array, rfc=None, steps=np.arange(1, 101, 2),\n",
+ " sample: int = 1):\n",
+ " \"\"\"\n",
+ " Fit a random forest model with an increasing number of estimators.\n",
+ " \n",
+ " Uses .fit with warm_start=True.\n",
+ " \n",
+ " :param rfc: RFC model to test. Default = None (use example with default RFC params).\n",
+ " If model is supplied, the .n_estimators param will be ignored and managed here.\n",
+ " :param steps: Range to iterate over. Sets total number of estimators that will be fit in model\n",
+ " after each iteration. Should be range with constant step size.\n",
+ " :param sample: Proportion of randomly sampled training data to use on each partial_fit call.\n",
+ " If sample = 1, all training data is used on each interation,\n",
+ " so should behave as standard random forest. Default = 1 (100%).\n",
+ " \"\"\"\n",
+ "\n",
+ " x_train, x_test, y_train, y_test = train_test_split(\n",
+ " x,\n",
+ " y,\n",
+ " test_size=0.25,\n",
+ " random_state=1\n",
+ " )\n",
+ "\n",
+ " if rfc is None:\n",
+ " rfc = RandomForestClassifier(warm_start=True)\n",
+ "\n",
+ " train_scores = []\n",
+ " test_scores = []\n",
+ " for s in steps:\n",
+ " # Fit model with these n ests\n",
+ " rfc.set_params(n_estimators=s)\n",
+ " rfc.fit(x_train, y_train)\n",
+ "\n",
+ " tr_score, te_score = score(\n",
+ " rfc,\n",
+ " train=(x_train, y_train),\n",
+ " test=(x_test, y_test),\n",
+ " pr=False\n",
+ " )\n",
+ " train_scores.append(tr_score)\n",
+ " test_scores.append(te_score)\n",
+ "\n",
+ " return train_scores, test_scores\n",
+ "\n",
+ "\n",
+ "def plot_auc(steps, train_scores, test_scores):\n",
+ " \"\"\"\n",
+ " Plot the train and test auc scores vs total number of model estimators\n",
+ " \"\"\"\n",
+ "\n",
+ " plt.figure(figsize=(4, 4))\n",
+ " plt.plot(steps, train_scores)\n",
+ " plt.plot(steps, test_scores)\n",
+ " plt.xlabel('n_estimators')\n",
+ " plt.ylabel('auc')\n",
+ " plt.legend(['train', 'test'])"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 6,
+ "metadata": {
+ "pycharm": {
+ "name": "#%%\n"
+ }
+ },
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "CPU times: total: 23.8 s\n",
+ "Wall time: 24.1 s\n",
+ "With 119: 1.0 | 0.6391245961589661\n"
+ ]
+ },
+ {
+ "data": {
+ "text/plain": "",
+ "image/png": "\n"
+ },
+ "metadata": {},
+ "output_type": "display_data"
+ }
+ ],
+ "source": [
+ "steps = np.arange(1, MAX_ESTIMATORS, 2)\n",
+ "rfc = RandomForestClassifier(warm_start=True)\n",
+ "\n",
+ "%time train_scores, test_scores = inc_fit(x, y, rfc=rfc, steps=steps)\n",
+ "print(f\"With {len(rfc.estimators_)}: {train_scores[-1]} | {test_scores[-1]}\")\n",
+ "plot_auc(steps, train_scores, test_scores)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "pycharm": {
+ "name": "#%% md\n"
+ }
+ },
+ "source": [
+ "### Streaming random forest"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 7,
+ "metadata": {
+ "pycharm": {
+ "name": "#%%\n"
+ }
+ },
+ "outputs": [],
+ "source": [
+ "def inc_partial_fit(x: np.ndarray, y:np.ndarray,\n",
+ " srfc=None,\n",
+ " steps=np.arange(1, 101, 2),\n",
+ " sample: int=0.1,\n",
+ " **kwargs) -> None:\n",
+ " \n",
+ " \"\"\"\n",
+ " Fit increasing number of estimators using .partial_fit on a subsample of the training data.\n",
+ " \n",
+ " StreamingRFC.n_estimators: Number of estimators that will be fit in each step. Set from first\n",
+ " difference in range (ie. range[1]-range[0])\n",
+ " StreamingRFC.max_n_estimators: Limit on number of estimators than will be fit in model. Should >\n",
+ " range[-1].\n",
+ " \n",
+ " :param srfc: StreamingRFC model to test. Default = None (use example with default RFC params).\n",
+ " If model is supplied, the .n_estimators param should match the constant range\n",
+ " step size.\n",
+ " :param steps: Range to iterate over. Sets total number of estimators that will be fit in model\n",
+ " after each iteration. Should be range with constant step size.\n",
+ " :param sample: Proportion of randomly sampled training data to use on each partial_fit call.\n",
+ " If sample = 1, all training data is used on each interation,\n",
+ " so should behave as standard random forest. Default = 0.1 (10%)\n",
+ " \"\"\"\n",
+ " \n",
+ " x_train, x_test, y_train, y_test = train_test_split(x, y, \n",
+ " test_size=0.25,\n",
+ " random_state=1)\n",
+ " n_train = x_train.shape[0]\n",
+ " \n",
+ " if srfc is None:\n",
+ " srfc = StreamingRFC(n_estimators_per_chunk=np.diff(steps)[0],\n",
+ " max_n_estimators=np.max(steps),\n",
+ " **kwargs)\n",
+ " \n",
+ " train_scores = []\n",
+ " test_scores = []\n",
+ " for s in steps:\n",
+ " \n",
+ " use_idx = np.arange(0, n_train)[np.random.randint(low=0, \n",
+ " high=n_train, \n",
+ " size=int(n_train * sample))]\n",
+ " \n",
+ " # Fit model with these n ests\n",
+ " srfc.partial_fit(x_train[use_idx, :], y_train[use_idx],\n",
+ " classes=np.unique(y))\n",
+ " \n",
+ " tr_score, te_score = score(srfc,\n",
+ " train=(x_train, y_train),\n",
+ " test=(x_test, y_test),\n",
+ " pr=False)\n",
+ " train_scores.append(tr_score)\n",
+ " test_scores.append(te_score)\n",
+ " \n",
+ " return train_scores, test_scores"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "pycharm": {
+ "name": "#%% md\n"
+ }
+ },
+ "source": [
+ "#### As normal random forest\n",
+ "1 estimator per full subset"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 9,
+ "metadata": {
+ "pycharm": {
+ "name": "#%%\n"
+ }
+ },
+ "outputs": [
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "C:\\Users\\Gareth\\anaconda3\\envs\\IncrementalTrees\\lib\\site-packages\\sklearn\\ensemble\\_forest.py:425: FutureWarning: `max_features='auto'` has been deprecated in 1.1 and will be removed in 1.3. To keep the past behaviour, explicitly set `max_features='sqrt'` or remove this parameter as it is also the default value for RandomForestClassifiers and ExtraTreesClassifiers.\n",
+ " warn(\n",
+ "C:\\Users\\Gareth\\anaconda3\\envs\\IncrementalTrees\\lib\\site-packages\\sklearn\\ensemble\\_forest.py:425: FutureWarning: `max_features='auto'` has been deprecated in 1.1 and will be removed in 1.3. To keep the past behaviour, explicitly set `max_features='sqrt'` or remove this parameter as it is also the default value for RandomForestClassifiers and ExtraTreesClassifiers.\n",
+ " warn(\n",
+ "C:\\Users\\Gareth\\anaconda3\\envs\\IncrementalTrees\\lib\\site-packages\\sklearn\\ensemble\\_forest.py:425: FutureWarning: `max_features='auto'` has been deprecated in 1.1 and will be removed in 1.3. To keep the past behaviour, explicitly set `max_features='sqrt'` or remove this parameter as it is also the default value for RandomForestClassifiers and ExtraTreesClassifiers.\n",
+ " warn(\n",
+ "C:\\Users\\Gareth\\anaconda3\\envs\\IncrementalTrees\\lib\\site-packages\\sklearn\\ensemble\\_forest.py:425: FutureWarning: `max_features='auto'` has been deprecated in 1.1 and will be removed in 1.3. To keep the past behaviour, explicitly set `max_features='sqrt'` or remove this parameter as it is also the default value for RandomForestClassifiers and ExtraTreesClassifiers.\n",
+ " warn(\n",
+ "C:\\Users\\Gareth\\anaconda3\\envs\\IncrementalTrees\\lib\\site-packages\\sklearn\\ensemble\\_forest.py:425: FutureWarning: `max_features='auto'` has been deprecated in 1.1 and will be removed in 1.3. To keep the past behaviour, explicitly set `max_features='sqrt'` or remove this parameter as it is also the default value for RandomForestClassifiers and ExtraTreesClassifiers.\n",
+ " warn(\n",
+ "C:\\Users\\Gareth\\anaconda3\\envs\\IncrementalTrees\\lib\\site-packages\\sklearn\\ensemble\\_forest.py:425: FutureWarning: `max_features='auto'` has been deprecated in 1.1 and will be removed in 1.3. To keep the past behaviour, explicitly set `max_features='sqrt'` or remove this parameter as it is also the default value for RandomForestClassifiers and ExtraTreesClassifiers.\n",
+ " warn(\n",
+ "C:\\Users\\Gareth\\anaconda3\\envs\\IncrementalTrees\\lib\\site-packages\\sklearn\\ensemble\\_forest.py:425: FutureWarning: `max_features='auto'` has been deprecated in 1.1 and will be removed in 1.3. To keep the past behaviour, explicitly set `max_features='sqrt'` or remove this parameter as it is also the default value for RandomForestClassifiers and ExtraTreesClassifiers.\n",
+ " warn(\n",
+ "C:\\Users\\Gareth\\anaconda3\\envs\\IncrementalTrees\\lib\\site-packages\\sklearn\\ensemble\\_forest.py:425: FutureWarning: `max_features='auto'` has been deprecated in 1.1 and will be removed in 1.3. To keep the past behaviour, explicitly set `max_features='sqrt'` or remove this parameter as it is also the default value for RandomForestClassifiers and ExtraTreesClassifiers.\n",
+ " warn(\n",
+ "C:\\Users\\Gareth\\anaconda3\\envs\\IncrementalTrees\\lib\\site-packages\\sklearn\\ensemble\\_forest.py:425: FutureWarning: `max_features='auto'` has been deprecated in 1.1 and will be removed in 1.3. To keep the past behaviour, explicitly set `max_features='sqrt'` or remove this parameter as it is also the default value for RandomForestClassifiers and ExtraTreesClassifiers.\n",
+ " warn(\n",
+ "C:\\Users\\Gareth\\anaconda3\\envs\\IncrementalTrees\\lib\\site-packages\\sklearn\\ensemble\\_forest.py:425: FutureWarning: `max_features='auto'` has been deprecated in 1.1 and will be removed in 1.3. To keep the past behaviour, explicitly set `max_features='sqrt'` or remove this parameter as it is also the default value for RandomForestClassifiers and ExtraTreesClassifiers.\n",
+ " warn(\n",
+ "C:\\Users\\Gareth\\anaconda3\\envs\\IncrementalTrees\\lib\\site-packages\\sklearn\\ensemble\\_forest.py:425: FutureWarning: `max_features='auto'` has been deprecated in 1.1 and will be removed in 1.3. To keep the past behaviour, explicitly set `max_features='sqrt'` or remove this parameter as it is also the default value for RandomForestClassifiers and ExtraTreesClassifiers.\n",
+ " warn(\n",
+ "C:\\Users\\Gareth\\anaconda3\\envs\\IncrementalTrees\\lib\\site-packages\\sklearn\\ensemble\\_forest.py:425: FutureWarning: `max_features='auto'` has been deprecated in 1.1 and will be removed in 1.3. To keep the past behaviour, explicitly set `max_features='sqrt'` or remove this parameter as it is also the default value for RandomForestClassifiers and ExtraTreesClassifiers.\n",
+ " warn(\n",
+ "C:\\Users\\Gareth\\anaconda3\\envs\\IncrementalTrees\\lib\\site-packages\\sklearn\\ensemble\\_forest.py:425: FutureWarning: `max_features='auto'` has been deprecated in 1.1 and will be removed in 1.3. To keep the past behaviour, explicitly set `max_features='sqrt'` or remove this parameter as it is also the default value for RandomForestClassifiers and ExtraTreesClassifiers.\n",
+ " warn(\n",
+ "C:\\Users\\Gareth\\anaconda3\\envs\\IncrementalTrees\\lib\\site-packages\\sklearn\\ensemble\\_forest.py:425: FutureWarning: `max_features='auto'` has been deprecated in 1.1 and will be removed in 1.3. To keep the past behaviour, explicitly set `max_features='sqrt'` or remove this parameter as it is also the default value for RandomForestClassifiers and ExtraTreesClassifiers.\n",
+ " warn(\n",
+ "C:\\Users\\Gareth\\anaconda3\\envs\\IncrementalTrees\\lib\\site-packages\\sklearn\\ensemble\\_forest.py:425: FutureWarning: `max_features='auto'` has been deprecated in 1.1 and will be removed in 1.3. To keep the past behaviour, explicitly set `max_features='sqrt'` or remove this parameter as it is also the default value for RandomForestClassifiers and ExtraTreesClassifiers.\n",
+ " warn(\n",
+ "C:\\Users\\Gareth\\anaconda3\\envs\\IncrementalTrees\\lib\\site-packages\\sklearn\\ensemble\\_forest.py:425: FutureWarning: `max_features='auto'` has been deprecated in 1.1 and will be removed in 1.3. To keep the past behaviour, explicitly set `max_features='sqrt'` or remove this parameter as it is also the default value for RandomForestClassifiers and ExtraTreesClassifiers.\n",
+ " warn(\n",
+ "C:\\Users\\Gareth\\anaconda3\\envs\\IncrementalTrees\\lib\\site-packages\\sklearn\\ensemble\\_forest.py:425: FutureWarning: `max_features='auto'` has been deprecated in 1.1 and will be removed in 1.3. To keep the past behaviour, explicitly set `max_features='sqrt'` or remove this parameter as it is also the default value for RandomForestClassifiers and ExtraTreesClassifiers.\n",
+ " warn(\n",
+ "C:\\Users\\Gareth\\anaconda3\\envs\\IncrementalTrees\\lib\\site-packages\\sklearn\\ensemble\\_forest.py:425: FutureWarning: `max_features='auto'` has been deprecated in 1.1 and will be removed in 1.3. To keep the past behaviour, explicitly set `max_features='sqrt'` or remove this parameter as it is also the default value for RandomForestClassifiers and ExtraTreesClassifiers.\n",
+ " warn(\n",
+ "C:\\Users\\Gareth\\anaconda3\\envs\\IncrementalTrees\\lib\\site-packages\\sklearn\\ensemble\\_forest.py:425: FutureWarning: `max_features='auto'` has been deprecated in 1.1 and will be removed in 1.3. To keep the past behaviour, explicitly set `max_features='sqrt'` or remove this parameter as it is also the default value for RandomForestClassifiers and ExtraTreesClassifiers.\n",
+ " warn(\n",
+ "C:\\Users\\Gareth\\anaconda3\\envs\\IncrementalTrees\\lib\\site-packages\\sklearn\\ensemble\\_forest.py:425: FutureWarning: `max_features='auto'` has been deprecated in 1.1 and will be removed in 1.3. To keep the past behaviour, explicitly set `max_features='sqrt'` or remove this parameter as it is also the default value for RandomForestClassifiers and ExtraTreesClassifiers.\n",
+ " warn(\n",
+ "C:\\Users\\Gareth\\anaconda3\\envs\\IncrementalTrees\\lib\\site-packages\\sklearn\\ensemble\\_forest.py:425: FutureWarning: `max_features='auto'` has been deprecated in 1.1 and will be removed in 1.3. To keep the past behaviour, explicitly set `max_features='sqrt'` or remove this parameter as it is also the default value for RandomForestClassifiers and ExtraTreesClassifiers.\n",
+ " warn(\n",
+ "C:\\Users\\Gareth\\anaconda3\\envs\\IncrementalTrees\\lib\\site-packages\\sklearn\\ensemble\\_forest.py:425: FutureWarning: `max_features='auto'` has been deprecated in 1.1 and will be removed in 1.3. To keep the past behaviour, explicitly set `max_features='sqrt'` or remove this parameter as it is also the default value for RandomForestClassifiers and ExtraTreesClassifiers.\n",
+ " warn(\n",
+ "C:\\Users\\Gareth\\anaconda3\\envs\\IncrementalTrees\\lib\\site-packages\\sklearn\\ensemble\\_forest.py:425: FutureWarning: `max_features='auto'` has been deprecated in 1.1 and will be removed in 1.3. To keep the past behaviour, explicitly set `max_features='sqrt'` or remove this parameter as it is also the default value for RandomForestClassifiers and ExtraTreesClassifiers.\n",
+ " warn(\n",
+ "C:\\Users\\Gareth\\anaconda3\\envs\\IncrementalTrees\\lib\\site-packages\\sklearn\\ensemble\\_forest.py:425: FutureWarning: `max_features='auto'` has been deprecated in 1.1 and will be removed in 1.3. To keep the past behaviour, explicitly set `max_features='sqrt'` or remove this parameter as it is also the default value for RandomForestClassifiers and ExtraTreesClassifiers.\n",
+ " warn(\n",
+ "C:\\Users\\Gareth\\anaconda3\\envs\\IncrementalTrees\\lib\\site-packages\\sklearn\\ensemble\\_forest.py:425: FutureWarning: `max_features='auto'` has been deprecated in 1.1 and will be removed in 1.3. To keep the past behaviour, explicitly set `max_features='sqrt'` or remove this parameter as it is also the default value for RandomForestClassifiers and ExtraTreesClassifiers.\n",
+ " warn(\n",
+ "C:\\Users\\Gareth\\anaconda3\\envs\\IncrementalTrees\\lib\\site-packages\\sklearn\\ensemble\\_forest.py:425: FutureWarning: `max_features='auto'` has been deprecated in 1.1 and will be removed in 1.3. To keep the past behaviour, explicitly set `max_features='sqrt'` or remove this parameter as it is also the default value for RandomForestClassifiers and ExtraTreesClassifiers.\n",
+ " warn(\n",
+ "C:\\Users\\Gareth\\anaconda3\\envs\\IncrementalTrees\\lib\\site-packages\\sklearn\\ensemble\\_forest.py:425: FutureWarning: `max_features='auto'` has been deprecated in 1.1 and will be removed in 1.3. To keep the past behaviour, explicitly set `max_features='sqrt'` or remove this parameter as it is also the default value for RandomForestClassifiers and ExtraTreesClassifiers.\n",
+ " warn(\n",
+ "C:\\Users\\Gareth\\anaconda3\\envs\\IncrementalTrees\\lib\\site-packages\\sklearn\\ensemble\\_forest.py:425: FutureWarning: `max_features='auto'` has been deprecated in 1.1 and will be removed in 1.3. To keep the past behaviour, explicitly set `max_features='sqrt'` or remove this parameter as it is also the default value for RandomForestClassifiers and ExtraTreesClassifiers.\n",
+ " warn(\n",
+ "C:\\Users\\Gareth\\anaconda3\\envs\\IncrementalTrees\\lib\\site-packages\\sklearn\\ensemble\\_forest.py:425: FutureWarning: `max_features='auto'` has been deprecated in 1.1 and will be removed in 1.3. To keep the past behaviour, explicitly set `max_features='sqrt'` or remove this parameter as it is also the default value for RandomForestClassifiers and ExtraTreesClassifiers.\n",
+ " warn(\n",
+ "C:\\Users\\Gareth\\anaconda3\\envs\\IncrementalTrees\\lib\\site-packages\\sklearn\\ensemble\\_forest.py:425: FutureWarning: `max_features='auto'` has been deprecated in 1.1 and will be removed in 1.3. To keep the past behaviour, explicitly set `max_features='sqrt'` or remove this parameter as it is also the default value for RandomForestClassifiers and ExtraTreesClassifiers.\n",
+ " warn(\n",
+ "C:\\Users\\Gareth\\anaconda3\\envs\\IncrementalTrees\\lib\\site-packages\\sklearn\\ensemble\\_forest.py:425: FutureWarning: `max_features='auto'` has been deprecated in 1.1 and will be removed in 1.3. To keep the past behaviour, explicitly set `max_features='sqrt'` or remove this parameter as it is also the default value for RandomForestClassifiers and ExtraTreesClassifiers.\n",
+ " warn(\n",
+ "C:\\Users\\Gareth\\anaconda3\\envs\\IncrementalTrees\\lib\\site-packages\\sklearn\\ensemble\\_forest.py:425: FutureWarning: `max_features='auto'` has been deprecated in 1.1 and will be removed in 1.3. To keep the past behaviour, explicitly set `max_features='sqrt'` or remove this parameter as it is also the default value for RandomForestClassifiers and ExtraTreesClassifiers.\n",
+ " warn(\n",
+ "C:\\Users\\Gareth\\anaconda3\\envs\\IncrementalTrees\\lib\\site-packages\\sklearn\\ensemble\\_forest.py:425: FutureWarning: `max_features='auto'` has been deprecated in 1.1 and will be removed in 1.3. To keep the past behaviour, explicitly set `max_features='sqrt'` or remove this parameter as it is also the default value for RandomForestClassifiers and ExtraTreesClassifiers.\n",
+ " warn(\n",
+ "C:\\Users\\Gareth\\anaconda3\\envs\\IncrementalTrees\\lib\\site-packages\\sklearn\\ensemble\\_forest.py:425: FutureWarning: `max_features='auto'` has been deprecated in 1.1 and will be removed in 1.3. To keep the past behaviour, explicitly set `max_features='sqrt'` or remove this parameter as it is also the default value for RandomForestClassifiers and ExtraTreesClassifiers.\n",
+ " warn(\n",
+ "C:\\Users\\Gareth\\anaconda3\\envs\\IncrementalTrees\\lib\\site-packages\\sklearn\\ensemble\\_forest.py:425: FutureWarning: `max_features='auto'` has been deprecated in 1.1 and will be removed in 1.3. To keep the past behaviour, explicitly set `max_features='sqrt'` or remove this parameter as it is also the default value for RandomForestClassifiers and ExtraTreesClassifiers.\n",
+ " warn(\n",
+ "C:\\Users\\Gareth\\anaconda3\\envs\\IncrementalTrees\\lib\\site-packages\\sklearn\\ensemble\\_forest.py:425: FutureWarning: `max_features='auto'` has been deprecated in 1.1 and will be removed in 1.3. To keep the past behaviour, explicitly set `max_features='sqrt'` or remove this parameter as it is also the default value for RandomForestClassifiers and ExtraTreesClassifiers.\n",
+ " warn(\n",
+ "C:\\Users\\Gareth\\anaconda3\\envs\\IncrementalTrees\\lib\\site-packages\\sklearn\\ensemble\\_forest.py:425: FutureWarning: `max_features='auto'` has been deprecated in 1.1 and will be removed in 1.3. To keep the past behaviour, explicitly set `max_features='sqrt'` or remove this parameter as it is also the default value for RandomForestClassifiers and ExtraTreesClassifiers.\n",
+ " warn(\n",
+ "C:\\Users\\Gareth\\anaconda3\\envs\\IncrementalTrees\\lib\\site-packages\\sklearn\\ensemble\\_forest.py:425: FutureWarning: `max_features='auto'` has been deprecated in 1.1 and will be removed in 1.3. To keep the past behaviour, explicitly set `max_features='sqrt'` or remove this parameter as it is also the default value for RandomForestClassifiers and ExtraTreesClassifiers.\n",
+ " warn(\n",
+ "C:\\Users\\Gareth\\anaconda3\\envs\\IncrementalTrees\\lib\\site-packages\\sklearn\\ensemble\\_forest.py:425: FutureWarning: `max_features='auto'` has been deprecated in 1.1 and will be removed in 1.3. To keep the past behaviour, explicitly set `max_features='sqrt'` or remove this parameter as it is also the default value for RandomForestClassifiers and ExtraTreesClassifiers.\n",
+ " warn(\n",
+ "C:\\Users\\Gareth\\anaconda3\\envs\\IncrementalTrees\\lib\\site-packages\\sklearn\\ensemble\\_forest.py:425: FutureWarning: `max_features='auto'` has been deprecated in 1.1 and will be removed in 1.3. To keep the past behaviour, explicitly set `max_features='sqrt'` or remove this parameter as it is also the default value for RandomForestClassifiers and ExtraTreesClassifiers.\n",
+ " warn(\n",
+ "C:\\Users\\Gareth\\anaconda3\\envs\\IncrementalTrees\\lib\\site-packages\\sklearn\\ensemble\\_forest.py:425: FutureWarning: `max_features='auto'` has been deprecated in 1.1 and will be removed in 1.3. To keep the past behaviour, explicitly set `max_features='sqrt'` or remove this parameter as it is also the default value for RandomForestClassifiers and ExtraTreesClassifiers.\n",
+ " warn(\n",
+ "C:\\Users\\Gareth\\anaconda3\\envs\\IncrementalTrees\\lib\\site-packages\\sklearn\\ensemble\\_forest.py:425: FutureWarning: `max_features='auto'` has been deprecated in 1.1 and will be removed in 1.3. To keep the past behaviour, explicitly set `max_features='sqrt'` or remove this parameter as it is also the default value for RandomForestClassifiers and ExtraTreesClassifiers.\n",
+ " warn(\n",
+ "C:\\Users\\Gareth\\anaconda3\\envs\\IncrementalTrees\\lib\\site-packages\\sklearn\\ensemble\\_forest.py:425: FutureWarning: `max_features='auto'` has been deprecated in 1.1 and will be removed in 1.3. To keep the past behaviour, explicitly set `max_features='sqrt'` or remove this parameter as it is also the default value for RandomForestClassifiers and ExtraTreesClassifiers.\n",
+ " warn(\n",
+ "C:\\Users\\Gareth\\anaconda3\\envs\\IncrementalTrees\\lib\\site-packages\\sklearn\\ensemble\\_forest.py:425: FutureWarning: `max_features='auto'` has been deprecated in 1.1 and will be removed in 1.3. To keep the past behaviour, explicitly set `max_features='sqrt'` or remove this parameter as it is also the default value for RandomForestClassifiers and ExtraTreesClassifiers.\n",
+ " warn(\n",
+ "C:\\Users\\Gareth\\anaconda3\\envs\\IncrementalTrees\\lib\\site-packages\\sklearn\\ensemble\\_forest.py:425: FutureWarning: `max_features='auto'` has been deprecated in 1.1 and will be removed in 1.3. To keep the past behaviour, explicitly set `max_features='sqrt'` or remove this parameter as it is also the default value for RandomForestClassifiers and ExtraTreesClassifiers.\n",
+ " warn(\n",
+ "C:\\Users\\Gareth\\anaconda3\\envs\\IncrementalTrees\\lib\\site-packages\\sklearn\\ensemble\\_forest.py:425: FutureWarning: `max_features='auto'` has been deprecated in 1.1 and will be removed in 1.3. To keep the past behaviour, explicitly set `max_features='sqrt'` or remove this parameter as it is also the default value for RandomForestClassifiers and ExtraTreesClassifiers.\n",
+ " warn(\n",
+ "C:\\Users\\Gareth\\anaconda3\\envs\\IncrementalTrees\\lib\\site-packages\\sklearn\\ensemble\\_forest.py:425: FutureWarning: `max_features='auto'` has been deprecated in 1.1 and will be removed in 1.3. To keep the past behaviour, explicitly set `max_features='sqrt'` or remove this parameter as it is also the default value for RandomForestClassifiers and ExtraTreesClassifiers.\n",
+ " warn(\n",
+ "C:\\Users\\Gareth\\anaconda3\\envs\\IncrementalTrees\\lib\\site-packages\\sklearn\\ensemble\\_forest.py:425: FutureWarning: `max_features='auto'` has been deprecated in 1.1 and will be removed in 1.3. To keep the past behaviour, explicitly set `max_features='sqrt'` or remove this parameter as it is also the default value for RandomForestClassifiers and ExtraTreesClassifiers.\n",
+ " warn(\n",
+ "C:\\Users\\Gareth\\anaconda3\\envs\\IncrementalTrees\\lib\\site-packages\\sklearn\\ensemble\\_forest.py:425: FutureWarning: `max_features='auto'` has been deprecated in 1.1 and will be removed in 1.3. To keep the past behaviour, explicitly set `max_features='sqrt'` or remove this parameter as it is also the default value for RandomForestClassifiers and ExtraTreesClassifiers.\n",
+ " warn(\n",
+ "C:\\Users\\Gareth\\anaconda3\\envs\\IncrementalTrees\\lib\\site-packages\\sklearn\\ensemble\\_forest.py:425: FutureWarning: `max_features='auto'` has been deprecated in 1.1 and will be removed in 1.3. To keep the past behaviour, explicitly set `max_features='sqrt'` or remove this parameter as it is also the default value for RandomForestClassifiers and ExtraTreesClassifiers.\n",
+ " warn(\n",
+ "C:\\Users\\Gareth\\anaconda3\\envs\\IncrementalTrees\\lib\\site-packages\\sklearn\\ensemble\\_forest.py:425: FutureWarning: `max_features='auto'` has been deprecated in 1.1 and will be removed in 1.3. To keep the past behaviour, explicitly set `max_features='sqrt'` or remove this parameter as it is also the default value for RandomForestClassifiers and ExtraTreesClassifiers.\n",
+ " warn(\n",
+ "C:\\Users\\Gareth\\anaconda3\\envs\\IncrementalTrees\\lib\\site-packages\\sklearn\\ensemble\\_forest.py:425: FutureWarning: `max_features='auto'` has been deprecated in 1.1 and will be removed in 1.3. To keep the past behaviour, explicitly set `max_features='sqrt'` or remove this parameter as it is also the default value for RandomForestClassifiers and ExtraTreesClassifiers.\n",
+ " warn(\n",
+ "C:\\Users\\Gareth\\anaconda3\\envs\\IncrementalTrees\\lib\\site-packages\\sklearn\\ensemble\\_forest.py:425: FutureWarning: `max_features='auto'` has been deprecated in 1.1 and will be removed in 1.3. To keep the past behaviour, explicitly set `max_features='sqrt'` or remove this parameter as it is also the default value for RandomForestClassifiers and ExtraTreesClassifiers.\n",
+ " warn(\n",
+ "C:\\Users\\Gareth\\anaconda3\\envs\\IncrementalTrees\\lib\\site-packages\\sklearn\\ensemble\\_forest.py:425: FutureWarning: `max_features='auto'` has been deprecated in 1.1 and will be removed in 1.3. To keep the past behaviour, explicitly set `max_features='sqrt'` or remove this parameter as it is also the default value for RandomForestClassifiers and ExtraTreesClassifiers.\n",
+ " warn(\n",
+ "C:\\Users\\Gareth\\anaconda3\\envs\\IncrementalTrees\\lib\\site-packages\\sklearn\\ensemble\\_forest.py:425: FutureWarning: `max_features='auto'` has been deprecated in 1.1 and will be removed in 1.3. To keep the past behaviour, explicitly set `max_features='sqrt'` or remove this parameter as it is also the default value for RandomForestClassifiers and ExtraTreesClassifiers.\n",
+ " warn(\n",
+ "C:\\Users\\Gareth\\anaconda3\\envs\\IncrementalTrees\\lib\\site-packages\\sklearn\\ensemble\\_forest.py:425: FutureWarning: `max_features='auto'` has been deprecated in 1.1 and will be removed in 1.3. To keep the past behaviour, explicitly set `max_features='sqrt'` or remove this parameter as it is also the default value for RandomForestClassifiers and ExtraTreesClassifiers.\n",
+ " warn(\n",
+ "C:\\Users\\Gareth\\anaconda3\\envs\\IncrementalTrees\\lib\\site-packages\\sklearn\\ensemble\\_forest.py:425: FutureWarning: `max_features='auto'` has been deprecated in 1.1 and will be removed in 1.3. To keep the past behaviour, explicitly set `max_features='sqrt'` or remove this parameter as it is also the default value for RandomForestClassifiers and ExtraTreesClassifiers.\n",
+ " warn(\n",
+ "C:\\Users\\Gareth\\anaconda3\\envs\\IncrementalTrees\\lib\\site-packages\\sklearn\\ensemble\\_forest.py:425: FutureWarning: `max_features='auto'` has been deprecated in 1.1 and will be removed in 1.3. To keep the past behaviour, explicitly set `max_features='sqrt'` or remove this parameter as it is also the default value for RandomForestClassifiers and ExtraTreesClassifiers.\n",
+ " warn(\n",
+ "C:\\Users\\Gareth\\anaconda3\\envs\\IncrementalTrees\\lib\\site-packages\\sklearn\\ensemble\\_forest.py:425: FutureWarning: `max_features='auto'` has been deprecated in 1.1 and will be removed in 1.3. To keep the past behaviour, explicitly set `max_features='sqrt'` or remove this parameter as it is also the default value for RandomForestClassifiers and ExtraTreesClassifiers.\n",
+ " warn(\n",
+ "C:\\Users\\Gareth\\anaconda3\\envs\\IncrementalTrees\\lib\\site-packages\\sklearn\\ensemble\\_forest.py:425: FutureWarning: `max_features='auto'` has been deprecated in 1.1 and will be removed in 1.3. To keep the past behaviour, explicitly set `max_features='sqrt'` or remove this parameter as it is also the default value for RandomForestClassifiers and ExtraTreesClassifiers.\n",
+ " warn(\n",
+ "C:\\Users\\Gareth\\anaconda3\\envs\\IncrementalTrees\\lib\\site-packages\\sklearn\\ensemble\\_forest.py:425: FutureWarning: `max_features='auto'` has been deprecated in 1.1 and will be removed in 1.3. To keep the past behaviour, explicitly set `max_features='sqrt'` or remove this parameter as it is also the default value for RandomForestClassifiers and ExtraTreesClassifiers.\n",
+ " warn(\n",
+ "C:\\Users\\Gareth\\anaconda3\\envs\\IncrementalTrees\\lib\\site-packages\\sklearn\\ensemble\\_forest.py:425: FutureWarning: `max_features='auto'` has been deprecated in 1.1 and will be removed in 1.3. To keep the past behaviour, explicitly set `max_features='sqrt'` or remove this parameter as it is also the default value for RandomForestClassifiers and ExtraTreesClassifiers.\n",
+ " warn(\n",
+ "C:\\Users\\Gareth\\anaconda3\\envs\\IncrementalTrees\\lib\\site-packages\\sklearn\\ensemble\\_forest.py:425: FutureWarning: `max_features='auto'` has been deprecated in 1.1 and will be removed in 1.3. To keep the past behaviour, explicitly set `max_features='sqrt'` or remove this parameter as it is also the default value for RandomForestClassifiers and ExtraTreesClassifiers.\n",
+ " warn(\n",
+ "C:\\Users\\Gareth\\anaconda3\\envs\\IncrementalTrees\\lib\\site-packages\\sklearn\\ensemble\\_forest.py:425: FutureWarning: `max_features='auto'` has been deprecated in 1.1 and will be removed in 1.3. To keep the past behaviour, explicitly set `max_features='sqrt'` or remove this parameter as it is also the default value for RandomForestClassifiers and ExtraTreesClassifiers.\n",
+ " warn(\n",
+ "C:\\Users\\Gareth\\anaconda3\\envs\\IncrementalTrees\\lib\\site-packages\\sklearn\\ensemble\\_forest.py:425: FutureWarning: `max_features='auto'` has been deprecated in 1.1 and will be removed in 1.3. To keep the past behaviour, explicitly set `max_features='sqrt'` or remove this parameter as it is also the default value for RandomForestClassifiers and ExtraTreesClassifiers.\n",
+ " warn(\n",
+ "C:\\Users\\Gareth\\anaconda3\\envs\\IncrementalTrees\\lib\\site-packages\\sklearn\\ensemble\\_forest.py:425: FutureWarning: `max_features='auto'` has been deprecated in 1.1 and will be removed in 1.3. To keep the past behaviour, explicitly set `max_features='sqrt'` or remove this parameter as it is also the default value for RandomForestClassifiers and ExtraTreesClassifiers.\n",
+ " warn(\n",
+ "C:\\Users\\Gareth\\anaconda3\\envs\\IncrementalTrees\\lib\\site-packages\\sklearn\\ensemble\\_forest.py:425: FutureWarning: `max_features='auto'` has been deprecated in 1.1 and will be removed in 1.3. To keep the past behaviour, explicitly set `max_features='sqrt'` or remove this parameter as it is also the default value for RandomForestClassifiers and ExtraTreesClassifiers.\n",
+ " warn(\n",
+ "C:\\Users\\Gareth\\anaconda3\\envs\\IncrementalTrees\\lib\\site-packages\\sklearn\\ensemble\\_forest.py:425: FutureWarning: `max_features='auto'` has been deprecated in 1.1 and will be removed in 1.3. To keep the past behaviour, explicitly set `max_features='sqrt'` or remove this parameter as it is also the default value for RandomForestClassifiers and ExtraTreesClassifiers.\n",
+ " warn(\n",
+ "C:\\Users\\Gareth\\anaconda3\\envs\\IncrementalTrees\\lib\\site-packages\\sklearn\\ensemble\\_forest.py:425: FutureWarning: `max_features='auto'` has been deprecated in 1.1 and will be removed in 1.3. To keep the past behaviour, explicitly set `max_features='sqrt'` or remove this parameter as it is also the default value for RandomForestClassifiers and ExtraTreesClassifiers.\n",
+ " warn(\n",
+ "C:\\Users\\Gareth\\anaconda3\\envs\\IncrementalTrees\\lib\\site-packages\\sklearn\\ensemble\\_forest.py:425: FutureWarning: `max_features='auto'` has been deprecated in 1.1 and will be removed in 1.3. To keep the past behaviour, explicitly set `max_features='sqrt'` or remove this parameter as it is also the default value for RandomForestClassifiers and ExtraTreesClassifiers.\n",
+ " warn(\n",
+ "C:\\Users\\Gareth\\anaconda3\\envs\\IncrementalTrees\\lib\\site-packages\\sklearn\\ensemble\\_forest.py:425: FutureWarning: `max_features='auto'` has been deprecated in 1.1 and will be removed in 1.3. To keep the past behaviour, explicitly set `max_features='sqrt'` or remove this parameter as it is also the default value for RandomForestClassifiers and ExtraTreesClassifiers.\n",
+ " warn(\n",
+ "C:\\Users\\Gareth\\anaconda3\\envs\\IncrementalTrees\\lib\\site-packages\\sklearn\\ensemble\\_forest.py:425: FutureWarning: `max_features='auto'` has been deprecated in 1.1 and will be removed in 1.3. To keep the past behaviour, explicitly set `max_features='sqrt'` or remove this parameter as it is also the default value for RandomForestClassifiers and ExtraTreesClassifiers.\n",
+ " warn(\n",
+ "C:\\Users\\Gareth\\anaconda3\\envs\\IncrementalTrees\\lib\\site-packages\\sklearn\\ensemble\\_forest.py:425: FutureWarning: `max_features='auto'` has been deprecated in 1.1 and will be removed in 1.3. To keep the past behaviour, explicitly set `max_features='sqrt'` or remove this parameter as it is also the default value for RandomForestClassifiers and ExtraTreesClassifiers.\n",
+ " warn(\n",
+ "C:\\Users\\Gareth\\anaconda3\\envs\\IncrementalTrees\\lib\\site-packages\\sklearn\\ensemble\\_forest.py:425: FutureWarning: `max_features='auto'` has been deprecated in 1.1 and will be removed in 1.3. To keep the past behaviour, explicitly set `max_features='sqrt'` or remove this parameter as it is also the default value for RandomForestClassifiers and ExtraTreesClassifiers.\n",
+ " warn(\n",
+ "C:\\Users\\Gareth\\anaconda3\\envs\\IncrementalTrees\\lib\\site-packages\\sklearn\\ensemble\\_forest.py:425: FutureWarning: `max_features='auto'` has been deprecated in 1.1 and will be removed in 1.3. To keep the past behaviour, explicitly set `max_features='sqrt'` or remove this parameter as it is also the default value for RandomForestClassifiers and ExtraTreesClassifiers.\n",
+ " warn(\n",
+ "C:\\Users\\Gareth\\anaconda3\\envs\\IncrementalTrees\\lib\\site-packages\\sklearn\\ensemble\\_forest.py:425: FutureWarning: `max_features='auto'` has been deprecated in 1.1 and will be removed in 1.3. To keep the past behaviour, explicitly set `max_features='sqrt'` or remove this parameter as it is also the default value for RandomForestClassifiers and ExtraTreesClassifiers.\n",
+ " warn(\n",
+ "C:\\Users\\Gareth\\anaconda3\\envs\\IncrementalTrees\\lib\\site-packages\\sklearn\\ensemble\\_forest.py:425: FutureWarning: `max_features='auto'` has been deprecated in 1.1 and will be removed in 1.3. To keep the past behaviour, explicitly set `max_features='sqrt'` or remove this parameter as it is also the default value for RandomForestClassifiers and ExtraTreesClassifiers.\n",
+ " warn(\n",
+ "C:\\Users\\Gareth\\anaconda3\\envs\\IncrementalTrees\\lib\\site-packages\\sklearn\\ensemble\\_forest.py:425: FutureWarning: `max_features='auto'` has been deprecated in 1.1 and will be removed in 1.3. To keep the past behaviour, explicitly set `max_features='sqrt'` or remove this parameter as it is also the default value for RandomForestClassifiers and ExtraTreesClassifiers.\n",
+ " warn(\n",
+ "C:\\Users\\Gareth\\anaconda3\\envs\\IncrementalTrees\\lib\\site-packages\\sklearn\\ensemble\\_forest.py:425: FutureWarning: `max_features='auto'` has been deprecated in 1.1 and will be removed in 1.3. To keep the past behaviour, explicitly set `max_features='sqrt'` or remove this parameter as it is also the default value for RandomForestClassifiers and ExtraTreesClassifiers.\n",
+ " warn(\n",
+ "C:\\Users\\Gareth\\anaconda3\\envs\\IncrementalTrees\\lib\\site-packages\\sklearn\\ensemble\\_forest.py:425: FutureWarning: `max_features='auto'` has been deprecated in 1.1 and will be removed in 1.3. To keep the past behaviour, explicitly set `max_features='sqrt'` or remove this parameter as it is also the default value for RandomForestClassifiers and ExtraTreesClassifiers.\n",
+ " warn(\n",
+ "C:\\Users\\Gareth\\anaconda3\\envs\\IncrementalTrees\\lib\\site-packages\\sklearn\\ensemble\\_forest.py:425: FutureWarning: `max_features='auto'` has been deprecated in 1.1 and will be removed in 1.3. To keep the past behaviour, explicitly set `max_features='sqrt'` or remove this parameter as it is also the default value for RandomForestClassifiers and ExtraTreesClassifiers.\n",
+ " warn(\n",
+ "C:\\Users\\Gareth\\anaconda3\\envs\\IncrementalTrees\\lib\\site-packages\\sklearn\\ensemble\\_forest.py:425: FutureWarning: `max_features='auto'` has been deprecated in 1.1 and will be removed in 1.3. To keep the past behaviour, explicitly set `max_features='sqrt'` or remove this parameter as it is also the default value for RandomForestClassifiers and ExtraTreesClassifiers.\n",
+ " warn(\n",
+ "C:\\Users\\Gareth\\anaconda3\\envs\\IncrementalTrees\\lib\\site-packages\\sklearn\\ensemble\\_forest.py:425: FutureWarning: `max_features='auto'` has been deprecated in 1.1 and will be removed in 1.3. To keep the past behaviour, explicitly set `max_features='sqrt'` or remove this parameter as it is also the default value for RandomForestClassifiers and ExtraTreesClassifiers.\n",
+ " warn(\n",
+ "C:\\Users\\Gareth\\anaconda3\\envs\\IncrementalTrees\\lib\\site-packages\\sklearn\\ensemble\\_forest.py:425: FutureWarning: `max_features='auto'` has been deprecated in 1.1 and will be removed in 1.3. To keep the past behaviour, explicitly set `max_features='sqrt'` or remove this parameter as it is also the default value for RandomForestClassifiers and ExtraTreesClassifiers.\n",
+ " warn(\n",
+ "C:\\Users\\Gareth\\anaconda3\\envs\\IncrementalTrees\\lib\\site-packages\\sklearn\\ensemble\\_forest.py:425: FutureWarning: `max_features='auto'` has been deprecated in 1.1 and will be removed in 1.3. To keep the past behaviour, explicitly set `max_features='sqrt'` or remove this parameter as it is also the default value for RandomForestClassifiers and ExtraTreesClassifiers.\n",
+ " warn(\n",
+ "C:\\Users\\Gareth\\anaconda3\\envs\\IncrementalTrees\\lib\\site-packages\\sklearn\\ensemble\\_forest.py:425: FutureWarning: `max_features='auto'` has been deprecated in 1.1 and will be removed in 1.3. To keep the past behaviour, explicitly set `max_features='sqrt'` or remove this parameter as it is also the default value for RandomForestClassifiers and ExtraTreesClassifiers.\n",
+ " warn(\n",
+ "C:\\Users\\Gareth\\anaconda3\\envs\\IncrementalTrees\\lib\\site-packages\\sklearn\\ensemble\\_forest.py:425: FutureWarning: `max_features='auto'` has been deprecated in 1.1 and will be removed in 1.3. To keep the past behaviour, explicitly set `max_features='sqrt'` or remove this parameter as it is also the default value for RandomForestClassifiers and ExtraTreesClassifiers.\n",
+ " warn(\n",
+ "C:\\Users\\Gareth\\anaconda3\\envs\\IncrementalTrees\\lib\\site-packages\\sklearn\\ensemble\\_forest.py:425: FutureWarning: `max_features='auto'` has been deprecated in 1.1 and will be removed in 1.3. To keep the past behaviour, explicitly set `max_features='sqrt'` or remove this parameter as it is also the default value for RandomForestClassifiers and ExtraTreesClassifiers.\n",
+ " warn(\n",
+ "C:\\Users\\Gareth\\anaconda3\\envs\\IncrementalTrees\\lib\\site-packages\\sklearn\\ensemble\\_forest.py:425: FutureWarning: `max_features='auto'` has been deprecated in 1.1 and will be removed in 1.3. To keep the past behaviour, explicitly set `max_features='sqrt'` or remove this parameter as it is also the default value for RandomForestClassifiers and ExtraTreesClassifiers.\n",
+ " warn(\n",
+ "C:\\Users\\Gareth\\anaconda3\\envs\\IncrementalTrees\\lib\\site-packages\\sklearn\\ensemble\\_forest.py:425: FutureWarning: `max_features='auto'` has been deprecated in 1.1 and will be removed in 1.3. To keep the past behaviour, explicitly set `max_features='sqrt'` or remove this parameter as it is also the default value for RandomForestClassifiers and ExtraTreesClassifiers.\n",
+ " warn(\n",
+ "C:\\Users\\Gareth\\anaconda3\\envs\\IncrementalTrees\\lib\\site-packages\\sklearn\\ensemble\\_forest.py:425: FutureWarning: `max_features='auto'` has been deprecated in 1.1 and will be removed in 1.3. To keep the past behaviour, explicitly set `max_features='sqrt'` or remove this parameter as it is also the default value for RandomForestClassifiers and ExtraTreesClassifiers.\n",
+ " warn(\n",
+ "C:\\Users\\Gareth\\anaconda3\\envs\\IncrementalTrees\\lib\\site-packages\\sklearn\\ensemble\\_forest.py:425: FutureWarning: `max_features='auto'` has been deprecated in 1.1 and will be removed in 1.3. To keep the past behaviour, explicitly set `max_features='sqrt'` or remove this parameter as it is also the default value for RandomForestClassifiers and ExtraTreesClassifiers.\n",
+ " warn(\n",
+ "C:\\Users\\Gareth\\anaconda3\\envs\\IncrementalTrees\\lib\\site-packages\\sklearn\\ensemble\\_forest.py:425: FutureWarning: `max_features='auto'` has been deprecated in 1.1 and will be removed in 1.3. To keep the past behaviour, explicitly set `max_features='sqrt'` or remove this parameter as it is also the default value for RandomForestClassifiers and ExtraTreesClassifiers.\n",
+ " warn(\n",
+ "C:\\Users\\Gareth\\anaconda3\\envs\\IncrementalTrees\\lib\\site-packages\\sklearn\\ensemble\\_forest.py:425: FutureWarning: `max_features='auto'` has been deprecated in 1.1 and will be removed in 1.3. To keep the past behaviour, explicitly set `max_features='sqrt'` or remove this parameter as it is also the default value for RandomForestClassifiers and ExtraTreesClassifiers.\n",
+ " warn(\n",
+ "C:\\Users\\Gareth\\anaconda3\\envs\\IncrementalTrees\\lib\\site-packages\\sklearn\\ensemble\\_forest.py:425: FutureWarning: `max_features='auto'` has been deprecated in 1.1 and will be removed in 1.3. To keep the past behaviour, explicitly set `max_features='sqrt'` or remove this parameter as it is also the default value for RandomForestClassifiers and ExtraTreesClassifiers.\n",
+ " warn(\n",
+ "C:\\Users\\Gareth\\anaconda3\\envs\\IncrementalTrees\\lib\\site-packages\\sklearn\\ensemble\\_forest.py:425: FutureWarning: `max_features='auto'` has been deprecated in 1.1 and will be removed in 1.3. To keep the past behaviour, explicitly set `max_features='sqrt'` or remove this parameter as it is also the default value for RandomForestClassifiers and ExtraTreesClassifiers.\n",
+ " warn(\n",
+ "C:\\Users\\Gareth\\anaconda3\\envs\\IncrementalTrees\\lib\\site-packages\\sklearn\\ensemble\\_forest.py:425: FutureWarning: `max_features='auto'` has been deprecated in 1.1 and will be removed in 1.3. To keep the past behaviour, explicitly set `max_features='sqrt'` or remove this parameter as it is also the default value for RandomForestClassifiers and ExtraTreesClassifiers.\n",
+ " warn(\n",
+ "C:\\Users\\Gareth\\anaconda3\\envs\\IncrementalTrees\\lib\\site-packages\\sklearn\\ensemble\\_forest.py:425: FutureWarning: `max_features='auto'` has been deprecated in 1.1 and will be removed in 1.3. To keep the past behaviour, explicitly set `max_features='sqrt'` or remove this parameter as it is also the default value for RandomForestClassifiers and ExtraTreesClassifiers.\n",
+ " warn(\n",
+ "C:\\Users\\Gareth\\anaconda3\\envs\\IncrementalTrees\\lib\\site-packages\\sklearn\\ensemble\\_forest.py:425: FutureWarning: `max_features='auto'` has been deprecated in 1.1 and will be removed in 1.3. To keep the past behaviour, explicitly set `max_features='sqrt'` or remove this parameter as it is also the default value for RandomForestClassifiers and ExtraTreesClassifiers.\n",
+ " warn(\n",
+ "C:\\Users\\Gareth\\anaconda3\\envs\\IncrementalTrees\\lib\\site-packages\\sklearn\\ensemble\\_forest.py:425: FutureWarning: `max_features='auto'` has been deprecated in 1.1 and will be removed in 1.3. To keep the past behaviour, explicitly set `max_features='sqrt'` or remove this parameter as it is also the default value for RandomForestClassifiers and ExtraTreesClassifiers.\n",
+ " warn(\n",
+ "C:\\Users\\Gareth\\anaconda3\\envs\\IncrementalTrees\\lib\\site-packages\\sklearn\\ensemble\\_forest.py:425: FutureWarning: `max_features='auto'` has been deprecated in 1.1 and will be removed in 1.3. To keep the past behaviour, explicitly set `max_features='sqrt'` or remove this parameter as it is also the default value for RandomForestClassifiers and ExtraTreesClassifiers.\n",
+ " warn(\n",
+ "C:\\Users\\Gareth\\anaconda3\\envs\\IncrementalTrees\\lib\\site-packages\\sklearn\\ensemble\\_forest.py:425: FutureWarning: `max_features='auto'` has been deprecated in 1.1 and will be removed in 1.3. To keep the past behaviour, explicitly set `max_features='sqrt'` or remove this parameter as it is also the default value for RandomForestClassifiers and ExtraTreesClassifiers.\n",
+ " warn(\n",
+ "C:\\Users\\Gareth\\anaconda3\\envs\\IncrementalTrees\\lib\\site-packages\\sklearn\\ensemble\\_forest.py:425: FutureWarning: `max_features='auto'` has been deprecated in 1.1 and will be removed in 1.3. To keep the past behaviour, explicitly set `max_features='sqrt'` or remove this parameter as it is also the default value for RandomForestClassifiers and ExtraTreesClassifiers.\n",
+ " warn(\n",
+ "C:\\Users\\Gareth\\anaconda3\\envs\\IncrementalTrees\\lib\\site-packages\\sklearn\\ensemble\\_forest.py:425: FutureWarning: `max_features='auto'` has been deprecated in 1.1 and will be removed in 1.3. To keep the past behaviour, explicitly set `max_features='sqrt'` or remove this parameter as it is also the default value for RandomForestClassifiers and ExtraTreesClassifiers.\n",
+ " warn(\n",
+ "C:\\Users\\Gareth\\anaconda3\\envs\\IncrementalTrees\\lib\\site-packages\\sklearn\\ensemble\\_forest.py:425: FutureWarning: `max_features='auto'` has been deprecated in 1.1 and will be removed in 1.3. To keep the past behaviour, explicitly set `max_features='sqrt'` or remove this parameter as it is also the default value for RandomForestClassifiers and ExtraTreesClassifiers.\n",
+ " warn(\n",
+ "C:\\Users\\Gareth\\anaconda3\\envs\\IncrementalTrees\\lib\\site-packages\\sklearn\\ensemble\\_forest.py:425: FutureWarning: `max_features='auto'` has been deprecated in 1.1 and will be removed in 1.3. To keep the past behaviour, explicitly set `max_features='sqrt'` or remove this parameter as it is also the default value for RandomForestClassifiers and ExtraTreesClassifiers.\n",
+ " warn(\n",
+ "C:\\Users\\Gareth\\anaconda3\\envs\\IncrementalTrees\\lib\\site-packages\\sklearn\\ensemble\\_forest.py:425: FutureWarning: `max_features='auto'` has been deprecated in 1.1 and will be removed in 1.3. To keep the past behaviour, explicitly set `max_features='sqrt'` or remove this parameter as it is also the default value for RandomForestClassifiers and ExtraTreesClassifiers.\n",
+ " warn(\n",
+ "C:\\Users\\Gareth\\anaconda3\\envs\\IncrementalTrees\\lib\\site-packages\\sklearn\\ensemble\\_forest.py:425: FutureWarning: `max_features='auto'` has been deprecated in 1.1 and will be removed in 1.3. To keep the past behaviour, explicitly set `max_features='sqrt'` or remove this parameter as it is also the default value for RandomForestClassifiers and ExtraTreesClassifiers.\n",
+ " warn(\n",
+ "C:\\Users\\Gareth\\anaconda3\\envs\\IncrementalTrees\\lib\\site-packages\\sklearn\\ensemble\\_forest.py:425: FutureWarning: `max_features='auto'` has been deprecated in 1.1 and will be removed in 1.3. To keep the past behaviour, explicitly set `max_features='sqrt'` or remove this parameter as it is also the default value for RandomForestClassifiers and ExtraTreesClassifiers.\n",
+ " warn(\n",
+ "C:\\Users\\Gareth\\anaconda3\\envs\\IncrementalTrees\\lib\\site-packages\\sklearn\\ensemble\\_forest.py:425: FutureWarning: `max_features='auto'` has been deprecated in 1.1 and will be removed in 1.3. To keep the past behaviour, explicitly set `max_features='sqrt'` or remove this parameter as it is also the default value for RandomForestClassifiers and ExtraTreesClassifiers.\n",
+ " warn(\n",
+ "C:\\Users\\Gareth\\anaconda3\\envs\\IncrementalTrees\\lib\\site-packages\\sklearn\\ensemble\\_forest.py:425: FutureWarning: `max_features='auto'` has been deprecated in 1.1 and will be removed in 1.3. To keep the past behaviour, explicitly set `max_features='sqrt'` or remove this parameter as it is also the default value for RandomForestClassifiers and ExtraTreesClassifiers.\n",
+ " warn(\n",
+ "C:\\Users\\Gareth\\anaconda3\\envs\\IncrementalTrees\\lib\\site-packages\\sklearn\\ensemble\\_forest.py:425: FutureWarning: `max_features='auto'` has been deprecated in 1.1 and will be removed in 1.3. To keep the past behaviour, explicitly set `max_features='sqrt'` or remove this parameter as it is also the default value for RandomForestClassifiers and ExtraTreesClassifiers.\n",
+ " warn(\n",
+ "C:\\Users\\Gareth\\anaconda3\\envs\\IncrementalTrees\\lib\\site-packages\\sklearn\\ensemble\\_forest.py:425: FutureWarning: `max_features='auto'` has been deprecated in 1.1 and will be removed in 1.3. To keep the past behaviour, explicitly set `max_features='sqrt'` or remove this parameter as it is also the default value for RandomForestClassifiers and ExtraTreesClassifiers.\n",
+ " warn(\n",
+ "C:\\Users\\Gareth\\anaconda3\\envs\\IncrementalTrees\\lib\\site-packages\\sklearn\\ensemble\\_forest.py:425: FutureWarning: `max_features='auto'` has been deprecated in 1.1 and will be removed in 1.3. To keep the past behaviour, explicitly set `max_features='sqrt'` or remove this parameter as it is also the default value for RandomForestClassifiers and ExtraTreesClassifiers.\n",
+ " warn(\n",
+ "C:\\Users\\Gareth\\anaconda3\\envs\\IncrementalTrees\\lib\\site-packages\\sklearn\\ensemble\\_forest.py:425: FutureWarning: `max_features='auto'` has been deprecated in 1.1 and will be removed in 1.3. To keep the past behaviour, explicitly set `max_features='sqrt'` or remove this parameter as it is also the default value for RandomForestClassifiers and ExtraTreesClassifiers.\n",
+ " warn(\n",
+ "C:\\Users\\Gareth\\anaconda3\\envs\\IncrementalTrees\\lib\\site-packages\\sklearn\\ensemble\\_forest.py:425: FutureWarning: `max_features='auto'` has been deprecated in 1.1 and will be removed in 1.3. To keep the past behaviour, explicitly set `max_features='sqrt'` or remove this parameter as it is also the default value for RandomForestClassifiers and ExtraTreesClassifiers.\n",
+ " warn(\n",
+ "C:\\Users\\Gareth\\anaconda3\\envs\\IncrementalTrees\\lib\\site-packages\\sklearn\\ensemble\\_forest.py:425: FutureWarning: `max_features='auto'` has been deprecated in 1.1 and will be removed in 1.3. To keep the past behaviour, explicitly set `max_features='sqrt'` or remove this parameter as it is also the default value for RandomForestClassifiers and ExtraTreesClassifiers.\n",
+ " warn(\n",
+ "C:\\Users\\Gareth\\anaconda3\\envs\\IncrementalTrees\\lib\\site-packages\\sklearn\\ensemble\\_forest.py:425: FutureWarning: `max_features='auto'` has been deprecated in 1.1 and will be removed in 1.3. To keep the past behaviour, explicitly set `max_features='sqrt'` or remove this parameter as it is also the default value for RandomForestClassifiers and ExtraTreesClassifiers.\n",
+ " warn(\n",
+ "C:\\Users\\Gareth\\anaconda3\\envs\\IncrementalTrees\\lib\\site-packages\\sklearn\\ensemble\\_forest.py:425: FutureWarning: `max_features='auto'` has been deprecated in 1.1 and will be removed in 1.3. To keep the past behaviour, explicitly set `max_features='sqrt'` or remove this parameter as it is also the default value for RandomForestClassifiers and ExtraTreesClassifiers.\n",
+ " warn(\n"
+ ]
+ },
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "CPU times: total: 41.7 s\n",
+ "Wall time: 41.9 s\n",
+ "With 119: 1.0 | 0.6418562031518801\n"
+ ]
+ },
+ {
+ "data": {
+ "text/plain": "",
+ "image/png": "\n"
+ },
+ "metadata": {},
+ "output_type": "display_data"
+ }
+ ],
+ "source": [
+ "steps = np.arange(1, MAX_ESTIMATORS, 1)\n",
+ "srfc = StreamingRFC(n_estimators_per_chunk=1,\n",
+ " max_n_estimators=np.max(steps))\n",
+ "\n",
+ "%time train_scores, test_scores = inc_partial_fit(x, y, srfc=srfc, steps=steps, sample=1)\n",
+ "print(f\"With {len(srfc.estimators_)}: {train_scores[-1]} | {test_scores[-1]}\")\n",
+ "plot_auc(steps, train_scores, test_scores)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "pycharm": {
+ "name": "#%% md\n"
+ }
+ },
+ "source": [
+ "#### Partial random forest\n",
+ "1 estimator per 10 % subset"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {
+ "pycharm": {
+ "name": "#%%\n"
+ }
+ },
+ "outputs": [],
+ "source": [
+ "steps = np.arange(1, MAX_ESTIMATORS, 1)\n",
+ "srfc = StreamingRFC(n_estimators_per_chunk=1,\n",
+ " max_n_estimators=np.max(steps))\n",
+ "\n",
+ "%time train_scores, test_scores = inc_partial_fit(x, y, srfc=srfc, steps=steps, sample=0.1)\n",
+ "print(f\"With {len(srfc.estimators_)}: {train_scores[-1]} | {test_scores[-1]}\")\n",
+ "plot_auc(steps, train_scores, test_scores)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "pycharm": {
+ "name": "#%% md\n"
+ }
+ },
+ "source": [
+ "#### Partial random forest\n",
+ "3 estimators per 10 % subset, but /3 fewer steps"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {
+ "pycharm": {
+ "name": "#%%\n"
+ }
+ },
+ "outputs": [],
+ "source": [
+ "steps = np.arange(1, MAX_ESTIMATORS, 3)\n",
+ "srfc = StreamingRFC(n_estimators_per_chunk=3,\n",
+ " max_n_estimators=np.max(steps))\n",
+ "\n",
+ "%time train_scores, test_scores = inc_partial_fit(x, y, srfc=srfc, steps=steps, sample=0.1)\n",
+ "print(f\"With {len(srfc.estimators_)}: {train_scores[-1]} | {test_scores[-1]}\")\n",
+ "plot_auc(steps, train_scores, test_scores)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "pycharm": {
+ "name": "#%% md\n"
+ }
+ },
+ "source": [
+ "#### Forest of partial decision trees\n",
+ "1 estimator per 10 % subset with all features"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {
+ "pycharm": {
+ "name": "#%%\n"
+ }
+ },
+ "outputs": [],
+ "source": [
+ "steps = np.arange(1, MAX_ESTIMATORS, 1)\n",
+ "srfc = StreamingRFC(n_estimators_per_chunk=1,\n",
+ " max_n_estimators=np.max(steps),\n",
+ " max_features=x.shape[1])\n",
+ "\n",
+ "%time train_scores, test_scores = inc_partial_fit(x, y, srfc=srfc, steps=steps, sample=0.1)\n",
+ "print(f\"With {len(srfc.estimators_)}: {train_scores[-1]} | {test_scores[-1]}\")\n",
+ "plot_auc(steps, train_scores, test_scores)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "pycharm": {
+ "name": "#%% md\n"
+ }
+ },
+ "source": [
+ "# Optimised parameters\n",
+ "\n",
+ "Using a better set of parameters for this dataset"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {
+ "pycharm": {
+ "name": "#%%\n"
+ }
+ },
+ "outputs": [],
+ "source": [
+ "from sklearn.model_selection import RandomizedSearchCV as RCV\n",
+ "\n",
+ "grid = RCV(RandomForestClassifier(n_estimators=30, \n",
+ " n_jobs=-1),\n",
+ " param_distributions={'min_samples_leaf': [1, 2, 10, 30, 60, 120, 240, 480],\n",
+ " 'min_samples_split': [2, 10, 30, 60, 120, 240, 480],\n",
+ " 'min_impurity_decrease': [0, 0.05, 0.1, 0.2, 0.3]},\n",
+ " cv=3,\n",
+ " n_iter=100,\n",
+ " verbose=10,\n",
+ " n_jobs=-1)\n",
+ "\n",
+ "x_train, x_test, y_train, y_test = train_test_split(x, y, \n",
+ " test_size=0.25,\n",
+ " random_state=1)\n",
+ "\n",
+ "grid.fit(x_train, y_train)\n",
+ "print(grid.best_estimator_.get_params(deep=True))\n",
+ "\n",
+ "tr_score, te_score = score(grid,\n",
+ " train=(x_train, y_train),\n",
+ " test=(x_test, y_test),\n",
+ " pr=True)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {
+ "pycharm": {
+ "name": "#%%\n"
+ }
+ },
+ "outputs": [],
+ "source": [
+ "params = grid.best_estimator_.get_params()\n",
+ "params.pop('warm_start', None)\n",
+ "params.pop('n_jobs', None)\n",
+ "params.pop('n_estimators', None)\n",
+ "params"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "pycharm": {
+ "name": "#%% md\n"
+ }
+ },
+ "source": [
+ "### Standard random forest"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {
+ "pycharm": {
+ "name": "#%%\n"
+ }
+ },
+ "outputs": [],
+ "source": [
+ "steps = np.arange(1, MAX_ESTIMATORS, 10)\n",
+ "\n",
+ "rfc = RandomForestClassifier(warm_start=True,\n",
+ " **params)\n",
+ "\n",
+ "%time train_scores, test_scores = inc_fit(x, y, rfc=rfc, steps=steps)\n",
+ "print(f\"With {len(rfc.estimators_)}: {train_scores[-1]} | {test_scores[-1]}\")\n",
+ "plot_auc(steps, train_scores, test_scores)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "pycharm": {
+ "name": "#%% md\n"
+ }
+ },
+ "source": [
+ "#### As normal random forest\n",
+ "1 estimator per full subset"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {
+ "pycharm": {
+ "name": "#%%\n"
+ }
+ },
+ "outputs": [],
+ "source": [
+ "steps = np.arange(1, MAX_ESTIMATORS, 1)\n",
+ "srfc = StreamingRFC(n_estimators_per_chunk=1,\n",
+ " max_n_estimators=np.max(steps),\n",
+ " **params)\n",
+ "\n",
+ "%time train_scores, test_scores = inc_partial_fit(x, y, srfc=srfc, steps=steps, sample=1)\n",
+ "print(f\"With {len(srfc.estimators_)}: {train_scores[-1]} | {test_scores[-1]}\")\n",
+ "plot_auc(steps, train_scores, test_scores)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "pycharm": {
+ "name": "#%% md\n"
+ }
+ },
+ "source": [
+ "#### Partial random forest\n",
+ "1 estimator per 10 % subset"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {
+ "pycharm": {
+ "name": "#%%\n"
+ }
+ },
+ "outputs": [],
+ "source": [
+ "steps = np.arange(1, MAX_ESTIMATORS, 1)\n",
+ "srfc = StreamingRFC(n_estimators=1,\n",
+ " max_n_estimators=np.max(steps),\n",
+ " **params)\n",
+ "\n",
+ "%time train_scores, test_scores = inc_partial_fit(x, y, srfc=srfc, steps=steps, sample=0.1)\n",
+ "print(f\"With {len(srfc.estimators_)}: {train_scores[-1]} | {test_scores[-1]}\")\n",
+ "plot_auc(steps, train_scores, test_scores)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "pycharm": {
+ "name": "#%% md\n"
+ }
+ },
+ "source": [
+ "#### Partial random forest\n",
+ "3 estimators per 10 % subset, but /3 fewer steps"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {
+ "pycharm": {
+ "name": "#%%\n"
+ }
+ },
+ "outputs": [],
+ "source": [
+ "steps = np.arange(1, MAX_ESTIMATORS, 3)\n",
+ "srfc = StreamingRFC(n_estimators=3,\n",
+ " max_n_estimators=np.max(steps),\n",
+ " **params)\n",
+ "\n",
+ "%time train_scores, test_scores = inc_partial_fit(x, y, srfc=srfc, steps=steps, sample=0.1)\n",
+ "print(f\"With {len(srfc.estimators_)}: {train_scores[-1]} | {test_scores[-1]}\")\n",
+ "plot_auc(steps, train_scores, test_scores)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "pycharm": {
+ "name": "#%% md\n"
+ }
+ },
+ "source": [
+ "#### Forest of partial decision trees\n",
+ "1 estimator per 10 % subset with all features"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {
+ "pycharm": {
+ "name": "#%%\n"
+ }
+ },
+ "outputs": [],
+ "source": [
+ "params.pop('max_features', None)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {
+ "pycharm": {
+ "name": "#%%\n"
+ }
+ },
+ "outputs": [],
+ "source": [
+ "steps = np.arange(1, MAX_ESTIMATORS, 1)\n",
+ "srfc = StreamingRFC(n_estimators=1,\n",
+ " max_n_estimators=np.max(steps),\n",
+ " max_features=x.shape[1],\n",
+ " **params)\n",
+ "\n",
+ "%time train_scores, test_scores = inc_partial_fit(x, y, srfc=srfc, steps=steps, sample=0.1)\n",
+ "print(f\"With {len(srfc.estimators_)}: {train_scores[-1]} | {test_scores[-1]}\")\n",
+ "plot_auc(steps, train_scores, test_scores)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {
+ "pycharm": {
+ "name": "#%%\n"
+ }
+ },
+ "outputs": [],
+ "source": []
+ }
+ ],
+ "metadata": {
+ "kernelspec": {
+ "display_name": "Python 3",
+ "language": "python",
+ "name": "python3"
+ },
+ "language_info": {
+ "codemirror_mode": {
+ "name": "ipython",
+ "version": 3
+ },
+ "file_extension": ".py",
+ "mimetype": "text/x-python",
+ "name": "python",
+ "nbconvert_exporter": "python",
+ "pygments_lexer": "ipython3",
+ "version": "3.6.7"
+ }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}
\ No newline at end of file
diff --git a/scripts/PerformanceComparisonsDask.ipynb b/scripts/PerformanceComparisonsDask.ipynb
new file mode 100644
index 0000000..e130615
--- /dev/null
+++ b/scripts/PerformanceComparisonsDask.ipynb
@@ -0,0 +1,649 @@
+{
+ "cells": [
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "pycharm": {
+ "name": "#%% md\n"
+ }
+ },
+ "source": [
+ "# Performance comparisons\n",
+ "\n",
+ "In memory and out of memory, using dask."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 23,
+ "metadata": {
+ "pycharm": {
+ "name": "#%%\n"
+ }
+ },
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "The autoreload extension is already loaded. To reload it, use:\n",
+ " %reload_ext autoreload\n"
+ ]
+ }
+ ],
+ "source": [
+ "# Change dir to repo root if running from repo (rather than pip installed)\n",
+ "# (Assuming running from [repo]/scripts/)\n",
+ "import os\n",
+ "os.chdir('../')\n",
+ "\n",
+ "%load_ext autoreload\n",
+ "%autoreload 2"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 24,
+ "metadata": {
+ "pycharm": {
+ "name": "#%%\n"
+ }
+ },
+ "outputs": [],
+ "source": [
+ "import matplotlib.pyplot as plt\n",
+ "import numpy as np\n",
+ "\n",
+ "from typing import Tuple\n",
+ "\n",
+ "from incremental_trees.trees import StreamingRFC\n",
+ "\n",
+ "from sklearn.ensemble import RandomForestClassifier\n",
+ "from sklearn.model_selection import train_test_split\n",
+ "from sklearn.metrics import roc_auc_score\n",
+ "\n",
+ "import dask_ml.datasets\n",
+ "from dask_ml.wrappers import Incremental\n",
+ "from dask.distributed import Client, LocalCluster\n",
+ "from dask_ml.model_selection import train_test_split as dask_tts\n",
+ "\n",
+ "import dask as dd\n",
+ "import pandas as pd"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 25,
+ "metadata": {
+ "pycharm": {
+ "name": "#%%\n"
+ }
+ },
+ "outputs": [],
+ "source": [
+ "# Settings\n",
+ "MAX_ESTIMATORS = 60 # Lower to run faster"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 26,
+ "metadata": {
+ "pycharm": {
+ "name": "#%%\n"
+ }
+ },
+ "outputs": [
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "2023-01-01 21:04:41,365 - distributed.deploy.spec - WARNING - Cluster closed without starting up\n"
+ ]
+ },
+ {
+ "ename": "RuntimeError",
+ "evalue": "Cluster failed to start: Scheduler failed to start.",
+ "output_type": "error",
+ "traceback": [
+ "\u001B[1;31m---------------------------------------------------------------------------\u001B[0m",
+ "\u001B[1;31mOSError\u001B[0m Traceback (most recent call last)",
+ "File \u001B[1;32m~\\anaconda3\\envs\\IncrementalTrees\\lib\\site-packages\\distributed\\core.py:524\u001B[0m, in \u001B[0;36mServer.start\u001B[1;34m(self)\u001B[0m\n\u001B[0;32m 523\u001B[0m \u001B[38;5;28;01mtry\u001B[39;00m:\n\u001B[1;32m--> 524\u001B[0m \u001B[38;5;28;01mawait\u001B[39;00m asyncio\u001B[38;5;241m.\u001B[39mwait_for(\u001B[38;5;28mself\u001B[39m\u001B[38;5;241m.\u001B[39mstart_unsafe(), timeout\u001B[38;5;241m=\u001B[39mtimeout)\n\u001B[0;32m 525\u001B[0m \u001B[38;5;28;01mexcept\u001B[39;00m asyncio\u001B[38;5;241m.\u001B[39mTimeoutError \u001B[38;5;28;01mas\u001B[39;00m exc:\n",
+ "File \u001B[1;32m~\\anaconda3\\envs\\IncrementalTrees\\lib\\asyncio\\tasks.py:442\u001B[0m, in \u001B[0;36mwait_for\u001B[1;34m(fut, timeout, loop)\u001B[0m\n\u001B[0;32m 441\u001B[0m \u001B[38;5;28;01mif\u001B[39;00m timeout \u001B[38;5;129;01mis\u001B[39;00m \u001B[38;5;28;01mNone\u001B[39;00m:\n\u001B[1;32m--> 442\u001B[0m \u001B[38;5;28;01mreturn\u001B[39;00m \u001B[38;5;28;01mawait\u001B[39;00m fut\n\u001B[0;32m 444\u001B[0m \u001B[38;5;28;01mif\u001B[39;00m timeout \u001B[38;5;241m<\u001B[39m\u001B[38;5;241m=\u001B[39m \u001B[38;5;241m0\u001B[39m:\n",
+ "File \u001B[1;32m~\\anaconda3\\envs\\IncrementalTrees\\lib\\site-packages\\distributed\\scheduler.py:3880\u001B[0m, in \u001B[0;36mScheduler.start_unsafe\u001B[1;34m(self)\u001B[0m\n\u001B[0;32m 3879\u001B[0m \u001B[38;5;28;01mfor\u001B[39;00m addr \u001B[38;5;129;01min\u001B[39;00m \u001B[38;5;28mself\u001B[39m\u001B[38;5;241m.\u001B[39m_start_address:\n\u001B[1;32m-> 3880\u001B[0m \u001B[38;5;28;01mawait\u001B[39;00m \u001B[38;5;28mself\u001B[39m\u001B[38;5;241m.\u001B[39mlisten(\n\u001B[0;32m 3881\u001B[0m addr,\n\u001B[0;32m 3882\u001B[0m allow_offload\u001B[38;5;241m=\u001B[39m\u001B[38;5;28;01mFalse\u001B[39;00m,\n\u001B[0;32m 3883\u001B[0m handshake_overrides\u001B[38;5;241m=\u001B[39m{\u001B[38;5;124m\"\u001B[39m\u001B[38;5;124mpickle-protocol\u001B[39m\u001B[38;5;124m\"\u001B[39m: \u001B[38;5;241m4\u001B[39m, \u001B[38;5;124m\"\u001B[39m\u001B[38;5;124mcompression\u001B[39m\u001B[38;5;124m\"\u001B[39m: \u001B[38;5;28;01mNone\u001B[39;00m},\n\u001B[0;32m 3884\u001B[0m \u001B[38;5;241m*\u001B[39m\u001B[38;5;241m*\u001B[39m\u001B[38;5;28mself\u001B[39m\u001B[38;5;241m.\u001B[39msecurity\u001B[38;5;241m.\u001B[39mget_listen_args(\u001B[38;5;124m\"\u001B[39m\u001B[38;5;124mscheduler\u001B[39m\u001B[38;5;124m\"\u001B[39m),\n\u001B[0;32m 3885\u001B[0m )\n\u001B[0;32m 3886\u001B[0m \u001B[38;5;28mself\u001B[39m\u001B[38;5;241m.\u001B[39mip \u001B[38;5;241m=\u001B[39m get_address_host(\u001B[38;5;28mself\u001B[39m\u001B[38;5;241m.\u001B[39mlisten_address)\n",
+ "File \u001B[1;32m~\\anaconda3\\envs\\IncrementalTrees\\lib\\site-packages\\distributed\\core.py:707\u001B[0m, in \u001B[0;36mServer.listen\u001B[1;34m(self, port_or_addr, allow_offload, **kwargs)\u001B[0m\n\u001B[0;32m 706\u001B[0m \u001B[38;5;28;01massert\u001B[39;00m \u001B[38;5;28misinstance\u001B[39m(addr, \u001B[38;5;28mstr\u001B[39m)\n\u001B[1;32m--> 707\u001B[0m listener \u001B[38;5;241m=\u001B[39m \u001B[38;5;28;01mawait\u001B[39;00m listen(\n\u001B[0;32m 708\u001B[0m addr,\n\u001B[0;32m 709\u001B[0m \u001B[38;5;28mself\u001B[39m\u001B[38;5;241m.\u001B[39mhandle_comm,\n\u001B[0;32m 710\u001B[0m deserialize\u001B[38;5;241m=\u001B[39m\u001B[38;5;28mself\u001B[39m\u001B[38;5;241m.\u001B[39mdeserialize,\n\u001B[0;32m 711\u001B[0m allow_offload\u001B[38;5;241m=\u001B[39mallow_offload,\n\u001B[0;32m 712\u001B[0m \u001B[38;5;241m*\u001B[39m\u001B[38;5;241m*\u001B[39mkwargs,\n\u001B[0;32m 713\u001B[0m )\n\u001B[0;32m 714\u001B[0m \u001B[38;5;28mself\u001B[39m\u001B[38;5;241m.\u001B[39mlisteners\u001B[38;5;241m.\u001B[39mappend(listener)\n",
+ "File \u001B[1;32m~\\anaconda3\\envs\\IncrementalTrees\\lib\\site-packages\\distributed\\comm\\core.py:212\u001B[0m, in \u001B[0;36mListener.__await__.._\u001B[1;34m()\u001B[0m\n\u001B[0;32m 211\u001B[0m \u001B[38;5;28;01masync\u001B[39;00m \u001B[38;5;28;01mdef\u001B[39;00m \u001B[38;5;21m_\u001B[39m():\n\u001B[1;32m--> 212\u001B[0m \u001B[38;5;28;01mawait\u001B[39;00m \u001B[38;5;28mself\u001B[39m\u001B[38;5;241m.\u001B[39mstart()\n\u001B[0;32m 213\u001B[0m \u001B[38;5;28;01mreturn\u001B[39;00m \u001B[38;5;28mself\u001B[39m\n",
+ "File \u001B[1;32m~\\anaconda3\\envs\\IncrementalTrees\\lib\\site-packages\\distributed\\comm\\tcp.py:580\u001B[0m, in \u001B[0;36mBaseTCPListener.start\u001B[1;34m(self)\u001B[0m\n\u001B[0;32m 575\u001B[0m \u001B[38;5;28;01mtry\u001B[39;00m:\n\u001B[0;32m 576\u001B[0m \u001B[38;5;66;03m# When shuffling data between workers, there can\u001B[39;00m\n\u001B[0;32m 577\u001B[0m \u001B[38;5;66;03m# really be O(cluster size) connection requests\u001B[39;00m\n\u001B[0;32m 578\u001B[0m \u001B[38;5;66;03m# on a single worker socket, make sure the backlog\u001B[39;00m\n\u001B[0;32m 579\u001B[0m \u001B[38;5;66;03m# is large enough not to lose any.\u001B[39;00m\n\u001B[1;32m--> 580\u001B[0m sockets \u001B[38;5;241m=\u001B[39m \u001B[43mnetutil\u001B[49m\u001B[38;5;241;43m.\u001B[39;49m\u001B[43mbind_sockets\u001B[49m\u001B[43m(\u001B[49m\n\u001B[0;32m 581\u001B[0m \u001B[43m \u001B[49m\u001B[38;5;28;43mself\u001B[39;49m\u001B[38;5;241;43m.\u001B[39;49m\u001B[43mport\u001B[49m\u001B[43m,\u001B[49m\u001B[43m \u001B[49m\u001B[43maddress\u001B[49m\u001B[38;5;241;43m=\u001B[39;49m\u001B[38;5;28;43mself\u001B[39;49m\u001B[38;5;241;43m.\u001B[39;49m\u001B[43mip\u001B[49m\u001B[43m,\u001B[49m\u001B[43m \u001B[49m\u001B[43mbacklog\u001B[49m\u001B[38;5;241;43m=\u001B[39;49m\u001B[43mbacklog\u001B[49m\n\u001B[0;32m 582\u001B[0m \u001B[43m \u001B[49m\u001B[43m)\u001B[49m\n\u001B[0;32m 583\u001B[0m \u001B[38;5;28;01mexcept\u001B[39;00m \u001B[38;5;167;01mOSError\u001B[39;00m \u001B[38;5;28;01mas\u001B[39;00m e:\n\u001B[0;32m 584\u001B[0m \u001B[38;5;66;03m# EADDRINUSE can happen sporadically when trying to bind\u001B[39;00m\n\u001B[0;32m 585\u001B[0m \u001B[38;5;66;03m# to an ephemeral port\u001B[39;00m\n",
+ "File \u001B[1;32m~\\anaconda3\\envs\\IncrementalTrees\\lib\\site-packages\\tornado\\netutil.py:162\u001B[0m, in \u001B[0;36mbind_sockets\u001B[1;34m(port, address, family, backlog, flags, reuse_port)\u001B[0m\n\u001B[0;32m 161\u001B[0m \u001B[38;5;28;01mtry\u001B[39;00m:\n\u001B[1;32m--> 162\u001B[0m \u001B[43msock\u001B[49m\u001B[38;5;241;43m.\u001B[39;49m\u001B[43mbind\u001B[49m\u001B[43m(\u001B[49m\u001B[43msockaddr\u001B[49m\u001B[43m)\u001B[49m\n\u001B[0;32m 163\u001B[0m \u001B[38;5;28;01mexcept\u001B[39;00m \u001B[38;5;167;01mOSError\u001B[39;00m \u001B[38;5;28;01mas\u001B[39;00m e:\n",
+ "\u001B[1;31mOSError\u001B[0m: [WinError 10048] Only one usage of each socket address (protocol/network address/port) is normally permitted",
+ "\nThe above exception was the direct cause of the following exception:\n",
+ "\u001B[1;31mRuntimeError\u001B[0m Traceback (most recent call last)",
+ "File \u001B[1;32m~\\anaconda3\\envs\\IncrementalTrees\\lib\\site-packages\\distributed\\deploy\\spec.py:309\u001B[0m, in \u001B[0;36mSpecCluster._start\u001B[1;34m(self)\u001B[0m\n\u001B[0;32m 308\u001B[0m \u001B[38;5;28mself\u001B[39m\u001B[38;5;241m.\u001B[39mscheduler \u001B[38;5;241m=\u001B[39m \u001B[38;5;28mcls\u001B[39m(\u001B[38;5;241m*\u001B[39m\u001B[38;5;241m*\u001B[39m\u001B[38;5;28mself\u001B[39m\u001B[38;5;241m.\u001B[39mscheduler_spec\u001B[38;5;241m.\u001B[39mget(\u001B[38;5;124m\"\u001B[39m\u001B[38;5;124moptions\u001B[39m\u001B[38;5;124m\"\u001B[39m, {}))\n\u001B[1;32m--> 309\u001B[0m \u001B[38;5;28mself\u001B[39m\u001B[38;5;241m.\u001B[39mscheduler \u001B[38;5;241m=\u001B[39m \u001B[38;5;28;01mawait\u001B[39;00m \u001B[38;5;28mself\u001B[39m\u001B[38;5;241m.\u001B[39mscheduler\n\u001B[0;32m 310\u001B[0m \u001B[38;5;28mself\u001B[39m\u001B[38;5;241m.\u001B[39mscheduler_comm \u001B[38;5;241m=\u001B[39m rpc(\n\u001B[0;32m 311\u001B[0m \u001B[38;5;28mgetattr\u001B[39m(\u001B[38;5;28mself\u001B[39m\u001B[38;5;241m.\u001B[39mscheduler, \u001B[38;5;124m\"\u001B[39m\u001B[38;5;124mexternal_address\u001B[39m\u001B[38;5;124m\"\u001B[39m, \u001B[38;5;28;01mNone\u001B[39;00m)\n\u001B[0;32m 312\u001B[0m \u001B[38;5;129;01mor\u001B[39;00m \u001B[38;5;28mself\u001B[39m\u001B[38;5;241m.\u001B[39mscheduler\u001B[38;5;241m.\u001B[39maddress,\n\u001B[0;32m 313\u001B[0m connection_args\u001B[38;5;241m=\u001B[39m\u001B[38;5;28mself\u001B[39m\u001B[38;5;241m.\u001B[39msecurity\u001B[38;5;241m.\u001B[39mget_connection_args(\u001B[38;5;124m\"\u001B[39m\u001B[38;5;124mclient\u001B[39m\u001B[38;5;124m\"\u001B[39m),\n\u001B[0;32m 314\u001B[0m )\n",
+ "File \u001B[1;32m~\\anaconda3\\envs\\IncrementalTrees\\lib\\site-packages\\distributed\\core.py:532\u001B[0m, in \u001B[0;36mServer.start\u001B[1;34m(self)\u001B[0m\n\u001B[0;32m 531\u001B[0m \u001B[38;5;28;01mawait\u001B[39;00m _close_on_failure(exc)\n\u001B[1;32m--> 532\u001B[0m \u001B[38;5;28;01mraise\u001B[39;00m \u001B[38;5;167;01mRuntimeError\u001B[39;00m(\u001B[38;5;124mf\u001B[39m\u001B[38;5;124m\"\u001B[39m\u001B[38;5;132;01m{\u001B[39;00m\u001B[38;5;28mtype\u001B[39m(\u001B[38;5;28mself\u001B[39m)\u001B[38;5;241m.\u001B[39m\u001B[38;5;18m__name__\u001B[39m\u001B[38;5;132;01m}\u001B[39;00m\u001B[38;5;124m failed to start.\u001B[39m\u001B[38;5;124m\"\u001B[39m) \u001B[38;5;28;01mfrom\u001B[39;00m \u001B[38;5;21;01mexc\u001B[39;00m\n\u001B[0;32m 533\u001B[0m \u001B[38;5;28mself\u001B[39m\u001B[38;5;241m.\u001B[39mstatus \u001B[38;5;241m=\u001B[39m Status\u001B[38;5;241m.\u001B[39mrunning\n",
+ "\u001B[1;31mRuntimeError\u001B[0m: Scheduler failed to start.",
+ "\nThe above exception was the direct cause of the following exception:\n",
+ "\u001B[1;31mRuntimeError\u001B[0m Traceback (most recent call last)",
+ "Cell \u001B[1;32mIn[26], line 2\u001B[0m\n\u001B[0;32m 1\u001B[0m \u001B[38;5;66;03m# Prepare dask cluster\u001B[39;00m\n\u001B[1;32m----> 2\u001B[0m cluster \u001B[38;5;241m=\u001B[39m \u001B[43mLocalCluster\u001B[49m\u001B[43m(\u001B[49m\n\u001B[0;32m 3\u001B[0m \u001B[43m \u001B[49m\u001B[43mprocesses\u001B[49m\u001B[38;5;241;43m=\u001B[39;49m\u001B[38;5;28;43;01mFalse\u001B[39;49;00m\u001B[43m,\u001B[49m\n\u001B[0;32m 4\u001B[0m \u001B[43m \u001B[49m\u001B[43mn_workers\u001B[49m\u001B[38;5;241;43m=\u001B[39;49m\u001B[38;5;241;43m2\u001B[39;49m\u001B[43m,\u001B[49m\n\u001B[0;32m 5\u001B[0m \u001B[43m \u001B[49m\u001B[43mthreads_per_worker\u001B[49m\u001B[38;5;241;43m=\u001B[39;49m\u001B[38;5;241;43m2\u001B[39;49m\u001B[43m,\u001B[49m\n\u001B[0;32m 6\u001B[0m \u001B[43m \u001B[49m\u001B[43mscheduler_port\u001B[49m\u001B[38;5;241;43m=\u001B[39;49m\u001B[38;5;241;43m8383\u001B[39;49m\n\u001B[0;32m 7\u001B[0m \u001B[43m)\u001B[49m\n\u001B[0;32m 8\u001B[0m client \u001B[38;5;241m=\u001B[39m Client(cluster)\n\u001B[0;32m 9\u001B[0m client\n",
+ "File \u001B[1;32m~\\anaconda3\\envs\\IncrementalTrees\\lib\\site-packages\\distributed\\deploy\\local.py:253\u001B[0m, in \u001B[0;36mLocalCluster.__init__\u001B[1;34m(self, name, n_workers, threads_per_worker, processes, loop, start, host, ip, scheduler_port, silence_logs, dashboard_address, worker_dashboard_address, diagnostics_port, services, worker_services, service_kwargs, asynchronous, security, protocol, blocked_handlers, interface, worker_class, scheduler_kwargs, scheduler_sync_interval, **worker_kwargs)\u001B[0m\n\u001B[0;32m 250\u001B[0m worker \u001B[38;5;241m=\u001B[39m {\u001B[38;5;124m\"\u001B[39m\u001B[38;5;124mcls\u001B[39m\u001B[38;5;124m\"\u001B[39m: worker_class, \u001B[38;5;124m\"\u001B[39m\u001B[38;5;124moptions\u001B[39m\u001B[38;5;124m\"\u001B[39m: worker_kwargs}\n\u001B[0;32m 251\u001B[0m workers \u001B[38;5;241m=\u001B[39m {i: worker \u001B[38;5;28;01mfor\u001B[39;00m i \u001B[38;5;129;01min\u001B[39;00m \u001B[38;5;28mrange\u001B[39m(n_workers)}\n\u001B[1;32m--> 253\u001B[0m \u001B[38;5;28;43msuper\u001B[39;49m\u001B[43m(\u001B[49m\u001B[43m)\u001B[49m\u001B[38;5;241;43m.\u001B[39;49m\u001B[38;5;21;43m__init__\u001B[39;49m\u001B[43m(\u001B[49m\n\u001B[0;32m 254\u001B[0m \u001B[43m \u001B[49m\u001B[43mname\u001B[49m\u001B[38;5;241;43m=\u001B[39;49m\u001B[43mname\u001B[49m\u001B[43m,\u001B[49m\n\u001B[0;32m 255\u001B[0m \u001B[43m \u001B[49m\u001B[43mscheduler\u001B[49m\u001B[38;5;241;43m=\u001B[39;49m\u001B[43mscheduler\u001B[49m\u001B[43m,\u001B[49m\n\u001B[0;32m 256\u001B[0m \u001B[43m \u001B[49m\u001B[43mworkers\u001B[49m\u001B[38;5;241;43m=\u001B[39;49m\u001B[43mworkers\u001B[49m\u001B[43m,\u001B[49m\n\u001B[0;32m 257\u001B[0m \u001B[43m \u001B[49m\u001B[43mworker\u001B[49m\u001B[38;5;241;43m=\u001B[39;49m\u001B[43mworker\u001B[49m\u001B[43m,\u001B[49m\n\u001B[0;32m 258\u001B[0m \u001B[43m \u001B[49m\u001B[43mloop\u001B[49m\u001B[38;5;241;43m=\u001B[39;49m\u001B[43mloop\u001B[49m\u001B[43m,\u001B[49m\n\u001B[0;32m 259\u001B[0m \u001B[43m \u001B[49m\u001B[43masynchronous\u001B[49m\u001B[38;5;241;43m=\u001B[39;49m\u001B[43masynchronous\u001B[49m\u001B[43m,\u001B[49m\n\u001B[0;32m 260\u001B[0m \u001B[43m \u001B[49m\u001B[43msilence_logs\u001B[49m\u001B[38;5;241;43m=\u001B[39;49m\u001B[43msilence_logs\u001B[49m\u001B[43m,\u001B[49m\n\u001B[0;32m 261\u001B[0m \u001B[43m \u001B[49m\u001B[43msecurity\u001B[49m\u001B[38;5;241;43m=\u001B[39;49m\u001B[43msecurity\u001B[49m\u001B[43m,\u001B[49m\n\u001B[0;32m 262\u001B[0m \u001B[43m \u001B[49m\u001B[43mscheduler_sync_interval\u001B[49m\u001B[38;5;241;43m=\u001B[39;49m\u001B[43mscheduler_sync_interval\u001B[49m\u001B[43m,\u001B[49m\n\u001B[0;32m 263\u001B[0m \u001B[43m\u001B[49m\u001B[43m)\u001B[49m\n",
+ "File \u001B[1;32m~\\anaconda3\\envs\\IncrementalTrees\\lib\\site-packages\\distributed\\deploy\\spec.py:275\u001B[0m, in \u001B[0;36mSpecCluster.__init__\u001B[1;34m(self, workers, scheduler, worker, asynchronous, loop, security, silence_logs, name, shutdown_on_close, scheduler_sync_interval)\u001B[0m\n\u001B[0;32m 273\u001B[0m \u001B[38;5;28;01mif\u001B[39;00m \u001B[38;5;129;01mnot\u001B[39;00m called_from_running_loop:\n\u001B[0;32m 274\u001B[0m \u001B[38;5;28mself\u001B[39m\u001B[38;5;241m.\u001B[39m_loop_runner\u001B[38;5;241m.\u001B[39mstart()\n\u001B[1;32m--> 275\u001B[0m \u001B[38;5;28;43mself\u001B[39;49m\u001B[38;5;241;43m.\u001B[39;49m\u001B[43msync\u001B[49m\u001B[43m(\u001B[49m\u001B[38;5;28;43mself\u001B[39;49m\u001B[38;5;241;43m.\u001B[39;49m\u001B[43m_start\u001B[49m\u001B[43m)\u001B[49m\n\u001B[0;32m 276\u001B[0m \u001B[38;5;28;01mtry\u001B[39;00m:\n\u001B[0;32m 277\u001B[0m \u001B[38;5;28mself\u001B[39m\u001B[38;5;241m.\u001B[39msync(\u001B[38;5;28mself\u001B[39m\u001B[38;5;241m.\u001B[39m_correct_state)\n",
+ "File \u001B[1;32m~\\anaconda3\\envs\\IncrementalTrees\\lib\\site-packages\\distributed\\utils.py:339\u001B[0m, in \u001B[0;36mSyncMethodMixin.sync\u001B[1;34m(self, func, asynchronous, callback_timeout, *args, **kwargs)\u001B[0m\n\u001B[0;32m 337\u001B[0m \u001B[38;5;28;01mreturn\u001B[39;00m future\n\u001B[0;32m 338\u001B[0m \u001B[38;5;28;01melse\u001B[39;00m:\n\u001B[1;32m--> 339\u001B[0m \u001B[38;5;28;01mreturn\u001B[39;00m sync(\n\u001B[0;32m 340\u001B[0m \u001B[38;5;28mself\u001B[39m\u001B[38;5;241m.\u001B[39mloop, func, \u001B[38;5;241m*\u001B[39margs, callback_timeout\u001B[38;5;241m=\u001B[39mcallback_timeout, \u001B[38;5;241m*\u001B[39m\u001B[38;5;241m*\u001B[39mkwargs\n\u001B[0;32m 341\u001B[0m )\n",
+ "File \u001B[1;32m~\\anaconda3\\envs\\IncrementalTrees\\lib\\site-packages\\distributed\\utils.py:406\u001B[0m, in \u001B[0;36msync\u001B[1;34m(loop, func, callback_timeout, *args, **kwargs)\u001B[0m\n\u001B[0;32m 404\u001B[0m \u001B[38;5;28;01mif\u001B[39;00m error:\n\u001B[0;32m 405\u001B[0m typ, exc, tb \u001B[38;5;241m=\u001B[39m error\n\u001B[1;32m--> 406\u001B[0m \u001B[38;5;28;01mraise\u001B[39;00m exc\u001B[38;5;241m.\u001B[39mwith_traceback(tb)\n\u001B[0;32m 407\u001B[0m \u001B[38;5;28;01melse\u001B[39;00m:\n\u001B[0;32m 408\u001B[0m \u001B[38;5;28;01mreturn\u001B[39;00m result\n",
+ "File \u001B[1;32m~\\anaconda3\\envs\\IncrementalTrees\\lib\\site-packages\\distributed\\utils.py:379\u001B[0m, in \u001B[0;36msync..f\u001B[1;34m()\u001B[0m\n\u001B[0;32m 377\u001B[0m future \u001B[38;5;241m=\u001B[39m asyncio\u001B[38;5;241m.\u001B[39mwait_for(future, callback_timeout)\n\u001B[0;32m 378\u001B[0m future \u001B[38;5;241m=\u001B[39m asyncio\u001B[38;5;241m.\u001B[39mensure_future(future)\n\u001B[1;32m--> 379\u001B[0m result \u001B[38;5;241m=\u001B[39m \u001B[38;5;28;01myield\u001B[39;00m future\n\u001B[0;32m 380\u001B[0m \u001B[38;5;28;01mexcept\u001B[39;00m \u001B[38;5;167;01mException\u001B[39;00m:\n\u001B[0;32m 381\u001B[0m error \u001B[38;5;241m=\u001B[39m sys\u001B[38;5;241m.\u001B[39mexc_info()\n",
+ "File \u001B[1;32m~\\anaconda3\\envs\\IncrementalTrees\\lib\\site-packages\\tornado\\gen.py:769\u001B[0m, in \u001B[0;36mRunner.run\u001B[1;34m(self)\u001B[0m\n\u001B[0;32m 766\u001B[0m exc_info \u001B[38;5;241m=\u001B[39m \u001B[38;5;28;01mNone\u001B[39;00m\n\u001B[0;32m 768\u001B[0m \u001B[38;5;28;01mtry\u001B[39;00m:\n\u001B[1;32m--> 769\u001B[0m value \u001B[38;5;241m=\u001B[39m \u001B[43mfuture\u001B[49m\u001B[38;5;241;43m.\u001B[39;49m\u001B[43mresult\u001B[49m\u001B[43m(\u001B[49m\u001B[43m)\u001B[49m\n\u001B[0;32m 770\u001B[0m \u001B[38;5;28;01mexcept\u001B[39;00m \u001B[38;5;167;01mException\u001B[39;00m:\n\u001B[0;32m 771\u001B[0m exc_info \u001B[38;5;241m=\u001B[39m sys\u001B[38;5;241m.\u001B[39mexc_info()\n",
+ "File \u001B[1;32m~\\anaconda3\\envs\\IncrementalTrees\\lib\\site-packages\\distributed\\deploy\\spec.py:319\u001B[0m, in \u001B[0;36mSpecCluster._start\u001B[1;34m(self)\u001B[0m\n\u001B[0;32m 317\u001B[0m \u001B[38;5;28mself\u001B[39m\u001B[38;5;241m.\u001B[39mstatus \u001B[38;5;241m=\u001B[39m Status\u001B[38;5;241m.\u001B[39mfailed\n\u001B[0;32m 318\u001B[0m \u001B[38;5;28;01mawait\u001B[39;00m \u001B[38;5;28mself\u001B[39m\u001B[38;5;241m.\u001B[39m_close()\n\u001B[1;32m--> 319\u001B[0m \u001B[38;5;28;01mraise\u001B[39;00m \u001B[38;5;167;01mRuntimeError\u001B[39;00m(\u001B[38;5;124mf\u001B[39m\u001B[38;5;124m\"\u001B[39m\u001B[38;5;124mCluster failed to start: \u001B[39m\u001B[38;5;132;01m{\u001B[39;00me\u001B[38;5;132;01m}\u001B[39;00m\u001B[38;5;124m\"\u001B[39m) \u001B[38;5;28;01mfrom\u001B[39;00m \u001B[38;5;21;01me\u001B[39;00m\n",
+ "\u001B[1;31mRuntimeError\u001B[0m: Cluster failed to start: Scheduler failed to start."
+ ]
+ }
+ ],
+ "source": [
+ "# Prepare dask cluster\n",
+ "cluster = LocalCluster(\n",
+ " processes=False,\n",
+ " n_workers=2,\n",
+ " threads_per_worker=2,\n",
+ " scheduler_port=8383\n",
+ ")\n",
+ "client = Client(cluster)\n",
+ "client"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "pycharm": {
+ "name": "#%% md\n"
+ }
+ },
+ "source": [
+ "# Synthetic data, in memory\n",
+ "\n",
+ "Compare increasing estimators with RandomForest (using warm_start) against Incremental StreamingRFC (dask handles .partial_fit)."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {
+ "pycharm": {
+ "name": "#%%\n"
+ }
+ },
+ "outputs": [],
+ "source": [
+ "x, y = dask_ml.datasets.make_blobs(\n",
+ " n_samples=1e5,\n",
+ " chunks=1e4,\n",
+ " random_state=0,\n",
+ " n_features=40,\n",
+ " centers=2,\n",
+ " cluster_std=100\n",
+ ")\n",
+ "\n",
+ "x_dd = dd.dataframe.from_array(x, chunksize=1e4)\n",
+ "y_dd = dd.dataframe.from_array(y, chunksize=1e4)\n",
+ "\n",
+ "x_pd = pd.DataFrame(x.persist().compute())\n",
+ "y_pd = pd.DataFrame(y.persist().compute())"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {
+ "pycharm": {
+ "name": "#%%\n"
+ }
+ },
+ "outputs": [],
+ "source": [
+ "x_pd.memory_usage(deep=True).sum() / 1024 / 1024"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "pycharm": {
+ "name": "#%% md\n"
+ }
+ },
+ "source": [
+ "### Standard random forest"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {
+ "pycharm": {
+ "name": "#%%\n"
+ }
+ },
+ "outputs": [],
+ "source": [
+ "def score(mod, train: Tuple[np.array, np.array], test: Tuple[np.array, np.array], pr=False) -> Tuple[float, float]:\n",
+ " \"\"\"Return ROC auc on x_train and x_test (from caller) on mod. Print if requested.\"\"\"\n",
+ "\n",
+ " y_pred_train_proba = mod.predict_proba(train[0])[:, 1]\n",
+ " y_pred_test_proba = mod.predict_proba(test[0])[:, 1]\n",
+ "\n",
+ " roc_train = roc_auc_score(train[1], y_pred_train_proba)\n",
+ " roc_test = roc_auc_score(test[1], y_pred_test_proba)\n",
+ " if pr:\n",
+ " print(f\"n_ests: {len(rfc.estimators_)}\")\n",
+ " print(f'Train AUC: {roc_train}')\n",
+ " print(f'Test AUC: {roc_test}')\n",
+ "\n",
+ " return roc_train, roc_test\n",
+ "\n",
+ "\n",
+ "def score_dask(mod, train: Tuple[np.array, np.array], test: Tuple[np.array, np.array], pr=False) -> Tuple[float, float]:\n",
+ " \"\"\"Score model using available dask metric (accuracy).\"\"\"\n",
+ "\n",
+ " roc_train = mod.score(train[0], train[1])\n",
+ " roc_test = mod.score(test[0], test[1])\n",
+ " if pr:\n",
+ " print(f\"n_ests: {len(rfc.estimators_)}\")\n",
+ " print(f'Train AUC: {roc_train}')\n",
+ " print(f'Test AUC: {roc_test}')\n",
+ "\n",
+ " return roc_train, roc_test\n",
+ "\n",
+ "\n",
+ "def multiple_fit(x: np.array, y: np.array, steps=np.arange(1, 101, 2), sample: int = 1):\n",
+ " \"\"\"\n",
+ " Fit a random forest model with an increasing number of estimators.\n",
+ " \n",
+ " This version doesn't use warm start and refits the model from scratch each iteration.\n",
+ " This is for the sake of comparing timings to dask function below.\n",
+ " \n",
+ " :param steps: Range to iterate over. Sets total number of estimators that will be fit in model\n",
+ " after each iteration. Should be range with constant step size.\n",
+ " :param sample: Proportion of randomly sampled training data to use on each partial_fit call.\n",
+ " If sample = 1, all training data is used on each interation,\n",
+ " so should behave as standard random forest. Default = 1 (100%).\n",
+ " \"\"\"\n",
+ "\n",
+ " x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.25, random_state=1)\n",
+ "\n",
+ " train_scores = []\n",
+ " test_scores = []\n",
+ " for s in steps:\n",
+ " # Fit full model on each iteration\n",
+ " rfc = RandomForestClassifier(warm_start=False)\n",
+ "\n",
+ " # Fit model with these n ests\n",
+ " rfc.set_params(n_estimators=s)\n",
+ " rfc.fit(x_train, y_train)\n",
+ "\n",
+ " tr_score, te_score = score(rfc, train=(x_train, y_train), test=(x_test, y_test), pr=False)\n",
+ "\n",
+ " train_scores.append(tr_score)\n",
+ " test_scores.append(te_score)\n",
+ "\n",
+ " return rfc, train_scores, test_scores\n",
+ "\n",
+ "\n",
+ "def plot_auc(steps, train_scores, test_scores):\n",
+ " \"\"\"Plot the train and test auc scores vs total number of model estimators\"\"\"\n",
+ "\n",
+ " plt.figure(figsize=(4, 4))\n",
+ " plt.plot(steps, train_scores)\n",
+ " plt.plot(steps, test_scores)\n",
+ " plt.xlabel('n_estimators')\n",
+ " plt.ylabel('auc')\n",
+ " plt.legend(['train', 'test'])"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {
+ "pycharm": {
+ "name": "#%%\n"
+ }
+ },
+ "outputs": [],
+ "source": [
+ "steps = np.arange(1, MAX_ESTIMATORS, 4)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {
+ "pycharm": {
+ "name": "#%%\n"
+ }
+ },
+ "outputs": [],
+ "source": [
+ "rfc = RandomForestClassifier(warm_start=True)\n",
+ "\n",
+ "%time rfc, train_scores, test_scores = multiple_fit(x_pd.values, y_pd.values.squeeze(), steps=steps)\n",
+ "\n",
+ "print(f\"With {len(rfc.estimators_)}: {train_scores[-1]} | {test_scores[-1]}\")\n",
+ "plot_auc(steps, train_scores, test_scores)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "pycharm": {
+ "name": "#%% md\n"
+ }
+ },
+ "source": [
+ "## Single incremental forest specs"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {
+ "pycharm": {
+ "name": "#%%\n"
+ }
+ },
+ "outputs": [],
+ "source": [
+ "x_train, x_test, y_train, y_test = dask_tts(x, y, test_size=0.25)\n",
+ "\n",
+ "x_train.chunks"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "pycharm": {
+ "name": "#%% md\n"
+ }
+ },
+ "source": [
+ "#### Incremental forest\n",
+ "1 estimator per subset, 10 % per chunk, 1 pass through data.\n"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {
+ "pycharm": {
+ "name": "#%%\n"
+ }
+ },
+ "outputs": [],
+ "source": [
+ "srfc = Incremental(StreamingRFC(n_estimators_per_chunk=1, max_n_estimators=np.inf))\n",
+ "\n",
+ "srfc.fit(x_train, y_train, classes=[0, 1])\n",
+ "\n",
+ "tr_score, te_score = score(srfc, train=(x_train, y_train), test=(x_test, y_test), pr=True)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "pycharm": {
+ "name": "#%% md\n"
+ }
+ },
+ "source": [
+ "#### Incremental forest\n",
+ "20 estimators per subset (different features), 10 % per chunk, 1 pass through data."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {
+ "pycharm": {
+ "name": "#%%\n"
+ }
+ },
+ "outputs": [],
+ "source": [
+ "srfc = Incremental(StreamingRFC(n_estimators_per_chunk=20, max_n_estimators=np.inf))\n",
+ "\n",
+ "srfc.fit(x_train, y_train, classes=[0, 1])\n",
+ "\n",
+ "tr_score, te_score = score(srfc, train=(x_train, y_train), test=(x_test, y_test), pr=True)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "pycharm": {
+ "name": "#%% md\n"
+ }
+ },
+ "source": [
+ "#### Forest of partial decision trees\n",
+ "1 estimator per subset with all features, 10 % per chunk, 1 pass through data."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {
+ "pycharm": {
+ "name": "#%%\n"
+ }
+ },
+ "outputs": [],
+ "source": [
+ "srfc = Incremental(StreamingRFC(\n",
+ " n_estimators_per_chunk=1,\n",
+ " max_n_estimators=np.max(steps),\n",
+ " max_features=x.shape[1])\n",
+ ")\n",
+ "\n",
+ "srfc.fit(x_train, y_train,\n",
+ " classes=[0, 1])\n",
+ "\n",
+ "tr_score, te_score = score(srfc, train=(x_train, y_train), test=(x_test, y_test), pr=True)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "pycharm": {
+ "name": "#%% md\n"
+ }
+ },
+ "source": [
+ "#### Forest of partial decision trees\n",
+ "20 estimator per subset with all features, 10 % per chunk, 1 pass through data.\n",
+ "\n",
+ "Extra estimators shouldn't help here?"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {
+ "pycharm": {
+ "name": "#%%\n"
+ }
+ },
+ "outputs": [],
+ "source": [
+ "srfc = Incremental(StreamingRFC(\n",
+ " n_estimators_per_chunk=20,\n",
+ " max_n_estimators=np.max(steps),\n",
+ " max_features=x.shape[1])\n",
+ ")\n",
+ "\n",
+ "srfc.fit(x_train, y_train, classes=[0, 1])\n",
+ "\n",
+ "tr_score, te_score = score(srfc, train=(x_train, y_train), test=(x_test, y_test), pr=True)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "pycharm": {
+ "name": "#%% md\n"
+ }
+ },
+ "source": [
+ "### n estimators per chunk vs performance\n",
+ "\n",
+ "Effect of increasing estimators per subset (with different set ups)\n",
+ "\n",
+ "Function here add Incremental to supplied model, and uses .fit to refit the full model in each iteration.\n",
+ "\n",
+ "The other functions (above and in PerformanceComparisons.ipynb) do incremental fits using warm start (either directly or via .partial_fit). \n",
+ "\n",
+ "This means the timing information cannot be directly compared!"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {
+ "pycharm": {
+ "name": "#%%\n"
+ }
+ },
+ "outputs": [],
+ "source": [
+ "def multiple_dask_fit(x: np.ndarray, y: np.ndarray, steps=np.arange(1, 101, 2),\n",
+ " **kwargs) -> None:\n",
+ " \"\"\"\n",
+ " Fit increasing number of estimators using .partial_fit on a subsample of the training data.\n",
+ " Uses Dask by adding Incremental to model and calling fit. This refits the whole model one each\n",
+ " iteration, so will be slower than the other test functions. Timing this function can only be compared\n",
+ " to other calls of this function.\n",
+ " \n",
+ " The data passed to the Random forest fit by partial_fit is handled by dask and is sequential batches\n",
+ " of data, rather than random samples (as used by inc_partial_fit in PerformanceComparisons.ipynb).\n",
+ " \n",
+ " StreamingRFC.n_estimators: Number of estimators that will be fit in each step. Set from first\n",
+ " difference in range (ie. range[1]-range[0])\n",
+ " StreamingRFC.max_n_estimators: Limit on number of estimators than will be fit in model. Should >\n",
+ " range[-1].\n",
+ " \n",
+ " :param steps: Range to iterate over. Sets total number of estimators that will be fit in model\n",
+ " after each iteration. Should be range with constant step size.\n",
+ " \"\"\"\n",
+ "\n",
+ " x_train, x_test, y_train, y_test = dask_tts(x, y, test_size=0.25)\n",
+ "\n",
+ " train_scores = []\n",
+ " test_scores = []\n",
+ " for s in steps:\n",
+ " # Create fresh model each iteration\n",
+ " srfc_ = StreamingRFC(n_estimators_per_chunk=s, max_n_estimators=np.inf, **kwargs)\n",
+ "\n",
+ " # Add Incremental\n",
+ " srfc_ = Incremental(srfc_)\n",
+ "\n",
+ " # Fit model with these n ests\n",
+ " # From scratch each time\n",
+ " srfc_.fit(x_train, y_train,\n",
+ " classes=[0, 1])\n",
+ "\n",
+ " tr_score, te_score = score(\n",
+ " srfc_,\n",
+ " train=(x_train, y_train),\n",
+ " test=(x_test, y_test),\n",
+ " pr=False)\n",
+ "\n",
+ " train_scores.append(tr_score)\n",
+ " test_scores.append(te_score)\n",
+ "\n",
+ " return srfc_, train_scores, test_scores"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "pycharm": {
+ "name": "#%% md\n"
+ }
+ },
+ "source": [
+ "#### Incremental forest\n",
+ "*range* estimators per subset (different features), 10 % per chunk, 1 pass through data."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {
+ "pycharm": {
+ "name": "#%%\n"
+ }
+ },
+ "outputs": [],
+ "source": [
+ "steps = np.arange(1, MAX_ESTIMATORS, 6)\n",
+ "\n",
+ "%time final_est, train_scores, test_scores = multiple_dask_fit(x, y, steps=steps)\n",
+ "print(f\"With {len(final_est.estimators_)}: {train_scores[-1]} | {test_scores[-1]}\")\n",
+ "plot_auc(steps, train_scores, test_scores)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "pycharm": {
+ "name": "#%% md\n"
+ }
+ },
+ "source": [
+ "#### Forest of partial decision trees\n",
+ "*range* estimators per subset with all features, 10 % per chunk, 1 pass through data."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {
+ "pycharm": {
+ "name": "#%%\n"
+ }
+ },
+ "outputs": [],
+ "source": [
+ "steps = np.arange(1, MAX_ESTIMATORS, 6)\n",
+ "\n",
+ "%time final_est, train_scores, test_scores = multiple_dask_fit(x, y, steps=steps, max_features=x.shape[1])\n",
+ "print(f\"With {len(final_est.estimators_)}: {train_scores[-1]} | {test_scores[-1]}\")\n",
+ "plot_auc(steps, train_scores, test_scores)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {
+ "pycharm": {
+ "name": "#%%\n"
+ }
+ },
+ "outputs": [],
+ "source": []
+ }
+ ],
+ "metadata": {
+ "kernelspec": {
+ "display_name": "Python 3",
+ "language": "python",
+ "name": "python3"
+ },
+ "language_info": {
+ "codemirror_mode": {
+ "name": "ipython",
+ "version": 3
+ },
+ "file_extension": ".py",
+ "mimetype": "text/x-python",
+ "name": "python",
+ "nbconvert_exporter": "python",
+ "pygments_lexer": "ipython3",
+ "version": "3.7.5"
+ }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 4
+}
\ No newline at end of file
diff --git a/tests/integration/incremental_trees/__init__.py b/scripts/__init__.py
similarity index 100%
rename from tests/integration/incremental_trees/__init__.py
rename to scripts/__init__.py
diff --git a/scripts/example_dask.py b/scripts/example_dask.py
new file mode 100644
index 0000000..f443640
--- /dev/null
+++ b/scripts/example_dask.py
@@ -0,0 +1,40 @@
+import dask as dd
+import dask_ml.cluster
+import dask_ml.datasets
+import numpy as np
+from dask.distributed import Client, LocalCluster
+from dask_ml.wrappers import Incremental
+
+from scripts.trees import StreamingRFC
+
+
+def run_on_blobs():
+ x, y = dask_ml.datasets.make_blobs(n_samples=1e8, chunks=1e5, random_state=0, centers=3)
+
+ x = dd.dataframe.from_array(x)
+ y = dd.dataframe.from_array(y)
+
+ print(f"Rows: {x.shape[0].compute()}")
+
+ ests_per_chunk = 4
+ chunks = len(x.divisions)
+ print(f"n chunks: {chunks}")
+
+ srfc = Incremental(StreamingRFC(
+ n_estimators_per_chunk=ests_per_chunk,
+ max_n_estimators=np.inf,
+ verbose=1,
+ n_jobs=4)
+ )
+ srfc.fit(x, y, classes=y.unique().compute())
+
+
+if __name__ == "__main__":
+ # Create, connect, and run on local cluster.
+ with LocalCluster(processes=False,
+ n_workers=2,
+ threads_per_worker=2,
+ scheduler_port=8080,
+ diagnostics_port=8081) as cluster, Client(cluster) as client:
+ print(client)
+ run_on_blobs()
diff --git a/scripts/example_fit.py b/scripts/example_fit.py
new file mode 100644
index 0000000..b952ea0
--- /dev/null
+++ b/scripts/example_fit.py
@@ -0,0 +1,21 @@
+import numpy as np
+from sklearn.datasets import make_blobs
+
+from incremental_trees.models.classification.streaming_rfc import StreamingRFC
+
+if __name__ == "__main__":
+ # Generate some data in memory
+ x, y = make_blobs(n_samples=int(2e5), random_state=0, n_features=40, centers=2, cluster_std=100)
+
+ srfc = StreamingRFC(
+ n_estimators_per_chunk=3,
+ max_n_estimators=np.inf,
+ spf_n_fits=30, # Number of calls to .partial_fit()
+ spf_sample_prop=0.3 # Number of rows to sample each on .partial_fit()
+ )
+
+ srfc.fit(x, y, sample_weight=np.ones_like(y)) # Optional
+
+ # Should be n_estimators_per_chunk * spf_n_fits
+ print(len(srfc.estimators_))
+ print(srfc.score(x, y))
diff --git a/example_partial_fit.py b/scripts/example_partial_fit.py
similarity index 81%
rename from example_partial_fit.py
rename to scripts/example_partial_fit.py
index 5a21c7c..17e8b82 100644
--- a/example_partial_fit.py
+++ b/scripts/example_partial_fit.py
@@ -4,13 +4,10 @@
from incremental_trees.models.classification.streaming_rfc import StreamingRFC
if __name__ == "__main__":
- srfc = StreamingRFC(n_estimators_per_chunk=20,
- max_n_estimators=np.inf,
- n_jobs=8)
+ srfc = StreamingRFC(n_estimators_per_chunk=20, max_n_estimators=np.inf, n_jobs=8)
# Generate some data in memory
- x, y = make_blobs(n_samples=int(2e5), random_state=0, n_features=40,
- centers=2, cluster_std=100)
+ x, y = make_blobs(n_samples=int(2e5), random_state=0, n_features=40, centers=2, cluster_std=100)
# Feed .partial_fit() with random samples of the data
n_chunks = 30
diff --git a/scripts/requirements.txt b/scripts/requirements.txt
new file mode 100644
index 0000000..fba4851
--- /dev/null
+++ b/scripts/requirements.txt
@@ -0,0 +1,6 @@
+ipykernel
+matplotlib
+jupyter
+jupyterlab
+pandas
+bokeh<3
diff --git a/setup.py b/setup.py
index 60af819..a896d1f 100644
--- a/setup.py
+++ b/setup.py
@@ -8,14 +8,19 @@
setuptools.setup(name='incremental_trees',
version=__version__,
author="Gareth Jones",
- author_email="author@example.com",
+ author_email="garethgithub@gmail.com",
description='Sklearn forests with partial fits',
long_description=long_description,
long_description_content_type="text/markdown",
packages=setuptools.find_packages(),
url="https://github.com/garethjns/IncrementalTrees",
- install_requires=["scikit-learn>=0.22", "pandas",
- "dask>=2",
- "dask-glm==0.2.0",
- "dask-ml>=1",
- "bokeh"])
+ python_requires='>=3.8',
+ install_requires=[
+ "scikit-learn==1.2",
+ "pandas",
+ "numpy",
+ "dask==2022.12",
+ "dask-glm==0.2.0",
+ "dask-ml==2022.5.27",
+ "fsspec"
+ ])
diff --git a/tests/integration/base.py b/tests/integration/base.py
deleted file mode 100644
index 80135b2..0000000
--- a/tests/integration/base.py
+++ /dev/null
@@ -1,240 +0,0 @@
-import numpy as np
-import sklearn
-from dask_ml.datasets import make_blobs, make_regression
-from distributed import LocalCluster, Client
-from sklearn import clone
-from sklearn.model_selection import RandomizedSearchCV
-
-
-class PredictTests:
- def test_predict(self):
- """
- Test prediction function runs are returns expected shape, even if all classes are not in prediction set.
- :return:
- """
-
- # Predict on all data
- preds = self.mod.predict(self.x)
- self.assertEqual(preds.shape, (self.x.shape[0],))
-
- # Predict on single row
- preds = self.mod.predict(self.x[0, :].reshape(1, -1))
- self.assertEqual(preds.shape, (1,))
-
- def test_predict_proba(self):
- """
- Test prediction function runs are returns expected shape, even if all classes are not in prediction set.
- :return:
- """
- if getattr(self.mod, 'predict_proba', False) is False:
- # No predict_proba for this model type
- pass
- else:
- # Predict on all data
- preds = self.mod.predict_proba(self.x)
- self.assertEqual(preds.shape, (self.x.shape[0], 2))
-
- # Predict on single row
- preds = self.mod.predict_proba(self.x[0, :].reshape(1, -1))
- self.assertEqual(preds.shape, (1, 2))
-
- def test_score(self):
- self.mod.score(self.x, self.y)
-
-
-class PartialFitTests(PredictTests):
- """
- Standard tests to run on supplied model and data.
-
- Inherit this into a class with model/data defined in setUpClass into self.mod, self.x, self.y. Then call the
- setupClass method here to set some helper values.
-
- These tests need to run in order, as self.mod used through tests. Maybe would be better to mock it each time,
- but lazy....
-
- These are run without using Dask, so the subset passing to partial_fit is handled manually.
- """
-
- @classmethod
- def setUpClass(cls) -> None:
- """
- Set helper values from specified model/data. Need to super this from child setUpClass.
- """
- cls.chunk_size = 10
- cls.n_chunks = int(cls.n_samples / cls.chunk_size)
- cls.samples_per_chunk = int(cls.n_samples / cls.n_chunks)
-
- # Cursor will be tracked through data between tests.
- cls.s_idx = 0
- cls.e_idx = cls.samples_per_chunk
-
- def test_first_partial_fit_call(self):
- """
- Call partial_fit for the first time on self.mod.
- :return:
- """
- # Call the first partial fit specifying classes
- self.mod.partial_fit(self.x[self.s_idx:self.e_idx, :],
- self.y[self.s_idx:self.e_idx],
- classes=np.unique(self.y))
-
- def test_next_partial_fit_calls(self):
- """
- Call partial fit on remaining chunks.
-
- Provide classes again on second iteration, otherwise don't.
-
- :return:
- """
- for i in range(1, self.n_chunks):
- self.mod.partial_fit(self.x[self.s_idx:self.e_idx, :],
- self.y[self.s_idx:self.e_idx],
- classes=np.unique(self.y) if i == 2 else None)
-
- self.s_idx = self.e_idx
- self.e_idx = self.s_idx + self.samples_per_chunk
-
- # Set expected number of esitmators in class set up
- # Check it matches with parameters
- expect_ = min((self.mod.n_estimators_per_chunk * self.n_chunks), self.mod.max_n_estimators)
- self.assertEqual(expect_, self.expected_n_estimators)
- # Then check the model matches the validated expectation
- self.assertEqual(len(self.mod.estimators_), self.expected_n_estimators)
-
- def test_result(self):
- """Test performance of model is approximately as expected."""
- pass
-
-
-class FitTests(PredictTests):
- """
- Test direct calls to.fit with dask off, which will use ._sampled_partial_fit() to feed partial_fit.
- """
-
- @classmethod
- def setUpClass(cls):
- """
- Set helper actual model from specified values. Need to super this from child setUpClass.
- :return:
- """
- cls.expected_n_estimators = cls.spf_n_fits * cls.n_estimators_per_sample
-
- cls.n_samples = 1000
- cls.x, cls.y = sklearn.datasets.make_blobs(n_samples=cls.n_samples,
- random_state=0,
- n_features=40,
- centers=2,
- cluster_std=100)
-
- cls.grid = RandomizedSearchCV(clone(cls.mod),
- scoring='roc_auc',
- cv=2,
- n_iter=3,
- verbose=10,
- param_distributions={'spf_sample_prop': [0.1, 0.2, 0.3],
- 'spf_n_fits': [10, 20, 30]},
- n_jobs=-1)
-
- def test_fit__sampled_partial_fit(self):
- """With dask off, call .fit directly."""
- self.mod.fit(self.x, self.y)
-
- def test_n_estimators(self):
- self.assertEqual(self.expected_n_estimators, len(self.mod.estimators_))
-
- def test_grid_search(self):
- """With dask off, try with sklearn GS."""
- self.grid.fit(self.x, self.y)
- self.grid.score(self.x, self.y)
-
-
-class DaskTests:
- """
- Standard tests to run on supplied model and data.
-
- Inherit this into a class with model/data defined in setUpClass into self.mod, self.x, self.y. Then call the
- setupClass method here to set some helper values.
-
- These tests need to run in order, as self.mod used through tests. Maybe would be better to mock it each time,
- but lazy....
-
- These are run without using Dask, so the subset passing to partial_fit is handled manually.
- """
-
- @classmethod
- def setUpClass(cls):
- """
- Set helper values from specified model/data. Need to super this from child setUpClass.
- :return:
- """
- """
- Prepare dask connection once.
- """
-
- try:
- cls.cluster = LocalCluster(processes=True,
- n_workers=4,
- threads_per_worker=2,
- scheduler_port=8586,
- diagnostics_port=8587)
- except (OSError, AttributeError):
- cls.cluster = 'localhost:8586'
-
- cls.client = Client(cls.cluster)
-
- # Set helper valuez
- cls.samples_per_chunk = int(cls.n_samples / cls.n_chunks)
-
- def _prep_data(self, reg=False):
- self.n_samples = int(1e5)
- self.chunk_size = int(1e4)
- self.n_chunks = np.ceil(self.n_samples / self.chunk_size).astype(int)
-
- if reg:
- self.x, self.y = make_regression(n_samples=self.n_samples,
- chunks=self.chunk_size,
- random_state=0,
- n_features=40)
- else:
- self.x, self.y = make_blobs(n_samples=self.n_samples,
- chunks=self.chunk_size,
- random_state=0,
- n_features=40,
- centers=2,
- cluster_std=100)
-
- return self
-
- @classmethod
- def tearDownClass(cls) -> None:
- cls.client.close()
- if type(cls.cluster) != str:
- cls.cluster.close()
-
- def test_fit(self):
- """Test the supplied model by wrapping with dask Incremental and calling .fit."""
- self.mod.fit(self.x, self.y,
- classes=np.unique(self.y).compute())
-
- # Set expected number of estimators in class set up
- # Check it matches with parameters
- expect_ = min((self.mod.estimator.n_estimators_per_chunk * self.n_chunks), self.mod.estimator.max_n_estimators)
- self.assertEqual(expect_, self.expected_n_estimators)
- # Then check the model matches the validated expectation
- self.assertEqual(len(self.mod.estimators_), self.expected_n_estimators)
-
- def test_predict(self):
- """
- Test prediction function runs are returns expected shape, even if all classes are not in prediction set.
- :return:
- """
-
- # Predict on all data
- self.mod.predict(self.x)
-
- # Predict on single row
- self.mod.predict(self.x[0, :].reshape(1, -1))
-
- def test_result(self):
- """Test performance of model is approximately as expected."""
- pass
diff --git a/tests/unit/incremental_trees/__init__.py b/tests/integration/base/__init__.py
similarity index 100%
rename from tests/unit/incremental_trees/__init__.py
rename to tests/integration/base/__init__.py
diff --git a/tests/integration/incremental_trees/test_trees_inconsistent_classes.py b/tests/integration/base/class_consistency_test_base.py
similarity index 65%
rename from tests/integration/incremental_trees/test_trees_inconsistent_classes.py
rename to tests/integration/base/class_consistency_test_base.py
index 86dfd35..d725d16 100644
--- a/tests/integration/incremental_trees/test_trees_inconsistent_classes.py
+++ b/tests/integration/base/class_consistency_test_base.py
@@ -1,13 +1,16 @@
import unittest
+from typing import Union
import numpy as np
import pandas as pd
from incremental_trees.models.classification.streaming_extc import StreamingEXTC
-from incremental_trees.trees import StreamingRFC
+from incremental_trees.models.classification.streaming_rfc import StreamingRFC
-class ClassConsistencyTests:
+class ClassConsistencyTestBase(unittest.TestCase):
+ mod: Union[StreamingEXTC, StreamingRFC]
+
@classmethod
def setUpClass(cls):
data = pd.DataFrame({'a': (1, 2, 3, 4, 5),
@@ -23,8 +26,7 @@ def setUpClass(cls):
def test_none_on_second_call(self):
# Fit with 2 classes
- self.mod.partial_fit(self.x[0:6], self.y[0:6],
- classes=np.array([1, 2, 3]))
+ self.mod.partial_fit(self.x[0:6], self.y[0:6], classes=np.array([1, 2, 3]))
self.mod.predict(self.x[0:6])
self.assertEqual(self.mod.n_classes_, 3)
@@ -39,16 +41,14 @@ def test_none_on_second_call(self):
def test_correct_on_second_call(self):
# Fit with 2 classes
- self.mod.partial_fit(self.x[0:6], self.y[0:6],
- classes=np.array([1, 2, 3]))
+ self.mod.partial_fit(self.x[0:6], self.y[0:6], classes=np.array([1, 2, 3]))
self.mod.predict(self.x[0:6])
self.assertEqual(self.mod.n_classes_, 3)
self.assertListEqual(list(self.mod.classes_), [1, 2, 3])
# Fit with 3 classes
- self.mod.partial_fit(self.x, self.y,
- classes=np.array([1, 2, 3]))
+ self.mod.partial_fit(self.x, self.y, classes=np.array([1, 2, 3]))
self.mod.predict(self.x)
self.assertEqual(self.mod.n_classes_, 3)
@@ -58,16 +58,14 @@ def test_incorrect_on_second_call(self):
"""Incorrect on second call - can happen when dask passes classes."""
# Fit with 3 classes
- self.mod.partial_fit(self.x, self.y,
- classes=np.array([1, 2, 3]))
+ self.mod.partial_fit(self.x, self.y, classes=np.array([1, 2, 3]))
self.mod.predict(self.x)
self.assertEqual(self.mod.n_classes_, 3)
self.assertListEqual(list(self.mod.classes_), [1, 2, 3])
# Fit with 2 classes
- self.mod.partial_fit(self.x[0:6], self.y[0:6],
- classes=np.array([1, 2]))
+ self.mod.partial_fit(self.x[0:6], self.y[0:6], classes=np.array([1, 2]))
self.mod.predict(self.x[0:6])
self.assertEqual(self.mod.n_classes_, 3)
@@ -81,15 +79,3 @@ def test_incorrect_on_second_call(self):
self.assertListEqual(list(self.mod.classes_), [1, 2, 3])
-class TestInconsistentClassesRFC(ClassConsistencyTests, unittest.TestCase):
- def setUp(self):
- self.mod = StreamingRFC(n_estimators_per_chunk=1,
- max_n_estimators=np.inf,
- verbose=2)
-
-
-class TestInconsistentClassesEXT(ClassConsistencyTests, unittest.TestCase):
- def setUp(self):
- self.mod = StreamingEXTC(n_estimators_per_chunk=1,
- max_n_estimators=np.inf,
- verbose=2)
diff --git a/tests/integration/base/dask_test_base.py b/tests/integration/base/dask_test_base.py
new file mode 100644
index 0000000..b30e4ed
--- /dev/null
+++ b/tests/integration/base/dask_test_base.py
@@ -0,0 +1,102 @@
+import unittest
+from typing import Union
+
+import numpy as np
+from dask_ml.datasets import make_blobs, make_regression
+from dask_ml.wrappers import Incremental
+from distributed import LocalCluster, Client
+
+
+class DaskTestBase(unittest.TestCase):
+ client: Client
+ cluster: Union[str, LocalCluster]
+ n_samples: int
+ n_chunks: int
+ n_samples: int
+ mod: Incremental
+ reg: bool
+ expected_n_estimators: int
+ x: np.ndarray
+ y: np.ndarray
+
+ """
+ Standard tests to run on supplied model and data.
+
+ Inherit this into a class with model/data defined in setUpClass into self.mod, self.x, self.y. Then call the
+ setupClass method here to set some helper values.
+
+ These tests need to run in order, as self.mod used through tests. Maybe would be better to mock it each time,
+ but lazy....
+
+ These are run without using Dask, so the subset passing to partial_fit is handled manually.
+ """
+
+ @classmethod
+ def setUpClass(cls):
+ """
+ Set helper values from specified model/data. Need to super this from child setUpClass.
+ :return:
+ """
+ """
+ Prepare dask connection once.
+ """
+
+ try:
+ cls.cluster = LocalCluster(
+ processes=True,
+ n_workers=4,
+ threads_per_worker=2,
+ scheduler_port=8586,
+ diagnostics_port=8587
+ )
+ except (OSError, AttributeError):
+ cls.cluster = 'localhost:8586'
+
+ cls.client = Client(cls.cluster)
+ cls.samples_per_chunk = int(cls.n_samples / cls.n_chunks)
+
+ def _prep_data(self, reg: bool):
+ self.n_samples = int(1e5)
+ self.chunk_size = int(1e4)
+ self.n_chunks = np.ceil(self.n_samples / self.chunk_size).astype(int)
+
+ if reg:
+ self.x, self.y = make_regression(
+ n_samples=self.n_samples,
+ chunks=self.chunk_size,
+ random_state=0,
+ n_features=40)
+ else:
+ self.x, self.y = make_blobs(
+ n_samples=self.n_samples,
+ chunks=self.chunk_size,
+ random_state=0,
+ n_features=40,
+ centers=2,
+ cluster_std=100)
+
+ return self
+
+ @classmethod
+ def tearDownClass(cls) -> None:
+ cls.client.close()
+ if type(cls.cluster) != str:
+ cls.cluster.close()
+
+ def test_fit_predict(self):
+ """Test the supplied model by wrapping with dask Incremental and calling .fit."""
+
+ # Act
+ self.mod.fit(self.x, self.y, classes=np.unique(self.y).compute())
+ preds = self.mod.predict(self.x)
+ single_pred = self.mod.predict(self.x[0, :].reshape(1, -1))
+
+ # Assert
+ # Set expected number of estimators in class set up
+ # Check it matches with parameters
+ expect_ = min((self.mod.estimator.n_estimators_per_chunk * self.n_chunks), self.mod.estimator.max_n_estimators)
+ self.assertEqual(expect_, self.expected_n_estimators)
+ # Then check the model matches the validated expectation
+ self.assertEqual(len(self.mod.estimators_), self.expected_n_estimators)
+ self.assertEqual(self.x.shape[0], preds.shape[0])
+ self.assertEqual(1, len(single_pred))
diff --git a/tests/integration/base/fit_test_base.py b/tests/integration/base/fit_test_base.py
new file mode 100644
index 0000000..8fe42f8
--- /dev/null
+++ b/tests/integration/base/fit_test_base.py
@@ -0,0 +1,60 @@
+from typing import Union
+
+import sklearn
+from sklearn import clone
+from sklearn.model_selection import RandomizedSearchCV
+
+from incremental_trees.models.classification.streaming_extc import StreamingEXTC
+from incremental_trees.models.classification.streaming_rfc import StreamingRFC
+from incremental_trees.models.regression.streaming_extr import StreamingEXTR
+from incremental_trees.models.regression.streaming_rfr import StreamingRFR
+from tests.integration.base.predict_test_base import PredictTestBase
+
+
+class FitTestBase(PredictTestBase):
+ """
+ Test direct calls to.fit with dask off, which will use ._sampled_partial_fit() to feed partial_fit.
+ """
+
+ spf_n_fits: int
+ n_samples: int
+ n_estimators_per_sample: int
+ mod: Union[StreamingEXTC, StreamingEXTR, StreamingRFC, StreamingRFR]
+ dask_feeding: bool = False
+ spf_sample_prop: float = 0.1
+
+ @classmethod
+ def setUpClass(cls):
+ """
+ Set helper actual model from specified values. Need to super this from child setUpClass.
+ :return:
+ """
+ cls.expected_n_estimators = cls.spf_n_fits * cls.n_estimators_per_sample
+
+ cls.n_samples = 1000
+ cls.x, cls.y = sklearn.datasets.make_blobs(n_samples=cls.n_samples,
+ random_state=0,
+ n_features=40,
+ centers=2,
+ cluster_std=100)
+
+ cls.grid = RandomizedSearchCV(clone(cls.mod),
+ scoring='roc_auc',
+ cv=2,
+ n_iter=3,
+ verbose=10,
+ param_distributions={'spf_sample_prop': [0.1, 0.2, 0.3],
+ 'spf_n_fits': [10, 20, 30]},
+ n_jobs=-1)
+
+ def test_fit__sampled_partial_fit(self):
+ """With dask off, call .fit directly."""
+ self.mod.fit(self.x, self.y)
+
+ def test_n_estimators(self):
+ self.assertEqual(self.expected_n_estimators, len(self.mod.estimators_))
+
+ def test_grid_search(self):
+ """With dask off, try with sklearn GS."""
+ self.grid.fit(self.x, self.y)
+ self.grid.score(self.x, self.y)
diff --git a/tests/integration/base/partial_fit_test_base.py b/tests/integration/base/partial_fit_test_base.py
new file mode 100644
index 0000000..5267e7b
--- /dev/null
+++ b/tests/integration/base/partial_fit_test_base.py
@@ -0,0 +1,78 @@
+from typing import Union
+
+import numpy as np
+
+from incremental_trees.models.classification.streaming_extc import StreamingEXTC
+from incremental_trees.models.classification.streaming_rfc import StreamingRFC
+from incremental_trees.models.regression.streaming_extr import StreamingEXTR
+from incremental_trees.models.regression.streaming_rfr import StreamingRFR
+from tests.integration.base.predict_test_base import PredictTestBase
+
+
+class PartialFitTestBase(PredictTestBase):
+ mod: Union[StreamingEXTC, StreamingEXTR, StreamingRFC, StreamingRFR]
+ x: np.ndarray
+ y: np.ndarray
+ n_samples: int
+ chunk_size: int
+ n_chunks: int
+ samples_per_chunk: int
+ expected_n_estimators: int
+
+ """
+ Standard tests to run on supplied model and data.
+
+ Inherit this into a class with model/data defined in setUpClass into self.mod, self.x, self.y. Then call the
+ setupClass method here to set some helper values.
+
+ These tests need to run in order, as self.mod used through tests. Maybe would be better to mock it each time,
+ but lazy....
+
+ These are run without using Dask, so the subset passing to partial_fit is handled manually.
+ """
+
+ @classmethod
+ def setUpClass(cls) -> None:
+ """
+ Set helper values from specified model/data. Need to super this from child setUpClass.
+ """
+ cls.chunk_size = 10
+ cls.n_chunks = int(cls.n_samples / cls.chunk_size)
+ cls.samples_per_chunk = int(cls.n_samples / cls.n_chunks)
+
+ # Cursor will be tracked through data between tests.
+ cls.s_idx = 0
+ cls.e_idx = cls.samples_per_chunk
+
+ def test_first_partial_fit_call(self):
+ """
+ Call partial_fit for the first time on self.mod.
+ :return:
+ """
+ # Call the first partial fit specifying classes
+ self.mod.partial_fit(self.x[self.s_idx:self.e_idx, :],
+ self.y[self.s_idx:self.e_idx],
+ classes=np.unique(self.y))
+
+ def test_next_partial_fit_calls(self):
+ """
+ Call partial fit on remaining chunks.
+
+ Provide classes again on second iteration, otherwise don't.
+
+ :return:
+ """
+ for i in range(1, self.n_chunks):
+ self.mod.partial_fit(self.x[self.s_idx:self.e_idx, :],
+ self.y[self.s_idx:self.e_idx],
+ classes=np.unique(self.y) if i == 2 else None)
+
+ self.s_idx = self.e_idx
+ self.e_idx = self.s_idx + self.samples_per_chunk
+
+ # Set expected number of estimators in class set up
+ # Check it matches with parameters
+ expect_ = min((self.mod.n_estimators_per_chunk * self.n_chunks), self.mod.max_n_estimators)
+ self.assertEqual(expect_, self.expected_n_estimators)
+ # Then check the model matches the validated expectation
+ self.assertEqual(len(self.mod.estimators_), self.expected_n_estimators)
diff --git a/tests/integration/base/predict_test_base.py b/tests/integration/base/predict_test_base.py
new file mode 100644
index 0000000..fba88f1
--- /dev/null
+++ b/tests/integration/base/predict_test_base.py
@@ -0,0 +1,50 @@
+import unittest
+from typing import Union
+
+import numpy as np
+
+from incremental_trees.models.classification.streaming_extc import StreamingEXTC
+from incremental_trees.models.classification.streaming_rfc import StreamingRFC
+from incremental_trees.models.regression.streaming_extr import StreamingEXTR
+from incremental_trees.models.regression.streaming_rfr import StreamingRFR
+
+
+class PredictTestBase(unittest.TestCase):
+ x: np.ndarray
+ y: np.ndarray
+ mod: Union[StreamingEXTC, StreamingEXTR, StreamingRFC, StreamingRFR]
+
+ def test_predict(self):
+ """
+ Test prediction function runs are returns expected shape, even if all classes are not in prediction set.
+ :return:
+ """
+
+ # Predict on all data
+ preds = self.mod.predict(self.x)
+ self.assertEqual(preds.shape, (self.x.shape[0],))
+
+ # Predict on single row
+ preds = self.mod.predict(self.x[0, :].reshape(1, -1))
+ self.assertEqual(preds.shape, (1,))
+
+ def test_predict_proba(self):
+ """
+ Test prediction function runs are returns expected shape, even if all classes are not in prediction set.
+ :return:
+ """
+ if getattr(self.mod, 'predict_proba', False) is False:
+ # No predict_proba for this model type
+ pass
+ else:
+ # Predict on all data
+ preds = self.mod.predict_proba(self.x)
+ self.assertEqual(preds.shape, (self.x.shape[0], 2))
+
+ # Predict on single row
+ preds = self.mod.predict_proba(self.x[0, :].reshape(1, -1))
+ self.assertEqual(preds.shape, (1, 2))
+
+ def test_score(self):
+ score = self.mod.score(self.x, self.y)
+ self.assertIsInstance(score, float)
diff --git a/tests/integration/classification/__init__.py b/tests/integration/classification/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/tests/integration/classification/test_streaming_extc.py b/tests/integration/classification/test_streaming_extc.py
new file mode 100644
index 0000000..f86e516
--- /dev/null
+++ b/tests/integration/classification/test_streaming_extc.py
@@ -0,0 +1,211 @@
+import numpy as np
+from dask_ml.wrappers import Incremental
+from sklearn.datasets import make_blobs
+
+from incremental_trees.models.classification.streaming_extc import StreamingEXTC
+from tests.integration.base.class_consistency_test_base import ClassConsistencyTestBase
+from tests.integration.base.dask_test_base import DaskTestBase
+from tests.integration.base.fit_test_base import FitTestBase
+from tests.integration.base.partial_fit_test_base import PartialFitTestBase
+
+
+class TestStreamingEXTCWithPartialFitsUnlimitedEstimators(PartialFitTestBase):
+ """
+ Test SEXT with single estimator per chunk with "random forest style" max features. ie, subset.
+
+ No limit on the total number of trees.
+ """
+
+ @classmethod
+ def setUpClass(cls):
+ """Set up model to test."""
+ cls.n_samples = 1000
+ cls.x, cls.y = make_blobs(n_samples=int(2e4), random_state=0, n_features=40, centers=2, cluster_std=100)
+ cls.mod = StreamingEXTC(n_estimators_per_chunk=1, max_n_estimators=np.inf)
+ cls.expected_n_estimators = 100
+
+ super().setUpClass()
+
+
+class TestStreamingEXTCWithPartialFitsLimitedEstimators(PartialFitTestBase):
+ """
+ Test SEXT with single estimator per chunk with "random forest style" max features. ie, subset.
+
+ Total models limited to 39.
+ """
+
+ @classmethod
+ def setUpClass(cls):
+ """Set up model to test."""
+ cls.n_samples = 1000
+ cls.x, cls.y = make_blobs(n_samples=int(2e4), random_state=0, n_features=40, centers=2, cluster_std=100)
+ cls.mod = StreamingEXTC(n_estimators_per_chunk=1, max_n_estimators=39)
+ cls.expected_n_estimators = 39
+
+ super().setUpClass()
+
+
+class TestStreamingEXTCWithPartialFitsMultipleEstimatorsPerChunk(PartialFitTestBase):
+ """
+ Test SEXT with multiple estimators per chunk with "random forest style" max features. ie, subset.
+
+ No limit on total models, 3 estimators per row subset (each with different feature subset)
+ """
+
+ @classmethod
+ def setUpClass(cls):
+ """Set up model to test."""
+ cls.n_samples = 1000
+ cls.x, cls.y = make_blobs(n_samples=int(2e4), random_state=0, n_features=40, centers=2, cluster_std=100)
+ cls.mod = StreamingEXTC(n_estimators_per_chunk=3, n_jobs=-1, max_n_estimators=np.inf)
+ cls.expected_n_estimators = 300
+
+ super().setUpClass()
+
+
+class TestStreamingEXTCWithPartialFitsAllFeatures(PartialFitTestBase):
+ """
+ Test SEXT with single estimator per chunk with "decision tree style" max features. ie, all available to each tree.
+
+ No limit on total models, 1 estimators per row subset.
+ """
+
+ @classmethod
+ def setUpClass(cls):
+ """Set up model to test."""
+ cls.n_samples = 1000
+ cls.x, cls.y = make_blobs(n_samples=int(2e4), random_state=0, n_features=40, centers=2, cluster_std=100)
+ cls.mod = StreamingEXTC(n_estimators_per_chunk=1, max_features=cls.x.shape[1], max_n_estimators=np.inf)
+ cls.expected_n_estimators = 100
+
+ super().setUpClass()
+
+
+class TestStreamingEXTCWithPartialFitsMultipleEstimatorsPerChunkAllFeatures(PartialFitTestBase):
+ """
+ Test SEXT with single estimator per chunk with "decision tree style" max features. ie, all available to each tree.
+
+ No limit on total models, 3 estimators per row subset.
+ """
+
+ @classmethod
+ def setUpClass(cls):
+ cls.n_samples = 1000
+ cls.x, cls.y = make_blobs(n_samples=int(2e4), random_state=0, n_features=40, centers=2, cluster_std=100)
+
+ cls.mod = StreamingEXTC(
+ n_estimators_per_chunk=3,
+ n_jobs=-1,
+ max_features=cls.x.shape[1],
+ max_n_estimators=np.inf
+ )
+ cls.expected_n_estimators = 300
+
+ super().setUpClass()
+
+
+class TestStreamingEXTCWithFitSingleEstimatorPerChunk(FitTestBase):
+ @classmethod
+ def setUpClass(cls):
+ cls.spf_n_fits = 10
+ cls.n_estimators_per_sample = 1
+ cls.mod = StreamingEXTC(
+ verbose=1,
+ n_estimators_per_chunk=cls.n_estimators_per_sample,
+ max_n_estimators=np.inf,
+ dask_feeding=cls.dask_feeding,
+ spf_sample_prop=cls.spf_sample_prop,
+ spf_n_fits=cls.spf_n_fits
+ )
+
+ super().setUpClass()
+
+
+class TestStreamingEXTCWithFitMultipleEstimatorsPerChunk(FitTestBase):
+ @classmethod
+ def setUpClass(cls):
+ """Set up model to test."""
+ cls.spf_n_fits = 10
+ cls.n_estimators_per_sample = 10
+ cls.mod = StreamingEXTC(
+ verbose=1,
+ n_estimators_per_chunk=cls.n_estimators_per_sample,
+ max_n_estimators=np.inf,
+ dask_feeding=cls.dask_feeding,
+ spf_sample_prop=cls.spf_sample_prop,
+ spf_n_fits=cls.spf_n_fits
+ )
+
+ super().setUpClass()
+
+
+class TestStreamingEXTCWithFitAdditionalSteps(FitTestBase):
+ @classmethod
+ def setUpClass(cls):
+ cls.spf_n_fits = 20
+ cls.n_estimators_per_sample = 6
+ cls.mod = StreamingEXTC(
+ verbose=1,
+ n_estimators_per_chunk=cls.n_estimators_per_sample,
+ max_n_estimators=np.inf,
+ dask_feeding=cls.dask_feeding,
+ spf_sample_prop=cls.spf_sample_prop,
+ spf_n_fits=cls.spf_n_fits
+ )
+
+ super().setUpClass()
+
+
+class TestDaskEXTCWithDask(DaskTestBase):
+ @classmethod
+ def setUpClass(cls):
+ cls._prep_data(cls, reg=False)
+ cls.mod = Incremental(StreamingEXTC(n_estimators_per_chunk=1, max_n_estimators=39, verbose=1))
+ cls.expected_n_estimators = 10
+
+ super().setUpClass()
+
+
+class TestDaskEXTCWithDaskMultipleEstimatorsPerChunk(DaskTestBase):
+ @classmethod
+ def setUpClass(cls):
+ cls._prep_data(cls, reg=False)
+ cls.mod = Incremental(StreamingEXTC(n_estimators_per_chunk=2, n_jobs=-1, max_n_estimators=np.inf, verbose=1))
+ cls.expected_n_estimators = 20
+
+ super().setUpClass()
+
+
+class TestDaskEXTCWithDaskManyEstimatorsPerChunk(DaskTestBase):
+ @classmethod
+ def setUpClass(cls):
+ cls._prep_data(cls, reg=False)
+ cls.mod = Incremental(StreamingEXTC(n_estimators_per_chunk=20, n_jobs=-1, max_n_estimators=np.inf, verbose=1))
+ cls.expected_n_estimators = 200
+
+ super().setUpClass()
+
+
+class TestDaskEXTCWithDaskAlLFeatures(DaskTestBase):
+ @classmethod
+ def setUpClass(cls):
+ cls._prep_data(cls, reg=False)
+ cls.mod = Incremental(
+ StreamingEXTC(
+ n_estimators_per_chunk=1, n_jobs=-1,
+ max_n_estimators=np.inf,
+ max_features=cls.x.shape[1],
+ verbose=1
+ )
+ )
+ cls.expected_n_estimators = 10
+
+ super().setUpClass()
+
+
+class TestInconsistentClassesEXTC(ClassConsistencyTestBase):
+ def setUp(self):
+ self.mod = StreamingEXTC(n_estimators_per_chunk=1, max_n_estimators=np.inf, verbose=2)
+
+
+del FitTestBase, PartialFitTestBase, DaskTestBase, ClassConsistencyTestBase
diff --git a/tests/integration/classification/test_streaming_rfc.py b/tests/integration/classification/test_streaming_rfc.py
new file mode 100644
index 0000000..0421de1
--- /dev/null
+++ b/tests/integration/classification/test_streaming_rfc.py
@@ -0,0 +1,210 @@
+import numpy as np
+from dask_ml.wrappers import Incremental
+from sklearn.datasets import make_blobs
+
+from incremental_trees.models.classification.streaming_rfc import StreamingRFC
+from tests.integration.base.class_consistency_test_base import ClassConsistencyTestBase
+from tests.integration.base.dask_test_base import DaskTestBase
+from tests.integration.base.fit_test_base import FitTestBase
+from tests.integration.base.partial_fit_test_base import PartialFitTestBase
+
+
+class TestStreamingRFCWithPartialFitsUnlimitedEstimators(PartialFitTestBase):
+ """
+ Test SRFC with single estimator per chunk with "random forest style" max features. ie, subset.
+
+ No limit on the total number of trees.
+ """
+
+ @classmethod
+ def setUpClass(cls):
+ """Set up model to test."""
+ cls.n_samples = 1000
+ cls.x, cls.y = make_blobs(n_samples=int(2e4), random_state=0, n_features=40, centers=2, cluster_std=100)
+ cls.mod = StreamingRFC(verbose=1, n_estimators_per_chunk=1, max_n_estimators=np.inf)
+ cls.expected_n_estimators = 100
+
+ super().setUpClass()
+
+
+class TestStreamingRFCWithPartialFitsLimitedEstimators(PartialFitTestBase):
+ """
+ Test SRFC with single estimator per chunk with "random forest style" max features. ie, subset.
+
+ Total models limited to 39.
+ """
+
+ @classmethod
+ def setUpClass(cls):
+ """Set up model to test."""
+ cls.n_samples = 1000
+ cls.x, cls.y = make_blobs(n_samples=int(2e4), random_state=0, n_features=40, centers=2, cluster_std=100)
+ cls.mod = StreamingRFC(n_estimators_per_chunk=1, max_n_estimators=39)
+ cls.expected_n_estimators = 39
+
+ super().setUpClass()
+
+
+class TestStreamingRFCWithPartialFitsMultipleEstimatorsPerChunk(PartialFitTestBase):
+ """
+ Test SRFC with multiple estimators per chunk with "random forest style" max features. ie, subset.
+
+ No limit on total models, 3 estimators per row subset (each with different feature subset)
+ """
+
+ @classmethod
+ def setUpClass(cls):
+ """Set up model to test."""
+ cls.n_samples = 1000
+ cls.x, cls.y = make_blobs(n_samples=int(2e4), random_state=0, n_features=40, centers=2, cluster_std=100)
+ cls.mod = StreamingRFC(n_estimators_per_chunk=3, n_jobs=-1, max_n_estimators=np.inf)
+ cls.expected_n_estimators = 300
+
+ super().setUpClass()
+
+
+class TestStreamingRFCWithPartialFitsAllFeatures(PartialFitTestBase):
+ """
+ Test SRFC with single estimator per chunk with "decision tree style" max features. ie, all available to each tree.
+
+ No limit on total models, 1 estimators per row subset.
+ """
+
+ @classmethod
+ def setUpClass(cls):
+ """Set up model to test."""
+ cls.n_samples = 1000
+ cls.x, cls.y = make_blobs(n_samples=int(2e4), random_state=0, n_features=40, centers=2, cluster_std=100)
+ cls.mod = StreamingRFC(n_estimators_per_chunk=1, max_features=cls.x.shape[1], max_n_estimators=np.inf)
+ cls.expected_n_estimators = 100
+
+ super().setUpClass()
+
+
+class TestStreamingRFCWithPartialFitsMultipleEstimatorsPerChunkAllFeatures(PartialFitTestBase):
+ """
+ Test SRFC with single estimator per chunk with "decision tree style" max features. ie, all available to each tree.
+
+ No limit on total models, 3 estimators per row subset.
+ """
+
+ @classmethod
+ def setUpClass(cls):
+ """Set up model to test."""
+ cls.n_samples = 1000
+ cls.x, cls.y = make_blobs(n_samples=int(2e4), random_state=0, n_features=40, centers=2, cluster_std=100)
+ cls.mod = StreamingRFC(
+ n_estimators_per_chunk=3,
+ n_jobs=-1,
+ max_features=cls.x.shape[1],
+ max_n_estimators=np.inf
+ )
+ cls.expected_n_estimators = 300
+
+ super().setUpClass()
+
+
+class TestStreamingEXTCWithFitSingleEstimatorPerChunk(FitTestBase):
+ @classmethod
+ def setUpClass(cls):
+ cls.spf_n_fits = 10
+ cls.n_estimators_per_sample = 1
+ cls.mod = StreamingRFC(
+ verbose=1,
+ n_estimators_per_chunk=cls.n_estimators_per_sample,
+ max_n_estimators=np.inf,
+ dask_feeding=cls.dask_feeding,
+ spf_sample_prop=cls.spf_sample_prop,
+ spf_n_fits=cls.spf_n_fits
+ )
+
+ super().setUpClass()
+
+
+class TestStreamingEXTCWithFitMultipleEstimatorsPerChunk(FitTestBase):
+ @classmethod
+ def setUpClass(cls):
+ cls.spf_n_fits = 10
+ cls.n_estimators_per_sample = 10
+ cls.mod = StreamingRFC(
+ verbose=1,
+ n_estimators_per_chunk=cls.n_estimators_per_sample,
+ max_n_estimators=np.inf,
+ dask_feeding=cls.dask_feeding,
+ spf_sample_prop=cls.spf_sample_prop,
+ spf_n_fits=cls.spf_n_fits
+ )
+
+ super().setUpClass()
+
+
+class TestStreamingEXTCWithFitAdditionalSteps(FitTestBase):
+ @classmethod
+ def setUpClass(cls):
+ cls.spf_n_fits = 20
+ cls.n_estimators_per_sample = 6
+ cls.mod = StreamingRFC(
+ verbose=1,
+ n_estimators_per_chunk=cls.n_estimators_per_sample,
+ max_n_estimators=np.inf,
+ dask_feeding=cls.dask_feeding,
+ spf_sample_prop=cls.spf_sample_prop,
+ spf_n_fits=cls.spf_n_fits
+ )
+
+ super().setUpClass()
+
+
+class TestDaskRFCWithDask(DaskTestBase):
+ @classmethod
+ def setUpClass(cls):
+ cls._prep_data(cls, reg=False)
+ cls.mod = Incremental(StreamingRFC(n_estimators_per_chunk=1, max_n_estimators=39, verbose=1))
+ cls.expected_n_estimators = 10
+
+ super().setUpClass()
+
+
+class TestDaskRFCWithDaskMultipleEstimatorsPerChunk(DaskTestBase):
+ @classmethod
+ def setUpClass(cls):
+ cls._prep_data(cls, reg=False)
+ cls.mod = Incremental(StreamingRFC(n_estimators_per_chunk=2, n_jobs=-1, max_n_estimators=np.inf, verbose=1))
+ cls.expected_n_estimators = 20
+
+ super().setUpClass()
+
+
+class TestDaskRFCWithDaskManyEstimatorsPerChunk(DaskTestBase):
+ @classmethod
+ def setUpClass(cls):
+ cls._prep_data(cls, reg=False)
+ cls.mod = Incremental(StreamingRFC(n_estimators_per_chunk=20, n_jobs=-1, max_n_estimators=np.inf, verbose=1))
+ cls.expected_n_estimators = 200
+
+ super().setUpClass()
+
+
+class TestDaskRFCWithDaskAlLFeatures(DaskTestBase):
+ @classmethod
+ def setUpClass(cls):
+ cls._prep_data(cls, reg=False)
+ cls.mod = Incremental(
+ StreamingRFC(
+ n_estimators_per_chunk=1, n_jobs=-1,
+ max_n_estimators=np.inf,
+ max_features=cls.x.shape[1],
+ verbose=1
+ )
+ )
+ cls.expected_n_estimators = 10
+
+ super().setUpClass()
+
+
+class TestInconsistentClassesRFC(ClassConsistencyTestBase):
+ def setUp(self):
+ self.mod = StreamingRFC(n_estimators_per_chunk=1, max_n_estimators=np.inf, verbose=2)
+
+
+del FitTestBase, PartialFitTestBase, DaskTestBase, ClassConsistencyTestBase
diff --git a/tests/integration/incremental_trees/test_trees.py b/tests/integration/incremental_trees/test_trees.py
deleted file mode 100644
index ad7c2d9..0000000
--- a/tests/integration/incremental_trees/test_trees.py
+++ /dev/null
@@ -1,897 +0,0 @@
-import unittest
-
-import numpy as np
-import sklearn
-import sklearn.datasets
-
-from incremental_trees.models.classification.streaming_extc import StreamingEXTC
-from incremental_trees.models.regression.streaming_extr import StreamingEXTR
-from incremental_trees.models.regression.streaming_rfr import StreamingRFR
-from incremental_trees.trees import StreamingRFC
-from tests.integration.base import PartialFitTests, FitTests
-
-
-class TestStreamingRFC1(PartialFitTests, unittest.TestCase):
- """
- Test SRFC with single estimator per chunk with "random forest style" max features. ie, subset.
-
- No limit on the total number of trees.
- """
-
- @classmethod
- def setUpClass(cls):
- """Set up model to test."""
- cls.n_samples = 1000
- cls.x, cls.y = sklearn.datasets.make_blobs(n_samples=int(2e4),
- random_state=0,
- n_features=40,
- centers=2,
- cluster_std=100)
-
- cls.mod = StreamingRFC(verbose=1,
- n_estimators_per_chunk=1,
- max_n_estimators=np.inf)
-
- # Set expected number of estimators
- cls.expected_n_estimators = 100
-
- # Set helper values
- super().setUpClass()
-
-
-class TestStreamingRFC2(PartialFitTests, unittest.TestCase):
- """
- Test SRFC with single estimator per chunk with "random forest style" max features. ie, subset.
-
- Total models limited to 39.
- """
-
- @classmethod
- def setUpClass(cls):
- """Set up model to test."""
- cls.n_samples = 1000
- cls.x, cls.y = sklearn.datasets.make_blobs(n_samples=int(2e4),
- random_state=0,
- n_features=40,
- centers=2,
- cluster_std=100)
-
- cls.mod = StreamingRFC(n_estimators_per_chunk=1,
- max_n_estimators=39)
-
- # Set expected number of estimators
- cls.expected_n_estimators = 39
-
- # Set helper values
- super().setUpClass()
-
-
-class TestStreamingRFC3(PartialFitTests, unittest.TestCase):
- """
- Test SRFC with multiple estimators per chunk with "random forest style" max features. ie, subset.
-
- No limit on total models, 3 estimators per row subset (each with different feature subset)
- """
-
- @classmethod
- def setUpClass(cls):
- """Set up model to test."""
- cls.n_samples = 1000
- cls.x, cls.y = sklearn.datasets.make_blobs(n_samples=int(2e4),
- random_state=0,
- n_features=40,
- centers=2,
- cluster_std=100)
-
- cls.mod = StreamingRFC(n_estimators_per_chunk=3,
- n_jobs=-1,
- max_n_estimators=np.inf)
-
- # Set expected number of estimators
- cls.expected_n_estimators = 300
-
- # Set helper values
- super().setUpClass()
-
-
-class TestStreamingRFC4(PartialFitTests, unittest.TestCase):
- """
- Test SRFC with single estimator per chunk with "decision tree style" max features. ie, all available to each tree.
-
- No limit on total models, 1 estimators per row subset.
- """
-
- @classmethod
- def setUpClass(cls):
- """Set up model to test."""
- cls.n_samples = 1000
- cls.x, cls.y = sklearn.datasets.make_blobs(n_samples=int(2e4),
- random_state=0,
- n_features=40,
- centers=2,
- cluster_std=100)
-
- cls.mod = StreamingRFC(n_estimators_per_chunk=1,
- max_features=cls.x.shape[1],
- max_n_estimators=np.inf)
-
- # Set expected number of estimators
- cls.expected_n_estimators = 100
-
- # Set helper values
- super().setUpClass()
-
-
-class TestStreamingRFC5(PartialFitTests, unittest.TestCase):
- """
- Test SRFC with single estimator per chunk with "decision tree style" max features. ie, all available to each tree.
-
- No limit on total models, 3 estimators per row subset.
- """
-
- @classmethod
- def setUpClass(cls):
- """Set up model to test."""
- cls.n_samples = 1000
- cls.x, cls.y = sklearn.datasets.make_blobs(n_samples=int(2e4),
- random_state=0,
- n_features=40,
- centers=2,
- cluster_std=100)
-
- cls.mod = StreamingRFC(n_estimators_per_chunk=3,
- n_jobs=-1,
- max_features=cls.x.shape[1],
- max_n_estimators=np.inf)
-
- # Set expected number of estimators
- cls.expected_n_estimators = 300
-
- # Set helper values
- super().setUpClass()
-
-
-class TestStreamingRFC6(PartialFitTests, unittest.TestCase):
- """
- Test SRFC with single estimator per chunk with "decision tree style" max features. ie, all available to each tree.
-
- No limit on total models, 3 estimators per row subset.
- """
-
- @classmethod
- def setUpClass(cls):
- """Set up model to test."""
- cls.n_samples = 1000
- cls.x, cls.y = sklearn.datasets.make_blobs(n_samples=int(2e4),
- random_state=0,
- n_features=40,
- centers=2,
- cluster_std=100)
-
- cls.mod = StreamingRFC(n_estimators_per_chunk=3,
- n_jobs=-1,
- max_features=cls.x.shape[1],
- max_n_estimators=np.inf)
-
- # Set expected number of estimators
- cls.expected_n_estimators = 300
-
- # Set helper values
- super().setUpClass()
-
-
-class TestStreamingRFC7(FitTests, unittest.TestCase):
- @classmethod
- def setUpClass(cls):
- """Set up model to test."""
-
- cls.spf_n_fits = 10
- cls.spf_sample_prop = 0.1
- cls.dask_feeding = False
- cls.n_estimators_per_sample = 1
-
- cls.mod = StreamingRFC(verbose=1,
- n_estimators_per_chunk=cls.n_estimators_per_sample,
- max_n_estimators=np.inf,
- dask_feeding=cls.dask_feeding,
- spf_sample_prop=cls.spf_sample_prop,
- spf_n_fits=cls.spf_n_fits)
-
- super().setUpClass()
-
-
-class TestStreamingRFC8(FitTests, unittest.TestCase):
- @classmethod
- def setUpClass(cls):
- """Set up model to test."""
- cls.spf_n_fits = 10
- cls.spf_sample_prop = 0.1
- cls.dask_feeding = False
- cls.n_estimators_per_sample = 10
-
- cls.mod = StreamingRFC(verbose=1,
- n_estimators_per_chunk=cls.n_estimators_per_sample,
- max_n_estimators=np.inf,
- dask_feeding=cls.dask_feeding,
- spf_sample_prop=cls.spf_sample_prop,
- spf_n_fits=cls.spf_n_fits)
-
- super().setUpClass()
-
-
-class TestStreamingRFC9(FitTests, unittest.TestCase):
- @classmethod
- def setUpClass(cls):
- """Set up model to test."""
-
- cls.spf_n_fits = 20
- cls.spf_sample_prop = 0.1
- cls.dask_feeding = False
- cls.n_estimators_per_sample = 6
-
- cls.mod = StreamingRFC(verbose=1,
- n_estimators_per_chunk=cls.n_estimators_per_sample,
- max_n_estimators=np.inf,
- dask_feeding=cls.dask_feeding,
- spf_sample_prop=cls.spf_sample_prop,
- spf_n_fits=cls.spf_n_fits)
-
- super().setUpClass()
-
-
-class TestStreamingRFR1(PartialFitTests, unittest.TestCase):
- """
- Test SRFC with single estimator per chunk with "random forest style" max features. ie, subset.
-
- No limit on the total number of trees.
- """
-
- @classmethod
- def setUpClass(cls):
- """Set up model to test."""
- cls.n_samples = 1000
- cls.x, cls.y = sklearn.datasets.make_regression(n_samples=int(2e4),
- random_state=0,
- n_features=40)
-
- cls.mod = StreamingRFR(verbose=1,
- n_estimators_per_chunk=1,
- max_n_estimators=np.inf)
-
- # Set expected number of estimators
- cls.expected_n_estimators = 100
-
- # Set helper values
- super().setUpClass()
-
-
-class TestStreamingRFR2(PartialFitTests, unittest.TestCase):
- """
- Test SRFC with single estimator per chunk with "random forest style" max features. ie, subset.
-
- Total models limited to 39.
- """
-
- @classmethod
- def setUpClass(cls):
- """Set up model to test."""
- cls.n_samples = 1000
- cls.x, cls.y = sklearn.datasets.make_regression(n_samples=int(2e4),
- random_state=0,
- n_features=40)
-
- cls.mod = StreamingRFR(n_estimators_per_chunk=1,
- max_n_estimators=39)
-
- # Set expected number of estimators
- cls.expected_n_estimators = 39
-
- # Set helper values
- super().setUpClass()
-
-
-class TestStreamingRFR3(PartialFitTests, unittest.TestCase):
- """
- Test SRFC with multiple estimators per chunk with "random forest style" max features. ie, subset.
-
- No limit on total models, 3 estimators per row subset (each with different feature subset)
- """
-
- @classmethod
- def setUpClass(cls):
- """Set up model to test."""
- cls.n_samples = 1000
- cls.x, cls.y = sklearn.datasets.make_regression(n_samples=int(2e4),
- random_state=0,
- n_features=40)
-
- cls.mod = StreamingRFR(n_estimators_per_chunk=3,
- n_jobs=-1,
- max_n_estimators=np.inf)
-
- # Set expected number of estimators
- cls.expected_n_estimators = 300
-
- # Set helper values
- super().setUpClass()
-
-
-class TestStreamingRFR4(PartialFitTests, unittest.TestCase):
- """
- Test SRFC with single estimator per chunk with "decision tree style" max features. ie, all available to each tree.
-
- No limit on total models, 1 estimators per row subset.
- """
-
- @classmethod
- def setUpClass(cls):
- """Set up model to test."""
- cls.n_samples = 1000
- cls.x, cls.y = sklearn.datasets.make_regression(n_samples=int(2e4),
- random_state=0,
- n_features=40)
-
- cls.mod = StreamingRFR(n_estimators_per_chunk=1,
- max_features=cls.x.shape[1],
- max_n_estimators=np.inf)
-
- # Set expected number of estimators
- cls.expected_n_estimators = 100
-
- # Set helper values
- super().setUpClass()
-
-
-class TestStreamingRFR5(PartialFitTests, unittest.TestCase):
- """
- Test SRFC with single estimator per chunk with "decision tree style" max features. ie, all available to each tree.
-
- No limit on total models, 3 estimators per row subset.
- """
-
- @classmethod
- def setUpClass(cls):
- """Set up model to test."""
- cls.n_samples = 1000
- cls.x, cls.y = sklearn.datasets.make_regression(n_samples=int(2e4),
- random_state=0,
- n_features=40)
-
- cls.mod = StreamingRFR(n_estimators_per_chunk=3,
- n_jobs=-1,
- max_features=cls.x.shape[1],
- max_n_estimators=np.inf)
-
- # Set expected number of estimators
- cls.expected_n_estimators = 300
-
- # Set helper values
- super().setUpClass()
-
-
-class TestStreamingRFR6(PartialFitTests, unittest.TestCase):
- """
- Test SRFC with single estimator per chunk with "decision tree style" max features. ie, all available to each tree.
-
- No limit on total models, 3 estimators per row subset.
- """
-
- @classmethod
- def setUpClass(cls):
- """Set up model to test."""
- cls.n_samples = 1000
- cls.x, cls.y = sklearn.datasets.make_regression(n_samples=int(2e4),
- random_state=0,
- n_features=40)
-
- cls.mod = StreamingRFR(n_estimators_per_chunk=3,
- n_jobs=-1,
- max_features=cls.x.shape[1],
- max_n_estimators=np.inf)
-
- # Set expected number of estimators
- cls.expected_n_estimators = 300
-
- # Set helper values
- super().setUpClass()
-
-
-class TestStreamingRFR7(FitTests, unittest.TestCase):
- @classmethod
- def setUpClass(cls):
- """Set up model to test."""
-
- cls.spf_n_fits = 10
- cls.spf_sample_prop = 0.2
- cls.dask_feeding = False
- cls.n_estimators_per_sample = 1
-
- cls.mod = StreamingRFR(verbose=1,
- n_estimators_per_chunk=cls.n_estimators_per_sample,
- max_n_estimators=np.inf,
- dask_feeding=cls.dask_feeding,
- spf_sample_prop=cls.spf_sample_prop,
- spf_n_fits=cls.spf_n_fits)
-
- super().setUpClass()
-
-
-class TestStreamingRFR8(FitTests, unittest.TestCase):
- @classmethod
- def setUpClass(cls):
- """Set up model to test."""
- cls.spf_n_fits = 10
- cls.spf_sample_prop = 0.2
- cls.dask_feeding = False
- cls.n_estimators_per_sample = 10
-
- cls.mod = StreamingRFR(verbose=1,
- n_estimators_per_chunk=cls.n_estimators_per_sample,
- max_n_estimators=np.inf,
- dask_feeding=cls.dask_feeding,
- spf_sample_prop=cls.spf_sample_prop,
- spf_n_fits=cls.spf_n_fits)
-
- super().setUpClass()
-
-
-class TestStreamingRFR9(FitTests, unittest.TestCase):
- @classmethod
- def setUpClass(cls):
- """Set up model to test."""
-
- cls.spf_n_fits = 20
- cls.spf_sample_prop = 0.1
- cls.dask_feeding = False
- cls.n_estimators_per_sample = 6
-
- cls.mod = StreamingRFR(verbose=2,
- n_estimators_per_chunk=cls.n_estimators_per_sample,
- max_n_estimators=np.inf,
- dask_feeding=cls.dask_feeding,
- spf_sample_prop=cls.spf_sample_prop,
- spf_n_fits=cls.spf_n_fits)
-
- super().setUpClass()
-
-
-class TestStreamingEXTC1(PartialFitTests, unittest.TestCase):
- """
- Test SEXT with single estimator per chunk with "random forest style" max features. ie, subset.
-
- No limit on the total number of trees.
- """
-
- @classmethod
- def setUpClass(cls):
- """Set up model to test."""
- cls.n_samples = 1000
- cls.x, cls.y = sklearn.datasets.make_blobs(n_samples=int(2e4),
- random_state=0,
- n_features=40,
- centers=2,
- cluster_std=100)
-
- cls.mod = StreamingEXTC(n_estimators_per_chunk=1,
- max_n_estimators=np.inf)
-
- # Set expected number of estimators
- cls.expected_n_estimators = 100
-
- # Set helper values
- super().setUpClass()
-
-
-class TestStreamingEXTC2(PartialFitTests, unittest.TestCase):
- """
- Test SEXT with single estimator per chunk with "random forest style" max features. ie, subset.
-
- Total models limited to 39.
- """
-
- @classmethod
- def setUpClass(cls):
- """Set up model to test."""
- cls.n_samples = 1000
- cls.x, cls.y = sklearn.datasets.make_blobs(n_samples=int(2e4),
- random_state=0,
- n_features=40,
- centers=2,
- cluster_std=100)
-
- cls.mod = StreamingEXTC(n_estimators_per_chunk=1,
- max_n_estimators=39)
-
- # Set expected number of estimators
- cls.expected_n_estimators = 39
-
- # Set helper values
- super().setUpClass()
-
-
-class TestStreamingEXTC3(PartialFitTests, unittest.TestCase):
- """
- Test SEXT with multiple estimators per chunk with "random forest style" max features. ie, subset.
-
- No limit on total models, 3 estimators per row subset (each with different feature subset)
- """
-
- @classmethod
- def setUpClass(cls):
- """Set up model to test."""
- cls.n_samples = 1000
- cls.x, cls.y = sklearn.datasets.make_blobs(n_samples=int(2e4),
- random_state=0,
- n_features=40,
- centers=2,
- cluster_std=100)
-
- cls.mod = StreamingEXTC(n_estimators_per_chunk=3,
- n_jobs=-1,
- max_n_estimators=np.inf)
-
- # Set expected number of estimators
- cls.expected_n_estimators = 300
-
- # Set helper values
- super().setUpClass()
-
-
-class TestStreamingEXTC4(PartialFitTests, unittest.TestCase):
- """
- Test SEXT with single estimator per chunk with "decision tree style" max features. ie, all available to each tree.
-
- No limit on total models, 1 estimators per row subset.
- """
-
- @classmethod
- def setUpClass(cls):
- """Set up model to test."""
- cls.n_samples = 1000
- cls.x, cls.y = sklearn.datasets.make_blobs(n_samples=int(2e4),
- random_state=0,
- n_features=40,
- centers=2,
- cluster_std=100)
-
- cls.mod = StreamingEXTC(n_estimators_per_chunk=1,
- max_features=cls.x.shape[1],
- max_n_estimators=np.inf)
-
- # Set expected number of estimators
- cls.expected_n_estimators = 100
-
- # Set helper values
- super().setUpClass()
-
-
-class TestStreamingEXTC5(PartialFitTests, unittest.TestCase):
- """
- Test SEXT with single estimator per chunk with "decision tree style" max features. ie, all available to each tree.
-
- No limit on total models, 3 estimators per row subset.
- """
-
- @classmethod
- def setUpClass(cls):
- """Set up model to test."""
- cls.n_samples = 1000
- cls.x, cls.y = sklearn.datasets.make_blobs(n_samples=int(2e4),
- random_state=0,
- n_features=40,
- centers=2,
- cluster_std=100)
-
- cls.mod = StreamingEXTC(n_estimators_per_chunk=3,
- n_jobs=-1,
- max_features=cls.x.shape[1],
- max_n_estimators=np.inf)
-
- # Set expected number of estimators
- cls.expected_n_estimators = 300
-
- # Set helper values
- super().setUpClass()
-
-
-class TestStreamingEXTC6(PartialFitTests, unittest.TestCase):
- """
- Test SEXT with single estimator per chunk with "decision tree style" max features. ie, all available to each tree.
-
- No limit on total models, 3 estimators per row subset.
- """
-
- @classmethod
- def setUpClass(cls):
- """Set up model to test."""
- cls.n_samples = 1000
- cls.x, cls.y = sklearn.datasets.make_blobs(n_samples=int(2e4),
- random_state=0,
- n_features=40,
- centers=2,
- cluster_std=100)
-
- cls.mod = StreamingEXTC(n_estimators_per_chunk=3,
- n_jobs=-1,
- max_features=cls.x.shape[1],
- max_n_estimators=np.inf)
-
- # Set expected number of estimators
- cls.expected_n_estimators = 300
-
- # Set helper values
- super().setUpClass()
-
-
-class TestStreamingEXTC7(FitTests, unittest.TestCase):
- @classmethod
- def setUpClass(cls):
- """Set up model to test."""
-
- cls.spf_n_fits = 10
- cls.spf_sample_prop = 0.1
- cls.dask_feeding = False
- cls.n_estimators_per_sample = 1
-
- cls.mod = StreamingEXTC(verbose=1,
- n_estimators_per_chunk=cls.n_estimators_per_sample,
- max_n_estimators=np.inf,
- dask_feeding=cls.dask_feeding,
- spf_sample_prop=cls.spf_sample_prop,
- spf_n_fits=cls.spf_n_fits)
-
- super().setUpClass()
-
-
-class TestStreamingEXTC8(FitTests, unittest.TestCase):
- @classmethod
- def setUpClass(cls):
- """Set up model to test."""
- cls.spf_n_fits = 10
- cls.spf_sample_prop = 0.1
- cls.dask_feeding = False
- cls.n_estimators_per_sample = 10
-
- cls.mod = StreamingEXTC(verbose=1,
- n_estimators_per_chunk=cls.n_estimators_per_sample,
- max_n_estimators=np.inf,
- dask_feeding=cls.dask_feeding,
- spf_sample_prop=cls.spf_sample_prop,
- spf_n_fits=cls.spf_n_fits)
-
- super().setUpClass()
-
-
-class TestStreamingEXTC9(FitTests, unittest.TestCase):
- @classmethod
- def setUpClass(cls):
- """Set up model to test."""
-
- cls.spf_n_fits = 20
- cls.spf_sample_prop = 0.1
- cls.dask_feeding = False
- cls.n_estimators_per_sample = 6
-
- cls.mod = StreamingEXTC(verbose=1,
- n_estimators_per_chunk=cls.n_estimators_per_sample,
- max_n_estimators=np.inf,
- dask_feeding=cls.dask_feeding,
- spf_sample_prop=cls.spf_sample_prop,
- spf_n_fits=cls.spf_n_fits)
-
- super().setUpClass()
-
-
-class TestStreamingEXTR1(PartialFitTests, unittest.TestCase):
- """
- Test SEXT with single estimator per chunk with "random forest style" max features. ie, subset.
-
- No limit on the total number of trees.
- """
-
- @classmethod
- def setUpClass(cls):
- """Set up model to test."""
- cls.n_samples = 1000
- cls.x, cls.y = sklearn.datasets.make_regression(n_samples=int(2e4),
- random_state=0,
- n_features=40)
-
- cls.mod = StreamingEXTR(n_estimators_per_chunk=1,
- max_n_estimators=np.inf)
-
- # Set expected number of estimators
- cls.expected_n_estimators = 100
-
- # Set helper values
- super().setUpClass()
-
-
-class TestStreamingEXTR2(PartialFitTests, unittest.TestCase):
- """
- Test SEXT with single estimator per chunk with "random forest style" max features. ie, subset.
-
- Total models limited to 39.
- """
-
- @classmethod
- def setUpClass(cls):
- """Set up model to test."""
- cls.n_samples = 1000
- cls.x, cls.y = sklearn.datasets.make_regression(n_samples=int(2e4),
- random_state=0,
- n_features=400)
-
- cls.mod = StreamingEXTR(n_estimators_per_chunk=1,
- max_n_estimators=39)
-
- # Set expected number of estimators
- cls.expected_n_estimators = 39
-
- # Set helper values
- super().setUpClass()
-
-
-class TestStreamingEXTR3(PartialFitTests, unittest.TestCase):
- """
- Test SEXT with multiple estimators per chunk with "random forest style" max features. ie, subset.
-
- No limit on total models, 3 estimators per row subset (each with different feature subset)
- """
-
- @classmethod
- def setUpClass(cls):
- """Set up model to test."""
- cls.n_samples = 1000
- cls.x, cls.y = sklearn.datasets.make_regression(n_samples=int(2e4),
- random_state=0,
- n_features=40)
-
- cls.mod = StreamingEXTR(n_estimators_per_chunk=3,
- n_jobs=-1,
- max_n_estimators=np.inf)
-
- # Set expected number of estimators
- cls.expected_n_estimators = 300
-
- # Set helper values
- super().setUpClass()
-
-
-class TestStreamingEXTR4(PartialFitTests, unittest.TestCase):
- """
- Test SEXT with single estimator per chunk with "decision tree style" max features. ie, all available to each tree.
-
- No limit on total models, 1 estimators per row subset.
- """
-
- @classmethod
- def setUpClass(cls):
- """Set up model to test."""
- cls.n_samples = 1000
- cls.x, cls.y = sklearn.datasets.make_regression(n_samples=int(2e4),
- random_state=0,
- n_features=4)
-
- cls.mod = StreamingEXTR(n_estimators_per_chunk=1,
- max_features=cls.x.shape[1],
- max_n_estimators=np.inf)
-
- # Set expected number of estimators
- cls.expected_n_estimators = 100
-
- # Set helper values
- super().setUpClass()
-
-
-class TestStreamingEXTR5(PartialFitTests, unittest.TestCase):
- """
- Test SEXT with single estimator per chunk with "decision tree style" max features. ie, all available to each tree.
-
- No limit on total models, 3 estimators per row subset.
- """
-
- @classmethod
- def setUpClass(cls):
- """Set up model to test."""
- cls.n_samples = 1000
- cls.x, cls.y = sklearn.datasets.make_regression(n_samples=int(2e4),
- random_state=0,
- n_features=40)
-
- cls.mod = StreamingEXTR(n_estimators_per_chunk=3,
- n_jobs=-1,
- max_features=cls.x.shape[1],
- max_n_estimators=np.inf)
-
- # Set expected number of estimators
- cls.expected_n_estimators = 300
-
- # Set helper values
- super().setUpClass()
-
-
-class TestStreamingEXTR6(PartialFitTests, unittest.TestCase):
- """
- Test SEXT with single estimator per chunk with "decision tree style" max features. ie, all available to each tree.
-
- No limit on total models, 3 estimators per row subset.
- """
-
- @classmethod
- def setUpClass(cls):
- """Set up model to test."""
- cls.n_samples = 1000
- cls.x, cls.y = sklearn.datasets.make_regression(n_samples=int(2e4),
- random_state=0,
- n_features=40)
-
- cls.mod = StreamingEXTR(n_estimators_per_chunk=3,
- n_jobs=-1,
- max_features=cls.x.shape[1],
- max_n_estimators=np.inf)
-
- # Set expected number of estimators
- cls.expected_n_estimators = 300
-
- # Set helper values
- super().setUpClass()
-
-
-class TestStreamingEXTR7(FitTests, unittest.TestCase):
- @classmethod
- def setUpClass(cls):
- """Set up model to test."""
-
- cls.spf_n_fits = 10
- cls.spf_sample_prop = 0.1
- cls.dask_feeding = False
- cls.n_estimators_per_sample = 1
-
- cls.mod = StreamingEXTR(verbose=1,
- n_estimators_per_chunk=cls.n_estimators_per_sample,
- max_n_estimators=np.inf,
- dask_feeding=cls.dask_feeding,
- spf_sample_prop=cls.spf_sample_prop,
- spf_n_fits=cls.spf_n_fits)
-
- super().setUpClass()
-
-
-class TestStreamingEXTR8(FitTests, unittest.TestCase):
- @classmethod
- def setUpClass(cls):
- """Set up model to test."""
- cls.spf_n_fits = 10
- cls.spf_sample_prop = 0.1
- cls.dask_feeding = False
- cls.n_estimators_per_sample = 10
-
- cls.mod = StreamingEXTR(verbose=1,
- n_estimators_per_chunk=cls.n_estimators_per_sample,
- max_n_estimators=np.inf,
- dask_feeding=cls.dask_feeding,
- spf_sample_prop=cls.spf_sample_prop,
- spf_n_fits=cls.spf_n_fits)
-
- super().setUpClass()
-
-
-class TestStreamingEXTR9(FitTests, unittest.TestCase):
- @classmethod
- def setUpClass(cls):
- """Set up model to test."""
-
- cls.spf_n_fits = 20
- cls.spf_sample_prop = 0.1
- cls.dask_feeding = False
- cls.n_estimators_per_sample = 6
-
- cls.mod = StreamingEXTR(verbose=1,
- n_estimators_per_chunk=cls.n_estimators_per_sample,
- max_n_estimators=np.inf,
- dask_feeding=cls.dask_feeding,
- spf_sample_prop=cls.spf_sample_prop,
- spf_n_fits=cls.spf_n_fits)
-
- super().setUpClass()
diff --git a/tests/integration/incremental_trees/test_trees_dask.py b/tests/integration/incremental_trees/test_trees_dask.py
deleted file mode 100644
index 123313e..0000000
--- a/tests/integration/incremental_trees/test_trees_dask.py
+++ /dev/null
@@ -1,294 +0,0 @@
-import unittest
-
-import numpy as np
-from dask_ml.wrappers import Incremental
-
-from incremental_trees.models.classification.streaming_extc import StreamingEXTC
-from incremental_trees.models.regression.streaming_extr import StreamingEXTR
-from incremental_trees.models.regression.streaming_rfr import StreamingRFR
-from incremental_trees.trees import StreamingRFC
-from tests.integration.base import DaskTests
-
-
-class TestDaskModel1(DaskTests, unittest.TestCase):
- @classmethod
- def setUpClass(cls):
- """Set up model to test."""
- cls = cls._prep_data(cls)
- cls.mod = Incremental(StreamingEXTC(n_estimators_per_chunk=1,
- max_n_estimators=39,
- verbose=1))
-
- # Set expected number of estimators
- # This should be set manually depending on data.
- cls.expected_n_estimators = 10
-
- # Set helper values
- super().setUpClass()
-
-
-class TestDaskModel2(DaskTests, unittest.TestCase):
- @classmethod
- def setUpClass(cls):
- """Set up model to test."""
- cls = cls._prep_data(cls)
- cls.mod = Incremental(StreamingEXTC(n_estimators_per_chunk=2,
- n_jobs=-1,
- max_n_estimators=np.inf,
- verbose=1))
-
- # Set expected number of estimators
- cls.expected_n_estimators = 20
-
- # Set helper values
- super().setUpClass()
-
-
-class TestDaskModel3(DaskTests, unittest.TestCase):
- @classmethod
- def setUpClass(cls):
- """Set up model to test."""
- cls = cls._prep_data(cls)
- cls.mod = Incremental(StreamingEXTC(n_estimators_per_chunk=20,
- n_jobs=-1,
- max_n_estimators=np.inf,
- verbose=1))
-
- # Set expected number of estimators
- cls.expected_n_estimators = 200
-
- # Set helper values
- super().setUpClass()
-
-
-class TestDaskModel4(DaskTests, unittest.TestCase):
- @classmethod
- def setUpClass(cls):
- """Set up model to test."""
- cls = cls._prep_data(cls)
- cls.mod = Incremental(StreamingEXTC(n_estimators_per_chunk=1,
- n_jobs=-1,
- max_n_estimators=np.inf,
- max_features=cls.x.shape[1],
- verbose=1))
-
- # Set expected number of estimators
- cls.expected_n_estimators = 10
-
- # Set helper values
- super().setUpClass()
-
-
-class TestDaskModel5(DaskTests, unittest.TestCase):
- @classmethod
- def setUpClass(cls):
- """Set up model to test."""
- cls = cls._prep_data(cls,
- reg=True)
- cls.mod = Incremental(StreamingEXTR(n_estimators_per_chunk=1,
- max_n_estimators=39,
- verbose=1))
-
- # Set expected number of estimators
- # This should be set manually depending on data.
- cls.expected_n_estimators = 10
-
- # Set helper values
- super().setUpClass()
-
-
-class TestDaskModel6(DaskTests, unittest.TestCase):
- @classmethod
- def setUpClass(cls):
- """Set up model to test."""
- cls = cls._prep_data(cls,
- reg=True)
- cls.mod = Incremental(StreamingEXTR(n_estimators_per_chunk=2,
- n_jobs=-1,
- max_n_estimators=np.inf,
- verbose=1))
-
- # Set expected number of estimators
- cls.expected_n_estimators = 20
-
- # Set helper values
- super().setUpClass()
-
-
-class TestDaskModel7(DaskTests, unittest.TestCase):
- @classmethod
- def setUpClass(cls):
- """Set up model to test."""
- cls = cls._prep_data(cls,
- reg=True)
- cls.mod = Incremental(StreamingEXTR(n_estimators_per_chunk=4,
- n_jobs=-1,
- max_n_estimators=np.inf,
- verbose=1))
-
- # Set expected number of estimators
- cls.expected_n_estimators = 40
-
- # Set helper values
- super().setUpClass()
-
-
-class TestDaskModel8(DaskTests, unittest.TestCase):
- @classmethod
- def setUpClass(cls):
- """Set up model to test."""
- cls = cls._prep_data(cls,
- reg=True)
- cls.mod = Incremental(StreamingEXTR(n_estimators_per_chunk=1,
- n_jobs=-1,
- max_n_estimators=np.inf,
- max_features=cls.x.shape[1],
- verbose=1))
-
- # Set expected number of estimators
- cls.expected_n_estimators = 10
-
- # Set helper values
- super().setUpClass()
-
-
-class TestDaskRFC1(DaskTests, unittest.TestCase):
- @classmethod
- def setUpClass(cls):
- """Set up model to test."""
- cls = cls._prep_data(cls)
- cls.mod = Incremental(StreamingRFC(n_estimators_per_chunk=1,
- max_n_estimators=39,
- verbose=1))
-
- # Set expected number of estimators
- # This should be set manually depending on data.
- cls.expected_n_estimators = 10
-
- # Set helper values
- super().setUpClass()
-
-
-class TestDaskRFC2(DaskTests, unittest.TestCase):
- @classmethod
- def setUpClass(cls):
- """Set up model to test."""
- cls = cls._prep_data(cls)
- cls.mod = Incremental(StreamingRFC(n_estimators_per_chunk=2,
- n_jobs=-1,
- max_n_estimators=np.inf,
- verbose=1))
-
- # Set expected number of estimators
- cls.expected_n_estimators = 20
-
- # Set helper values
- super().setUpClass()
-
-
-class TestDaskRFC3(DaskTests, unittest.TestCase):
- @classmethod
- def setUpClass(cls):
- """Set up model to test."""
- cls = cls._prep_data(cls)
- cls.mod = Incremental(StreamingRFC(n_estimators_per_chunk=20,
- n_jobs=-1,
- max_n_estimators=np.inf,
- verbose=1))
-
- # Set expected number of estimators
- cls.expected_n_estimators = 200
-
- # Set helper values
- super().setUpClass()
-
-
-class TestDaskRFC4(DaskTests, unittest.TestCase):
- @classmethod
- def setUpClass(cls):
- """Set up model to test."""
- cls = cls._prep_data(cls)
- cls.mod = Incremental(StreamingRFC(n_estimators_per_chunk=1,
- n_jobs=-1,
- max_n_estimators=np.inf,
- max_features=cls.x.shape[1],
- verbose=1))
-
- # Set expected number of estimators
- cls.expected_n_estimators = 10
-
- # Set helper values
- super().setUpClass()
-
-
-class TestDaskRFR1(DaskTests, unittest.TestCase):
- @classmethod
- def setUpClass(cls):
- """Set up model to test."""
- cls = cls._prep_data(cls,
- reg=True)
- cls.mod = Incremental(StreamingRFR(n_estimators_per_chunk=1,
- max_n_estimators=39,
- verbose=1))
-
- # Set expected number of estimators
- # This should be set manually depending on data.
- cls.expected_n_estimators = 10
-
- # Set helper values
- super().setUpClass()
-
-
-class TestDaskRFR2(DaskTests, unittest.TestCase):
- @classmethod
- def setUpClass(cls):
- """Set up model to test."""
- cls = cls._prep_data(cls,
- reg=True)
- cls.mod = Incremental(StreamingRFR(n_estimators_per_chunk=2,
- n_jobs=-1,
- max_n_estimators=np.inf,
- verbose=1))
-
- # Set expected number of estimators
- cls.expected_n_estimators = 20
-
- # Set helper values
- super().setUpClass()
-
-
-class TestDaskRFR3(DaskTests, unittest.TestCase):
- @classmethod
- def setUpClass(cls):
- """Set up model to test."""
- cls = cls._prep_data(cls,
- reg=True)
- cls.mod = Incremental(StreamingRFR(n_estimators_per_chunk=20,
- n_jobs=-1,
- max_n_estimators=np.inf,
- verbose=1))
-
- # Set expected number of estimators
- cls.expected_n_estimators = 200
-
- # Set helper values
- super().setUpClass()
-
-
-class TestDaskRFR4(DaskTests, unittest.TestCase):
- @classmethod
- def setUpClass(cls):
- """Set up model to test."""
- cls = cls._prep_data(cls,
- reg=True)
- cls.mod = Incremental(StreamingRFR(n_estimators_per_chunk=1,
- n_jobs=-1,
- max_n_estimators=np.inf,
- max_features=cls.x.shape[1],
- verbose=1))
-
- # Set expected number of estimators
- cls.expected_n_estimators = 10
-
- # Set helper values
- super().setUpClass()
diff --git a/tests/integration/regression/__init__.py b/tests/integration/regression/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/tests/integration/regression/test_streaming_extr.py b/tests/integration/regression/test_streaming_extr.py
new file mode 100644
index 0000000..31f07f2
--- /dev/null
+++ b/tests/integration/regression/test_streaming_extr.py
@@ -0,0 +1,199 @@
+import numpy as np
+from dask_ml.wrappers import Incremental
+from sklearn import datasets
+
+from incremental_trees.models.regression.streaming_extr import StreamingEXTR
+from tests.integration.base.dask_test_base import DaskTestBase
+from tests.integration.base.fit_test_base import FitTestBase
+from tests.integration.base.partial_fit_test_base import PartialFitTestBase
+
+
+class TestStreamingEXTRWithPartialFitsUnlimitedEstimators(PartialFitTestBase):
+ """
+ Test SEXT with single estimator per chunk with "random forest style" max features. ie, subset.
+
+ No limit on the total number of trees.
+ """
+
+ @classmethod
+ def setUpClass(cls):
+ cls.n_samples = 1000
+ cls.x, cls.y = datasets.make_regression(n_samples=int(2e4), random_state=0, n_features=40)
+ cls.mod = StreamingEXTR(n_estimators_per_chunk=1, max_n_estimators=np.inf)
+ cls.expected_n_estimators = 100
+
+ super().setUpClass()
+
+
+class TestStreamingEXTRWithPartialFitsLimitedEstimators(PartialFitTestBase):
+ """
+ Test SEXT with single estimator per chunk with "random forest style" max features. ie, subset.
+
+ Total models limited to 39.
+ """
+
+ @classmethod
+ def setUpClass(cls):
+ cls.n_samples = 1000
+ cls.x, cls.y = datasets.make_regression(n_samples=int(2e4), random_state=0, n_features=400)
+ cls.mod = StreamingEXTR(n_estimators_per_chunk=1, max_n_estimators=39)
+ cls.expected_n_estimators = 39
+
+ super().setUpClass()
+
+
+class TestStreamingEXTRWithPartialFitsMultipleEstimatorsPerChunk(PartialFitTestBase):
+ """
+ Test SEXT with multiple estimators per chunk with "random forest style" max features. ie, subset.
+
+ No limit on total models, 3 estimators per row subset (each with different feature subset)
+ """
+
+ @classmethod
+ def setUpClass(cls):
+ cls.n_samples = 1000
+ cls.x, cls.y = datasets.make_regression(n_samples=int(2e4), random_state=0, n_features=40)
+ cls.mod = StreamingEXTR(n_estimators_per_chunk=3, n_jobs=-1, max_n_estimators=np.inf)
+ cls.expected_n_estimators = 300
+
+ super().setUpClass()
+
+
+class TestStreamingEXTRWithPartialFitsAllFeatures(PartialFitTestBase):
+ """
+ Test SEXT with single estimator per chunk with "decision tree style" max features. ie, all available to each tree.
+
+ No limit on total models, 1 estimators per row subset.
+ """
+
+ @classmethod
+ def setUpClass(cls):
+ cls.n_samples = 1000
+ cls.x, cls.y = datasets.make_regression(n_samples=int(2e4), random_state=0, n_features=4)
+ cls.mod = StreamingEXTR(n_estimators_per_chunk=1, max_features=cls.x.shape[1], max_n_estimators=np.inf)
+ cls.expected_n_estimators = 100
+
+ super().setUpClass()
+
+
+class TestStreamingEXTRWithPartialFitsMultipleEstimatorsPerChunkAllFeatures(PartialFitTestBase):
+ """
+ Test SEXT with single estimator per chunk with "decision tree style" max features. ie, all available to each tree.
+
+ No limit on total models, 3 estimators per row subset.
+ """
+
+ @classmethod
+ def setUpClass(cls):
+ cls.n_samples = 1000
+ cls.x, cls.y = datasets.make_regression(n_samples=int(2e4),
+ random_state=0,
+ n_features=40)
+ cls.mod = StreamingEXTR(n_estimators_per_chunk=3,
+ n_jobs=-1,
+ max_features=cls.x.shape[1],
+ max_n_estimators=np.inf)
+ cls.expected_n_estimators = 300
+
+ super().setUpClass()
+
+
+class TestStreamingEXTRWithFitSingleEstimatorPerChunk(FitTestBase):
+ @classmethod
+ def setUpClass(cls):
+ cls.spf_n_fits = 10
+ cls.n_estimators_per_sample = 1
+ cls.mod = StreamingEXTR(
+ verbose=1,
+ n_estimators_per_chunk=cls.n_estimators_per_sample,
+ max_n_estimators=np.inf,
+ dask_feeding=cls.dask_feeding,
+ spf_sample_prop=cls.spf_sample_prop,
+ spf_n_fits=cls.spf_n_fits
+ )
+
+ super().setUpClass()
+
+
+class TestStreamingEXTRWithFitMultipleEstimatorsPerChunk(FitTestBase):
+ @classmethod
+ def setUpClass(cls):
+ cls.spf_n_fits = 10
+ cls.n_estimators_per_sample = 10
+ cls.mod = StreamingEXTR(
+ verbose=1,
+ n_estimators_per_chunk=cls.n_estimators_per_sample,
+ max_n_estimators=np.inf,
+ dask_feeding=cls.dask_feeding,
+ spf_sample_prop=cls.spf_sample_prop,
+ spf_n_fits=cls.spf_n_fits
+ )
+
+ super().setUpClass()
+
+
+class TestStreamingEXTRWithFitAdditionalSteps(FitTestBase):
+ @classmethod
+ def setUpClass(cls):
+ cls.spf_n_fits = 20
+ cls.n_estimators_per_sample = 6
+ cls.mod = StreamingEXTR(
+ verbose=1,
+ n_estimators_per_chunk=cls.n_estimators_per_sample,
+ max_n_estimators=np.inf,
+ dask_feeding=cls.dask_feeding,
+ spf_sample_prop=cls.spf_sample_prop,
+ spf_n_fits=cls.spf_n_fits
+ )
+
+ super().setUpClass()
+
+
+class TestDaskEXTRWithDask(DaskTestBase):
+ @classmethod
+ def setUpClass(cls):
+ cls._prep_data(cls, reg=True)
+ cls.mod = Incremental(StreamingEXTR(n_estimators_per_chunk=1, max_n_estimators=39, verbose=1))
+ cls.expected_n_estimators = 10
+
+ super().setUpClass()
+
+
+class TestDaskEXTRWithDaskMultipleEstimatorsPerChunk(DaskTestBase):
+ @classmethod
+ def setUpClass(cls):
+ cls._prep_data(cls, reg=True)
+ cls.mod = Incremental(StreamingEXTR(n_estimators_per_chunk=2, n_jobs=-1, max_n_estimators=np.inf, verbose=1))
+ cls.expected_n_estimators = 20
+
+ super().setUpClass()
+
+
+class TestDaskEXTRWithDaskManyEstimatorsPerChunk(DaskTestBase):
+ @classmethod
+ def setUpClass(cls):
+ cls._prep_data(cls, reg=True)
+ cls.mod = Incremental(StreamingEXTR(n_estimators_per_chunk=20, n_jobs=-1, max_n_estimators=np.inf, verbose=1))
+ cls.expected_n_estimators = 200
+
+ super().setUpClass()
+
+
+class TestDaskEXTRWithDaskAlLFeatures(DaskTestBase):
+ @classmethod
+ def setUpClass(cls):
+ cls._prep_data(cls, reg=True)
+ cls.mod = Incremental(
+ StreamingEXTR(
+ n_estimators_per_chunk=1, n_jobs=-1,
+ max_n_estimators=np.inf,
+ max_features=cls.x.shape[1],
+ verbose=1
+ )
+ )
+ cls.expected_n_estimators = 10
+
+ super().setUpClass()
+
+
+del FitTestBase, PartialFitTestBase, DaskTestBase
diff --git a/tests/integration/regression/test_streaming_rfr.py b/tests/integration/regression/test_streaming_rfr.py
new file mode 100644
index 0000000..05d813e
--- /dev/null
+++ b/tests/integration/regression/test_streaming_rfr.py
@@ -0,0 +1,207 @@
+import numpy as np
+from dask_ml.wrappers import Incremental
+from sklearn import datasets
+
+from incremental_trees.models.regression.streaming_rfr import StreamingRFR
+from tests.integration.base.dask_test_base import DaskTestBase
+from tests.integration.base.fit_test_base import FitTestBase
+from tests.integration.base.partial_fit_test_base import PartialFitTestBase
+
+
+class TestStreamingRFRWithPartialFitsUnlimitedEstimators(PartialFitTestBase):
+ """
+ Test SRFC with single estimator per chunk with "random forest style" max features. ie, subset.
+
+ No limit on the total number of trees.
+ """
+
+ @classmethod
+ def setUpClass(cls):
+ """Set up model to test."""
+ cls.n_samples = 1000
+ cls.x, cls.y = datasets.make_regression(n_samples=int(2e4), random_state=0, n_features=40)
+ cls.mod = StreamingRFR(verbose=1, n_estimators_per_chunk=1, max_n_estimators=np.inf)
+ cls.expected_n_estimators = 100
+
+ super().setUpClass()
+
+
+class TestStreamingRFRWithPartialFitsLimitedEstimators(PartialFitTestBase):
+ """
+ Test SRFC with single estimator per chunk with "random forest style" max features. ie, subset.
+
+ Total models limited to 39.
+ """
+
+ @classmethod
+ def setUpClass(cls):
+ """Set up model to test."""
+ cls.n_samples = 1000
+ cls.x, cls.y = datasets.make_regression(n_samples=int(2e4), random_state=0, n_features=40)
+ cls.mod = StreamingRFR(n_estimators_per_chunk=1, max_n_estimators=39)
+ cls.expected_n_estimators = 39
+
+ super().setUpClass()
+
+
+class TestStreamingRFRWithPartialFitsMultipleEstimatorsPerChunk(PartialFitTestBase):
+ """
+ Test SRFC with multiple estimators per chunk with "random forest style" max features. ie, subset.
+
+ No limit on total models, 3 estimators per row subset (each with different feature subset)
+ """
+
+ @classmethod
+ def setUpClass(cls):
+ """Set up model to test."""
+ cls.n_samples = 1000
+ cls.x, cls.y = datasets.make_regression(n_samples=int(2e4), random_state=0, n_features=40)
+ cls.mod = StreamingRFR(n_estimators_per_chunk=3, n_jobs=-1, max_n_estimators=np.inf)
+ cls.expected_n_estimators = 300
+
+ super().setUpClass()
+
+
+class TestStreamingRFRWithPartialFitsAllFeatures(PartialFitTestBase):
+ """
+ Test SRFC with single estimator per chunk with "decision tree style" max features. ie, all available to each tree.
+
+ No limit on total models, 1 estimators per row subset.
+ """
+
+ @classmethod
+ def setUpClass(cls):
+ """Set up model to test."""
+ cls.n_samples = 1000
+ cls.x, cls.y = datasets.make_regression(n_samples=int(2e4), random_state=0, n_features=40)
+ cls.mod = StreamingRFR(n_estimators_per_chunk=1, max_features=cls.x.shape[1], max_n_estimators=np.inf)
+ cls.expected_n_estimators = 100
+
+ super().setUpClass()
+
+
+class TestStreamingRFRWithPartialFitsMultipleEstimatorsPerChunkAllFeatures(PartialFitTestBase):
+ """
+ Test SRFC with single estimator per chunk with "decision tree style" max features. ie, all available to each tree.
+
+ No limit on total models, 3 estimators per row subset.
+ """
+
+ @classmethod
+ def setUpClass(cls):
+ cls.n_samples = 1000
+ cls.x, cls.y = datasets.make_regression(n_samples=int(2e4), random_state=0, n_features=40)
+ cls.mod = StreamingRFR(
+ n_estimators_per_chunk=3,
+ n_jobs=-1,
+ max_features=cls.x.shape[1],
+ max_n_estimators=np.inf
+ )
+ cls.expected_n_estimators = 300
+
+ super().setUpClass()
+
+
+class TestStreamingEXTRWithFitSingleEstimatorPerChunk(FitTestBase):
+ @classmethod
+ def setUpClass(cls):
+ cls.spf_n_fits = 10
+ cls.n_estimators_per_sample = 1
+
+ cls.mod = StreamingRFR(
+ verbose=1,
+ n_estimators_per_chunk=cls.n_estimators_per_sample,
+ max_n_estimators=np.inf,
+ dask_feeding=cls.dask_feeding,
+ spf_sample_prop=cls.spf_sample_prop,
+ spf_n_fits=cls.spf_n_fits
+ )
+
+ super().setUpClass()
+
+
+class TestStreamingEXTRWithFitMultipleEstimatorsPerChunk(FitTestBase):
+ @classmethod
+ def setUpClass(cls):
+ """Set up model to test."""
+ cls.spf_n_fits = 10
+ cls.n_estimators_per_sample = 10
+ cls.mod = StreamingRFR(
+ verbose=1,
+ n_estimators_per_chunk=cls.n_estimators_per_sample,
+ max_n_estimators=np.inf,
+ dask_feeding=cls.dask_feeding,
+ spf_sample_prop=cls.spf_sample_prop,
+ spf_n_fits=cls.spf_n_fits
+ )
+
+ super().setUpClass()
+
+
+class TestStreamingEXTRWithFitAdditionalSteps(FitTestBase):
+ @classmethod
+ def setUpClass(cls):
+ """Set up model to test."""
+
+ cls.spf_n_fits = 20
+ cls.n_estimators_per_sample = 6
+ cls.mod = StreamingRFR(
+ verbose=2,
+ n_estimators_per_chunk=cls.n_estimators_per_sample,
+ max_n_estimators=np.inf,
+ dask_feeding=cls.dask_feeding,
+ spf_sample_prop=cls.spf_sample_prop,
+ spf_n_fits=cls.spf_n_fits
+ )
+
+ super().setUpClass()
+
+
+class TestDaskRFRWithDask(DaskTestBase):
+ @classmethod
+ def setUpClass(cls):
+ cls._prep_data(cls, reg=True)
+ cls.mod = Incremental(StreamingRFR(n_estimators_per_chunk=1, max_n_estimators=39, verbose=1))
+ cls.expected_n_estimators = 10
+
+ super().setUpClass()
+
+
+class TestDaskRFRWithDaskMultipleEstimatorsPerChunk(DaskTestBase):
+ @classmethod
+ def setUpClass(cls):
+ cls._prep_data(cls, reg=True)
+ cls.mod = Incremental(StreamingRFR(n_estimators_per_chunk=2, n_jobs=-1, max_n_estimators=np.inf, verbose=1))
+ cls.expected_n_estimators = 20
+
+ super().setUpClass()
+
+
+class TestDaskRFRWithDaskManyEstimatorsPerChunk(DaskTestBase):
+ @classmethod
+ def setUpClass(cls):
+ cls._prep_data(cls, reg=True)
+ cls.mod = Incremental(StreamingRFR(n_estimators_per_chunk=20, n_jobs=-1, max_n_estimators=np.inf, verbose=1))
+ cls.expected_n_estimators = 200
+
+ super().setUpClass()
+
+
+class TestDaskRFRWithDaskAlLFeatures(DaskTestBase):
+ @classmethod
+ def setUpClass(cls):
+ cls._prep_data(cls, reg=True)
+ cls.mod = Incremental(
+ StreamingRFR(
+ n_estimators_per_chunk=1, n_jobs=-1,
+ max_n_estimators=np.inf,
+ max_features=cls.x.shape[1],
+ verbose=1
+ )
+ )
+ cls.expected_n_estimators = 10
+
+ super().setUpClass()
+
+
+del FitTestBase, PartialFitTestBase, DaskTestBase
diff --git a/tests/performance/__init__.py b/tests/performance/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/tests/integration/incremental_trees/test_trees_benchmarks.py b/tests/performance/test_benchmarks.py
similarity index 93%
rename from tests/integration/incremental_trees/test_trees_benchmarks.py
rename to tests/performance/test_benchmarks.py
index 823b559..989dc0f 100644
--- a/tests/integration/incremental_trees/test_trees_benchmarks.py
+++ b/tests/performance/test_benchmarks.py
@@ -2,11 +2,11 @@
import numpy as np
from distributed import LocalCluster, Client
+from incremental_trees.models.classification.streaming_rfc import StreamingRFC
from sklearn import clone
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
-from incremental_trees.trees import StreamingRFC
from tests.common.data_fixture import DataFixture
@@ -42,12 +42,12 @@ def _fit_benchmarks(self):
self.rfc.fit(self.x_train, self.y_train)
self.rfc_once.fit(self.x_train, self.y_train)
- self.log_reg_report, self.log_reg_train_auc, self.log_reg_test_auc = self._mod_report(self,
- mod=self.log_reg)
- self.rfc_report, self.rfc_train_auc, self.rfc_test_auc = self._mod_report(self,
- mod=self.rfc)
- self.rfc_once_report, self.rfc_once_train_auc, self.rfc_once_test_auc = self._mod_report(self,
- mod=self.rfc_once)
+ self.log_reg_report, self.log_reg_train_auc, self.log_reg_test_auc = self._mod_report(self, mod=self.log_reg)
+ self.rfc_report, self.rfc_train_auc, self.rfc_test_auc = self._mod_report(self, mod=self.rfc)
+ self.rfc_once_report, self.rfc_once_train_auc, self.rfc_once_test_auc = self._mod_report(
+ self,
+ mod=self.rfc_once
+ )
return self
@@ -60,14 +60,14 @@ def _assert_same_n_rows(self):
# Will be available in actual test.
n_rows = self.x_train.shape[0]
- self.assertEqual(self.rfc_n_estimators * n_rows,
- (self.srfc_n_estimators_per_chunk *
- self.srfc_n_partial_fit_calls *
- int(n_rows / self.srfc_n_partial_fit_calls)))
+ self.assertEqual(
+ self.rfc_n_estimators * n_rows,
+ (self.srfc_n_estimators_per_chunk *
+ self.srfc_n_partial_fit_calls *
+ int(n_rows / self.srfc_n_partial_fit_calls))
+ )
- def _fit_srfc(self,
- sequential: bool = True,
- n_prop: float = 0.1) -> StreamingRFC:
+ def _fit_srfc(self, sequential: bool = True, n_prop: float = 0.1) -> StreamingRFC:
"""
Fit the streaming RFC. Total number of rows used in training varies depending on sequential.
@@ -123,7 +123,7 @@ def _fit_with_dask(self):
n_workers=2,
threads_per_worker=2,
scheduler_port=8080,
- diagnostics_port=8081) as cluster, Client(cluster) as client:
+ diagnostics_port=8081) as cluster, Client(cluster):
self.srfc_dask.fit(self.x_train, self.y_train)
def _fit_with_spf(self):
diff --git a/tests/integration/incremental_trees/test_trees_grids.py b/tests/performance/test_grids.py
similarity index 93%
rename from tests/integration/incremental_trees/test_trees_grids.py
rename to tests/performance/test_grids.py
index 4886558..9ddb7fc 100644
--- a/tests/integration/incremental_trees/test_trees_grids.py
+++ b/tests/performance/test_grids.py
@@ -1,5 +1,3 @@
-# TODO: These tests aren't finished. Need to generalise, add EXTC, regressors, etc.
-
import unittest
import warnings
@@ -7,18 +5,14 @@
from sklearn.model_selection import RandomizedSearchCV
from incremental_trees.models.classification.streaming_extc import StreamingEXTC
-from incremental_trees.trees import StreamingRFC
+from incremental_trees.models.classification.streaming_rfc import StreamingRFC
from tests.common.data_fixture import DataFixture
from tests.common.param_fixtures import RFCGRID, SRFCGRID
class GridBenchmarks:
def test_fit_all(self):
- """
- Fit grids and compare.
-
- TODO: Generalise naming.
- """
+ """Fit grids and compare."""
with warnings.catch_warnings():
warnings.simplefilter('ignore', FutureWarning)
diff --git a/tests/requirements.txt b/tests/requirements.txt
new file mode 100644
index 0000000..e079f8a
--- /dev/null
+++ b/tests/requirements.txt
@@ -0,0 +1 @@
+pytest