Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Update reqs #13

Merged
merged 7 commits into from
Jan 1, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 3 additions & 3 deletions .github/workflows/run_tests.yml
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@ jobs:
runs-on: ubuntu-latest
strategy:
matrix:
python-version: [3.6, 3.7, 3.8]
python-version: [3.8, 3.9]

steps:
- uses: actions/checkout@v2
Expand All @@ -23,7 +23,7 @@ jobs:
- name: Install dependencies
run: |
python -m pip install --upgrade pip
pip install -r requirements.txt
pip install .
- name: Lint with flake8
run: |
pip install flake8
Expand All @@ -33,5 +33,5 @@ jobs:
flake8 . --count --exit-zero --max-complexity=10 --max-line-length=127 --statistics
- name: Test with pytest
run: |
pip install pytest
pip install -r tests/requirements.txt
pytest
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@ laptop_env/
worker*/
*.dirlock
*.lock
notes/

# Byte-compiled / optimized / DLL files
__pycache__/
Expand Down
26 changes: 12 additions & 14 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -39,16 +39,14 @@ from sklearn.datasets import make_blobs
from incremental_trees.models.classification.streaming_rfc import StreamingRFC

# Generate some data in memory
x, y = make_blobs(n_samples=int(2e5), random_state=0, n_features=40,
centers=2, cluster_std=100)
x, y = make_blobs(n_samples=int(2e5), random_state=0, n_features=40, centers=2, cluster_std=100)

srfc = StreamingRFC(n_estimators_per_chunk=3,
max_n_estimators=np.inf,
spf_n_fits=30, # Number of calls to .partial_fit()
spf_sample_prop=0.3) # Number of rows to sample each on .partial_fit()

srfc.fit(x, y,
sample_weight=np.ones_like(y)) # Optional, gets sampled along with the data
srfc.fit(x, y, sample_weight=np.ones_like(y)) # Optional, gets sampled along with the data

# Should be n_estimators_per_chunk * spf_n_fits
print(len(srfc.estimators_))
Expand Down Expand Up @@ -96,7 +94,7 @@ For example, this can be used to feed .partial_fit() sequentially (although belo
````python
import numpy as np
from sklearn.datasets import make_blobs
from incremental_trees.trees import StreamingRFC
from incremental_trees.models.classification.streaming_rfc import StreamingRFC

srfc = StreamingRFC(n_estimators_per_chunk=20,
max_n_estimators=np.inf,
Expand All @@ -110,11 +108,11 @@ x, y = make_blobs(n_samples=int(2e5), random_state=0, n_features=40,
n_chunks = 30
chunk_size = int(2e3)
for i in range(n_chunks):
sample_idx = np.random.randint(0, x.shape[0], chunk_size)
# Call .partial_fit(), specifying expected classes, also supports other .fit args such as sample_weight
srfc.partial_fit(x[sample_idx, :], y[sample_idx],
classes=np.unique(y))
sample_idx = np.random.randint(0, x.shape[0], chunk_size)
# Call .partial_fit(), specifying expected classes, also supports other .fit args such as sample_weight
srfc.partial_fit(x[sample_idx, :], y[sample_idx],
classes=np.unique(y))

# Should be n_chunks * n_estimators_per_chunk
print(len(srfc.estimators_))
print(srfc.score(x, y))
Expand All @@ -126,17 +124,17 @@ There are a couple of different model setups worth considering. No idea which wo
#### "Incremental forest"
For the number of chunks/fits, sample rows from X, then fit a number of single trees (with different column subsets), eg.
````python
srfc = StreamingRFC(n_estimators_per_chunk=10,
max_features='sqrt')
srfc = StreamingRFC(n_estimators_per_chunk=10, max_features='sqrt')
````
#### "Incremental decision trees"
Single (or few) decision trees per data subset, with all features.
````python
srfc = StreamingRFC(n_estimators_per_chunk=1,
max_features=x.shape[1])
srfc = StreamingRFC(n_estimators_per_chunk=1, max_features=x.shape[1])
````

# Version history
## v0.6.0
- Update to work with scikit-learn==1.2, dask==2022.12, dask-glm==0.2.0, dask-ml==2022.5.27. Support python 3.8 and 3.9.
## v0.5.1
- Add support for passing fit args/kwargs via `.fit` (specifically, `sample_weight`)
## v0.5.0
Expand Down
40 changes: 0 additions & 40 deletions example_dask.py

This file was deleted.

21 changes: 0 additions & 21 deletions example_fit.py

This file was deleted.

2 changes: 1 addition & 1 deletion incremental_trees/__init__.py
Original file line number Diff line number Diff line change
@@ -1 +1 @@
__version__ = '0.5.1'
__version__ = '0.6.0'
59 changes: 32 additions & 27 deletions incremental_trees/models/classification/streaming_extc.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,5 @@
from typing import Optional, Dict, Union

import numpy as np
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.tree import ExtraTreeClassifier
Expand All @@ -10,52 +12,54 @@ class StreamingEXTC(ClassifierAdditions, ClassifierOverloads, ExtraTreesClassifi
"""Overload sklearn.ensemble.ExtraTreesClassifier to add partial fit method and new params."""

def __init__(self,
criterion: str = "gini",
max_depth: Optional[int] = None,
min_samples_split: int = 2,
min_samples_leaf: int = 1,
min_weight_fraction_leaf: float = 0.0,
max_features: float = 1.0,
max_leaf_nodes: Optional[int] = None,
min_impurity_decrease: float = 0.0,
bootstrap: bool = False,
oob_score: bool = False,
n_jobs: Optional[int] = None,
random_state: Optional[int] = None,
verbose: int = 0,
warm_start: bool = True,
class_weight: Optional[Union[str, Dict]] = None,
ccp_alpha: float = 0.0,
max_samples: Optional[float] = None,
n_estimators_per_chunk: int = 1,
n_estimators: bool = None,
max_n_estimators=np.inf,
criterion="gini",
max_depth=None,
min_samples_split=2,
min_samples_leaf=1,
min_weight_fraction_leaf=0.,
max_features="auto",
max_leaf_nodes=None,
min_impurity_decrease=0.,
min_impurity_split=None,
bootstrap=False,
oob_score=False,
n_jobs=None,
random_state=None,
verbose=0,
warm_start=True,
class_weight=None,
max_n_estimators: float = np.inf,
dask_feeding: bool = True,
spf_n_fits=100,
spf_sample_prop: float = 0.1):
spf_n_fits: int = 100,
spf_sample_prop: float = 0.1
):
super(ExtraTreesClassifier, self).__init__(
base_estimator=ExtraTreeClassifier(),
estimator=ExtraTreeClassifier(),
n_estimators=n_estimators_per_chunk,
estimator_params=("criterion", "max_depth", "min_samples_split",
"min_samples_leaf", "min_weight_fraction_leaf",
"max_features", "max_leaf_nodes",
"min_impurity_decrease", "min_impurity_split",
"random_state"),
"min_impurity_decrease",
"random_state", "ccp_alpha"),
bootstrap=bootstrap,
oob_score=oob_score,
n_jobs=n_jobs,
random_state=random_state,
verbose=verbose,
warm_start=warm_start,
class_weight=class_weight)
class_weight=class_weight,
max_samples=max_samples
)

self.max_n_estimators: int = None
self._fit_estimators: int = 0
self.classes_: np.array = None # NB: Needs to be array, not list.
self.n_classes_: int = None

self._fit_estimators = 0
self.max_n_estimators = max_n_estimators
self.n_estimators_per_chunk = n_estimators
self.n_estimators_per_chunk = n_estimators_per_chunk
self.criterion = criterion
self.max_depth = max_depth
self.min_samples_split = min_samples_split
Expand All @@ -64,7 +68,8 @@ def __init__(self,
self.max_features = max_features
self.max_leaf_nodes = max_leaf_nodes
self.min_impurity_decrease = min_impurity_decrease
self.min_impurity_split = min_impurity_split
self.ccp_alpha = ccp_alpha
self.max_samples = max_samples

# Set additional params.
self.set_params(n_estimators_per_chunk=n_estimators_per_chunk,
Expand Down
48 changes: 25 additions & 23 deletions incremental_trees/models/classification/streaming_rfc.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
import warnings
from typing import Optional, Union, Dict, List

import numpy as np
from sklearn.ensemble import RandomForestClassifier
Expand All @@ -15,27 +15,28 @@ class StreamingRFC(ClassifierAdditions, ClassifierOverloads, RandomForestClassif
"""

def __init__(self,
bootstrap=True,
class_weight=None,
criterion='gini',
max_depth=None,
max_features='auto',
max_leaf_nodes=None,
min_impurity_decrease=0.0,
min_impurity_split=None,
min_samples_leaf=1,
min_samples_split=2,
min_weight_fraction_leaf=0.0,
n_estimators_per_chunk: int = 1,
n_jobs=None,
oob_score=False,
random_state=None,
verbose=0,
criterion: str = 'gini',
max_depth: Optional[int] = None,
min_samples_split: int = 2,
min_samples_leaf: int = 1,
min_weight_fraction_leaf: float = 0.0,
max_features: Optional[str] = 'sqrt',
max_leaf_nodes: Optional[int] = None,
min_impurity_decrease: float = 0.0,
bootstrap: bool = True,
oob_score: bool = False,
n_jobs: Optional[int] = None,
random_state: Optional[int] = None,
verbose: int = 0,
warm_start: bool = True,
class_weight: Optional[Union[str, Dict, List[Dict]]] = None,
ccp_alpha: float = 0.0,
max_samples: Optional[int] = None,
dask_feeding: bool = True,
max_n_estimators=10,
spf_n_fits=100,
spf_sample_prop=0.1) -> None:
n_estimators_per_chunk: int = 1,
max_n_estimators: int = 10,
spf_n_fits: int = 100,
spf_sample_prop: float = 0.1) -> None:
"""
:param bootstrap:
:param class_weight:
Expand All @@ -44,7 +45,6 @@ def __init__(self,
:param max_features:
:param max_leaf_nodes:
:param min_impurity_decrease:
:param min_impurity_split:
:param min_samples_leaf:
:param min_samples_split:
:param min_weight_fraction_leaf:
Expand Down Expand Up @@ -73,7 +73,6 @@ def __init__(self,
max_features=max_features,
max_leaf_nodes=max_leaf_nodes,
min_impurity_decrease=min_impurity_decrease,
min_impurity_split=min_impurity_split,
min_samples_leaf=min_samples_leaf,
min_samples_split=min_samples_split,
min_weight_fraction_leaf=min_weight_fraction_leaf,
Expand All @@ -89,4 +88,7 @@ def __init__(self,
max_n_estimators=max_n_estimators,
verb=0,
spf_n_fits=spf_n_fits,
spf_sample_prop=spf_sample_prop)
spf_sample_prop=spf_sample_prop,
ccp_alpha=ccp_alpha,
max_samples=max_samples
)
Loading