Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

πŸ’š:bug::snake: Fix bug for 0 passing predictions #341

Merged
merged 6 commits into from
Aug 24, 2023
Merged
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
16 changes: 8 additions & 8 deletions Makefile
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Code is fine. Going to need to search and replace mamba/conda in the Docs though

Original file line number Diff line number Diff line change
Expand Up @@ -10,10 +10,10 @@ PYTHON_INTERPRETER = python3
# This was retrieved from https://drive.google.com/file/d/1bSlPldaq3C6Cf9Y5Rm7iwtUDcjxAaeEk/view?usp=sharing
TEST_DATA_FILEID = 1bSlPldaq3C6Cf9Y5Rm7iwtUDcjxAaeEk

ifeq (,$(shell which conda))
HAS_CONDA=False
ifeq (,$(shell which mamba))
HAS_MAMBA=False
else
HAS_CONDA=True
HAS_MAMBA=True
endif

#################################################################################
Expand All @@ -35,14 +35,14 @@ black:

## Set up python interpreter environment
create_environment: autometa-env.yml
ifeq (True,$(HAS_CONDA))
@echo ">>> Detected conda, creating conda environment."
ifeq (True,$(HAS_MAMBA))
@echo ">>> Detected mamba, creating mamba environment."
ifeq (3,$(findstring 3,$(PYTHON_INTERPRETER)))
conda env create --file=autometa-env.yml
mamba env create --file=autometa-env.yml
else
@echo "It looks like you are not using python 3. Autometa is only compatible with python 3. Please upgrade."
endif
@echo ">>> New conda env created. Activate with:\nsource activate $(PROJECT_NAME)"
@echo ">>> New mamba env created. Activate with:\nsource activate $(PROJECT_NAME)"
else
$(PYTHON_INTERPRETER) -m pip install -q virtualenv virtualenvwrapper
@echo ">>> Installing virtualenvwrapper if not already installed.\nMake sure the following lines are in shell startup file\n\
Expand All @@ -61,7 +61,7 @@ install: setup.py

## Install dependencies for test environment
test_environment: tests/environment.yml
conda env update -n $(PROJECT_NAME) --file=$<
mamba env update -n $(PROJECT_NAME) --file=$<

## Build docker image from Dockerfile (auto-taggged as jasonkwan/autometa:<current-branch>)
image: Dockerfile
Expand Down
4 changes: 1 addition & 3 deletions autometa-env.yml
Original file line number Diff line number Diff line change
Expand Up @@ -10,9 +10,7 @@ dependencies:
- bowtie2
- diamond>=2.0
- gdown
- hdbscan
- hmmer
- joblib
- numba>=0.47
- numpy>=1.13
- pandas>=1.1
Expand All @@ -25,7 +23,7 @@ dependencies:
- samtools>=1.11
- scikit-bio
- scipy
- scikit-learn
- scikit-learn>=1.3
- seqkit
- tqdm
- trimap
Expand Down
28 changes: 7 additions & 21 deletions autometa/binning/recursive_dbscan.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,8 +16,7 @@
import pandas as pd
import numpy as np

from sklearn.cluster import DBSCAN
from hdbscan import HDBSCAN
from sklearn.cluster import DBSCAN, HDBSCAN
from numba import config


Expand Down Expand Up @@ -235,8 +234,7 @@ def run_hdbscan(
df: pd.DataFrame,
min_cluster_size: int,
min_samples: int,
cache_dir: str = None,
core_dist_n_jobs: int = -1,
n_jobs: int = -1,
) -> pd.DataFrame:
"""Run clustering on `df` at provided `min_cluster_size`.

Expand All @@ -261,14 +259,9 @@ def run_hdbscan(
The number of samples in a neighborhood for a point to be
considered a core point.

cache_dir : str, optional
Used to cache the output of the computation of the tree.
By default, no caching is done. If a string is given, it is the
path to the caching directory.

core_dist_n_jobs: int
n_jobs: int
Number of parallel jobs to run in core distance computations.
For ``core_dist_n_jobs`` below -1, (n_cpus + 1 + core_dist_n_jobs) are used.
For ``n_jobs`` below -1, (n_cpus + 1 + n_jobs) are used.

Returns
-------
Expand Down Expand Up @@ -304,8 +297,7 @@ def run_hdbscan(
min_samples=min_samples,
cluster_selection_method="leaf",
allow_single_cluster=True,
memory=cache_dir,
core_dist_n_jobs=core_dist_n_jobs,
n_jobs=n_jobs,
).fit_predict(features_df.to_numpy())
clusters = pd.Series(clusters, index=df.index, name="cluster")
# NOTE: HDBSCAN labels outliers with -1
Expand All @@ -325,7 +317,7 @@ def recursive_hdbscan(
verbose: bool = False,
) -> Tuple[pd.DataFrame, pd.DataFrame]:
"""Recursively run HDBSCAN starting with defaults and iterating the min_samples
and min_cluster_size until only 1 cluster is recovered.
and min_cluster_size until only 1 cluster is recovered.

Parameters
----------
Expand Down Expand Up @@ -372,14 +364,12 @@ def recursive_hdbscan(
n_clusters = float("inf")
best_median = float("-inf")
best_df = pd.DataFrame()
cache_dir = tempfile.mkdtemp()
while n_clusters > 1:
binned_df = run_hdbscan(
table,
min_cluster_size=min_cluster_size,
min_samples=min_samples,
cache_dir=cache_dir,
core_dist_n_jobs=n_jobs,
n_jobs=n_jobs,
)
df, metrics_df = add_metrics(df=binned_df, markers_df=markers_df)
filtered_df = apply_binning_metrics_filter(
Expand All @@ -403,8 +393,6 @@ def recursive_hdbscan(
)

if min_cluster_size >= max_min_cluster_size:
shutil.rmtree(cache_dir)
cache_dir = tempfile.mkdtemp()
min_samples += 1
min_cluster_size = 2
else:
Expand All @@ -416,8 +404,6 @@ def recursive_hdbscan(
if min_samples >= max_min_samples:
max_min_cluster_size *= 2

# clean up cache now that we are out of while loop
shutil.rmtree(cache_dir)
# Check our df is not empty from while loop
if best_df.empty:
if verbose:
Expand Down
10 changes: 7 additions & 3 deletions autometa/binning/unclustered_recruitment.py
Original file line number Diff line number Diff line change
Expand Up @@ -407,9 +407,13 @@ def get_confidence_filtered_predictions(
# Filter predictions by confidence threshold
confidence_threshold = num_classifications * confidence
df = df[df.max(axis="columns") >= confidence_threshold]
filtered_predictions = df.idxmax(axis="columns")
filtered_predictions.name = "cluster"
return filtered_predictions.to_frame()
if df.empty:
filtered_predictions = pd.DataFrame(
[], columns=["contig", "cluster"]
).set_index("contig")
else:
filtered_predictions = df.idxmax(axis="columns").to_frame(name="cluster")
return filtered_predictions


def filter_contaminating_predictions(
Expand Down
6 changes: 2 additions & 4 deletions tests/environment.yml
Original file line number Diff line number Diff line change
Expand Up @@ -11,9 +11,7 @@ dependencies:
- bowtie2
- diamond>=2.0
- gdown
- hdbscan
- hmmer
- joblib==1.1.0 # See https://stackoverflow.com/a/73830525/12671809
- numba>=0.47
- numpy>=1.13
- pandas>=1.1
Expand All @@ -30,8 +28,8 @@ dependencies:
- rsync
- samtools>=1.11
- scikit-bio
- scipy==1.8.1 #force scipy 1.8 until scikit-bio updates to 1.9, https://github.com/KwanLab/Autometa/issues/285
- scikit-learn==0.24 # prevent error from joblib in multiprocessing distance calculations
- scipy
- scikit-learn>=1.3
- sphinx
- sphinx_rtd_theme
- tqdm
Expand Down