KwanLab · evanroyrees · Aug 24, 2023 · Aug 23, 2023 · Aug 23, 2023 · Aug 23, 2023
diff --git a/Makefile b/Makefile
@@ -10,10 +10,10 @@ PYTHON_INTERPRETER = python3
 # This was retrieved from https://drive.google.com/file/d/1bSlPldaq3C6Cf9Y5Rm7iwtUDcjxAaeEk/view?usp=sharing
 TEST_DATA_FILEID = 1bSlPldaq3C6Cf9Y5Rm7iwtUDcjxAaeEk
 
-ifeq (,$(shell which conda))
-HAS_CONDA=False
+ifeq (,$(shell which mamba))
+HAS_MAMBA=False
 else
-HAS_CONDA=True
+HAS_MAMBA=True
 endif
 
 #################################################################################
@@ -35,14 +35,14 @@ black:
 
 ## Set up python interpreter environment
 create_environment: autometa-env.yml
-ifeq (True,$(HAS_CONDA))
-		@echo ">>> Detected conda, creating conda environment."
+ifeq (True,$(HAS_MAMBA))
+		@echo ">>> Detected mamba, creating mamba environment."
 ifeq (3,$(findstring 3,$(PYTHON_INTERPRETER)))
-	conda env create --file=autometa-env.yml
+	mamba env create --file=autometa-env.yml
 else
 	@echo "It looks like you are not using python 3. Autometa is only compatible with python 3. Please upgrade."
 endif
-	@echo ">>> New conda env created. Activate with:\nsource activate $(PROJECT_NAME)"
+	@echo ">>> New mamba env created. Activate with:\nsource activate $(PROJECT_NAME)"
 else
 	$(PYTHON_INTERPRETER) -m pip install -q virtualenv virtualenvwrapper
 	@echo ">>> Installing virtualenvwrapper if not already installed.\nMake sure the following lines are in shell startup file\n\
@@ -61,7 +61,7 @@ install: setup.py
 
 ## Install dependencies for test environment
 test_environment: tests/environment.yml
-	conda env update -n $(PROJECT_NAME) --file=$<
+	mamba env update -n $(PROJECT_NAME) --file=$<
 
 ## Build docker image from Dockerfile (auto-taggged as jasonkwan/autometa:<current-branch>)
 image: Dockerfile

diff --git a/autometa-env.yml b/autometa-env.yml
@@ -10,9 +10,7 @@ dependencies:
   - bowtie2
   - diamond>=2.0
   - gdown
-  - hdbscan
   - hmmer
-  - joblib
   - numba>=0.47
   - numpy>=1.13
   - pandas>=1.1
@@ -25,7 +23,7 @@ dependencies:
   - samtools>=1.11
   - scikit-bio
   - scipy
-  - scikit-learn
+  - scikit-learn>=1.3
   - seqkit
   - tqdm
   - trimap

diff --git a/autometa/binning/recursive_dbscan.py b/autometa/binning/recursive_dbscan.py
@@ -16,8 +16,7 @@
 import pandas as pd
 import numpy as np
 
-from sklearn.cluster import DBSCAN
-from hdbscan import HDBSCAN
+from sklearn.cluster import DBSCAN, HDBSCAN
 from numba import config
 
 
@@ -235,8 +234,7 @@ def run_hdbscan(
     df: pd.DataFrame,
     min_cluster_size: int,
     min_samples: int,
-    cache_dir: str = None,
-    core_dist_n_jobs: int = -1,
+    n_jobs: int = -1,
 ) -> pd.DataFrame:
     """Run clustering on `df` at provided `min_cluster_size`.
 
@@ -261,14 +259,9 @@ def run_hdbscan(
         The number of samples in a neighborhood for a point to be
         considered a core point.
 
-    cache_dir : str, optional
-        Used to cache the output of the computation of the tree.
-        By default, no caching is done. If a string is given, it is the
-        path to the caching directory.
-
-    core_dist_n_jobs: int
+    n_jobs: int
         Number of parallel jobs to run in core distance computations.
-        For ``core_dist_n_jobs`` below -1, (n_cpus + 1 + core_dist_n_jobs) are used.
+        For ``n_jobs`` below -1, (n_cpus + 1 + n_jobs) are used.
 
     Returns
     -------
@@ -304,8 +297,7 @@ def run_hdbscan(
         min_samples=min_samples,
         cluster_selection_method="leaf",
         allow_single_cluster=True,
-        memory=cache_dir,
-        core_dist_n_jobs=core_dist_n_jobs,
+        n_jobs=n_jobs,
     ).fit_predict(features_df.to_numpy())
     clusters = pd.Series(clusters, index=df.index, name="cluster")
     # NOTE: HDBSCAN labels outliers with -1
@@ -325,7 +317,7 @@ def recursive_hdbscan(
     verbose: bool = False,
 ) -> Tuple[pd.DataFrame, pd.DataFrame]:
     """Recursively run HDBSCAN starting with defaults and iterating the min_samples
-     and min_cluster_size until only 1 cluster is recovered.
+    and min_cluster_size until only 1 cluster is recovered.
 
     Parameters
     ----------
@@ -372,14 +364,12 @@ def recursive_hdbscan(
     n_clusters = float("inf")
     best_median = float("-inf")
     best_df = pd.DataFrame()
-    cache_dir = tempfile.mkdtemp()
     while n_clusters > 1:
         binned_df = run_hdbscan(
             table,
             min_cluster_size=min_cluster_size,
             min_samples=min_samples,
-            cache_dir=cache_dir,
-            core_dist_n_jobs=n_jobs,
+            n_jobs=n_jobs,
         )
         df, metrics_df = add_metrics(df=binned_df, markers_df=markers_df)
         filtered_df = apply_binning_metrics_filter(
@@ -403,8 +393,6 @@ def recursive_hdbscan(
             )
 
         if min_cluster_size >= max_min_cluster_size:
-            shutil.rmtree(cache_dir)
-            cache_dir = tempfile.mkdtemp()
             min_samples += 1
             min_cluster_size = 2
         else:
@@ -416,8 +404,6 @@ def recursive_hdbscan(
         if min_samples >= max_min_samples:
             max_min_cluster_size *= 2
 
-    # clean up cache now that we are out of while loop
-    shutil.rmtree(cache_dir)
     # Check our df is not empty from while loop
     if best_df.empty:
         if verbose:

diff --git a/autometa/binning/unclustered_recruitment.py b/autometa/binning/unclustered_recruitment.py
@@ -407,9 +407,13 @@ def get_confidence_filtered_predictions(
     # Filter predictions by confidence threshold
     confidence_threshold = num_classifications * confidence
     df = df[df.max(axis="columns") >= confidence_threshold]
-    filtered_predictions = df.idxmax(axis="columns")
-    filtered_predictions.name = "cluster"
-    return filtered_predictions.to_frame()
+    if df.empty:
+        filtered_predictions = pd.DataFrame(
+            [], columns=["contig", "cluster"]
+        ).set_index("contig")
+    else:
+        filtered_predictions = df.idxmax(axis="columns").to_frame(name="cluster")
+    return filtered_predictions
 
 
 def filter_contaminating_predictions(

diff --git a/tests/environment.yml b/tests/environment.yml
@@ -11,9 +11,7 @@ dependencies:
   - bowtie2
   - diamond>=2.0
   - gdown
-  - hdbscan
   - hmmer
-  - joblib==1.1.0 # See https://stackoverflow.com/a/73830525/12671809
   - numba>=0.47
   - numpy>=1.13
   - pandas>=1.1
@@ -30,8 +28,8 @@ dependencies:
   - rsync
   - samtools>=1.11
   - scikit-bio
-  - scipy==1.8.1 #force scipy 1.8 until scikit-bio updates to 1.9, https://github.com/KwanLab/Autometa/issues/285
-  - scikit-learn==0.24 # prevent error from joblib in multiprocessing distance calculations
+  - scipy
+  - scikit-learn>=1.3
   - sphinx
   - sphinx_rtd_theme
   - tqdm