diff --git a/Makefile b/Makefile index c082a18d2..71b0112ba 100644 --- a/Makefile +++ b/Makefile @@ -10,10 +10,10 @@ PYTHON_INTERPRETER = python3 # This was retrieved from https://drive.google.com/file/d/1bSlPldaq3C6Cf9Y5Rm7iwtUDcjxAaeEk/view?usp=sharing TEST_DATA_FILEID = 1bSlPldaq3C6Cf9Y5Rm7iwtUDcjxAaeEk -ifeq (,$(shell which conda)) -HAS_CONDA=False +ifeq (,$(shell which mamba)) +HAS_MAMBA=False else -HAS_CONDA=True +HAS_MAMBA=True endif ################################################################################# @@ -35,20 +35,18 @@ black: ## Set up python interpreter environment create_environment: autometa-env.yml -ifeq (True,$(HAS_CONDA)) - @echo ">>> Detected conda, creating conda environment." +ifeq (True,$(HAS_MAMBA)) + @echo ">>> Detected mamba, creating mamba environment." ifeq (3,$(findstring 3,$(PYTHON_INTERPRETER))) - conda env create --file=autometa-env.yml + mamba env create --file=autometa-env.yml else @echo "It looks like you are not using python 3. Autometa is only compatible with python 3. Please upgrade." endif - @echo ">>> New conda env created. Activate with:\nsource activate $(PROJECT_NAME)" + @echo ">>> New mamba env created. Activate with:\nsource activate $(PROJECT_NAME)" else - $(PYTHON_INTERPRETER) -m pip install -q virtualenv virtualenvwrapper - @echo ">>> Installing virtualenvwrapper if not already installed.\nMake sure the following lines are in shell startup file\n\ - export WORKON_HOME=$$HOME/.virtualenvs\nexport PROJECT_HOME=$$HOME/Devel\nsource /usr/local/bin/virtualenvwrapper.sh\n" - @bash -c "source `which virtualenvwrapper.sh`;mkvirtualenv $(PROJECT_NAME) --python=$(PYTHON_INTERPRETER)" - @echo ">>> New virtualenv created. Activate with:\nworkon $(PROJECT_NAME)" + @echo "Mamba not detected. Please install before proceeding..." + @echo "Mamba docs: https://mamba.readthedocs.io/en/latest/" + exit endif ################################################################################# @@ -61,7 +59,7 @@ install: setup.py ## Install dependencies for test environment test_environment: tests/environment.yml - conda env update -n $(PROJECT_NAME) --file=$< + mamba env update -n $(PROJECT_NAME) --file=$< ## Build docker image from Dockerfile (auto-taggged as jasonkwan/autometa:) image: Dockerfile diff --git a/autometa-env.yml b/autometa-env.yml index 1c93fd218..72b1b4629 100644 --- a/autometa-env.yml +++ b/autometa-env.yml @@ -10,9 +10,7 @@ dependencies: - bowtie2 - diamond>=2.0 - gdown - - hdbscan - hmmer - - joblib - numba>=0.47 - numpy>=1.13 - pandas>=1.1 @@ -25,7 +23,7 @@ dependencies: - samtools>=1.11 - scikit-bio - scipy - - scikit-learn + - scikit-learn>=1.3 - seqkit - tqdm - trimap diff --git a/autometa/binning/recursive_dbscan.py b/autometa/binning/recursive_dbscan.py index 713e08673..35efa2b4f 100644 --- a/autometa/binning/recursive_dbscan.py +++ b/autometa/binning/recursive_dbscan.py @@ -16,8 +16,7 @@ import pandas as pd import numpy as np -from sklearn.cluster import DBSCAN -from hdbscan import HDBSCAN +from sklearn.cluster import DBSCAN, HDBSCAN from numba import config @@ -235,8 +234,7 @@ def run_hdbscan( df: pd.DataFrame, min_cluster_size: int, min_samples: int, - cache_dir: str = None, - core_dist_n_jobs: int = -1, + n_jobs: int = -1, ) -> pd.DataFrame: """Run clustering on `df` at provided `min_cluster_size`. @@ -261,14 +259,9 @@ def run_hdbscan( The number of samples in a neighborhood for a point to be considered a core point. - cache_dir : str, optional - Used to cache the output of the computation of the tree. - By default, no caching is done. If a string is given, it is the - path to the caching directory. - - core_dist_n_jobs: int + n_jobs: int Number of parallel jobs to run in core distance computations. - For ``core_dist_n_jobs`` below -1, (n_cpus + 1 + core_dist_n_jobs) are used. + For ``n_jobs`` below -1, (n_cpus + 1 + n_jobs) are used. Returns ------- @@ -304,8 +297,7 @@ def run_hdbscan( min_samples=min_samples, cluster_selection_method="leaf", allow_single_cluster=True, - memory=cache_dir, - core_dist_n_jobs=core_dist_n_jobs, + n_jobs=n_jobs, ).fit_predict(features_df.to_numpy()) clusters = pd.Series(clusters, index=df.index, name="cluster") # NOTE: HDBSCAN labels outliers with -1 @@ -325,7 +317,7 @@ def recursive_hdbscan( verbose: bool = False, ) -> Tuple[pd.DataFrame, pd.DataFrame]: """Recursively run HDBSCAN starting with defaults and iterating the min_samples - and min_cluster_size until only 1 cluster is recovered. + and min_cluster_size until only 1 cluster is recovered. Parameters ---------- @@ -372,14 +364,12 @@ def recursive_hdbscan( n_clusters = float("inf") best_median = float("-inf") best_df = pd.DataFrame() - cache_dir = tempfile.mkdtemp() while n_clusters > 1: binned_df = run_hdbscan( table, min_cluster_size=min_cluster_size, min_samples=min_samples, - cache_dir=cache_dir, - core_dist_n_jobs=n_jobs, + n_jobs=n_jobs, ) df, metrics_df = add_metrics(df=binned_df, markers_df=markers_df) filtered_df = apply_binning_metrics_filter( @@ -403,8 +393,6 @@ def recursive_hdbscan( ) if min_cluster_size >= max_min_cluster_size: - shutil.rmtree(cache_dir) - cache_dir = tempfile.mkdtemp() min_samples += 1 min_cluster_size = 2 else: @@ -416,8 +404,6 @@ def recursive_hdbscan( if min_samples >= max_min_samples: max_min_cluster_size *= 2 - # clean up cache now that we are out of while loop - shutil.rmtree(cache_dir) # Check our df is not empty from while loop if best_df.empty: if verbose: diff --git a/autometa/binning/unclustered_recruitment.py b/autometa/binning/unclustered_recruitment.py index 7b025a980..fa8bf9284 100644 --- a/autometa/binning/unclustered_recruitment.py +++ b/autometa/binning/unclustered_recruitment.py @@ -407,9 +407,13 @@ def get_confidence_filtered_predictions( # Filter predictions by confidence threshold confidence_threshold = num_classifications * confidence df = df[df.max(axis="columns") >= confidence_threshold] - filtered_predictions = df.idxmax(axis="columns") - filtered_predictions.name = "cluster" - return filtered_predictions.to_frame() + if df.empty: + filtered_predictions = pd.DataFrame( + [], columns=["contig", "cluster"] + ).set_index("contig") + else: + filtered_predictions = df.idxmax(axis="columns").to_frame(name="cluster") + return filtered_predictions def filter_contaminating_predictions( diff --git a/docs/source/bash-workflow.rst b/docs/source/bash-workflow.rst index 80b8a7d1d..a17a6b4d6 100644 --- a/docs/source/bash-workflow.rst +++ b/docs/source/bash-workflow.rst @@ -14,17 +14,16 @@ Getting Started Compute Environment Setup ************************* -If you have not previously installed/used Conda, you can get it using the -Miniconda installer appropriate to your system, here: ``_ +If you have not previously installed/used mamba_, you can get it from Mambaforge_. -You may either create a new Conda environment named "autometa"... +You may either create a new mamba environment named "autometa"... .. code-block:: bash - conda create -n autometa -c bioconda autometa - # Then, once Conda has finished creating the environment + mamba create -n autometa -c conda-forge -c bioconda autometa + # Then, once mamba has finished creating the environment # you may activate it: - conda activate autometa + mamba activate autometa \.\.\. or install Autometa into any of your existing environments. @@ -32,13 +31,13 @@ This installs Autometa in your current active environment: .. code-block:: bash - conda install -c bioconda autometa + mamba install -c conda-forge -c bioconda autometa The next command installs Autometa in the provided environment: .. code-block:: bash - conda install -n -c bioconda autometa + mamba install -n -c conda-forge -c bioconda autometa Download Workflow Template ************************** @@ -128,7 +127,7 @@ Alignments Preparation .. note:: The following example requires ``bwa``, ``kart`` and ``samtools`` - ``conda install -c bioconda bwa kart samtools`` + ``mamba install -c bioconda bwa kart samtools`` .. code-block:: bash @@ -158,7 +157,7 @@ ORFs **** .. note:: - The following example requires ``prodigal``. e.g. ``conda install -c bioconda prodigal`` + The following example requires ``prodigal``. e.g. ``mamba install -c bioconda prodigal`` .. code-block:: bash @@ -175,7 +174,7 @@ Diamond blastp Preparation ************************** .. note:: - The following example requires ``diamond``. e.g. ``conda install -c bioconda diamond`` + The following example requires ``diamond``. e.g. ``mamba install -c bioconda diamond`` .. code-block:: bash @@ -267,7 +266,7 @@ For example, with slurm: .. caution:: - Make sure your conda autometa environment is activated or the autometa entrypoints will not be available. + Make sure your mamba autometa environment is activated or the autometa entrypoints will not be available. Additional parameters ##################### @@ -323,3 +322,5 @@ See :ref:`advanced-usage-binning` section for details .. _Trimmomatic: http://www.usadellab.org/cms/?page=trimmomatic .. _FastQC: https://www.bioinformatics.babraham.ac.uk/projects/fastqc/ .. _metaQuast: http://quast.sourceforge.net/metaquast +.. _Mambaforge: https://github.com/conda-forge/miniforge#mambaforge +.. _mamba: https://mamba.readthedocs.io/en/latest/ diff --git a/docs/source/benchmarking.rst b/docs/source/benchmarking.rst index f58eab3e6..6c7aca511 100644 --- a/docs/source/benchmarking.rst +++ b/docs/source/benchmarking.rst @@ -7,11 +7,11 @@ Benchmarking .. note:: - The most recent Autometa benchmarking results covering multiple modules and input parameters are hosted on our - `KwanLab/metaBenchmarks `_ Github repository and provide a range of + The most recent Autometa benchmarking results covering multiple modules and input parameters are hosted on our + `KwanLab/metaBenchmarks `_ Github repository and provide a range of analyses covering multiple stages and parameter sets. These benchmarks are available with their own respective - modules so that the community may easily assess how Autometa's novel (``taxon-profiling``, ``clustering``, - ``binning``, ``refinement``) algorithms perform compared to current state-of-the-art methods. Tools were selected for + modules so that the community may easily assess how Autometa's novel (``taxon-profiling``, ``clustering``, + ``binning``, ``refinement``) algorithms perform compared to current state-of-the-art methods. Tools were selected for benchmarking based on their relevance to environmental, single-assembly, reference-free binning pipelines. Benchmarking with the ``autometa-benchmark`` module @@ -51,7 +51,7 @@ Example benchmarking with simulated communities # Set community size (see above for selection/download of other community types) community_size=78Mbp - + # Inputs ## NOTE: predictions and reference were downloaded using autometa-download-dataset predictions="$HOME/Autometa/autometa/datasets/simulated/${community_size}/taxonomy.tsv.gz" # required columns -> contig, taxid @@ -73,7 +73,7 @@ Example benchmarking with simulated communities --output-classification-reports $reports .. note:: - Using ``--benchmark=classification`` requires the path to a directory containing files (nodes.dmp, names.dmp, merged.dmp) + Using ``--benchmark=classification`` requires the path to a directory containing files (nodes.dmp, names.dmp, merged.dmp) from NCBI's taxdump tarball. This should be supplied using the ``--ncbi`` parameter. Clustering @@ -95,7 +95,7 @@ Example benchmarking with simulated communities # Outputs output_wide="${community_size}.clustering_benchmarks.wide.tsv.gz" output_long="${community_size}.clustering_benchmarks.long.tsv.gz" - + autometa-benchmark \ --benchmark clustering \ --predictions $predictions \ @@ -114,16 +114,16 @@ Example benchmarking with simulated communities # Set community size (see above for selection/download of other community types) community_size=78Mbp - + # Inputs ## NOTE: predictions and reference were downloaded using autometa-download-dataset predictions="$HOME/Autometa/autometa/datasets/simulated/${community_size}/binning.tsv.gz" # required columns -> contig, cluster reference="$HOME/Autometa/autometa/datasets/simulated/${community_size}/reference_assignments.tsv.gz" - + # Outputs output_wide="${community_size}.binning_benchmarks.wide.tsv.gz" output_long="${community_size}.binning_benchmarks.long.tsv.gz" - + autometa-benchmark \ --benchmark binning-classification \ --predictions $predictions \ @@ -172,7 +172,7 @@ Autometa is packaged with a built-in module that allows any user to download any To use retrieve these datasets one simply needs to run the ``autometa-download-dataset`` command. For example, to download the reference assignments for a simulated community as well as the most recent Autometa -binning and taxon-profiling predictions for this community, provide the following parameters: +binning and taxon-profiling predictions for this community, provide the following parameters: .. code:: bash @@ -195,15 +195,15 @@ Using ``gdrive`` You can download the individual assemblies of different datasests with the help of ``gdown`` using command line (This is what ``autometa-download-dataset`` is using behind the scenes). If you have installed ``autometa`` using -``conda`` then ``gdown`` should already be installed. If not, you can install it using -``conda install -c conda-forge gdown`` or ``pip install gdown``. +``mamba`` then ``gdown`` should already be installed. If not, you can install it using +``mamba install -c conda-forge gdown`` or ``pip install gdown``. Example for the 78Mbp simulated community """"""""""""""""""""""""""""""""""""""""" 1. Navigate to the 78Mbp community dataset using the `link `_ mentioned above. -2. Get the file ID by navigating to any of the files and right clicking, then selecting the ``get link`` option. - This will have a ``copy link`` button that you should use. The link for the metagenome assembly +2. Get the file ID by navigating to any of the files and right clicking, then selecting the ``get link`` option. + This will have a ``copy link`` button that you should use. The link for the metagenome assembly (ie. ``metagenome.fna.gz``) should look like this : ``https://drive.google.com/file/d/15CB8rmQaHTGy7gWtZedfBJkrwr51bb2y/view?usp=sharing`` 3. The file ID is within the ``/`` forward slashes between ``file/d/`` and ``/``, e.g: @@ -313,4 +313,4 @@ e.g. ``-l 1250`` would translate to 1250Mbp as the sum of total lengths for all # -s : the standard deviation of DNA/RNA fragment size for paired-end simulations. # -l : the length of reads to be simulated $ coverage = ((250 * reads) / (length * 1000000)) - $ art_illumina -p -ss HS25 -l 125 -f $coverage -o simulated_reads -m 275 -s 90 -i asm_path \ No newline at end of file + $ art_illumina -p -ss HS25 -l 125 -f $coverage -o simulated_reads -m 275 -s 90 -i asm_path diff --git a/docs/source/how-to-contribute.rst b/docs/source/how-to-contribute.rst index a210ccc42..9c5ddcc7a 100644 --- a/docs/source/how-to-contribute.rst +++ b/docs/source/how-to-contribute.rst @@ -16,10 +16,10 @@ Autometa builds documentation using `readthedocs `__. .. code-block:: bash - # Activate your autometa conda environment - conda activate autometa + # Activate your autometa mamba environment + mamba activate autometa # Install dependencies - conda install -n autometa -c conda-forge \ + mamba install -n autometa -c conda-forge \ sphinx sphinx_rtd_theme # List all make options make @@ -38,8 +38,8 @@ You will have to install certain dependencies as well as test data to be able to .. code-block:: bash - # Activate your autometa conda environment - conda activate autometa + # Activate your autometa mamba environment + mamba activate autometa # List all make options make # Install dependencies for test environment diff --git a/docs/source/installation.rst b/docs/source/installation.rst index 498b7087b..c32e3e3fa 100644 --- a/docs/source/installation.rst +++ b/docs/source/installation.rst @@ -4,8 +4,8 @@ Installation ============ -Currently Autometa package installation is supported by conda_ and docker_. -For installation using conda, we suggest downloading miniconda_. +Currently Autometa package installation is supported by mamba_, and docker_. +For installation using mamba, download mamba from Mambaforge_. .. attention:: @@ -14,23 +14,74 @@ For installation using conda, we suggest downloading miniconda_. Direct installation (Quickest) ============================== -#. Install miniconda_ +#. Install mamba_ + + .. code-block:: bash + + wget "https://github.com/conda-forge/miniforge/releases/latest/download/Mambaforge-$(uname)-$(uname -m).sh" + bash Mambaforge-$(uname)-$(uname -m).sh + + Follow the installation prompts and when you get to this: + + .. code-block:: bash + + Do you wish the installer to initialize Mambaforge + by running conda init? [yes|no] + [no] >>> yes + + This will require restarting the terminal, or resetting + the terminal with the source command + + .. code-block:: bash + + # To resolve the comment: + # ==> For changes to take effect, close and re-open your current shell. <== + # type: + source ~/.bashrc + + .. note:: + + If you already have conda installed, you can install mamba as a drop-in replacement. + + .. code-block:: bash + + conda -n base -c conda-forge mamba -y + + #. Create a new environment with ``autometa`` installed: .. code-block:: bash - conda create -c bioconda -n autometa autometa + mamba create -c conda-forge -c bioconda -n autometa autometa + + .. note:: + + You may add the ``bioconda`` and ``conda-forge`` channels to your mamba + config to simplify the command. + + .. code-block:: bash + + mamba config --append channels bioconda + mamba config --append channels conda-forge + + Now mamba will search the ``bioconda`` and ``conda-forge`` + channels alongside the defaults channel. + + .. code-block:: bash + + mamba create -n autometa autometa + #. Activate ``autometa`` environment: .. code-block:: - conda activate autometa + mamba activate autometa Install from source (using make) ================================ -Download and install miniconda_. Now run the following commands: +Download and install mamba_. Now run the following commands: .. code-block:: bash @@ -43,11 +94,11 @@ Download and install miniconda_. Now run the following commands: # Navigate into the cloned repository cd Autometa - # create autometa conda environment + # create autometa mamba environment make create_environment - # activate autometa conda environment - conda activate autometa + # activate autometa mamba environment + mamba activate autometa # install autometa source code in autometa environment make install @@ -59,7 +110,7 @@ Download and install miniconda_. Now run the following commands: Install from source (full commands) =================================== -Download and install miniconda_. Now run the following commands: +Download and install mamba_. Now run the following commands: .. code-block:: bash @@ -73,10 +124,10 @@ Download and install miniconda_. Now run the following commands: cd Autometa # Construct the autometa environment from autometa-env.yml - conda env create --file=autometa-env.yml + mamba env create --file=autometa-env.yml # Activate environment - conda activate autometa + mamba activate autometa # Install the autometa code base from source python -m pip install . --ignore-installed --no-deps -vv @@ -115,8 +166,8 @@ To run the tests, however, you'll first need to install the following packages a .. code-block:: bash - # Activate your autometa conda environment - conda activate autometa + # Activate your autometa mamba environment + mamba activate autometa # List all make options make @@ -141,12 +192,12 @@ You can now run different unit tests using the following commands: make unit_test_wip .. note:: + As a shortcut you can also create the test environment and run **all** the unit tests using ``make unit_test`` command. For more information about the above commands see the :ref:`Contributing Guidelines` page. Additional unit tests are provided in the test directory. These are designed to aid in future development of autometa. -.. _conda: https://docs.conda.io/en/latest/ -.. _miniconda: https://docs.conda.io/en/latest/miniconda.html +.. _mamba: https://mamba.readthedocs.io/en/latest/index.html +.. _Mambaforge: https://github.com/conda-forge/miniforge#mambaforge .. _Docker: https://www.docker.com/ -.. _anaconda: https://www.anaconda.com/ diff --git a/docs/source/nextflow-workflow.rst b/docs/source/nextflow-workflow.rst index ba097318e..ad5790e2f 100644 --- a/docs/source/nextflow-workflow.rst +++ b/docs/source/nextflow-workflow.rst @@ -16,12 +16,12 @@ System Requirements Currently the nextflow pipeline requires Docker 🐳 so it must be installed on your system. If you don't have Docker installed you can install it from `docs.docker.com/get-docker `_. We plan on removing this dependency in future versions, so that other dependency managers -(e.g. Conda, Singularity, etc) can be used. +(e.g. Conda, Mamba, Singularity, etc) can be used. Nextflow runs on any Posix compatible system. Detailed system requirements can be found in the `nextflow documentation `_ -Nextflow (required) and nf-core tools (optional but highly recommended) installation will be discussed in :ref:`install-nextflow-nfcore-with-conda`. +Nextflow (required) and nf-core tools (optional but highly recommended) installation will be discussed in :ref:`install-nextflow-nfcore-with-mamba`. Data Preparation ################ @@ -138,7 +138,7 @@ Example ``sample_sheet.csv`` Quick Start ########### -The following is a condensed summary of steps required to get Autometa installed, configured and running. +The following is a condensed summary of steps required to get Autometa installed, configured and running. There are links throughout to the appropriate documentation sections that can provide more detail if required. Installation @@ -146,14 +146,14 @@ Installation For full installation instructions, please see the :ref:`installation-page` section -If you would like to install Autometa via conda (I'd recommend it, its almost foolproof!), -you'll need to first install Miniconda on your system. You can do this in a few easy steps: +If you would like to install Autometa via mamba (I'd recommend it, its almost foolproof!), +you'll need to first download the Mambaforge_ installer on your system. You can do this in a few easy steps: -1. Type in the following and then hit enter. This will download the Miniconda installer to your home directory. +1. Type in the following and then hit enter. This will download the Mambaforge installer to your home directory. .. code-block:: bash - wget https://repo.anaconda.com/miniconda/Miniconda3-latest-Linux-x86_64.sh -O $HOME/Miniconda3-latest-Linux-x86_64.sh + wget "https://github.com/conda-forge/miniforge/releases/latest/download/Mambaforge-$(uname)-$(uname -m).sh" -O "$HOME/Mambaforge-$(uname)-$(uname -m).sh" .. note:: @@ -163,44 +163,44 @@ you'll need to first install Miniconda on your system. You can do this in a few .. code-block:: bash - bash $HOME/Miniconda3-latest-Linux-x86_64.sh + bash $HOME/Mambaforge-$(uname)-$(uname -m).sh + # On my machine this was /home/sam/Mambaforge-latest-Linux-x86_64.sh 3. Follow all of the prompts. Keep pressing enter until it asks you to accept. Then type yes and enter. Say yes to everything. -.. note:: +.. note:: - If for whatever reason, you accidentally said no to the initialization, do not fear. + If for whatever reason, you accidentally said no to the initialization, do not fear. We can fix this by running the initialization with the following command: .. code-block:: bash - cd $HOME/miniconda3/bin/ - ./conda init - + $HOME/mambaforge/bin/mamba init + -4. Finally, for the changes to take effect, you'll need to run the following line of code which effectively acts as a "refresh" +1. Finally, for the changes to take effect, you'll need to run the following line of code which effectively acts as a "refresh" .. code-block:: bash - - source ~/.bashrc -Now that you have conda up and running, its time to install the Autometa conda environment. Run the following code: + source $HOME/.bashrc + +Now that you have mamba up and running, its time to install the Autometa mamba environment. Run the following code: .. code-block:: bash - conda env create --file=https://raw.githubusercontent.com/KwanLab/Autometa/main/nextflow-env.yml - + mamba env create --file=https://raw.githubusercontent.com/KwanLab/Autometa/main/nextflow-env.yml + .. attention:: - You will only need to run the installation (code above) once. The installation does NOT need to be performed every time you wish to use Autometa. - Once installation is complete, the conda environment (which holds all the tools that Autometa needs) will live on your server/computer + You will only need to run the installation (code above) once. The installation does NOT need to be performed every time you wish to use Autometa. + Once installation is complete, the mamba environment (which holds all the tools that Autometa needs) will live on your server/computer much like any other program you install. -Anytime you would like to run Autometa, you'll need to activate the conda environment. To activate the environment you'll need to run the following command: +Anytime you would like to run Autometa, you'll need to activate the mamba environment. To activate the environment you'll need to run the following command: .. code-block:: bash - conda activate autometa-nf + mamba activate autometa-nf Configuring a scheduler *********************** @@ -239,13 +239,13 @@ Then copy the following code block into that new file ("agrp" is the slurm parti } } -Keep this file somewhere central to you. For the sake of this example I will be keeping it in a folder called "Useful scripts" in my home directory +Keep this file somewhere central to you. For the sake of this example I will be keeping it in a folder called "Useful scripts" in my home directory because that is a central point for me where I know I can easily find the file and it won't be moved e.g. :code:`/home/sam/Useful_scripts/slurm_nextflow.config` -Save your new file with Ctrl+O and then exit nano with Ctrl+O. +Save your new file with Ctrl+O and then exit nano with Ctrl+O. -Installation and set up is now complete. 🎉 🥳 +Installation and set up is now complete. 🎉 🥳 Running Autometa **************** @@ -253,19 +253,19 @@ Running Autometa For a comprehensive list of features and options and how to use them please see :ref:`Running the pipeline` Autometa can bin one or several metagenomic datasets in one run. Regardless of the number of metagenomes you -want to process, you will need to provide a sample sheet which specifies the name of your sample, the full path to +want to process, you will need to provide a sample sheet which specifies the name of your sample, the full path to where that data is found and how to retrieve the sample's contig coverage information. -If the metagenome was assembled via SPAdes, Autometa can extract coverage and contig length information from the sequence headers. +If the metagenome was assembled via SPAdes, Autometa can extract coverage and contig length information from the sequence headers. -If you used a different assembler you will need to provide either raw reads or a table containing contig/scaffold coverage information. -Full details for data preparation may be found under :ref:`sample-sheet-preparation` +If you used a different assembler you will need to provide either raw reads or a table containing contig/scaffold coverage information. +Full details for data preparation may be found under :ref:`sample-sheet-preparation` -First ensure that your Autometa conda environment is activated. You can activate your environment by running: +First ensure that your Autometa mamba environment is activated. You can activate your environment by running: .. code-block:: bash - - conda activate autometa-nf + + mamba activate autometa-nf Run the following code to launch Autometa: @@ -275,7 +275,7 @@ Run the following code to launch Autometa: .. note:: - You may want to note where you have saved your input sample sheet prior to running the launch command. + You may want to note where you have saved your input sample sheet prior to running the launch command. It is much easier (and less error prone) to copy/paste the sample sheet file path when specifying the input (We'll get to this later in :ref:`quickstart-menu-4`). You will now use the arrow keys to move up and down between your options and hit your "Enter" or "Return" key to make your choice. @@ -296,8 +296,8 @@ You will now use the arrow keys to move up and down between your options and hit Choose a version ---------------- -The double, right-handed arrows should already indicate the latest release of Autometa (in our case ``2.0.0``). -The latest version of the tool will always be at the top of the list with older versions descending below. +The double, right-handed arrows should already indicate the latest release of Autometa (in our case ``2.0.0``). +The latest version of the tool will always be at the top of the list with older versions descending below. To select the latest version, ensure that the double, right-handed arrows are next to ``2.0.0``, then hit "Enter". .. image:: ../img/Menu1.png @@ -311,7 +311,7 @@ Pick the ``Command line`` option. .. note:: - Unless you've done some fancy server networking (i.e. tunneling and port-forwarding), + Unless you've done some fancy server networking (i.e. tunneling and port-forwarding), or are using Autometa locally, ``Command line`` is your *only* option. .. image:: ../img/Menu2.png @@ -321,7 +321,7 @@ Pick the ``Command line`` option. General nextflow parameters --------------------------- -If you are using a scheduler (Slurm in this example), ``-profile`` is the only option you'll need to change. +If you are using a scheduler (Slurm in this example), ``-profile`` is the only option you'll need to change. If you are not using a scheduler, you may skip this step. .. image:: ../img/Menu3.png @@ -331,12 +331,12 @@ If you are not using a scheduler, you may skip this step. Input and Output ---------------- -Now we need to give Autometa the full paths to our input sample sheet, output results folder -and output logs folder (aka where trace files are stored). +Now we need to give Autometa the full paths to our input sample sheet, output results folder +and output logs folder (aka where trace files are stored). .. note:: - A new folder, named by its respective sample value, will be created within the output results folder for + A new folder, named by its respective sample value, will be created within the output results folder for each metagenome listed in the sample sheet. .. image:: ../img/Menu4.png @@ -346,14 +346,14 @@ and output logs folder (aka where trace files are stored). Binning parameters ------------------ -If you're not sure what you're doing I would recommend only changing ``length_cutoff``. -The default cutoff is 3000bp, which means that any contigs/scaffolds smaller than 3000bp will not be considered for binning. +If you're not sure what you're doing I would recommend only changing ``length_cutoff``. +The default cutoff is 3000bp, which means that any contigs/scaffolds smaller than 3000bp will not be considered for binning. .. note:: - This cutoff will depend on how good your assembly is: e.g. if your N50 is 1200bp, I would choose a cutoff of 1000. + This cutoff will depend on how good your assembly is: e.g. if your N50 is 1200bp, I would choose a cutoff of 1000. If your N50 is more along the lines of 5000, I would leave the cutoff at the default 3000. I would strongly recommend - against choosing a number below 900 here. In the example below, I have chosen a cutoff of 1000bp as my assembly was + against choosing a number below 900 here. In the example below, I have chosen a cutoff of 1000bp as my assembly was not particularly great (the N50 is 1100bp). .. image:: ../img/Menu5.png @@ -363,17 +363,17 @@ The default cutoff is 3000bp, which means that any contigs/scaffolds smaller tha Additional Autometa options --------------------------- -Here you have a choice to make: +Here you have a choice to make: -* By enabling taxonomy aware mode, Autometa will attempt to use taxonomic data to make your bins more accurate. +* By enabling taxonomy aware mode, Autometa will attempt to use taxonomic data to make your bins more accurate. -However, this is a more computationally expensive step and will make the process take longer. +However, this is a more computationally expensive step and will make the process take longer. * By leaving this option as the default ``False`` option, Autometa will bin according to coverage and kmer patterns. Despite your choice, you will need to provide a path to the necessary databases using the ``single_db_dir`` option. -In the example below, I have enabled the taxonomy aware mode and provided the path to where the databases are stored -(in my case this is :code:`/home/sam/Databases`). +In the example below, I have enabled the taxonomy aware mode and provided the path to where the databases are stored +(in my case this is :code:`/home/sam/Databases`). For additional details on required databases, see the :ref:`Databases` section. @@ -384,13 +384,13 @@ For additional details on required databases, see the :ref:`Databases` section. Computational parameters ------------------------ -This will depend on the computational resources you have available. You could start with the default values and see -how the binning goes. If you have particularly complex datasets you may want to bump this up a bit. For your -average metagenome, you won't need more than 150Gb of memory. I've opted to use 75 Gb as a -starting point for a few biocrust (somewhat diverse) metagenomes. +This will depend on the computational resources you have available. You could start with the default values and see +how the binning goes. If you have particularly complex datasets you may want to bump this up a bit. For your +average metagenome, you won't need more than 150Gb of memory. I've opted to use 75 Gb as a +starting point for a few biocrust (somewhat diverse) metagenomes. .. note:: - + These options correspond to the resources provided to *each* process of Autometa, *not* the entire workflow itself. Also, for TB worth of assembled data you may want to try the :ref:`autometa-bash-workflow` using the @@ -409,7 +409,7 @@ to prevent immediately performing the nextflow run command. .. image:: ../img/launch_choice.png -If you recall, we created a file called :code:`slurm_nextflow.config` that contains the information Autometa will need to communicate with the Slurm scheduler. +If you recall, we created a file called :code:`slurm_nextflow.config` that contains the information Autometa will need to communicate with the Slurm scheduler. We need to include that file using the :code:`-c` flag (or configuration flag). Therefore to launch the Autometa workflow, run the following command: .. note:: @@ -433,41 +433,40 @@ Basic While the Autometa Nextflow pipeline can be run using Nextflow directly, we designed it using nf-core standards and templating to provide an easier user experience through use of the nf-core "tools" python library. The directions below demonstrate using a minimal -Conda environment to install Nextflow and nf-core tools and then running the Autometa pipeline. +mamba environment to install Nextflow and nf-core tools and then running the Autometa pipeline. -.. _install-nextflow-nfcore-with-conda: +.. _install-nextflow-nfcore-with-mamba: -Installing Nextflow and nf-core tools with Conda +Installing Nextflow and nf-core tools with mamba ************************************************ -If you have not previously installed/used Conda, you can get it using the -Miniconda installer appropriate to your system, here: ``_ +If you have not previously installed/used mamba_, you can get it from Mambaforge_. -After installing conda, running the following command will create a minimal -Conda environment named "autometa-nf", and install Nextflow and nf-core tools. +After installing mamba, running the following command will create a minimal +mamba environment named "autometa-nf", and install Nextflow and nf-core tools. .. code-block:: bash - conda env create --file=https://raw.githubusercontent.com/KwanLab/Autometa/main/nextflow-env.yml + mamba env create --file=https://raw.githubusercontent.com/KwanLab/Autometa/main/nextflow-env.yml If you receive the message... .. code-block:: bash - CondaValueError: prefix already exists: + CondaValueError: prefix already exists: /home/user/mambaforge/envs/autometa-nf ...it means you have already created the environment. If you want to overwrite/update the environment then add the :code:`--force` flag to the end of the command. .. code-block:: bash - conda env create --file=https://raw.githubusercontent.com/KwanLab/Autometa/main/nextflow-env.yml --force + mamba env create --file=https://raw.githubusercontent.com/KwanLab/Autometa/main/nextflow-env.yml --force -Once Conda has finished creating the environment be sure to activate it: +Once mamba has finished creating the environment be sure to activate it: .. code-block:: bash - conda activate autometa-nf + mamba activate autometa-nf Using nf-core @@ -484,7 +483,7 @@ start the pipeline launch process. nf-core launch KwanLab/Autometa .. caution:: - + nf-core will give a list of revisions to use following the above command. Any of the version 1.* revisions are NOT supported. @@ -543,7 +542,7 @@ The other parameter is a nextflow argument, specified with :code:`-profile`. Thi are able to successfully configure these profiles, please get in touch or submit a pull request and we will add these configurations to the repository. - - :code:`conda`: Enables running all processes using `conda `_ + - :code:`mamba`: Enables running all processes using `mamba `_ - :code:`singularity`: Enables running all processes using `singularity `_ - :code:`podman`: Enables running all processes using `podman `_ - :code:`shifter`: Enables running all processes using `shifter `_ @@ -581,7 +580,7 @@ using the :code:`nextflow run ...` command by prepending the parameter name with You can run the ``KwanLab/Autometa`` project without using nf-core if you already have a correctly formatted parameters file. (like the one generated from ``nf-core launch ...``, i.e. ``nf-params.json``) - + .. code-block:: bash nextflow run KwanLab/Autometa -params-file nf-params.json -profile slurm -resume @@ -795,7 +794,7 @@ Visualizing the Workflow ------------------------ You can visualize the entire workflow ie. create the directed acyclic graph (DAG) of processes from the written DOT file. First install -`Graphviz `_ (``conda install -c anaconda graphviz``) then do ``dot -Tpng < pipeline_info/autometa-dot > autometa-dag.png`` to get the +`Graphviz `_ (``mamba install -c anaconda graphviz``) then do ``dot -Tpng < pipeline_info/autometa-dot > autometa-dag.png`` to get the in the ``png`` format. Configuring your process executor @@ -868,3 +867,5 @@ To use this tagged version (or any other Autometa image tag) add the argument `` .. _Trimmomatic: http://www.usadellab.org/cms/?page=trimmomatic .. _FastQC: https://www.bioinformatics.babraham.ac.uk/projects/fastqc/ .. _metaQuast: http://quast.sourceforge.net/metaquast +.. _Mambaforge: https://github.com/conda-forge/miniforge#mambaforge +.. _mamba: https://mamba.readthedocs.io/en/latest/ diff --git a/docs/source/step-by-step-tutorial.rst b/docs/source/step-by-step-tutorial.rst index 28728a970..1a2dc76e9 100644 --- a/docs/source/step-by-step-tutorial.rst +++ b/docs/source/step-by-step-tutorial.rst @@ -7,7 +7,7 @@ Here is the step by step tutorial of the entire pipeline. This is helpful in case you have your own files or just want to run a specific step. Before running anything make sure you have activated the conda environment using -``conda activate autometa``. +``mamba activate autometa``. See the :ref:`Autometa Package Installation` page for details on setting up your conda environment. diff --git a/tests/environment.yml b/tests/environment.yml index f140f6d70..13c29b39a 100644 --- a/tests/environment.yml +++ b/tests/environment.yml @@ -11,9 +11,7 @@ dependencies: - bowtie2 - diamond>=2.0 - gdown - - hdbscan - hmmer - - joblib==1.1.0 # See https://stackoverflow.com/a/73830525/12671809 - numba>=0.47 - numpy>=1.13 - pandas>=1.1 @@ -30,8 +28,8 @@ dependencies: - rsync - samtools>=1.11 - scikit-bio - - scipy==1.8.1 #force scipy 1.8 until scikit-bio updates to 1.9, https://github.com/KwanLab/Autometa/issues/285 - - scikit-learn==0.24 # prevent error from joblib in multiprocessing distance calculations + - scipy + - scikit-learn>=1.3 - sphinx - sphinx_rtd_theme - tqdm