diff --git a/config/containers.config b/config/containers.config index aafd69c..9483bef 100644 --- a/config/containers.config +++ b/config/containers.config @@ -18,4 +18,6 @@ params{ // seurat-conversion module seurat_conversion_container = 'public.ecr.aws/openscpca/seurat-conversion:v0.2.0' + // cell-type-consensus module + consensus_cell_type_container = 'public.ecr.aws/openscpca/cell-type-consensus:latest' } diff --git a/config/module_params.config b/config/module_params.config new file mode 100644 index 0000000..729943c --- /dev/null +++ b/config/module_params.config @@ -0,0 +1,14 @@ +// Module specific parameters are defined here +params{ + + // merge sce parameters + merge_reuse = false + merge_max_libraries = 75 // maximum number of libraries to merge (current number is a guess, based on 59 working, but 104 not) + merge_hvg = 2000 // number of HVGs to select + + + // cell type consensus + cell_type_panglao_ref_file = 'https://raw.githubusercontent.com/AlexsLemonade/OpenScPCA-analysis/b870a082bc9acd3536c5f8d2d52550d8fe8a4239/analyses/cell-type-consensus/references/panglao-cell-type-ontologies.tsv' + cell_type_consensus_ref_file = 'https://raw.githubusercontent.com/AlexsLemonade/OpenScPCA-analysis/b870a082bc9acd3536c5f8d2d52550d8fe8a4239/analyses/cell-type-consensus/references/consensus-cell-type-reference.tsv' + +} diff --git a/main.nf b/main.nf index ca58951..a408c0d 100644 --- a/main.nf +++ b/main.nf @@ -6,6 +6,7 @@ include { simulate_sce } from './modules/simulate-sce' include { merge_sce } from './modules/merge-sce' include { detect_doublets } from './modules/doublet-detection' include { seurat_conversion } from './modules/seurat-conversion' +include { cell_type_consensus } from './modules/cell-type-consensus' // **** Parameter checks **** param_error = false @@ -57,4 +58,7 @@ workflow { // Run the seurat conversion workflow seurat_conversion(sample_ch) + + // Run the consensus cell type workflow + cell_type_consensus(sample_ch) } diff --git a/modules/cell-type-consensus/README.md b/modules/cell-type-consensus/README.md new file mode 100644 index 0000000..5ba49a3 --- /dev/null +++ b/modules/cell-type-consensus/README.md @@ -0,0 +1,13 @@ +This module assigns a consensus cell type based on cell types assigned by `SingleR` and `CellAssign`. + +Scripts are derived from the the `cell-type-consensus` module of the [OpenScPCA-analysis](https://github.com/AlexsLemonade/OpenScPCA-analysis) repository. + +Links to specific original scripts used in this module: + +- `save-coldata.R`: +- `combine-celltype-tables.R`: + +This module also uses the following reference files found in the `OpenScPCA-analysis` repository: + +- `panglao-cell-type-ontologies.tsv`: +- `consensus-cell-type-reference.tsv`: diff --git a/modules/cell-type-consensus/main.nf b/modules/cell-type-consensus/main.nf new file mode 100644 index 0000000..3b89875 --- /dev/null +++ b/modules/cell-type-consensus/main.nf @@ -0,0 +1,101 @@ +#!/usr/bin/env nextflow + +// Workflow to assign consensus cell type labels + +process save_celltypes { + container params.consensus_cell_type_container + tag "${sample_id}" + input: + tuple val(sample_id), + val(project_id), + path(library_files) + output: + tuple val(sample_id), + val(project_id), + path(output_files) + script: + output_files = library_files + .collect{ + it.name.replaceAll(/(?i).rds$/, "_original-cell-types.tsv") + } + """ + for file in ${library_files}; do + save-coldata.R \ + --input_sce_file \$file \ + --output_file \$(basename \${file%.rds}_original-cell-types.tsv) + done + """ + + stub: + output_files = library_files + .collect{ + it.name.replaceAll(/(?i).rds$/, "_original-cell-types.tsv") + } + """ + for file in ${library_files}; do + touch \$(basename \${file%.rds}_original-cell-types.tsv) + done + """ +} + +process assign_consensus { + container params.consensus_cell_type_container + tag "${project_id}" + label 'mem_8' + publishDir "${params.results_bucket}/${params.release_prefix}/cell-type-consensus", mode: 'copy' + input: + tuple val(project_id), + path(cell_type_files) + path panglao_ref + path consensus_ref + output: + tuple val(project_id), + path(consensus_output_file) + script: + input_files = cell_type_files.join(',') + consensus_output_file = "${project_id}_consensus-cell-types.tsv.gz" + """ + combine-celltype-tables.R \ + --input_tsv_files ${input_files} \ + --panglao_ref_file ${panglao_ref} \ + --consensus_ref_file ${consensus_ref} \ + --output_file ${consensus_output_file} + """ + + stub: + input_files = cell_type_files.join(',') + consensus_output_file = "${project_id}_consensus-cell-types.tsv.gz" + """ + touch ${consensus_output_file} + """ +} + + + +workflow cell_type_consensus { + take: + sample_ch // [sample_id, project_id, sample_path] + main: + // create [sample_id, project_id, [list of processed files]] + libraries_ch = sample_ch + .map{sample_id, project_id, sample_path -> + def library_files = Utils.getLibraryFiles(sample_path, format: "sce", process_level: "processed") + return [sample_id, project_id, library_files] + } + + // save cell type information for each library + save_celltypes(libraries_ch) + + cell_type_files_ch = save_celltypes.out + .groupTuple(by: 1) // group by project id + .map{sample_ids, project_id, celltype_files -> tuple( + project_id, + celltype_files.flatten() // get rid of nested tuple that occurs when more than one library maps to a sample + )} + + // assign consensus cell types by project + assign_consensus(cell_type_files_ch, file(params.cell_type_panglao_ref_file), file(params.cell_type_consensus_ref_file)) + + emit: + assign_consensus.out // [project_id, consensus_output_file] +} diff --git a/modules/cell-type-consensus/resources/usr/bin/combine-celltype-tables.R b/modules/cell-type-consensus/resources/usr/bin/combine-celltype-tables.R new file mode 100755 index 0000000..dad6e86 --- /dev/null +++ b/modules/cell-type-consensus/resources/usr/bin/combine-celltype-tables.R @@ -0,0 +1,116 @@ +#!/usr/bin/env Rscript + +# This script is used to combine all TSV files containing cell types into a single TSV file +# The output TSV file will include the following added columns: +# panglao_ontology: CL term assigned to panglao term +# panglao_annotation: human readable value associated with the CL term for panglao term +# blueprint_annotation_cl: human readable value associated with the CL term for singler_celltype_ontology +# consensus_annotation: human readable name associated with the consensus label +# consensus_ontology: CL ontology term for the consensus cell type + +library(optparse) + +option_list <- list( + make_option( + opt_str = c("--input_tsv_files"), + type = "character", + help = "Comma separated list of input file paths corresponding to the TSV files with cell type annotations. + All TSV files in this list will be combined into a single file." + ), + make_option( + opt_str = c("--panglao_ref_file"), + type = "character", + help = "Path to file with panglao assignments and associated cell ontology ids" + ), + make_option( + opt_str = c("--consensus_ref_file"), + type = "character", + help = "Path to file containing the reference for assigning consensus cell type labels" + ), + make_option( + opt_str = c("--output_file"), + type = "character", + help = "Path to file where combined TSV file will be saved. + File name must end in either `.tsv` or `.tsv.gz` to save a compressed TSV file" + ) +) + +# Parse options +opt <- parse_args(OptionParser(option_list = option_list)) + +# Prep ref files --------------------------------------------------------------- + +# make sure reference files exist +stopifnot( + "List of input files containing cell type assignments is missing" = !is.null(opt$input_tsv_files), + "panglao reference file does not exist" = file.exists(opt$panglao_ref_file), + "cell type consensus reference file does not exist" = file.exists(opt$consensus_ref_file), + "output file must end in `.tsv` or `.tsv.gz`" = stringr::str_detect(opt$output_file, ".tsv|.tsv.gz") +) + +# list of paths to tsv files +input_sce_files <- unlist(stringr::str_split(opt$input_tsv_files, ",")) +# check if any are empty, if so remove them +missing_files <- file.size(input_sce_files) == 0 +all_files <- input_sce_files[!missing_files] + +# read in ref files +# change names for panglao ref to match what's in the consensus file +panglao_ref_df <- readr::read_tsv(opt$panglao_ref_file) |> + dplyr::rename( + panglao_ontology = ontology_id, + panglao_annotation = human_readable_value, + original_panglao_name = panglao_cell_type + ) + +consensus_ref_df <- readr::read_tsv(opt$consensus_ref_file) |> + # select columns to use for joining and consensus assigmments + dplyr::select( + panglao_ontology, + original_panglao_name, + blueprint_ontology, + consensus_annotation, + consensus_ontology + ) + +# grab singler ref from celldex +blueprint_ref <- celldex::BlueprintEncodeData() + +# grab obo file, we need this to map the ontologies from blueprint +cl_ont <- ontologyIndex::get_ontology("http://purl.obolibrary.org/obo/cl/releases/2024-09-26/cl-basic.obo") + + +# get ontologies and human readable name into data frame for blueprint +# in scpca-nf we don't include the cl name so this lets us add it in +blueprint_df <- data.frame( + blueprint_ontology = blueprint_ref$label.ont, + blueprint_annotation_cl = cl_ont$name[blueprint_ref$label.ont] +) |> + unique() |> + tidyr::drop_na() + +# Create combined TSV ---------------------------------------------------------- + +# read in TSV files and combine into a single df +all_cells_df <- all_files |> + purrr::map(readr::read_tsv) |> + dplyr::bind_rows() |> + # add columns for panglao ontology and consensus + # first add panglao ontology + dplyr::left_join(panglao_ref_df, by = c("cellassign_celltype_annotation" = "original_panglao_name")) |> + # now add in all the blueprint columns + dplyr::left_join(blueprint_df, by = c("singler_celltype_ontology" = "blueprint_ontology")) |> + # then add consensus labels + dplyr::left_join(consensus_ref_df, + by = c( + "singler_celltype_ontology" = "blueprint_ontology", + "cellassign_celltype_annotation" = "original_panglao_name", + "panglao_ontology" + ) + ) |> + # use unknown for NA annotation but keep ontology ID as NA + # if the sample type is cell line, keep as NA + dplyr::mutate(consensus_annotation = dplyr::if_else(is.na(consensus_annotation) & (sample_type != "cell line"), "Unknown", consensus_annotation)) + +# export file +readr::write_tsv(all_cells_df, opt$output_file) diff --git a/modules/cell-type-consensus/resources/usr/bin/save-coldata.R b/modules/cell-type-consensus/resources/usr/bin/save-coldata.R new file mode 100755 index 0000000..79f6db6 --- /dev/null +++ b/modules/cell-type-consensus/resources/usr/bin/save-coldata.R @@ -0,0 +1,97 @@ +#!/usr/bin/env Rscript + +# This script is used to grab the cell type annotations from the +# colData from a SCE object and save them to a TSV file + +library(optparse) + +option_list <- list( + make_option( + opt_str = c("--input_sce_file"), + type = "character", + help = "Path to RDS file containing a processed SingleCellExperiment object from scpca-nf" + ), + make_option( + opt_str = c("--output_file"), + type = "character", + help = "Path to file where colData will be saved, must end in `.tsv`" + ) +) + +# Parse options +opt <- parse_args(OptionParser(option_list = option_list)) + +# Set up ----------------------------------------------------------------------- + +# make sure input files exist +stopifnot( + "sce file does not exist" = file.exists(opt$input_sce_file) +) + +# load SCE +suppressPackageStartupMessages({ + library(SingleCellExperiment) +}) + +# Extract colData -------------------------------------------------------------- + +# read in sce +sce <- readr::read_rds(opt$input_sce_file) + +# extract ids +library_id <- metadata(sce)$library_id +# account for multiplexed libraries that have multiple samples +# for now just combine sample ids into a single string and don't worry about demultiplexing +sample_id <- metadata(sce)$sample_id |> + paste0(collapse = ";") +project_id <- metadata(sce)$project_id + +# check if cell line since cell lines don't have any cell type assignments +# account for having more than one sample and a list of sample types +# all sample types should be the same theoretically +sample_type <- unique(metadata(sce)$sample_type) +is_cell_line <- sample_type == "cell line" + +# grab coldata +coldata_df <- colData(sce) |> + as.data.frame() |> + # add unique sample/library information + dplyr::mutate( + project_id = project_id, + sample_id = sample_id, + library_id = library_id, + # add in sample type to make sure we don't assign consensus cell types to cell lines + sample_type = sample_type + ) + +# only select sample info and cell type info, we don't need the rest of the coldata +# if sample is cell line, fill in celltype columns with NA +if (is_cell_line) { + celltype_df <- coldata_df |> + dplyr::select( + project_id, + sample_id, + library_id, + barcodes, + sample_type + ) |> + dplyr::mutate( + singler_celltype_ontology = NA, + singler_celltype_annotation = NA, + cellassign_celltype_annotation = NA + ) +} else { + # otherwise select the cell type columns + celltype_df <- coldata_df |> + dplyr::select( + project_id, + sample_id, + library_id, + barcodes, + sample_type, + contains("celltype") # get both singler and cellassign with ontology + ) +} + +# save tsv +readr::write_tsv(celltype_df, opt$output_file) diff --git a/modules/merge-sce/main.nf b/modules/merge-sce/main.nf index 968e8cf..0195f50 100644 --- a/modules/merge-sce/main.nf +++ b/modules/merge-sce/main.nf @@ -3,11 +3,6 @@ // Workflow to merge SCE objects into a single object. // This workflow does NOT perform integration, i.e. batch correction. -// module parameters -params.reuse_merge = false -params.max_merge_libraries = 75 // maximum number of libraries to merge (current number is a guess, based on 59 working, but 104 not) -params.num_hvg = 2000 // number of HVGs to select - // merge workflow variables def module_name = "merge-sce" def publish_merge_base = "${params.results_bucket}/${params.release_prefix}/${module_name}" @@ -34,7 +29,7 @@ process merge_group { --input_library_ids '${input_library_ids}' \ --input_sce_files '${input_sces}' \ --output_sce_file '${merged_sce_file}' \ - --n_hvg ${params.num_hvg} \ + --n_hvg ${params.merge_hvg} \ --threads ${task.cpus} """ @@ -138,7 +133,7 @@ workflow merge_sce { } .branch{ // check the number of libraries - mergeable: it[1].size() < params.max_merge_libraries + mergeable: it[1].size() < params.merge_max_libraries oversized: true } @@ -154,7 +149,7 @@ workflow merge_sce { libraries_branch = libraries_ch.mergeable .branch{ - has_merge: params.reuse_merge && file("${publish_merge_base}/${it[0]}/${it[0]}_merged.rds").exists() + has_merge: params.merge_reuse && file("${publish_merge_base}/${it[0]}/${it[0]}_merged.rds").exists() make_merge: true } diff --git a/nextflow.config b/nextflow.config index 0aacaec..27caa44 100644 --- a/nextflow.config +++ b/nextflow.config @@ -27,6 +27,9 @@ includeConfig 'config/process_base.config' // Load container definitions includeConfig 'config/containers.config' +// include module specific parameters +includeConfig 'config/module_params.config' + profiles { standard { process {