AlexsLemonade · allyhawkins · Jan 14, 2025 · Jan 14, 2025 · Jan 14, 2025 · Jan 14, 2025
diff --git a/config/containers.config b/config/containers.config
@@ -18,4 +18,6 @@ params{
   // seurat-conversion module
   seurat_conversion_container = 'public.ecr.aws/openscpca/seurat-conversion:v0.2.0'
 
+  // cell-type-consensus module
+  consensus_cell_type_container = 'public.ecr.aws/openscpca/cell-type-consensus:latest'
 }
diff --git a/main.nf b/main.nf
@@ -6,6 +6,7 @@ include { simulate_sce } from './modules/simulate-sce'
 include { merge_sce } from './modules/merge-sce'
 include { detect_doublets } from './modules/doublet-detection'
 include { seurat_conversion } from './modules/seurat-conversion'
+include { cell_type_consensus } from './modules/cell-type-consensus'
 
 // **** Parameter checks ****
 param_error = false
@@ -50,11 +51,14 @@ workflow {
     .filter{ run_all || it[1] in project_ids }
 
   // Run the merge workflow
-  merge_sce(sample_ch)
+  //merge_sce(sample_ch)
 
   // Run the doublet detection workflow
-  detect_doublets(sample_ch)
+  //detect_doublets(sample_ch)
 
   // Run the seurat conversion workflow
-  seurat_conversion(sample_ch)
+  //seurat_conversion(sample_ch)
+
+  // Run the consensus cell type workflow
+  cell_type_consensus(sample_ch)
 }
diff --git a/modules/cell-type-consensus/README.md b/modules/cell-type-consensus/README.md
@@ -0,0 +1,13 @@
+This module assigns a consensus cell type based on cell types assigned by `SingleR` and `CellAssign`.
+
+Scripts are derived from the the `cell-type-consensus` module of the [OpenScPCA-analysis](https://github.com/AlexsLemonade/OpenScPCA-analysis) repository.
+
+Links to specific original scripts used in this module:
+
+- `save-coldata.R`: <https://github.com/AlexsLemonade/OpenScPCA-analysis/blob/b870a082bc9acd3536c5f8d2d52550d8fe8a4239/analyses/cell-type-consensus/scripts/03-save-coldata.R>
+- `combine-celltype-tables.R`:<https://github.com/AlexsLemonade/OpenScPCA-analysis/blob/main/analyses/cell-type-consensus/scripts/04-combine-celltype-tables.R>
+
+This module also uses the following reference files found in the `OpenScPCA-analysis` repository:
+
+- `panglao-cell-type-ontologies.tsv`: <https://github.com/AlexsLemonade/OpenScPCA-analysis/blob/b870a082bc9acd3536c5f8d2d52550d8fe8a4239/analyses/cell-type-consensus/references/panglao-cell-type-ontologies.tsv>
+- `consensus-cell-type-reference.tsv`: <https://github.com/AlexsLemonade/OpenScPCA-analysis/blob/b870a082bc9acd3536c5f8d2d52550d8fe8a4239/analyses/cell-type-consensus/references/consensus-cell-type-reference.tsv>
diff --git a/modules/cell-type-consensus/main.nf b/modules/cell-type-consensus/main.nf
@@ -0,0 +1,103 @@
+#!/usr/bin/env nextflow
+
+// Workflow to assign consensus cell type labels
+
+// module parameters
+params.panglao_ref_file = file('https://github.com/AlexsLemonade/OpenScPCA-analysis/blob/b870a082bc9acd3536c5f8d2d52550d8fe8a4239/analyses/cell-type-consensus/references/panglao-cell-type-ontologies.tsv')
+params.consensus_ref_file = file('https://github.com/AlexsLemonade/OpenScPCA-analysis/blob/b870a082bc9acd3536c5f8d2d52550d8fe8a4239/analyses/cell-type-consensus/references/consensus-cell-type-reference.tsv')
-params.panglao_ref_file = file('https://github.com/AlexsLemonade/OpenScPCA-analysis/blob/b870a082bc9acd3536c5f8d2d52550d8fe8a4239/analyses/cell-type-consensus/references/panglao-cell-type-ontologies.tsv')
-params.consensus_ref_file = file('https://github.com/AlexsLemonade/OpenScPCA-analysis/blob/b870a082bc9acd3536c5f8d2d52550d8fe8a4239/analyses/cell-type-consensus/references/consensus-cell-type-reference.tsv')
+params.panglao_ref_file = file('https://raw.githubusercontent.com/AlexsLemonade/OpenScPCA-analysis/b870a082bc9acd3536c5f8d2d52550d8fe8a4239/analyses/cell-type-consensus/references/panglao-cell-type-ontologies.tsv')
+params.consensus_ref_file = file('https://raw.githubusercontent.com/AlexsLemonade/OpenScPCA-analysis/b870a082bc9acd3536c5f8d2d52550d8fe8a4239/analyses/cell-type-consensus/references/consensus-cell-type-reference.tsv')
-params.panglao_ref_file = file('https://github.com/AlexsLemonade/OpenScPCA-analysis/blob/b870a082bc9acd3536c5f8d2d52550d8fe8a4239/analyses/cell-type-consensus/references/panglao-cell-type-ontologies.tsv')
-params.consensus_ref_file = file('https://github.com/AlexsLemonade/OpenScPCA-analysis/blob/b870a082bc9acd3536c5f8d2d52550d8fe8a4239/analyses/cell-type-consensus/references/consensus-cell-type-reference.tsv')
+params.panglao_ref_file = file('https://raw.githubusercontent.com/AlexsLemonade/OpenScPCA-analysis/b870a082bc9acd3536c5f8d2d52550d8fe8a4239/analyses/cell-type-consensus/references/panglao-cell-type-ontologies.tsv')
+params.consensus_ref_file = file('https://raw.githubusercontent.com/AlexsLemonade/OpenScPCA-analysis/b870a082bc9acd3536c5f8d2d52550d8fe8a4239/analyses/cell-type-consensus/references/consensus-cell-type-reference.tsv')
+
+process save_celltypes {
+  container params.consensus_cell_type_container
+  tag "${sample_id}"
+  errorStrategy 'terminate'
+  input:
+    tuple val(sample_id),
+          val(project_id),
+          path(library_files)
+  output:
+    tuple val(project_id),
+          path(output_files)
+  script:
+    output_files = library_files
+      .collect{
+        it.name.replaceAll(/(?i).rds$/, "_original-cell-types.tsv")
+      }
+    """
+    for file in ${library_files}; do
+      save-coldata.R \
+        --input_sce_file \$file \
+        --output_file \$(basename \${file%.rds}_original-cell-types.tsv)
+    done
+    """
+
+  stub:
+    output_files = library_files
+      .collect{
+        it.name.replaceAll(/(?i).rds$/, "_original-cell-types.tsv")
+      }
+    """
+    for file in ${library_files}; do
+      touch \$(basename \${file%.rds}_original-cell-types.tsv)
+    done
+    """
+}
+
+process assign_consensus {
+  container params.consensus_cell_type_container
+  errorStrategy 'terminate'
+  tag "${project_id}"
+  label 'mem_8'
+  publishDir "${params.results_bucket}/${params.release_prefix}/cell-type-consensus", mode: 'copy'
+  input:
+    tuple val(project_id),
+          path(cell_type_files)
+  output:
+    path consensus_output_file
-  output:
-    path consensus_output_file
+  output:
+    tuple val(project_id),
+          path(consensus_output_file)
-  output:
-    path consensus_output_file
+  output:
+    tuple val(project_id),
+          path(consensus_output_file)
+  script:
+    input_files = cell_type_files.join(',')
+    consensus_output_file = "${project_id}_consensus-cell-types.tsv.gz"
+    """
+    combine-celltype-tables.R \
+      --input_tsv_files ${input_files} \
+      --panglao_ref_file ${params.panglao_ref_file} \
+      --consensus_ref_file ${params.consensus_ref_file} \
+      --output_file ${consensus_output_file}
+    """
+
+  stub:
+    input_files = cell_type_files.join(',')
+    consensus_output_file = "${project_id}_consensus-cell-types.tsv.gz"
+    """
+    touch ${consensus_output_file}
+    """
+}
+
+
+
+workflow cell_type_consensus {
+  take:
+    sample_ch  // [sample_id, project_id, sample_path]
+  main:
+    // create [sample_id, project_id, [list of processed files]]
+    libraries_ch = sample_ch
+      .map{sample_id, project_id, sample_path ->
+        def library_files = Utils.getLibraryFiles(sample_path, format: "sce", process_level: "processed")
+        return [sample_id, project_id, library_files]
+      }
+
+    // save cell type information for each library
+    save_celltypes(libraries_ch)
+
+    cell_type_files_ch = save_celltypes.out
+      .groupTuple(by: 0) // group by project id
+      .map{ project_id, celltype_files -> tuple(
+        project_id,
+        celltype_files.flatten() // get rid of nested tuple that occurs when more than one library maps to a sample
+      )}
+
+    // assign consensus cell types by project
+    assign_consensus(cell_type_files_ch)
+
+  emit:
+    assign_consensus.out // [project_id, consensus_output_file]
+}
diff --git a/modules/cell-type-consensus/resources/usr/bin/combine-celltype-tables.R b/modules/cell-type-consensus/resources/usr/bin/combine-celltype-tables.R
@@ -0,0 +1,115 @@
+#!/usr/bin/env Rscript
+
+# This script is used to combine all TSV files containing cell types into a single TSV file
+# The output TSV file will include the following added columns:
+# panglao_ontology: CL term assigned to panglao term
+# panglao_annotation: human readable value associated with the CL term for panglao term
+# blueprint_annotation_cl: human readable value associated with the CL term for singler_celltype_ontology
+# consensus_annotation: human readable name associated with the consensus label
+# consensus_ontology: CL ontology term for the consensus cell type
+
+library(optparse)
+
+option_list <- list(
+  make_option(
+    opt_str = c("--input_tsv_files"),
+    type = "character",
+    help = "Comma separated list of input file paths corresponding to the TSV files with cell type annotations.
+      All TSV files in this list will be combined into a single file."
+  ),
+  make_option(
+    opt_str = c("--panglao_ref_file"),
+    type = "character",
+    help = "Path to file with panglao assignments and associated cell ontology ids"
+  ),
+  make_option(
+    opt_str = c("--consensus_ref_file"),
+    type = "character",
+    help = "Path to file containing the reference for assigning consensus cell type labels"
+  ),
+  make_option(
+    opt_str = c("--output_file"),
+    type = "character",
+    help = "Path to file where combined TSV file will be saved.
+      File name must end in either `.tsv` or `.tsv.gz` to save a compressed TSV file"
+  )
+)
+
+# Parse options
+opt <- parse_args(OptionParser(option_list = option_list))
+
+# Prep ref files ---------------------------------------------------------------
+
+# make sure reference files exist
+stopifnot(
+  "List of input files containing cell type assignments is missing" = !is.null(opt$input_tsv_files),
+  "panglao reference file does not exist" = file.exists(opt$panglao_ref_file),
+  "cell type consensus reference file does not exist" = file.exists(opt$consensus_ref_file),
+  "output file must end in `.tsv` or `.tsv.gz`" = stringr::str_detect(opt$output_file, ".tsv|.tsv.gz")
+)
+
+# list of paths to tsv files
+input_sce_files <- unlist(stringr::str_split(opt$input_tsv_files, ","))
+# check if any are empty, if so remove them
+missing_files <- file.size(input_sce_files) > 0
+all_files <- all_files[!missing_files]
+
+# read in ref files
+# change names for panglao ref to match what's in the consensus file
+panglao_ref_df <- readr::read_tsv(opt$panglao_ref_file) |>
+  dplyr::rename(
+    panglao_ontology = ontology_id,
+    panglao_annotation = human_readable_value,
+    original_panglao_name = panglao_cell_type
+  )
+
+consensus_ref_df <- readr::read_tsv(opt$consensus_ref_file) |>
+  # select columns to use for joining and consensus assigmments
+  dplyr::select(
+    panglao_ontology,
+    original_panglao_name,
+    blueprint_ontology,
+    consensus_annotation,
+    consensus_ontology
+  )
+
+# grab singler ref from celldex
+blueprint_ref <- celldex::BlueprintEncodeData()
+
+# grab obo file, we need this to map the ontologies from blueprint
+cl_ont <- ontologyIndex::get_ontology("http://purl.obolibrary.org/obo/cl/releases/2024-09-26/cl-basic.obo")
+
+
+# get ontologies and human readable name into data frame for blueprint
+# in scpca-nf we don't include the cl name so this lets us add it in
+blueprint_df <- data.frame(
+  blueprint_ontology = blueprint_ref$label.ont,
+  blueprint_annotation_cl = cl_ont$name[blueprint_ref$label.ont]
+) |>
+  unique() |>
+  tidyr::drop_na()
+
+# Create combined TSV ----------------------------------------------------------
+
+# read in TSV files and combine into a single df
+all_cells_df <- all_files |>
+  purrr::map(readr::read_tsv) |>
+  dplyr::bind_rows() |>
+  # add columns for panglao ontology and consensus
+  # first add panglao ontology
+  dplyr::left_join(panglao_ref_df, by = c("cellassign_celltype_annotation" = "original_panglao_name")) |>
+  # now add in all the blueprint columns
+  dplyr::left_join(blueprint_df, by = c("singler_celltype_ontology" = "blueprint_ontology")) |>
+  # then add consensus labels
+  dplyr::left_join(consensus_ref_df,
+    by = c(
+      "singler_celltype_ontology" = "blueprint_ontology",
+      "cellassign_celltype_annotation" = "original_panglao_name",
+      "panglao_ontology"
+    )
+  ) |>
+  # use unknown for NA annotation but keep ontology ID as NA
+  dplyr::mutate(consensus_annotation = dplyr::if_else(is.na(consensus_annotation), "Unknown", consensus_annotation))
+
+# export file
+readr::write_tsv(all_cells_df, opt$output_file)
diff --git a/modules/cell-type-consensus/resources/usr/bin/save-coldata.R b/modules/cell-type-consensus/resources/usr/bin/save-coldata.R
@@ -0,0 +1,77 @@
+#!/usr/bin/env Rscript
+
+# This script is used to grab the cell type annotations from the
+# colData from a SCE object and save them to a TSV file
+
+library(optparse)
+
+option_list <- list(
+  make_option(
+    opt_str = c("--input_sce_file"),
+    type = "character",
+    help = "Path to RDS file containing a processed SingleCellExperiment object from scpca-nf"
+  ),
+  make_option(
+    opt_str = c("--output_file"),
+    type = "character",
+    help = "Path to file where colData will be saved, must end in `.tsv`"
+  )
+)
+
+# Parse options
+opt <- parse_args(OptionParser(option_list = option_list))
+
+# Set up -----------------------------------------------------------------------
+
+# make sure input files exist
+stopifnot(
+  "sce file does not exist" = file.exists(opt$input_sce_file)
+)
+
+# load SCE
+suppressPackageStartupMessages({
+  library(SingleCellExperiment)
+})
+
+# Extract colData --------------------------------------------------------------
+
+# read in sce
+sce <- readr::read_rds(opt$input_sce_file)
+
+# extract ids
+library_id <- metadata(sce)$library_id
+# account for multiplexed libraries that have multiple samples
+# for now just combine sample ids into a single string and don't worry about demultiplexing
+sample_id <- metadata(sce)$sample_id |>
+  paste0(collapse = ";")
+project_id <- metadata(sce)$project_id
+
+# check if cell line since cell lines don't have any cell type assignments
+# account for having more than one sample and a list of sample types
+# all sample types should be the same theoretically
+is_cell_line <- all(metadata(sce)$sample_type == "cell line")
+
+# only create and write table for non-cell line samples
+if (is_cell_line) {
+  # make an empty filtered file
+  file.create(opt$output_file)
+} else {
+  # get df with ids, barcodes, and cell type assignments
+  celltype_df <- colData(sce) |>
+    as.data.frame() |>
+    dplyr::mutate(
+      project_id = project_id,
+      sample_id = sample_id,
+      library_id = library_id
+    ) |>
+    dplyr::select(
+      project_id,
+      sample_id,
+      library_id,
+      barcodes,
+      contains("celltype") # get both singler and cellassign with ontology
+    )
+
+  # save tsv
+  readr::write_tsv(celltype_df, opt$output_file)
+}