From ebc48690b397ac3f594ef83541444a63aadc2da6 Mon Sep 17 00:00:00 2001
From: Ally Hawkins <ally.hawkins@ccdatalab.org>
Date: Tue, 14 Jan 2025 13:08:10 -0600
Subject: [PATCH 01/25] add scripts used for assigning cell types

---
 .../usr/bin/assign-consensus-label.R          | 108 ++++++++++++++++++
 .../resources/usr/bin/save-celltypes.R        |  77 +++++++++++++
 2 files changed, 185 insertions(+)
 create mode 100644 modules/cell-type-consensus/resources/usr/bin/assign-consensus-label.R
 create mode 100644 modules/cell-type-consensus/resources/usr/bin/save-celltypes.R

diff --git a/modules/cell-type-consensus/resources/usr/bin/assign-consensus-label.R b/modules/cell-type-consensus/resources/usr/bin/assign-consensus-label.R
new file mode 100644
index 0000000..937e4c2
--- /dev/null
+++ b/modules/cell-type-consensus/resources/usr/bin/assign-consensus-label.R
@@ -0,0 +1,108 @@
+#!/usr/bin/env Rscript
+
+# This script is used to combine all TSV files containing cell types into a single TSV file 
+# The output TSV file will include the following added columns: 
+# panglao_ontology: CL term assigned to panglao term
+# panglao_annotation: human readable value associated with the CL term for panglao term 
+# blueprint_annotation_cl: human readable value associated with the CL term for singler_celltype_ontology 
+# consensus_annotation: human readable name associated with the consensus label 
+# consensus_ontology: CL ontology term for the consensus cell type 
+
+library(optparse)
+
+option_list <- list(
+  make_option(
+    opt_str = c("--celltype_tsv_dir"),
+    type = "character",
+    help = "Path to directory containing TSV files with cell type annotations from single samples.
+      All TSV files in this directory will be combined into a single file."
+  ),
+  make_option(
+    opt_str = c("--panglao_ref_file"),
+    type = "character", 
+    help = "Path to file with panglao assignments and associated cell ontology ids"
+  ),
+  make_option(
+    opt_str = c("--consensus_ref_file"),
+    type = "character",
+    help = "Path to file containing the reference for assigning consensus cell type labels"
+  ), 
+  make_option(
+    opt_str = c("--output_file"),
+    type = "character",
+    help = "Path to file where combined TSV file will be saved. 
+      File name must end in either `.tsv` or `.tsv.gz` to save a compressed TSV file"
+  )
+)
+
+# Parse options
+opt <- parse_args(OptionParser(option_list = option_list))
+
+# Prep ref files ---------------------------------------------------------------
+
+# make sure reference files exist 
+stopifnot(
+  "panglao reference file does not exist" = file.exists(opt$panglao_ref_file),
+  "cell type consensus reference file does not exist" = file.exists(opt$consensus_ref_file),
+  "output file must end in `.tsv` or `.tsv.gz`" = stringr::str_detect(opt$output_file, ".tsv|.tsv.gz")
+)
+
+# read in ref files 
+# change names for panglao ref to match what's in the consensus file 
+panglao_ref_df <- readr::read_tsv(opt$panglao_ref_file) |>
+  dplyr::rename(
+    panglao_ontology = ontology_id,
+    panglao_annotation = human_readable_value,
+    original_panglao_name = panglao_cell_type
+  )
+
+consensus_ref_df <- readr::read_tsv(opt$consensus_ref_file) |>
+  # select columns to use for joining and consensus assigmments 
+  dplyr::select(
+    panglao_ontology, 
+    original_panglao_name,
+    blueprint_ontology,
+    consensus_annotation,
+    consensus_ontology
+  )
+
+# grab singler ref from celldex
+blueprint_ref <- celldex::BlueprintEncodeData()
+
+# grab obo file, we need this to map the ontologies from blueprint
+cl_ont <- ontologyIndex::get_ontology("http://purl.obolibrary.org/obo/cl/releases/2024-09-26/cl-basic.obo") 
+
+
+# get ontologies and human readable name into data frame for blueprint
+# in scpca-nf we don't include the cl name so this lets us add it in 
+blueprint_df <- data.frame(
+  blueprint_ontology = blueprint_ref$label.ont,
+  blueprint_annotation_cl = cl_ont$name[blueprint_ref$label.ont]
+) |>
+  unique() |> 
+  tidyr::drop_na()
+
+# get list of all TSV files 
+all_files <- list.files(path = opt$celltype_tsv_dir,
+                        pattern = "*.tsv", 
+                        full.names = TRUE) 
+
+# read in TSV files and combine into a single df 
+all_cells_df <- all_files |> 
+  purrr::map(readr::read_tsv) |> 
+  dplyr::bind_rows() |> 
+  # add columns for panglao ontology and consensus
+  # first add panglao ontology 
+  dplyr::left_join(panglao_ref_df, by = c("cellassign_celltype_annotation" = "original_panglao_name")) |>
+  # now add in all the blueprint columns
+  dplyr::left_join(blueprint_df, by = c("singler_celltype_ontology" = "blueprint_ontology")) |> 
+  # then add consensus labels
+  dplyr::left_join(consensus_ref_df, 
+                   by = c("singler_celltype_ontology" = "blueprint_ontology",
+                          "cellassign_celltype_annotation" = "original_panglao_name",
+                          "panglao_ontology")) |>
+  # use unknown for NA annotation but keep ontology ID as NA
+  dplyr::mutate(consensus_annotation = dplyr::if_else(is.na(consensus_annotation), "Unknown", consensus_annotation))
+
+# export file 
+readr::write_tsv(all_cells_df, opt$output_file)
diff --git a/modules/cell-type-consensus/resources/usr/bin/save-celltypes.R b/modules/cell-type-consensus/resources/usr/bin/save-celltypes.R
new file mode 100644
index 0000000..91081c8
--- /dev/null
+++ b/modules/cell-type-consensus/resources/usr/bin/save-celltypes.R
@@ -0,0 +1,77 @@
+#!/usr/bin/env Rscript
+
+# This script is used to grab the cell type annotations from the 
+# colData from a SCE object and save them to a TSV file
+
+library(optparse)
+
+option_list <- list(
+  make_option(
+    opt_str = c("--sce_file"),
+    type = "character",
+    help = "Path to RDS file containing a processed SingleCellExperiment object from scpca-nf"
+  ),
+  make_option(
+    opt_str = c("--output_file"),
+    type = "character",
+    help = "Path to file where colData will be saved, must end in `.tsv`"
+  )
+)
+
+# Parse options
+opt <- parse_args(OptionParser(option_list = option_list))
+
+# Set up -----------------------------------------------------------------------
+
+# make sure input files exist
+stopifnot(
+  "sce file does not exist" = file.exists(opt$sce_file)
+)
+
+# load SCE
+suppressPackageStartupMessages({
+  library(SingleCellExperiment)
+})
+
+# Extract colData --------------------------------------------------------------
+
+# read in sce 
+sce <- readr::read_rds(opt$sce_file)
+
+# extract ids 
+library_id <- metadata(sce)$library_id
+# account for multiplexed libraries that have multiple samples 
+# for now just combine sample ids into a single string and don't worry about demultiplexing 
+sample_id <- metadata(sce)$sample_id |> 
+  paste0(collapse = ";")
+project_id <- metadata(sce)$project_id
+
+# check if cell line since cell lines don't have any cell type assignments 
+# account for having more than one sample and a list of sample types
+# all sample types should be the same theoretically 
+is_cell_line <- all(metadata(sce)$sample_type == "cell line")
+
+# only create and write table for non-cell line samples
+if(!is_cell_line){
+  
+  # get df with ids, barcodes, and cell type assignments
+  celltype_df <- colData(sce) |> 
+    as.data.frame() |> 
+    dplyr::mutate(
+      project_id = project_id,
+      sample_id = sample_id,
+      library_id = library_id
+    ) |> 
+    dplyr::select(
+      project_id,
+      sample_id,
+      library_id,
+      barcodes, 
+      contains("celltype") # get both singler and cellassign with ontology 
+    )
+  
+  # save tsv 
+  readr::write_tsv(celltype_df, opt$output_file)
+  
+}
+

From 6db6af4c2b12ec04eab784e050fe8aaf1ba9c268 Mon Sep 17 00:00:00 2001
From: Ally Hawkins <ally.hawkins@ccdatalab.org>
Date: Tue, 14 Jan 2025 13:08:18 -0600
Subject: [PATCH 02/25] initiate readme for module

---
 modules/cell-type-consensus/README.md | 8 ++++++++
 1 file changed, 8 insertions(+)
 create mode 100644 modules/cell-type-consensus/README.md

diff --git a/modules/cell-type-consensus/README.md b/modules/cell-type-consensus/README.md
new file mode 100644
index 0000000..66a7419
--- /dev/null
+++ b/modules/cell-type-consensus/README.md
@@ -0,0 +1,8 @@
+This module assigns a consensus cell type based on cell types assigned by `SingleR` and `CellAssign`. 
+
+Scripts are derived from the the `cell-type-consensus` module of the [OpenScPCA-analysis](https://github.com/AlexsLemonade/OpenScPCA-analysis) repository.
+
+Links to specific original files used in this module:
+
+- `save-celltypes.R`: <https://github.com/AlexsLemonade/OpenScPCA-analysis/blob/40d6db1bcd2e4bdca8d840e96ebae8fe19db5372/analyses/cell-type-consensus/scripts/03-save-coldata.R>
+- `assign-consensus-label.R`:<https://github.com/AlexsLemonade/OpenScPCA-analysis/blob/40d6db1bcd2e4bdca8d840e96ebae8fe19db5372/analyses/cell-type-consensus/scripts/04-combine-celltype-tables.R>

From 2e4f68ca632ef2a8ac7e3f5259f50a3107a93800 Mon Sep 17 00:00:00 2001
From: Ally Hawkins <ally.hawkins@ccdatalab.org>
Date: Tue, 14 Jan 2025 14:29:14 -0600
Subject: [PATCH 03/25] make scripts executable

---
 .../usr/bin/assign-consensus-label.R          | 74 ++++++++++---------
 .../resources/usr/bin/save-celltypes.R        | 37 +++++-----
 2 files changed, 56 insertions(+), 55 deletions(-)
 mode change 100644 => 100755 modules/cell-type-consensus/resources/usr/bin/assign-consensus-label.R
 mode change 100644 => 100755 modules/cell-type-consensus/resources/usr/bin/save-celltypes.R

diff --git a/modules/cell-type-consensus/resources/usr/bin/assign-consensus-label.R b/modules/cell-type-consensus/resources/usr/bin/assign-consensus-label.R
old mode 100644
new mode 100755
index 937e4c2..37e0280
--- a/modules/cell-type-consensus/resources/usr/bin/assign-consensus-label.R
+++ b/modules/cell-type-consensus/resources/usr/bin/assign-consensus-label.R
@@ -1,36 +1,36 @@
 #!/usr/bin/env Rscript
 
-# This script is used to combine all TSV files containing cell types into a single TSV file 
-# The output TSV file will include the following added columns: 
+# This script is used to combine all TSV files containing cell types into a single TSV file
+# The output TSV file will include the following added columns:
 # panglao_ontology: CL term assigned to panglao term
-# panglao_annotation: human readable value associated with the CL term for panglao term 
-# blueprint_annotation_cl: human readable value associated with the CL term for singler_celltype_ontology 
-# consensus_annotation: human readable name associated with the consensus label 
-# consensus_ontology: CL ontology term for the consensus cell type 
+# panglao_annotation: human readable value associated with the CL term for panglao term
+# blueprint_annotation_cl: human readable value associated with the CL term for singler_celltype_ontology
+# consensus_annotation: human readable name associated with the consensus label
+# consensus_ontology: CL ontology term for the consensus cell type
 
 library(optparse)
 
 option_list <- list(
   make_option(
-    opt_str = c("--celltype_tsv_dir"),
+    opt_str = c("--input_tsv_files"),
     type = "character",
-    help = "Path to directory containing TSV files with cell type annotations from single samples.
-      All TSV files in this directory will be combined into a single file."
+    help = "Comma separated list of input file paths corresponding to the TSV files with cell type annotations.
+      All TSV files in this list will be combined into a single file."
   ),
   make_option(
     opt_str = c("--panglao_ref_file"),
-    type = "character", 
+    type = "character",
     help = "Path to file with panglao assignments and associated cell ontology ids"
   ),
   make_option(
     opt_str = c("--consensus_ref_file"),
     type = "character",
     help = "Path to file containing the reference for assigning consensus cell type labels"
-  ), 
+  ),
   make_option(
     opt_str = c("--output_file"),
     type = "character",
-    help = "Path to file where combined TSV file will be saved. 
+    help = "Path to file where combined TSV file will be saved.
       File name must end in either `.tsv` or `.tsv.gz` to save a compressed TSV file"
   )
 )
@@ -40,15 +40,19 @@ opt <- parse_args(OptionParser(option_list = option_list))
 
 # Prep ref files ---------------------------------------------------------------
 
-# make sure reference files exist 
+# make sure reference files exist
 stopifnot(
+  "List of input files containing cell type assignments is missing" = !is.null(opt$input_tsv_files),
   "panglao reference file does not exist" = file.exists(opt$panglao_ref_file),
   "cell type consensus reference file does not exist" = file.exists(opt$consensus_ref_file),
   "output file must end in `.tsv` or `.tsv.gz`" = stringr::str_detect(opt$output_file, ".tsv|.tsv.gz")
 )
 
-# read in ref files 
-# change names for panglao ref to match what's in the consensus file 
+# list of paths to tsv files
+all_files <- unlist(stringr::str_split(opt$input_tsv_files, ","))
+
+# read in ref files
+# change names for panglao ref to match what's in the consensus file
 panglao_ref_df <- readr::read_tsv(opt$panglao_ref_file) |>
   dplyr::rename(
     panglao_ontology = ontology_id,
@@ -57,9 +61,9 @@ panglao_ref_df <- readr::read_tsv(opt$panglao_ref_file) |>
   )
 
 consensus_ref_df <- readr::read_tsv(opt$consensus_ref_file) |>
-  # select columns to use for joining and consensus assigmments 
+  # select columns to use for joining and consensus assigmments
   dplyr::select(
-    panglao_ontology, 
+    panglao_ontology,
     original_panglao_name,
     blueprint_ontology,
     consensus_annotation,
@@ -70,39 +74,39 @@ consensus_ref_df <- readr::read_tsv(opt$consensus_ref_file) |>
 blueprint_ref <- celldex::BlueprintEncodeData()
 
 # grab obo file, we need this to map the ontologies from blueprint
-cl_ont <- ontologyIndex::get_ontology("http://purl.obolibrary.org/obo/cl/releases/2024-09-26/cl-basic.obo") 
+cl_ont <- ontologyIndex::get_ontology("http://purl.obolibrary.org/obo/cl/releases/2024-09-26/cl-basic.obo")
 
 
 # get ontologies and human readable name into data frame for blueprint
-# in scpca-nf we don't include the cl name so this lets us add it in 
+# in scpca-nf we don't include the cl name so this lets us add it in
 blueprint_df <- data.frame(
   blueprint_ontology = blueprint_ref$label.ont,
   blueprint_annotation_cl = cl_ont$name[blueprint_ref$label.ont]
 ) |>
-  unique() |> 
+  unique() |>
   tidyr::drop_na()
 
-# get list of all TSV files 
-all_files <- list.files(path = opt$celltype_tsv_dir,
-                        pattern = "*.tsv", 
-                        full.names = TRUE) 
+# Create combined TSV ----------------------------------------------------------
 
-# read in TSV files and combine into a single df 
-all_cells_df <- all_files |> 
-  purrr::map(readr::read_tsv) |> 
-  dplyr::bind_rows() |> 
+# read in TSV files and combine into a single df
+all_cells_df <- all_files |>
+  purrr::map(readr::read_tsv) |>
+  dplyr::bind_rows() |>
   # add columns for panglao ontology and consensus
-  # first add panglao ontology 
+  # first add panglao ontology
   dplyr::left_join(panglao_ref_df, by = c("cellassign_celltype_annotation" = "original_panglao_name")) |>
   # now add in all the blueprint columns
-  dplyr::left_join(blueprint_df, by = c("singler_celltype_ontology" = "blueprint_ontology")) |> 
+  dplyr::left_join(blueprint_df, by = c("singler_celltype_ontology" = "blueprint_ontology")) |>
   # then add consensus labels
-  dplyr::left_join(consensus_ref_df, 
-                   by = c("singler_celltype_ontology" = "blueprint_ontology",
-                          "cellassign_celltype_annotation" = "original_panglao_name",
-                          "panglao_ontology")) |>
+  dplyr::left_join(consensus_ref_df,
+    by = c(
+      "singler_celltype_ontology" = "blueprint_ontology",
+      "cellassign_celltype_annotation" = "original_panglao_name",
+      "panglao_ontology"
+    )
+  ) |>
   # use unknown for NA annotation but keep ontology ID as NA
   dplyr::mutate(consensus_annotation = dplyr::if_else(is.na(consensus_annotation), "Unknown", consensus_annotation))
 
-# export file 
+# export file
 readr::write_tsv(all_cells_df, opt$output_file)
diff --git a/modules/cell-type-consensus/resources/usr/bin/save-celltypes.R b/modules/cell-type-consensus/resources/usr/bin/save-celltypes.R
old mode 100644
new mode 100755
index 91081c8..7dff090
--- a/modules/cell-type-consensus/resources/usr/bin/save-celltypes.R
+++ b/modules/cell-type-consensus/resources/usr/bin/save-celltypes.R
@@ -1,13 +1,13 @@
 #!/usr/bin/env Rscript
 
-# This script is used to grab the cell type annotations from the 
+# This script is used to grab the cell type annotations from the
 # colData from a SCE object and save them to a TSV file
 
 library(optparse)
 
 option_list <- list(
   make_option(
-    opt_str = c("--sce_file"),
+    opt_str = c("--input_sce_file"),
     type = "character",
     help = "Path to RDS file containing a processed SingleCellExperiment object from scpca-nf"
   ),
@@ -35,43 +35,40 @@ suppressPackageStartupMessages({
 
 # Extract colData --------------------------------------------------------------
 
-# read in sce 
+# read in sce
 sce <- readr::read_rds(opt$sce_file)
 
-# extract ids 
+# extract ids
 library_id <- metadata(sce)$library_id
-# account for multiplexed libraries that have multiple samples 
-# for now just combine sample ids into a single string and don't worry about demultiplexing 
-sample_id <- metadata(sce)$sample_id |> 
+# account for multiplexed libraries that have multiple samples
+# for now just combine sample ids into a single string and don't worry about demultiplexing
+sample_id <- metadata(sce)$sample_id |>
   paste0(collapse = ";")
 project_id <- metadata(sce)$project_id
 
-# check if cell line since cell lines don't have any cell type assignments 
+# check if cell line since cell lines don't have any cell type assignments
 # account for having more than one sample and a list of sample types
-# all sample types should be the same theoretically 
+# all sample types should be the same theoretically
 is_cell_line <- all(metadata(sce)$sample_type == "cell line")
 
 # only create and write table for non-cell line samples
-if(!is_cell_line){
-  
+if (!is_cell_line) {
   # get df with ids, barcodes, and cell type assignments
-  celltype_df <- colData(sce) |> 
-    as.data.frame() |> 
+  celltype_df <- colData(sce) |>
+    as.data.frame() |>
     dplyr::mutate(
       project_id = project_id,
       sample_id = sample_id,
       library_id = library_id
-    ) |> 
+    ) |>
     dplyr::select(
       project_id,
       sample_id,
       library_id,
-      barcodes, 
-      contains("celltype") # get both singler and cellassign with ontology 
+      barcodes,
+      contains("celltype") # get both singler and cellassign with ontology
     )
-  
-  # save tsv 
+
+  # save tsv
   readr::write_tsv(celltype_df, opt$output_file)
-  
 }
-

From 925c41516da2b31c6ebb9ff3a6f7efea22fa7575 Mon Sep 17 00:00:00 2001
From: Ally Hawkins <ally.hawkins@ccdatalab.org>
Date: Tue, 14 Jan 2025 14:59:57 -0600
Subject: [PATCH 04/25] add to main workflow

---
 main.nf | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/main.nf b/main.nf
index ca58951..a408c0d 100644
--- a/main.nf
+++ b/main.nf
@@ -6,6 +6,7 @@ include { simulate_sce } from './modules/simulate-sce'
 include { merge_sce } from './modules/merge-sce'
 include { detect_doublets } from './modules/doublet-detection'
 include { seurat_conversion } from './modules/seurat-conversion'
+include { cell_type_consensus } from './modules/cell-type-consensus'
 
 // **** Parameter checks ****
 param_error = false
@@ -57,4 +58,7 @@ workflow {
 
   // Run the seurat conversion workflow
   seurat_conversion(sample_ch)
+
+  // Run the consensus cell type workflow
+  cell_type_consensus(sample_ch)
 }

From b0762df73b6270a25420b314458e9b353c9642ff Mon Sep 17 00:00:00 2001
From: Ally Hawkins <ally.hawkins@ccdatalab.org>
Date: Tue, 14 Jan 2025 15:00:20 -0600
Subject: [PATCH 05/25] workflow for running consensus cell types

---
 modules/cell-type-consensus/main.nf | 97 +++++++++++++++++++++++++++++
 1 file changed, 97 insertions(+)
 create mode 100644 modules/cell-type-consensus/main.nf

diff --git a/modules/cell-type-consensus/main.nf b/modules/cell-type-consensus/main.nf
new file mode 100644
index 0000000..765ccdd
--- /dev/null
+++ b/modules/cell-type-consensus/main.nf
@@ -0,0 +1,97 @@
+#!/usr/bin/env nextflow
+
+// Workflow to assign consensus cell type labels
+
+// module parameters
+params.panglao_ref_file = file('https://github.com/AlexsLemonade/OpenScPCA-analysis/blob/40d6db1bcd2e4bdca8d840e96ebae8fe19db5372/analyses/cell-type-consensus/references/panglao-cell-type-ontologies.tsv')
+params.consensus_ref_file = file('https://github.com/AlexsLemonade/OpenScPCA-analysis/blob/40d6db1bcd2e4bdca8d840e96ebae8fe19db5372/analyses/cell-type-consensus/references/consensus-cell-type-reference.tsv')
+
+process save_celltypes {
+  container params.consensus_cell_type_container
+  tag "${sample_id}"
+  input:
+    tuple val(sample_id),
+          val(project_id),
+          path(library_files)
+  output:
+    tuple val(project_id),
+          path(output_files)
+  script:
+    output_files = library_files
+      .collect{
+        it.name.replaceAll(/(?i).rds$/, "__original-cell-types.tsv")
+      }
+    """
+    for file in ${library_files}; do
+      save-celltypes.R \
+        --input_sce_file \$file \
+        --output_file \$(basename \${file%.rds}__original-cell-types.tsv)
+    done
+    """
+
+  stub:
+    output_files = library_files
+      .collect{
+        it.name.replaceAll(/(?i).rds$/, "_original-cell-types.tsv")
+      }
+    """
+    for file in ${library_files}; do
+      touch \$(basename \${file%.rds}_original-cell-types.tsv)
+    done
+    """
+}
+
+process assign_consensus {
+  container params.consensus_cell_type_container
+  tag "${project_id}"
+  label 'mem_8'
+  publishDir "${params.results_bucket}/${params.release_prefix}/cell-type-consensus", mode: 'copy'
+  input:
+    tuple val(project_id),
+          path(cell_type_files)
+  output:
+    path consensus_output_file
+  script:
+    input_files = cell_type_files.join(',')
+    consensus_output_file = "${project_id}_consensus-cell-types.tsv.gz"
+    """
+    assign-consensus-label.R \
+      --input_tsv_files ${input_files} \
+      --panglao_ref_file ${params.panglao_ref_file} \
+      --consensus_ref_file ${consensus_ref_file} \
+      --output_file ${consensus_output_file}
+    """
+
+  stub:
+    input_files = cell_type_files.join(',')
+    consensus_output_file = "${project_id}_consensus-cell-types.tsv.gz"
+    """
+    touch ${consensus_output_file}
+    """
+}
+
+
+
+workflow cell_type_consensus {
+  take:
+    sample_ch  // [sample_id, project_id, sample_path]
+  main:
+    // create [sample_id, project_id, [list of processed files]]
+    libraries_ch = sample_ch
+      .map{sample_id, project_id, sample_path ->
+        def library_files = Utils.getLibraryFiles(sample_path, format: "sce", process_level: "processed")
+        return [sample_id, project_id, library_files]
+      }
+
+    // save cell type information for each library
+    save_celltypes(libraries_ch)
+
+    cell_type_files_ch = save_celltypes.out
+      .groupTuple(by: 0) // group by project id
+
+    // assign consensus cell types by project
+    assign_consensus(cell_type_files_ch)
+
+  emit:
+    assign_consensus.out // [project_id, consensus_output_file]
+}

From e9e00edc29afb1678aaf54683e16013a162b67d2 Mon Sep 17 00:00:00 2001
From: Ally Hawkins <ally.hawkins@ccdatalab.org>
Date: Tue, 14 Jan 2025 15:00:39 -0600
Subject: [PATCH 06/25] consensus cell type container

---
 config/containers.config | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/config/containers.config b/config/containers.config
index aafd69c..9483bef 100644
--- a/config/containers.config
+++ b/config/containers.config
@@ -18,4 +18,6 @@ params{
   // seurat-conversion module
   seurat_conversion_container = 'public.ecr.aws/openscpca/seurat-conversion:v0.2.0'
 
+  // cell-type-consensus module
+  consensus_cell_type_container = 'public.ecr.aws/openscpca/cell-type-consensus:latest'
 }

From 823a959371fd3944fa19a0913f66b73e001a8c9f Mon Sep 17 00:00:00 2001
From: Ally Hawkins <ally.hawkins@ccdatalab.org>
Date: Tue, 14 Jan 2025 15:04:00 -0600
Subject: [PATCH 07/25] use original script names

---
 modules/cell-type-consensus/main.nf                       | 8 ++++----
 ...assign-consensus-label.R => combine-celltype-tables.R} | 0
 .../usr/bin/{save-celltypes.R => save-coldata.R}          | 0
 3 files changed, 4 insertions(+), 4 deletions(-)
 rename modules/cell-type-consensus/resources/usr/bin/{assign-consensus-label.R => combine-celltype-tables.R} (100%)
 rename modules/cell-type-consensus/resources/usr/bin/{save-celltypes.R => save-coldata.R} (100%)

diff --git a/modules/cell-type-consensus/main.nf b/modules/cell-type-consensus/main.nf
index 765ccdd..3be7858 100644
--- a/modules/cell-type-consensus/main.nf
+++ b/modules/cell-type-consensus/main.nf
@@ -3,8 +3,8 @@
 // Workflow to assign consensus cell type labels
 
 // module parameters
-params.panglao_ref_file = file('https://github.com/AlexsLemonade/OpenScPCA-analysis/blob/40d6db1bcd2e4bdca8d840e96ebae8fe19db5372/analyses/cell-type-consensus/references/panglao-cell-type-ontologies.tsv')
-params.consensus_ref_file = file('https://github.com/AlexsLemonade/OpenScPCA-analysis/blob/40d6db1bcd2e4bdca8d840e96ebae8fe19db5372/analyses/cell-type-consensus/references/consensus-cell-type-reference.tsv')
+params.panglao_ref_file = file('https://github.com/AlexsLemonade/OpenScPCA-analysis/blob/b870a082bc9acd3536c5f8d2d52550d8fe8a4239/analyses/cell-type-consensus/references/panglao-cell-type-ontologies.tsv')
+params.consensus_ref_file = file('https://github.com/AlexsLemonade/OpenScPCA-analysis/blob/b870a082bc9acd3536c5f8d2d52550d8fe8a4239/analyses/cell-type-consensus/references/consensus-cell-type-reference.tsv')
 
 process save_celltypes {
   container params.consensus_cell_type_container
@@ -23,7 +23,7 @@ process save_celltypes {
       }
     """
     for file in ${library_files}; do
-      save-celltypes.R \
+      save-coldata.R \
         --input_sce_file \$file \
         --output_file \$(basename \${file%.rds}__original-cell-types.tsv)
     done
@@ -55,7 +55,7 @@ process assign_consensus {
     input_files = cell_type_files.join(',')
     consensus_output_file = "${project_id}_consensus-cell-types.tsv.gz"
     """
-    assign-consensus-label.R \
+    combine-celltype-tables.R \
       --input_tsv_files ${input_files} \
       --panglao_ref_file ${params.panglao_ref_file} \
       --consensus_ref_file ${consensus_ref_file} \
diff --git a/modules/cell-type-consensus/resources/usr/bin/assign-consensus-label.R b/modules/cell-type-consensus/resources/usr/bin/combine-celltype-tables.R
similarity index 100%
rename from modules/cell-type-consensus/resources/usr/bin/assign-consensus-label.R
rename to modules/cell-type-consensus/resources/usr/bin/combine-celltype-tables.R
diff --git a/modules/cell-type-consensus/resources/usr/bin/save-celltypes.R b/modules/cell-type-consensus/resources/usr/bin/save-coldata.R
similarity index 100%
rename from modules/cell-type-consensus/resources/usr/bin/save-celltypes.R
rename to modules/cell-type-consensus/resources/usr/bin/save-coldata.R

From 154277b4eec8b8b8ef6857af767c9857306612e2 Mon Sep 17 00:00:00 2001
From: Ally Hawkins <ally.hawkins@ccdatalab.org>
Date: Tue, 14 Jan 2025 15:04:23 -0600
Subject: [PATCH 08/25] udpate permalinks in readme

---
 modules/cell-type-consensus/README.md | 13 +++++++++----
 1 file changed, 9 insertions(+), 4 deletions(-)

diff --git a/modules/cell-type-consensus/README.md b/modules/cell-type-consensus/README.md
index 66a7419..5ba49a3 100644
--- a/modules/cell-type-consensus/README.md
+++ b/modules/cell-type-consensus/README.md
@@ -1,8 +1,13 @@
-This module assigns a consensus cell type based on cell types assigned by `SingleR` and `CellAssign`. 
+This module assigns a consensus cell type based on cell types assigned by `SingleR` and `CellAssign`.
 
 Scripts are derived from the the `cell-type-consensus` module of the [OpenScPCA-analysis](https://github.com/AlexsLemonade/OpenScPCA-analysis) repository.
 
-Links to specific original files used in this module:
+Links to specific original scripts used in this module:
 
-- `save-celltypes.R`: <https://github.com/AlexsLemonade/OpenScPCA-analysis/blob/40d6db1bcd2e4bdca8d840e96ebae8fe19db5372/analyses/cell-type-consensus/scripts/03-save-coldata.R>
-- `assign-consensus-label.R`:<https://github.com/AlexsLemonade/OpenScPCA-analysis/blob/40d6db1bcd2e4bdca8d840e96ebae8fe19db5372/analyses/cell-type-consensus/scripts/04-combine-celltype-tables.R>
+- `save-coldata.R`: <https://github.com/AlexsLemonade/OpenScPCA-analysis/blob/b870a082bc9acd3536c5f8d2d52550d8fe8a4239/analyses/cell-type-consensus/scripts/03-save-coldata.R>
+- `combine-celltype-tables.R`:<https://github.com/AlexsLemonade/OpenScPCA-analysis/blob/main/analyses/cell-type-consensus/scripts/04-combine-celltype-tables.R>
+
+This module also uses the following reference files found in the `OpenScPCA-analysis` repository:
+
+- `panglao-cell-type-ontologies.tsv`: <https://github.com/AlexsLemonade/OpenScPCA-analysis/blob/b870a082bc9acd3536c5f8d2d52550d8fe8a4239/analyses/cell-type-consensus/references/panglao-cell-type-ontologies.tsv>
+- `consensus-cell-type-reference.tsv`: <https://github.com/AlexsLemonade/OpenScPCA-analysis/blob/b870a082bc9acd3536c5f8d2d52550d8fe8a4239/analyses/cell-type-consensus/references/consensus-cell-type-reference.tsv>

From d3b0ad141028698a8d597c2d163a38acd95a36f8 Mon Sep 17 00:00:00 2001
From: Ally Hawkins <ally.hawkins@ccdatalab.org>
Date: Tue, 14 Jan 2025 15:33:24 -0600
Subject: [PATCH 09/25] comment out other modules for faster testing

---
 main.nf | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/main.nf b/main.nf
index a408c0d..be536ab 100644
--- a/main.nf
+++ b/main.nf
@@ -51,13 +51,13 @@ workflow {
     .filter{ run_all || it[1] in project_ids }
 
   // Run the merge workflow
-  merge_sce(sample_ch)
+  //merge_sce(sample_ch)
 
   // Run the doublet detection workflow
-  detect_doublets(sample_ch)
+  //detect_doublets(sample_ch)
 
   // Run the seurat conversion workflow
-  seurat_conversion(sample_ch)
+  //seurat_conversion(sample_ch)
 
   // Run the consensus cell type workflow
   cell_type_consensus(sample_ch)

From 1f7944cd5d6ba59070ae15ceaeb24e9c90471bca Mon Sep 17 00:00:00 2001
From: Ally Hawkins <ally.hawkins@ccdatalab.org>
Date: Tue, 14 Jan 2025 15:37:43 -0600
Subject: [PATCH 10/25] use correct input name

---
 modules/cell-type-consensus/resources/usr/bin/save-coldata.R | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/modules/cell-type-consensus/resources/usr/bin/save-coldata.R b/modules/cell-type-consensus/resources/usr/bin/save-coldata.R
index 7dff090..70601d9 100755
--- a/modules/cell-type-consensus/resources/usr/bin/save-coldata.R
+++ b/modules/cell-type-consensus/resources/usr/bin/save-coldata.R
@@ -25,7 +25,7 @@ opt <- parse_args(OptionParser(option_list = option_list))
 
 # make sure input files exist
 stopifnot(
-  "sce file does not exist" = file.exists(opt$sce_file)
+  "sce file does not exist" = file.exists(opt$input_sce_file)
 )
 
 # load SCE

From b3caf00b97feb0a98417c2d678c63354a98d16e4 Mon Sep 17 00:00:00 2001
From: Ally Hawkins <ally.hawkins@ccdatalab.org>
Date: Tue, 14 Jan 2025 15:38:23 -0600
Subject: [PATCH 11/25] temporarily terminate if fail

---
 modules/cell-type-consensus/main.nf | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/modules/cell-type-consensus/main.nf b/modules/cell-type-consensus/main.nf
index 3be7858..e89936d 100644
--- a/modules/cell-type-consensus/main.nf
+++ b/modules/cell-type-consensus/main.nf
@@ -9,6 +9,7 @@ params.consensus_ref_file = file('https://github.com/AlexsLemonade/OpenScPCA-ana
 process save_celltypes {
   container params.consensus_cell_type_container
   tag "${sample_id}"
+  errorStrategy 'terminate'
   input:
     tuple val(sample_id),
           val(project_id),
@@ -43,6 +44,7 @@ process save_celltypes {
 
 process assign_consensus {
   container params.consensus_cell_type_container
+  errorStrategy 'terminate'
   tag "${project_id}"
   label 'mem_8'
   publishDir "${params.results_bucket}/${params.release_prefix}/cell-type-consensus", mode: 'copy'

From bf419b99ef10d29e14bfdcc2ac2e9c9e23c2135b Mon Sep 17 00:00:00 2001
From: Ally Hawkins <ally.hawkins@ccdatalab.org>
Date: Tue, 14 Jan 2025 15:47:04 -0600
Subject: [PATCH 12/25] another argument mis named

---
 modules/cell-type-consensus/resources/usr/bin/save-coldata.R | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/modules/cell-type-consensus/resources/usr/bin/save-coldata.R b/modules/cell-type-consensus/resources/usr/bin/save-coldata.R
index 70601d9..c754977 100755
--- a/modules/cell-type-consensus/resources/usr/bin/save-coldata.R
+++ b/modules/cell-type-consensus/resources/usr/bin/save-coldata.R
@@ -36,7 +36,7 @@ suppressPackageStartupMessages({
 # Extract colData --------------------------------------------------------------
 
 # read in sce
-sce <- readr::read_rds(opt$sce_file)
+sce <- readr::read_rds(opt$input_sce_file)
 
 # extract ids
 library_id <- metadata(sce)$library_id

From baa20a699b301a7d5c1e8768a39f0d490e5f58bf Mon Sep 17 00:00:00 2001
From: Ally Hawkins <ally.hawkins@ccdatalab.org>
Date: Tue, 14 Jan 2025 16:05:31 -0600
Subject: [PATCH 13/25] account for empty files because of cell lines

---
 .../resources/usr/bin/combine-celltype-tables.R              | 5 ++++-
 modules/cell-type-consensus/resources/usr/bin/save-coldata.R | 5 ++++-
 2 files changed, 8 insertions(+), 2 deletions(-)

diff --git a/modules/cell-type-consensus/resources/usr/bin/combine-celltype-tables.R b/modules/cell-type-consensus/resources/usr/bin/combine-celltype-tables.R
index 37e0280..1f3b5ff 100755
--- a/modules/cell-type-consensus/resources/usr/bin/combine-celltype-tables.R
+++ b/modules/cell-type-consensus/resources/usr/bin/combine-celltype-tables.R
@@ -49,7 +49,10 @@ stopifnot(
 )
 
 # list of paths to tsv files
-all_files <- unlist(stringr::str_split(opt$input_tsv_files, ","))
+input_sce_files <- unlist(stringr::str_split(opt$input_tsv_files, ","))
+# check if any are empty, if so remove them
+missing_files <- file.size(input_sce_files) > 0
+all_files <- all_files[!missing_files]
 
 # read in ref files
 # change names for panglao ref to match what's in the consensus file
diff --git a/modules/cell-type-consensus/resources/usr/bin/save-coldata.R b/modules/cell-type-consensus/resources/usr/bin/save-coldata.R
index c754977..a98d548 100755
--- a/modules/cell-type-consensus/resources/usr/bin/save-coldata.R
+++ b/modules/cell-type-consensus/resources/usr/bin/save-coldata.R
@@ -52,7 +52,10 @@ project_id <- metadata(sce)$project_id
 is_cell_line <- all(metadata(sce)$sample_type == "cell line")
 
 # only create and write table for non-cell line samples
-if (!is_cell_line) {
+if (is_cell_line) {
+  # make an empty filtered file
+  file.create(opt$output_file)
+} else {
   # get df with ids, barcodes, and cell type assignments
   celltype_df <- colData(sce) |>
     as.data.frame() |>

From 1bf5b2ea3f0f3acaac6e55dcb19dce3d55c4e82a Mon Sep 17 00:00:00 2001
From: Ally Hawkins <ally.hawkins@ccdatalab.org>
Date: Tue, 14 Jan 2025 16:21:53 -0600
Subject: [PATCH 14/25] add missing params

---
 modules/cell-type-consensus/main.nf | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/modules/cell-type-consensus/main.nf b/modules/cell-type-consensus/main.nf
index e89936d..56aeb01 100644
--- a/modules/cell-type-consensus/main.nf
+++ b/modules/cell-type-consensus/main.nf
@@ -60,7 +60,7 @@ process assign_consensus {
     combine-celltype-tables.R \
       --input_tsv_files ${input_files} \
       --panglao_ref_file ${params.panglao_ref_file} \
-      --consensus_ref_file ${consensus_ref_file} \
+      --consensus_ref_file ${params.consensus_ref_file} \
       --output_file ${consensus_output_file}
     """
 

From ce05d63b0d07602ba1432691ad8e6fb57f738e54 Mon Sep 17 00:00:00 2001
From: Ally Hawkins <ally.hawkins@ccdatalab.org>
Date: Tue, 14 Jan 2025 16:40:02 -0600
Subject: [PATCH 15/25] account for more than one library per sample

---
 modules/cell-type-consensus/main.nf | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

diff --git a/modules/cell-type-consensus/main.nf b/modules/cell-type-consensus/main.nf
index 56aeb01..c985bc6 100644
--- a/modules/cell-type-consensus/main.nf
+++ b/modules/cell-type-consensus/main.nf
@@ -20,13 +20,13 @@ process save_celltypes {
   script:
     output_files = library_files
       .collect{
-        it.name.replaceAll(/(?i).rds$/, "__original-cell-types.tsv")
+        it.name.replaceAll(/(?i).rds$/, "_original-cell-types.tsv")
       }
     """
     for file in ${library_files}; do
       save-coldata.R \
         --input_sce_file \$file \
-        --output_file \$(basename \${file%.rds}__original-cell-types.tsv)
+        --output_file \$(basename \${file%.rds}_original-cell-types.tsv)
     done
     """
 
@@ -90,6 +90,10 @@ workflow cell_type_consensus {
 
     cell_type_files_ch = save_celltypes.out
       .groupTuple(by: 0) // group by project id
+      .map{ project_id, celltype_files -> tuple(
+        project_id,
+        celltype_files.flatten() // get rid of nested tuple that occurs when more than one library maps to a sample
+      )}
 
     // assign consensus cell types by project
     assign_consensus(cell_type_files_ch)

From 4f22c416d2d88ccf218eebd4eab6d27556e7985d Mon Sep 17 00:00:00 2001
From: Ally Hawkins <54039191+allyhawkins@users.noreply.github.com>
Date: Wed, 15 Jan 2025 09:31:04 -0600
Subject: [PATCH 16/25] Apply suggestions from code review

Co-authored-by: Joshua Shapiro <josh.shapiro@ccdatalab.org>
---
 modules/cell-type-consensus/main.nf | 8 +++++---
 1 file changed, 5 insertions(+), 3 deletions(-)

diff --git a/modules/cell-type-consensus/main.nf b/modules/cell-type-consensus/main.nf
index c985bc6..a3bcc79 100644
--- a/modules/cell-type-consensus/main.nf
+++ b/modules/cell-type-consensus/main.nf
@@ -51,6 +51,8 @@ process assign_consensus {
   input:
     tuple val(project_id),
           path(cell_type_files)
+    path panglao_ref
+    path consensus_ref
   output:
     path consensus_output_file
   script:
@@ -59,8 +61,8 @@ process assign_consensus {
     """
     combine-celltype-tables.R \
       --input_tsv_files ${input_files} \
-      --panglao_ref_file ${params.panglao_ref_file} \
-      --consensus_ref_file ${params.consensus_ref_file} \
+      --panglao_ref_file ${panglao_ref} \
+      --consensus_ref_file ${consensus_ref} \
       --output_file ${consensus_output_file}
     """
 
@@ -96,7 +98,7 @@ workflow cell_type_consensus {
       )}
 
     // assign consensus cell types by project
-    assign_consensus(cell_type_files_ch)
+    assign_consensus(cell_type_files_ch, file(params.panglao_ref_file), file(params.consensus_ref_file))
 
   emit:
     assign_consensus.out // [project_id, consensus_output_file]

From 7dead7ce0b3f1f319c5f66efe8960f0559dc87c0 Mon Sep 17 00:00:00 2001
From: Ally Hawkins <ally.hawkins@ccdatalab.org>
Date: Wed, 15 Jan 2025 09:48:01 -0600
Subject: [PATCH 17/25] fix typo with all files variable

---
 .../resources/usr/bin/combine-celltype-tables.R                 | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/modules/cell-type-consensus/resources/usr/bin/combine-celltype-tables.R b/modules/cell-type-consensus/resources/usr/bin/combine-celltype-tables.R
index 1f3b5ff..87dcf86 100755
--- a/modules/cell-type-consensus/resources/usr/bin/combine-celltype-tables.R
+++ b/modules/cell-type-consensus/resources/usr/bin/combine-celltype-tables.R
@@ -52,7 +52,7 @@ stopifnot(
 input_sce_files <- unlist(stringr::str_split(opt$input_tsv_files, ","))
 # check if any are empty, if so remove them
 missing_files <- file.size(input_sce_files) > 0
-all_files <- all_files[!missing_files]
+all_files <- input_sce_files[!missing_files]
 
 # read in ref files
 # change names for panglao ref to match what's in the consensus file

From 636374195342d91b4a5a6e41646ce459e594b2c1 Mon Sep 17 00:00:00 2001
From: Ally Hawkins <54039191+allyhawkins@users.noreply.github.com>
Date: Wed, 15 Jan 2025 11:03:21 -0600
Subject: [PATCH 18/25] Apply suggestions from code review

Co-authored-by: Joshua Shapiro <josh.shapiro@ccdatalab.org>
---
 modules/cell-type-consensus/main.nf | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/modules/cell-type-consensus/main.nf b/modules/cell-type-consensus/main.nf
index a3bcc79..a11bca4 100644
--- a/modules/cell-type-consensus/main.nf
+++ b/modules/cell-type-consensus/main.nf
@@ -15,7 +15,8 @@ process save_celltypes {
           val(project_id),
           path(library_files)
   output:
-    tuple val(project_id),
+    tuple val(sample_id),
+          val(project_id),
           path(output_files)
   script:
     output_files = library_files
@@ -91,8 +92,8 @@ workflow cell_type_consensus {
     save_celltypes(libraries_ch)
 
     cell_type_files_ch = save_celltypes.out
-      .groupTuple(by: 0) // group by project id
-      .map{ project_id, celltype_files -> tuple(
+      .groupTuple(by: 1) // group by project id
+      .map{sample_ids, project_id, celltype_files -> tuple(
         project_id,
         celltype_files.flatten() // get rid of nested tuple that occurs when more than one library maps to a sample
       )}

From 621960720468272e72f174eab177c92e283941aa Mon Sep 17 00:00:00 2001
From: Ally Hawkins <ally.hawkins@ccdatalab.org>
Date: Wed, 15 Jan 2025 11:10:39 -0600
Subject: [PATCH 19/25] use raw github link

---
 modules/cell-type-consensus/main.nf | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/modules/cell-type-consensus/main.nf b/modules/cell-type-consensus/main.nf
index a11bca4..6c226f0 100644
--- a/modules/cell-type-consensus/main.nf
+++ b/modules/cell-type-consensus/main.nf
@@ -3,8 +3,8 @@
 // Workflow to assign consensus cell type labels
 
 // module parameters
-params.panglao_ref_file = file('https://github.com/AlexsLemonade/OpenScPCA-analysis/blob/b870a082bc9acd3536c5f8d2d52550d8fe8a4239/analyses/cell-type-consensus/references/panglao-cell-type-ontologies.tsv')
-params.consensus_ref_file = file('https://github.com/AlexsLemonade/OpenScPCA-analysis/blob/b870a082bc9acd3536c5f8d2d52550d8fe8a4239/analyses/cell-type-consensus/references/consensus-cell-type-reference.tsv')
+params.panglao_ref_file = file('https://raw.githubusercontent.com/AlexsLemonade/OpenScPCA-analysis/blob/b870a082bc9acd3536c5f8d2d52550d8fe8a4239/analyses/cell-type-consensus/references/panglao-cell-type-ontologies.tsv')
+params.consensus_ref_file = file('https://raw.githubusercontent.com/AlexsLemonade/OpenScPCA-analysis/b870a082bc9acd3536c5f8d2d52550d8fe8a4239/analyses/cell-type-consensus/references/consensus-cell-type-reference.tsv')
 
 process save_celltypes {
   container params.consensus_cell_type_container

From c20291bb5a986f7c14a9585a724ab620a4c73bab Mon Sep 17 00:00:00 2001
From: Ally Hawkins <ally.hawkins@ccdatalab.org>
Date: Wed, 15 Jan 2025 11:11:52 -0600
Subject: [PATCH 20/25] fully fix link

---
 modules/cell-type-consensus/main.nf | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/modules/cell-type-consensus/main.nf b/modules/cell-type-consensus/main.nf
index 6c226f0..85ab4da 100644
--- a/modules/cell-type-consensus/main.nf
+++ b/modules/cell-type-consensus/main.nf
@@ -3,7 +3,7 @@
 // Workflow to assign consensus cell type labels
 
 // module parameters
-params.panglao_ref_file = file('https://raw.githubusercontent.com/AlexsLemonade/OpenScPCA-analysis/blob/b870a082bc9acd3536c5f8d2d52550d8fe8a4239/analyses/cell-type-consensus/references/panglao-cell-type-ontologies.tsv')
+params.panglao_ref_file = file('https://raw.githubusercontent.com/AlexsLemonade/OpenScPCA-analysis/b870a082bc9acd3536c5f8d2d52550d8fe8a4239/analyses/cell-type-consensus/references/panglao-cell-type-ontologies.tsv')
 params.consensus_ref_file = file('https://raw.githubusercontent.com/AlexsLemonade/OpenScPCA-analysis/b870a082bc9acd3536c5f8d2d52550d8fe8a4239/analyses/cell-type-consensus/references/consensus-cell-type-reference.tsv')
 
 process save_celltypes {

From 18f227d5d2f32747e9a2cbf9e4bdf7ec05259dd7 Mon Sep 17 00:00:00 2001
From: Ally Hawkins <ally.hawkins@ccdatalab.org>
Date: Wed, 15 Jan 2025 11:17:42 -0600
Subject: [PATCH 21/25] add module params config

---
 config/module_params.config         | 14 ++++++++++++++
 modules/cell-type-consensus/main.nf |  4 ----
 modules/merge-sce/main.nf           |  5 -----
 nextflow.config                     |  3 +++
 4 files changed, 17 insertions(+), 9 deletions(-)
 create mode 100644 config/module_params.config

diff --git a/config/module_params.config b/config/module_params.config
new file mode 100644
index 0000000..458ff0d
--- /dev/null
+++ b/config/module_params.config
@@ -0,0 +1,14 @@
+// Module specific parameters are stored here defined here as parameters
+params{
+
+  // merge sce parameters
+  reuse_merge = false
+  max_merge_libraries = 75 // maximum number of libraries to merge (current number is a guess, based on 59 working, but 104 not)
+  num_hvg = 2000 // number of HVGs to select
+
+
+  // cell type consensus
+  panglao_ref_file = 'https://raw.githubusercontent.com/AlexsLemonade/OpenScPCA-analysis/b870a082bc9acd3536c5f8d2d52550d8fe8a4239/analyses/cell-type-consensus/references/panglao-cell-type-ontologies.tsv'
+  consensus_ref_file = 'https://raw.githubusercontent.com/AlexsLemonade/OpenScPCA-analysis/b870a082bc9acd3536c5f8d2d52550d8fe8a4239/analyses/cell-type-consensus/references/consensus-cell-type-reference.tsv'
+
+}
diff --git a/modules/cell-type-consensus/main.nf b/modules/cell-type-consensus/main.nf
index 85ab4da..54e9afc 100644
--- a/modules/cell-type-consensus/main.nf
+++ b/modules/cell-type-consensus/main.nf
@@ -2,10 +2,6 @@
 
 // Workflow to assign consensus cell type labels
 
-// module parameters
-params.panglao_ref_file = file('https://raw.githubusercontent.com/AlexsLemonade/OpenScPCA-analysis/b870a082bc9acd3536c5f8d2d52550d8fe8a4239/analyses/cell-type-consensus/references/panglao-cell-type-ontologies.tsv')
-params.consensus_ref_file = file('https://raw.githubusercontent.com/AlexsLemonade/OpenScPCA-analysis/b870a082bc9acd3536c5f8d2d52550d8fe8a4239/analyses/cell-type-consensus/references/consensus-cell-type-reference.tsv')
-
 process save_celltypes {
   container params.consensus_cell_type_container
   tag "${sample_id}"
diff --git a/modules/merge-sce/main.nf b/modules/merge-sce/main.nf
index 968e8cf..21796c3 100644
--- a/modules/merge-sce/main.nf
+++ b/modules/merge-sce/main.nf
@@ -3,11 +3,6 @@
 // Workflow to merge SCE objects into a single object.
 // This workflow does NOT perform integration, i.e. batch correction.
 
-// module parameters
-params.reuse_merge = false
-params.max_merge_libraries = 75 // maximum number of libraries to merge (current number is a guess, based on 59 working, but 104 not)
-params.num_hvg = 2000 // number of HVGs to select
-
 // merge workflow variables
 def module_name = "merge-sce"
 def publish_merge_base = "${params.results_bucket}/${params.release_prefix}/${module_name}"
diff --git a/nextflow.config b/nextflow.config
index 0aacaec..27caa44 100644
--- a/nextflow.config
+++ b/nextflow.config
@@ -27,6 +27,9 @@ includeConfig 'config/process_base.config'
 // Load container definitions
 includeConfig 'config/containers.config'
 
+// include module specific parameters
+includeConfig 'config/module_params.config'
+
 profiles {
   standard {
     process {

From 08306e9845f568826ae7fd448f8f1f0d01bfdebb Mon Sep 17 00:00:00 2001
From: Ally Hawkins <ally.hawkins@ccdatalab.org>
Date: Wed, 15 Jan 2025 11:57:07 -0600
Subject: [PATCH 22/25] switch logical for missing files

---
 .../resources/usr/bin/combine-celltype-tables.R                 | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/modules/cell-type-consensus/resources/usr/bin/combine-celltype-tables.R b/modules/cell-type-consensus/resources/usr/bin/combine-celltype-tables.R
index 87dcf86..f562fd6 100755
--- a/modules/cell-type-consensus/resources/usr/bin/combine-celltype-tables.R
+++ b/modules/cell-type-consensus/resources/usr/bin/combine-celltype-tables.R
@@ -51,7 +51,7 @@ stopifnot(
 # list of paths to tsv files
 input_sce_files <- unlist(stringr::str_split(opt$input_tsv_files, ","))
 # check if any are empty, if so remove them
-missing_files <- file.size(input_sce_files) > 0
+missing_files <- file.size(input_sce_files) == 0
 all_files <- input_sce_files[!missing_files]
 
 # read in ref files

From 4cf30cb49f58ce6dc05a398f488fb7e4b5ddc20c Mon Sep 17 00:00:00 2001
From: Ally Hawkins <ally.hawkins@ccdatalab.org>
Date: Wed, 15 Jan 2025 12:35:06 -0600
Subject: [PATCH 23/25] account for entire projects with cell lines

---
 .../usr/bin/combine-celltype-tables.R         | 50 +++++++++++--------
 1 file changed, 28 insertions(+), 22 deletions(-)

diff --git a/modules/cell-type-consensus/resources/usr/bin/combine-celltype-tables.R b/modules/cell-type-consensus/resources/usr/bin/combine-celltype-tables.R
index f562fd6..dc79f5f 100755
--- a/modules/cell-type-consensus/resources/usr/bin/combine-celltype-tables.R
+++ b/modules/cell-type-consensus/resources/usr/bin/combine-celltype-tables.R
@@ -91,25 +91,31 @@ blueprint_df <- data.frame(
 
 # Create combined TSV ----------------------------------------------------------
 
-# read in TSV files and combine into a single df
-all_cells_df <- all_files |>
-  purrr::map(readr::read_tsv) |>
-  dplyr::bind_rows() |>
-  # add columns for panglao ontology and consensus
-  # first add panglao ontology
-  dplyr::left_join(panglao_ref_df, by = c("cellassign_celltype_annotation" = "original_panglao_name")) |>
-  # now add in all the blueprint columns
-  dplyr::left_join(blueprint_df, by = c("singler_celltype_ontology" = "blueprint_ontology")) |>
-  # then add consensus labels
-  dplyr::left_join(consensus_ref_df,
-    by = c(
-      "singler_celltype_ontology" = "blueprint_ontology",
-      "cellassign_celltype_annotation" = "original_panglao_name",
-      "panglao_ontology"
-    )
-  ) |>
-  # use unknown for NA annotation but keep ontology ID as NA
-  dplyr::mutate(consensus_annotation = dplyr::if_else(is.na(consensus_annotation), "Unknown", consensus_annotation))
-
-# export file
-readr::write_tsv(all_cells_df, opt$output_file)
+# account for all samples being cell lines and no cell type annotations being present
+if (length(all_files) == 0) {
+  # make an empty filtered file
+  file.create(opt$output_file)
+} else {
+  # read in TSV files and combine into a single df
+  all_cells_df <- all_files |>
+    purrr::map(readr::read_tsv) |>
+    dplyr::bind_rows() |>
+    # add columns for panglao ontology and consensus
+    # first add panglao ontology
+    dplyr::left_join(panglao_ref_df, by = c("cellassign_celltype_annotation" = "original_panglao_name")) |>
+    # now add in all the blueprint columns
+    dplyr::left_join(blueprint_df, by = c("singler_celltype_ontology" = "blueprint_ontology")) |>
+    # then add consensus labels
+    dplyr::left_join(consensus_ref_df,
+      by = c(
+        "singler_celltype_ontology" = "blueprint_ontology",
+        "cellassign_celltype_annotation" = "original_panglao_name",
+        "panglao_ontology"
+      )
+    ) |>
+    # use unknown for NA annotation but keep ontology ID as NA
+    dplyr::mutate(consensus_annotation = dplyr::if_else(is.na(consensus_annotation), "Unknown", consensus_annotation))
+
+  # export file
+  readr::write_tsv(all_cells_df, opt$output_file)
+}

From 9a838b92c0688c8b0dc92f6351079b1eaf8dbce3 Mon Sep 17 00:00:00 2001
From: Ally Hawkins <ally.hawkins@ccdatalab.org>
Date: Wed, 15 Jan 2025 13:14:02 -0600
Subject: [PATCH 24/25] Revert "comment out other modules for faster testing"

This reverts commit d3b0ad141028698a8d597c2d163a38acd95a36f8.
---
 main.nf | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/main.nf b/main.nf
index be536ab..a408c0d 100644
--- a/main.nf
+++ b/main.nf
@@ -51,13 +51,13 @@ workflow {
     .filter{ run_all || it[1] in project_ids }
 
   // Run the merge workflow
-  //merge_sce(sample_ch)
+  merge_sce(sample_ch)
 
   // Run the doublet detection workflow
-  //detect_doublets(sample_ch)
+  detect_doublets(sample_ch)
 
   // Run the seurat conversion workflow
-  //seurat_conversion(sample_ch)
+  seurat_conversion(sample_ch)
 
   // Run the consensus cell type workflow
   cell_type_consensus(sample_ch)

From 333f444df15b99131996da6cea9a1f7ef9f143eb Mon Sep 17 00:00:00 2001
From: Ally Hawkins <ally.hawkins@ccdatalab.org>
Date: Wed, 15 Jan 2025 13:14:15 -0600
Subject: [PATCH 25/25] Revert "temporarily terminate if fail"

This reverts commit b3caf00b97feb0a98417c2d678c63354a98d16e4.
---
 modules/cell-type-consensus/main.nf | 2 --
 1 file changed, 2 deletions(-)

diff --git a/modules/cell-type-consensus/main.nf b/modules/cell-type-consensus/main.nf
index 54e9afc..b10408d 100644
--- a/modules/cell-type-consensus/main.nf
+++ b/modules/cell-type-consensus/main.nf
@@ -5,7 +5,6 @@
 process save_celltypes {
   container params.consensus_cell_type_container
   tag "${sample_id}"
-  errorStrategy 'terminate'
   input:
     tuple val(sample_id),
           val(project_id),
@@ -41,7 +40,6 @@ process save_celltypes {
 
 process assign_consensus {
   container params.consensus_cell_type_container
-  errorStrategy 'terminate'
   tag "${project_id}"
   label 'mem_8'
   publishDir "${params.results_bucket}/${params.release_prefix}/cell-type-consensus", mode: 'copy'