Skip to content

Commit

Permalink
Merge pull request #3 from aertslab/feature/split-fragments-allow-bar…
Browse files Browse the repository at this point in the history
…codes-to-map-to-multiple-types

Feature/split fragments allow barcodes to map to multiple types
  • Loading branch information
SeppeDeWinter authored Feb 9, 2024
2 parents a804ee8 + 3fc6ee0 commit 004a265
Show file tree
Hide file tree
Showing 5 changed files with 99 additions and 25 deletions.
7 changes: 5 additions & 2 deletions rust/src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -44,10 +44,13 @@ fn split_fragments_by_cell_barcode(
verbose: bool,
) -> PyResult<()> {
// Invert cell_type_to_cell_barcodes
let mut cell_barcode_to_cell_type: HashMap<String, String> = HashMap::new();
let mut cell_barcode_to_cell_type: HashMap<String, Vec<String>> = HashMap::new();
for (cell_type, cell_barcodes) in cell_type_to_cell_barcodes.iter() {
for cell_barcode in cell_barcodes.iter() {
cell_barcode_to_cell_type.insert(cell_barcode.to_string(), cell_type.to_string());
cell_barcode_to_cell_type
.entry(cell_barcode.to_string())
.or_insert(Vec::new())
.push(cell_type.to_string());
}
}
split_fragments::split_fragments_by_cell_barcode(
Expand Down
18 changes: 12 additions & 6 deletions rust/src/split_fragments.rs
Original file line number Diff line number Diff line change
Expand Up @@ -83,7 +83,7 @@ fn sanitize_string_for_filename(s: String) -> String {
pub fn split_fragments_by_cell_barcode(
path_to_fragments: &String,
path_to_output_folder: &String,
cell_barcode_to_cell_type: HashMap<String, String>,
cell_barcode_to_cell_type: HashMap<String, Vec<String>>,
chromsizes: HashMap<String, u64>,
number_of_threads: u32,
verbose: bool,
Expand All @@ -101,7 +101,11 @@ pub fn split_fragments_by_cell_barcode(
)
});
let mut cell_type_to_writer: HashMap<&String, LazyBgzfWriter> = HashMap::new();
let unique_cell_types: Vec<&String> = cell_barcode_to_cell_type.values().unique().collect();
let unique_cell_types: Vec<&String> = cell_barcode_to_cell_type
.values()
.flatten()
.unique()
.collect();
for cell_type in unique_cell_types {
let cell_type_name = sanitize_string_for_filename(cell_type.clone().to_string());
let path_to_output = format!(
Expand Down Expand Up @@ -145,10 +149,12 @@ pub fn split_fragments_by_cell_barcode(
// loop over reads
while not_at_end {
let read_cb = read_as_str.split('\t').nth(3).unwrap().to_string();
if let Some(cell_type) = cell_barcode_to_cell_type.get(&read_cb) {
let writer = cell_type_to_writer.get_mut(cell_type).unwrap();
writer.write(&read).unwrap();
writer.write(b"\n").unwrap();
if let Some(cell_types) = cell_barcode_to_cell_type.get(&read_cb) {
for cell_type in cell_types {
let writer = cell_type_to_writer.get_mut(cell_type).unwrap();
writer.write(&read).unwrap();
writer.write(b"\n").unwrap();
}
}
read.clear();
not_at_end = tbx_reader.read(&mut read).unwrap();
Expand Down
1 change: 0 additions & 1 deletion tests/split/cell_type_annotation.tsv
Original file line number Diff line number Diff line change
@@ -1,6 +1,5 @@
sample cell_type cell_barcode
A type_1 TTAGCTTAGGAGAACA-1
A type_1 TTAGCTTAGGAGAACA-1
A type_1 ATATTCCTCTTGTACT-1
A type_2 TGTGACAGTACAACGG-1
A type_2 CATGCCTTCTCTGACC-1
Expand Down
30 changes: 30 additions & 0 deletions tests/split/cell_type_annotation_one_bc_multiple_types.tsv
Original file line number Diff line number Diff line change
@@ -0,0 +1,30 @@
sample cell_type cell_barcode
A type_1 TTAGCTTAGGAGAACA-1
A type_1 ATATTCCTCTTGTACT-1
A type_2 TTAGCTTAGGAGAACA-1
A type_2 CATGCCTTCTCTGACC-1
A type_2 ATCGAGTAGGTTCGAG-1
A type_3 CTCTCAGGTCCCTTTG-1
A type_3 TTCGGTCTCACGTGTA-1
A type_3 GTGACATCATTGTTCT-1
A type_4 AAGGAGCCATCGACCG-1
A type_4 ACCAAACTCTTAAGCG-1
A type_4 CATTGGATCTCTTCCT-1
A type_5 AGGCGAAAGGTCTTTG-1
A type_5 AACGAGGCATCATGTG-1
A type_5 CTACTTAGTCATGAGG-1
B type_1 ATTACCTGTGTGCTTA-1
B type_1 CATAACGTCGGTTGTA-1
B type_1 ATGTCTTTCGGTCCGA-1
B type_2 CAATCCCGTAGCGTTT-1
B type_2 GCCATAATCATCGCTC-1
B type_2 CAACGTAAGGCAAGGG-1
B type_3 TATCGAGGTTGCCGCA-1
B type_3 CATTCCGAGCTAACAA-1
B type_3 CCTCCCTGTAAAGCTA-1
B type_4 CAAGCTACACGTTGTA-1
B type_4 CTGCTCATCACAAGCT-1
B type_4 CAGTATGGTTCTTTGT-1
B type_5 TTGCACCCATTAGCAC-1
B type_5 CAGCCTTCATCTCTCG-1
B type_5 TTGCGGGTCGTCAACA-1
68 changes: 52 additions & 16 deletions tests/split/test_split.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,48 +6,77 @@

TEST_DIRECTORY = pathlib.Path(__file__).parent.absolute()

FILES_ALL_BARCODES_MAPPING_TO_SINGLE_TYPE = {
"a.fragments": "a.fragments.tsv.gz",
"a.fragment_index": "a.fragments.tsv.gz.tbi",
"b.fragments": "b.fragments.tsv.gz",
"b.fragment_index": "b.fragments.tsv.gz.tbi",
"sample_to_fragment": "sample_to_fragment.tsv",
"cell_type_annotation": "cell_type_annotation.tsv",
"chrom_sizes": "hg38.chrom.sizes"
}

FILES_SOME_BARCODES_MAPPING_TO_MULTIPLE_TYPES = {
"a.fragments": "a.fragments.tsv.gz",
"a.fragment_index": "a.fragments.tsv.gz.tbi",
"b.fragments": "b.fragments.tsv.gz",
"b.fragment_index": "b.fragments.tsv.gz.tbi",
"sample_to_fragment": "sample_to_fragment.tsv",
"cell_type_annotation": "cell_type_annotation_one_bc_multiple_types.tsv",
"chrom_sizes": "hg38.chrom.sizes"
}

def test_entrypoint():
exit_status = os.system("scatac_fragment_tools split")
assert exit_status == 0

def run_split_command(tmp_path, output_folder):
os.system(f"cp {TEST_DIRECTORY}/a.fragments.tsv.gz {tmp_path}")
os.system(f"cp {TEST_DIRECTORY}/a.fragments.tsv.gz.tbi {tmp_path}")
os.system(f"cp {TEST_DIRECTORY}/b.fragments.tsv.gz {tmp_path}")
os.system(f"cp {TEST_DIRECTORY}/b.fragments.tsv.gz.tbi {tmp_path}")
os.system(f"cp {TEST_DIRECTORY}/sample_to_fragment.tsv {tmp_path}")
os.system(f"cp {TEST_DIRECTORY}/cell_type_annotation.tsv {tmp_path}")
os.system(f"cp {TEST_DIRECTORY}/hg38.chrom.sizes {tmp_path}")
def run_split_command(tmp_path, output_folder, file_dict):
path_to_a_fragments = os.path.join(TEST_DIRECTORY, file_dict["a.fragments"])
path_to_a_fragment_index = os.path.join(TEST_DIRECTORY, file_dict["a.fragment_index"])
path_to_b_fragments = os.path.join(TEST_DIRECTORY, file_dict["b.fragments"])
path_to_b_fragment_index = os.path.join(TEST_DIRECTORY, file_dict["b.fragment_index"])
path_to_sample_to_fragment = os.path.join(TEST_DIRECTORY, file_dict["sample_to_fragment"])
path_to_cell_type_annotation = os.path.join(TEST_DIRECTORY, file_dict["cell_type_annotation"])
path_to_chrom_sizes = os.path.join(TEST_DIRECTORY, file_dict["chrom_sizes"])
os.system(f"cp {path_to_a_fragments} {tmp_path}")
os.system(f"cp {path_to_a_fragment_index} {tmp_path}")
os.system(f"cp {path_to_b_fragments} {tmp_path}")
os.system(f"cp {path_to_b_fragment_index} {tmp_path}")
os.system(f"cp {path_to_sample_to_fragment} {tmp_path}")
os.system(f"cp {path_to_cell_type_annotation} {tmp_path}")
os.system(f"cp {path_to_chrom_sizes} {tmp_path}")

COMMAND = f"""cd {tmp_path} && \
scatac_fragment_tools split \
-f {tmp_path}/sample_to_fragment.tsv \
-b {tmp_path}/cell_type_annotation.tsv \
-c {tmp_path}/hg38.chrom.sizes \
-f {path_to_sample_to_fragment} \
-b {path_to_cell_type_annotation} \
-c {path_to_chrom_sizes} \
-o {output_folder} \
-t {tmp_path} \
"""
return os.system(COMMAND)

def test_split_command(tmp_path):
def split_command_test_helper(tmp_path, file_dict):
output_folder = os.path.join(tmp_path, "output")
os.makedirs(output_folder, exist_ok=True)
exit_status = run_split_command(tmp_path, output_folder)
exit_status = run_split_command(tmp_path, output_folder, file_dict)
assert exit_status == 0

a_fragments = pl.read_csv(
TEST_DIRECTORY.joinpath("a.fragments.tsv.gz"),
TEST_DIRECTORY.joinpath(file_dict["a.fragments"]),
separator = "\t",
has_header = False
)
b_fragments = pl.read_csv(
TEST_DIRECTORY.joinpath("b.fragments.tsv.gz"),
TEST_DIRECTORY.joinpath(file_dict["b.fragments"]),
separator = "\t",
has_header = False
)
cell_annotations = pl.read_csv(
TEST_DIRECTORY.joinpath("cell_type_annotation.tsv"),
TEST_DIRECTORY.joinpath(file_dict["cell_type_annotation"]),
separator = "\t"
)

for row in cell_annotations \
.select(pl.col("cell_type")) \
.unique() \
Expand All @@ -72,3 +101,10 @@ def test_split_command(tmp_path):
generated_fragments_cell_type
)


def test_split_command_bc_single_type(tmp_path):
split_command_test_helper(tmp_path, FILES_ALL_BARCODES_MAPPING_TO_SINGLE_TYPE)

def test_split_command_barcode_mapping_multiple_types(tmp_path):
split_command_test_helper(tmp_path, FILES_SOME_BARCODES_MAPPING_TO_MULTIPLE_TYPES)

0 comments on commit 004a265

Please sign in to comment.