Potential fix for #378.

lima1 · Dec 11, 2024 · 7f40f7c · 7f40f7c
1 parent 2394ee7
commit 7f40f7c
Show file tree

Hide file tree

Showing 4 changed files with 44 additions and 3 deletions.
diff --git a/NEWS b/NEWS
@@ -1,3 +1,10 @@
+Changes in version 2.14.0
+-------------------------
+
+BUGFIXES
+    o Fixed large runtime of readAllelicCounts. Thanks luyh-xp (#378).
+
+
 Changes in version 2.12.0
 -------------------------
 

diff --git a/R/readAllelicCountsFile.R b/R/readAllelicCountsFile.R
@@ -68,15 +68,20 @@ readAllelicCountsFile <- function(file, format, zero=NULL) {
     if (!is.null(zero)) flog.warn("zero ignored for GATK4 files.")
     con <- file(file, open = "r")
     header <- .parseGATKHeader(con)
-    inputCounts <- read.delim(con, header = FALSE, stringsAsFactors = FALSE)
+    inputCounts <- try(read.delim(con, header = FALSE, stringsAsFactors = FALSE))
+    if (is(inputCounts, "try-error")) {
+        .stopUserError("Error reading AllelicCountsFile ", file)
+    }
     colnames(inputCounts) <- strsplit(header$last_line, "\t")[[1]]
     close(con)
     gr <- GRanges(seqnames = inputCounts$CONTIG, IRanges(start = inputCounts$POSITION, end = inputCounts$POSITION))
     vcf <- VCF(gr, 
                 colData = DataFrame(Samples = 1, row.names = header$sid),
                 exptData = list(header = VCFHeader(samples = header$sid)))
     ref(vcf) <- DNAStringSet(inputCounts$REF_NUCLEOTIDE)
-    alt(vcf) <- DNAStringSetList(lapply(inputCounts$ALT_NUCLEOTIDE, DNAStringSet))
+    #alt(vcf) <- DNAStringSetList(split(inputCounts$ALT_NUCLEOTIDE, seq(length(vcf))))
+    alt(vcf) <- DNAStringSetList(as.list(inputCounts$ALT_NUCLEOTIDE))
+
     info(header(vcf)) <- DataFrame(
         Number = "0",
         Type = "Flag",

diff --git a/inst/extdata/example_allelic_counts_empty.tsv b/inst/extdata/example_allelic_counts_empty.tsv
@@ -0,0 +1,28 @@
+@HD	VN:1.6
+@SQ	SN:chr1	LN:249250621
+@SQ	SN:chr2	LN:243199373
+@SQ	SN:chr3	LN:198022430
+@SQ	SN:chr4	LN:191154276
+@SQ	SN:chr5	LN:180915260
+@SQ	SN:chr6	LN:171115067
+@SQ	SN:chr7	LN:159138663
+@SQ	SN:chr8	LN:146364022
+@SQ	SN:chr9	LN:141213431
+@SQ	SN:chr10	LN:135534747
+@SQ	SN:chr11	LN:135006516
+@SQ	SN:chr12	LN:133851895
+@SQ	SN:chr13	LN:115169878
+@SQ	SN:chr14	LN:107349540
+@SQ	SN:chr15	LN:102531392
+@SQ	SN:chr16	LN:90354753
+@SQ	SN:chr17	LN:81195210
+@SQ	SN:chr18	LN:78077248
+@SQ	SN:chr19	LN:59128983
+@SQ	SN:chr20	LN:63025520
+@SQ	SN:chr21	LN:48129895
+@SQ	SN:chr22	LN:51304566
+@SQ	SN:chrX	LN:155270560
+@SQ	SN:chrY	LN:59373566
+@SQ	SN:chrM	LN:16571
+@RG	ID:PureCN	SM:LIB-02240e4
+CONTIG	POSITION	REF_COUNT	ALT_COUNT	REF_NUCLEOTIDE	ALT_NUCLEOTIDE
diff --git a/tests/testthat/test_readAllelicCountsFile.R b/tests/testthat/test_readAllelicCountsFile.R
@@ -2,6 +2,7 @@ context("readAllelicCountsFile")
 
 vcf.file <- system.file("extdata", "example.vcf.gz", package = "PureCN")
 ac.file <- system.file("extdata", "example_allelic_counts.tsv", package = "PureCN")
+ac.empty.file <- system.file("extdata", "example_allelic_counts_empty.tsv", package = "PureCN")
 vcf <- readVcf(vcf.file, "hg19")
 data(purecn.example.output)
 normal.coverage.file <- system.file('extdata', 'example_normal.txt.gz', 
@@ -12,6 +13,7 @@ tumor.coverage.file <- system.file('extdata', 'example_tumor.txt.gz',
 test_that("example parses correctly", {
     vcf_ac <- readAllelicCountsFile(ac.file)
     expect_equal(as.character(ref(vcf_ac)), as.character(ref(head(vcf,20))))
+    expect_error(readAllelicCountsFile(ac.empty.file), "Error reading AllelicCountsFile")
 })
 
 test_that("parsing -> writing -> parsing works", {
@@ -29,4 +31,3 @@ test_that("parsing -> writing -> parsing works", {
     expect_true(length(ret$results) > 0)
     file.remove(output.file)
 })
-