Merge pull request #86 from thibautjombart/SNPbin-fix

SNPbin fix
thibautjombart · Aug 18, 2015 · da6043a · da6043a
2 parents dff44a6 + 0ec7ff4
commit da6043a
Show file tree

Hide file tree

Showing 3 changed files with 135 additions and 25 deletions.
diff --git a/ChangeLog b/ChangeLog
@@ -13,8 +13,10 @@ BUG FIXES
 	instead, computations are done using all available loci
 
 	o adegenetTutorial now opens up-to-date tutorials
-
-
+
+	o subsetting genlight objects now treats missing data appropriately 
+	when given logical, character, or negative subscripts. 
+	(See issue #83 (https://github.com/thibautjombart/adegenet/issues/83))
 
 			CHANGES IN ADEGENET VERSION 2.0.0
 CHANGES IN GENIND/GENPOP/GENLIGHT OBJECTS

diff --git a/R/glHandle.R b/R/glHandle.R
@@ -17,32 +17,81 @@
     return(x)
 }
 
+# Zhian N. Kamvar
+# Mon Aug 17 09:39:12 2015 ------------------------------
+# 
+# This function takes two steps:
+#   1. Subset the missing positions
+#   2. Subset the vectors of raw SNPs
+# 
+# Both steps are not exactly straighforward. Because the missing vector only
+# represents the positions of missing data, it must be subset by value as
+# opposed to position.
 .SNPbinset <- function(x, i){
-    if (missing(i)) i <- TRUE
-    n.loc     <- x@n.loc
-    if (length(x@NA.posi) > 0){
-        namatches <- match(i, x@NA.posi, nomatch = 0)
+  if (missing(i)) i <- TRUE
+
+  # Create a logical value indicating whether or not subsetting is necessary.
+  we_take_all <- length(i) == 1 && is.logical(i) && i
+  n.loc <- x@n.loc
+  if (length(x@NA.posi) > 0){
+    if (is.logical(i)){
+      if (we_take_all){
+        # Keep all of the data
+        return(x)
+      } else {
+        # If the positons are logical, perhaps the best way to address this is
+        # to match the TRUE positions to the NA.posi vector. Adding nomatch = 0 
+        # avoids introducing NAs.
+        namatches <- match(which(i), x@NA.posi, nomatch = 0)
         nas.kept  <- x@NA.posi[namatches]
-        if (length(nas.kept) > 0){
-            old.posi  <- 1:n.loc
-            x@NA.posi <- match(nas.kept, old.posi[i])
-        } else {
-            x@NA.posi <- nas.kept
-        }
+      }
+    } else if (is.character(i)){
+      stop("Cannot subset a SNPbin object with a character vector", call. = FALSE)
+    } else if (all(i < 0)){
+      # For negative subscripts, find which ones they match and then
+      # negate those. Luckily -0 is allowed.
+      namatches <- match(abs(i), x@NA.posi, nomatch = 0)
+      # Unfortunately, if nothing matches, then the default are zeroes. When you
+      # subset a vector in R with only zero, you will get an empty vector. This
+      # conditional makes sure that NA positions are retained.
+      if (all(namatches == 0)){
+        nas.kept  <- x@NA.posi
+      } else {
+        nas.kept  <- x@NA.posi[-namatches]
+      }
+
+    } else if (all(i > 0)){
+      # Positive subscripts are much easier. First you find where the subscripts
+      # match and then your subset with those positions. 
+      namatches <- match(i, x@NA.posi, nomatch = 0)
+      nas.kept  <- x@NA.posi[namatches]
+    } else {
+      stop("Cannot subset a SNPbin with mixed subscripts.", call. = FALSE)
     }
-    if (length(i) == 1 && is.logical(i) && i){
-        return(x)
-    } else if (all(is.logical(i))){
-        n.loc <- sum(i)
-    } else if (any(i < 0)){
-        n.loc <- n.loc - length(i)
+    # After we find out which missing positions we need to keep, we reset the 
+    # missing positions to the subset data.
+    if (length(nas.kept) > 0){
+      old.posi  <- 1:n.loc
+      x@NA.posi <- match(nas.kept, old.posi[i])
     } else {
-        n.loc <- length(i)
+      x@NA.posi <- nas.kept
     }
-    x@snp     <- lapply(x@snp, .subsetbin, i)
-    x@n.loc   <- n.loc
-
+  }
+  # Here we calculate the number of loci we will have left in the data.
+  if (we_take_all){
     return(x)
+  } else if (all(is.logical(i))){
+    n.loc <- sum(i)
+  } else if (any(i < 0)){
+    n.loc <- n.loc - length(i)
+  } else {
+    n.loc <- length(i)
+  }
+  # Now we loop over all chromosomes and subset.
+  x@snp   <- lapply(x@snp, .subsetbin, i)
+  # Set the new value of the number of loci and return.
+  x@n.loc <- n.loc
+  return(x)
 }
 
 ###############
@@ -123,12 +172,15 @@ setMethod("[", signature(x="genlight", i="ANY", j="ANY", drop="ANY"), function(x
     ## SUBSET LOCI ##
 
     ## handle ind.names, loc.names, chromosome, position, and alleles
+    if (is.character(j)){
+      j <- match(j, x@loc.names, nomatch = 0)
+    }
     x@loc.names   <- x@loc.names[j]
     x@chromosome  <- chr(x)[j]
     x@position    <- position(x)[j]
     x@loc.all     <- alleles(x)[j]
-    x@gen       <- lapply(x@gen, function(e) e[j])
-    x@n.loc      <- x@gen[[1]]@n.loc
+    x@gen         <- lapply(x@gen, function(e) e[j])
+    x@n.loc       <- x@gen[[1]]@n.loc
 
     return(x)
 }) # end [] for genlight

diff --git a/tests/testthat/test_genlight.R b/tests/testthat/test_genlight.R
@@ -33,4 +33,60 @@ test_that("population accessors work", {
   expect_that(popNames(x), equals(c("replacement", "pop2")))
   expect_error(popNames(x) <- NULL)
   expect_error(popNames(x)[2] <- NA)
-})
+})
+
+x <- "
+X13049 X13050 X13051 X13052 X13053
+AA36881      2     NA      2      2      2
+AA36883      2      2      2      2      2
+AA36884      2      2      2      2      2
+AA36802     NA      2      2      2      2
+AA36803      2      2      2      2      2
+AA36804      2     NA      2      2      2
+AA36181      2     NA      2      2      2
+AA36183      2      2      2      2      2"
+
+xxdf <- read.table(text = x)
+xx   <- new("genlight", xxdf, parallel = FALSE)
+pop(xx) <- rep(LETTERS[1:2], each = 4)
+
+test_that("missing data is properly subset with logical subscripts", {
+  skip_on_cran()
+  Apop <- pop(xx) == "A"
+  Bpop <- pop(xx) == "B"
+  expect_identical(NA.posi(xx), NA.posi(xx[]))
+  expect_identical(xxdf[Apop, ], as.data.frame(xx[Apop, ]))
+  expect_identical(xxdf[Bpop, ], as.data.frame(xx[Bpop, ]))
+  keepers <- c(FALSE, rep(TRUE, 4))
+  expect_identical(xxdf[keepers], as.data.frame(xx[, keepers]))
+})
+
+test_that("missing data is properly subset with positive subscripts", {
+  skip_on_cran()
+  rl <- sample(5)
+  # Can subset single locus
+  expect_identical(xxdf[, 1, drop = FALSE], as.data.frame(xx[, 1]))
+  # Can subset range of loci
+  expect_identical(xxdf[, 1:3, drop = FALSE], as.data.frame(xx[, 1:3]))
+  # Can subset by position
+  expect_identical(xxdf[, rl, drop = FALSE], as.data.frame(xx[, rl]))
+})
+
+test_that("missing data is properly subset with negative subscripts", {
+  skip_on_cran()
+  expect_identical(xxdf[, -1], as.data.frame(xx[, -1]))
+  expect_identical(xxdf[, -c(1, 3)], as.data.frame(xx[, -c(1, 3)]))
+})
+
+test_that("missing data is properly subset with a character vector", {
+  skip_on_cran()
+  lnames <- locNames(xx)
+  rl     <- sample(lnames)
+  expect_identical(xxdf[, rl], as.data.frame(xx[, rl]))
+  expect_identical(xxdf[, lnames[1:2]], as.data.frame(xx[, lnames[1:2]]))
+})
+
+test_that("genlight objects do not take a mixture of positive and negative subscripts", {
+  skip_on_cran()
+  expect_error(xx[, c(2, -1)], "subscripts.")
+})