zrxing
diff --git a/‎Makefile
100644100755 b/‎Makefile
100644100755
diff --git a/‎R/WaveQTL_preprocess_funcs.R
+277 b/‎R/WaveQTL_preprocess_funcs.R
+277
diff --git a/‎R/combine_two_strands_with_correction_for_mappability.R
+58 b/‎R/combine_two_strands_with_correction_for_mappability.R
+58
@@ -0,0 +1,277 @@
+## `WaveQTL_preprocess_funcs.R' contains functions to preprocess functional data for WaveQTL
+## software. 
+## Copyright (C) 2014 Heejung Shim
+##
+## This program is free software: you can redistribute it and/or modify
+## it under the terms of the GNU General Public License as published by
+## the Free Software Foundation, either version 3 of the License, or
+## (at your option) any later version.
+##
+## This program is distributed in the hope that it will be useful,
+## but WITHOUT ANY WARRANTY; without even the implied warranty of
+## MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+## GNU General Public License for more details.
+##
+## You should have received a copy of the GNU General Public License
+## along with this program. If not, see <http://www.gnu.org/licenses/>.
+
+
+require("wavethresh")
+
+
+##' Filter low count WCs.
+##'
+##'
+##' This function filters out some WCs that are computed based on very
+##' low counts. This function considers WCs as low count if the total
+##' counts used in their computation were less than `meanR.thresh'
+##' per individual on average.
+##' @param Data matrix (or a vector when N = 1) with N (# of samples) by T (# of bps in a region); This matrix contains original data before a wavelet transform; Here, T should be a power of 2.  
+##' @param meanR.thresh If average reads across individuals <= meanR.thresh,
+##' those WCs are filtered out.
+##' @return filtered.WCs a vector of length T; t-th element indicates whether t-th WC in output from the fuction \code{\link{FWT}} is filtered (0) or not (1). 
+fiter.WCs <- function(Data, meanR.thresh){
+
+        if(is.vector(Data)){
+            dim(Data) <- c(1, length(Data))
+        }        
+  	numWCs = dim(Data)[2]
+  	J = log2(numWCs)
+
+	Mean_R = rep(NA, numWCs)
+        Mean_R[1] = mean(apply(Data, 1, sum))
+        Mean_R[2] = Mean_R[1]
+
+        posi = 3
+        for(ss in 1:(J-1)){
+                num_loc = 2^ss
+                size_int = numWCs/num_loc
+                st = (0:(num_loc-1))*size_int + 1
+                en = st + size_int -1
+
+                for(ll in 1:num_loc){
+                        Mean_R[posi] = mean(apply(Data[,st[ll]:en[ll]], 1, sum))
+                        posi = posi + 1
+                }
+        }
+
+	filtered.WCs = rep(0, numWCs)
+	wh = which(Mean_R > meanR.thresh)
+
+        if(length(wh) > 0){
+        	filtered.WCs[wh] = rep(1, length(wh))
+        }
+
+	return(filtered.WCs)
+}
+
+
+
+
+##' Perform a wavelet transform.
+##'
+##'
+##' This function performs a wavelet transform using a \code{\link{wavethresh}} R package
+##' and returns WCs in the order that corresponds to output from the function
+##' \code{\link{fiter.WCs}}. For now, the function doesn't allow users to specify the level of wavelet
+##' decomposition and uses the maximum level decomposition.
+##' @param Data matrix (or a vector when N = 1) with N (# of samples) by T (# of bps in a region);
+##' This matrix contains original data to be decomposed; Here, T should be a power of 2.
+##' @param filter.number default=1; argument to the function \code{\link{wd}} in the R package
+##' \code{\link{wavethresh}}; See their manual for details.
+##' @param family default="DaubExPhase"; argument to the function \code{\link{wd}} in the R package
+##' \code{\link{wavethresh}}; See their manual for details.
+##' @return WCs a matrix with N (# of samples) by T (# of bps in a region); n-th row contains WCs
+##' for n-th sample; WCs are ordered from low resolution WC to high resolution WC; For example,
+##' with a Haar wavelet transform, the first column contains WC (precisely speaking, scaling
+##' coefficient) that corresponds to sum of data in the region. The second column contains WC
+##' that contrasts the data in the first half vs second half of the region. The last column
+##' contains WC that contrasts the data in the (T-1)-th bp vs T-th bp.
+FWT <- function(Data, filter.number=1, family="DaubExPhase"){
+
+        if(is.vector(Data)){
+            dim(Data) <- c(1, length(Data))
+        }    
+  	T = dim(Data)[2]
+  	J = log2(T)
+  	N = dim(Data)[1]
+
+	dat_D = matrix(data=NA, nr = N, nc = (T - 1))
+	dat_C = rep(NA, N)
+
+	dat_W = matrix(data=NA, nr = N, nc = T)
+
+	for(j in 1:N){
+		each_WT	= wd(Data[j,], filter.number=filter.number ,family=family) 
+		dat_D[j,] = each_WT$D
+		dat_C[j] = each_WT$C[length(each_WT$C)]
+	}
+
+	dat_W[,1] = dat_C
+	dat_W[,2] = -dat_D[,(T -1)]
+
+	st_input = 3
+	en_posi = T - 2
+	for(k in 1:(J-1)){
+		st_posi = en_posi - 2^k + 1
+		en_input = st_input + 2^k - 1
+		dat_W[,st_input:en_input] = -dat_D[,st_posi:en_posi]
+		en_posi = st_posi - 1
+		st_input = en_input + 1
+	}
+
+	return(list(WCs = dat_W))
+
+}
+
+
+
+
+##' Quantile-transform data to a standard normal distribution. 
+##'
+##'
+##' This function quantile-transforms data to a standard normal distribution. It randomly assign
+##' ranks for ties.
+##' @param x a vector containing data to be quantile-transformed.
+##' @return quantile-transformed data.
+QT_randomTie <- function(x) {
+
+	x.rank = rank(x, ties.method="random")
+	return(qqnorm(x.rank,plot.it = F)$x)
+}
+
+
+##' Correct for covariates.
+##'
+##' 
+##' This function corrects for covariates.
+##' @param x a vector of length N (# of samples) containing data. 
+##' @param Covariates a matrix (or a vector if M = 1) with N by M (# of covariates)
+##' containing covariates to correct for.
+##' @return a vector of length N; covariates corrected data. 
+corrected_forCovariates <- function(x, Covariates){
+	return(lm(x~Covariates)$residuals)
+}
+
+
+
+##' Normalize WCs. 
+##'
+##'
+##' This function quantile-transforms WCs to a standard normal ditribution.
+##' If covarites are provided, it corrects the quantile-transformed WCs for the covariates
+##' and quantile-transforms the covariates-corrected WCs to a standard normal distribution.
+##' @param WCs a matrix with N (# of samples) by T (# of bps in a region or # of WCs);
+##' n-th row contains WCs for n-th sample.
+##' @param Covariates default = NULL; a matrix (or a vector if M = 1) with N by M (# of covariates)
+##' containing covariates to correct for.
+##' @return QT_WCs a matrix with N (# of samples) by T (# of bps in a region or # of WCs);
+##' It contains normalized WCs (Quantile-transformed and covariate-corrected WCs).
+Normalize.WCs <- function(WCs, Covariates=NULL){
+
+	# QT to a standard normal distribution
+	QT_dat = apply(WCs, 2, QT_randomTie)
+
+	# correct for covariates and QT to a standard normal distribution. 
+	if(!is.null(Covariates)){
+		corrected_QT.dat = apply(QT_dat, 2, corrected_forCovariates, C)
+		QT.dat = apply(corrected_QT.dat, 2, QT_randomTie)
+	}
+
+	return(list(QT_WCs = QT.dat))
+
+}
+
+
+
+
+
+
+##'
+##'
+##' ##' Preprocess functional data for a WaveQTL software. 
+##'
+##'
+##' This function preprocesses functiona data for a wavelet-based approach implmented in
+##' a WaveQTL software.
+##'
+##'
+##' performs a wavelet transform using a \code{\link{wavethresh}} R package
+##' and returns WCs in the order that corresponds to output from the function
+##' \code{\link{fiter.WCs}}. For now, the function doesn't allow users to specify the level of wavelet
+##' decomposition and uses the maximum level decomposition.
+##' @param Data matrix (or a vector when N = 1) with N (# of samples) by T (# of bps in a region);
+##' This matrix contains original data to be decomposed; Here, T should be a power of 2.
+##' @param filter.number default=1; argument to the function \code{\link{wd}} in the R package
+##' \code{\link{wavethresh}}; See their manual for details.
+##' @param family default="DaubExPhase"; argument to the function \code{\link{wd}} in the R package
+##' \code{\link{wavethresh}}; See their manual for details.
+##' @return WCs a matrix with N (# of samples) by T (# of bps in a region); n-th row contains WCs
+##' for n-th sample; WCs are ordered from low resolution WC to high resolution WC; For example,
+##' with a Haar wavelet transform, the first column contains WC (precisely speaking, scaling
+##' coefficient) that corresponds to sum of data in the region. The second column contains WC
+##' that contrasts the data in the first half vs second half of the region. The last column
+##' contains WC that contrasts the data in the (T-1)-th bp vs T-th bp.
+
+
+# input 
+# 1. Data : N (# of samples) by T (# of bps in a region); read count at t-th bp (t = 1, ..., T) for i-th sample (i = 1, ..., N)
+# 2. Read.depth (=NULL) : a vector of length N; read.depth for each individual
+# 3. C (=NULL) : N by M (# of covariates); covariates to correct for
+# 4. meanR.thresh (=2) : average reads across individuals < meanR.thresh, we will filter those WCs. 
+# output
+# 1. WCs : N by T (# of WCs)
+# 2. filtered.WCs : a vector of length T (either 0 or 1 indicating whether it's filtered (0) or not (1) 
+
+
+
+
+WaveQTL_preprocess <- function(Data, library.read.depth = NULL, Covariates = NULL, meanR.thresh = 2){
+
+    
+	if(is.vector(Data)){dim(Data)<- c(1,length(Data))} #change Data to matrix
+  	if(nrow(Data)==1){C = NULL} #if only one observation, don't correct for covariates
+
+	if(!is.null(C)){
+		if(is.vector(C)){dim(C)<- c(1,length(C))} #change C to matrix
+	}
+
+
+
+  	T = dim(Data)[2]
+  	J = log2(T)
+  	if(!isTRUE(all.equal(J,trunc(J)))){stop("Error: number of columns of Data must be power of 2")}
+  	N = dim(Data)[1]
+
+
+	### generate filtered.WCs
+	if(!is.null(meanR.thresh)){
+		filtered.WCs = fiter.WCs(Data, meanR.thresh)				
+	}else{
+		filtered.WCs = NULL
+	}	
+
+	### corrected for read depth
+        if(!is.null(library.read.depth)){
+            DataC = Data/library.read.depth
+        }else{
+            DataC = Data
+        }
+        
+	### Wavelet Transform
+	WCs = FWT(DataC)$WCs
+	
+	### Quantile Transform to a standard normal distribution 
+	if(N > 1){
+		WCs = Quantile.Transform(WCs, Covariates)
+	}
+
+	return(list(WCs = WCs$QT_WCs, filtered.WCs = filtered.WCs))
+
+} 
+
+
+
+
+
+
@@ -0,0 +1,58 @@
+## `combine_two_strands_with_correction_for_mappability.R' shows how to combine DNase-seq data from two strands while taking mappability into account as we did in Shim and Stephens (2014).
+## Copyright (C) 2014 Heejung Shim
+##
+## This program is free software: you can redistribute it and/or modify
+## it under the terms of the GNU General Public License as published by
+## the Free Software Foundation, either version 3 of the License, or
+## (at your option) any later version.
+##
+## This program is distributed in the hope that it will be useful,
+## but WITHOUT ANY WARRANTY; without even the implied warranty of
+## MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+## GNU General Public License for more details.
+##
+## You should have received a copy of the GNU General Public License
+## along with this program. If not, see <http://www.gnu.org/licenses/>.
+
+#######################################################
+## Read DNase-seq data from two strands.
+## You probably need to correct a path to the data.
+#######################################################
+path = "/mnt/lustre/home/shim/WaveQTL/data/dsQTL/DNase.chr17.10160989.10162012.dat"
+DNase.dat = read.table(path)
+dim(DNase.dat) # 70 by 2048; each column corresponds to each individual; the first (second) 1024 rows contain DNass-seq read count from +(-) strand in each positions;
+
+#######################################################
+## Read mappability.
+## You probably need to correct a path to the data.
+#######################################################
+path = "/mnt/lustre/home/shim/WaveQTL/data/dsQTL/DNase.mappability.chr17.10160989.10162012.dat"
+map.dat = read.table(path)
+dim(map.dat) # 1 by 2048; the first (second) 1024 rows indicates mappability from +(-) strand in each positions; `1' indicates uniquly mappabile base.
+
+#############################################################
+## combine two strands while taking mappability into account
+#############################################################
+numBPs = dim(DNase.dat)[2]/2
+numINDs = dim(DNase.dat)[1]
+
+# take mappability into account
+map = rep(0, numBPs*2)
+wh = (map.dat[1,] == 1)
+map[wh] = 1
+dat = matrix(data = 0, nr = numINDs, nc = numBPs*2)
+dat[,wh] = as.matrix(DNase.dat[,wh])
+
+# combine two strands
+all.dat = dat[,1:numBPs] + dat[,(numBPs+1):(numBPs+numBPs)]
+all.map = map[1:numBPs] + map[(numBPs+1):(numBPs+numBPs)]
+pheno.dat = matrix(data = 0, nr = numINDs, nc = numBPs)
+wh2 = which(all.map > 0)
+pheno.dat[,wh2] = t(t(all.dat[,wh2])/all.map[wh2])
+
+# we can check if "pheno.dat" is the same as "/mnt/lustre/home/shim/WaveQTL/data/dsQTL/chr17.10160989.10162012.pheno.dat"
+A = as.matrix(read.table("/mnt/lustre/home/shim/WaveQTL/data/dsQTL/chr17.10160989.10162012.pheno.dat"))
+sum(pheno.dat != A)
+# 0
+
+