Skip to content

Commit 0a7a13e

Browse files
committed
major updates
1 parent 96b9392 commit 0a7a13e

18 files changed

+633
-2
lines changed

Makefile

100644100755
File mode changed.

R/WaveQTL_preprocess_funcs.R

+277
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,277 @@
1+
## `WaveQTL_preprocess_funcs.R' contains functions to preprocess functional data for WaveQTL
2+
## software.
3+
## Copyright (C) 2014 Heejung Shim
4+
##
5+
## This program is free software: you can redistribute it and/or modify
6+
## it under the terms of the GNU General Public License as published by
7+
## the Free Software Foundation, either version 3 of the License, or
8+
## (at your option) any later version.
9+
##
10+
## This program is distributed in the hope that it will be useful,
11+
## but WITHOUT ANY WARRANTY; without even the implied warranty of
12+
## MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13+
## GNU General Public License for more details.
14+
##
15+
## You should have received a copy of the GNU General Public License
16+
## along with this program. If not, see <http://www.gnu.org/licenses/>.
17+
18+
19+
require("wavethresh")
20+
21+
22+
##' Filter low count WCs.
23+
##'
24+
##'
25+
##' This function filters out some WCs that are computed based on very
26+
##' low counts. This function considers WCs as low count if the total
27+
##' counts used in their computation were less than `meanR.thresh'
28+
##' per individual on average.
29+
##' @param Data matrix (or a vector when N = 1) with N (# of samples) by T (# of bps in a region); This matrix contains original data before a wavelet transform; Here, T should be a power of 2.
30+
##' @param meanR.thresh If average reads across individuals <= meanR.thresh,
31+
##' those WCs are filtered out.
32+
##' @return filtered.WCs a vector of length T; t-th element indicates whether t-th WC in output from the fuction \code{\link{FWT}} is filtered (0) or not (1).
33+
fiter.WCs <- function(Data, meanR.thresh){
34+
35+
if(is.vector(Data)){
36+
dim(Data) <- c(1, length(Data))
37+
}
38+
numWCs = dim(Data)[2]
39+
J = log2(numWCs)
40+
41+
Mean_R = rep(NA, numWCs)
42+
Mean_R[1] = mean(apply(Data, 1, sum))
43+
Mean_R[2] = Mean_R[1]
44+
45+
posi = 3
46+
for(ss in 1:(J-1)){
47+
num_loc = 2^ss
48+
size_int = numWCs/num_loc
49+
st = (0:(num_loc-1))*size_int + 1
50+
en = st + size_int -1
51+
52+
for(ll in 1:num_loc){
53+
Mean_R[posi] = mean(apply(Data[,st[ll]:en[ll]], 1, sum))
54+
posi = posi + 1
55+
}
56+
}
57+
58+
filtered.WCs = rep(0, numWCs)
59+
wh = which(Mean_R > meanR.thresh)
60+
61+
if(length(wh) > 0){
62+
filtered.WCs[wh] = rep(1, length(wh))
63+
}
64+
65+
return(filtered.WCs)
66+
}
67+
68+
69+
70+
71+
##' Perform a wavelet transform.
72+
##'
73+
##'
74+
##' This function performs a wavelet transform using a \code{\link{wavethresh}} R package
75+
##' and returns WCs in the order that corresponds to output from the function
76+
##' \code{\link{fiter.WCs}}. For now, the function doesn't allow users to specify the level of wavelet
77+
##' decomposition and uses the maximum level decomposition.
78+
##' @param Data matrix (or a vector when N = 1) with N (# of samples) by T (# of bps in a region);
79+
##' This matrix contains original data to be decomposed; Here, T should be a power of 2.
80+
##' @param filter.number default=1; argument to the function \code{\link{wd}} in the R package
81+
##' \code{\link{wavethresh}}; See their manual for details.
82+
##' @param family default="DaubExPhase"; argument to the function \code{\link{wd}} in the R package
83+
##' \code{\link{wavethresh}}; See their manual for details.
84+
##' @return WCs a matrix with N (# of samples) by T (# of bps in a region); n-th row contains WCs
85+
##' for n-th sample; WCs are ordered from low resolution WC to high resolution WC; For example,
86+
##' with a Haar wavelet transform, the first column contains WC (precisely speaking, scaling
87+
##' coefficient) that corresponds to sum of data in the region. The second column contains WC
88+
##' that contrasts the data in the first half vs second half of the region. The last column
89+
##' contains WC that contrasts the data in the (T-1)-th bp vs T-th bp.
90+
FWT <- function(Data, filter.number=1, family="DaubExPhase"){
91+
92+
if(is.vector(Data)){
93+
dim(Data) <- c(1, length(Data))
94+
}
95+
T = dim(Data)[2]
96+
J = log2(T)
97+
N = dim(Data)[1]
98+
99+
dat_D = matrix(data=NA, nr = N, nc = (T - 1))
100+
dat_C = rep(NA, N)
101+
102+
dat_W = matrix(data=NA, nr = N, nc = T)
103+
104+
for(j in 1:N){
105+
each_WT = wd(Data[j,], filter.number=filter.number ,family=family)
106+
dat_D[j,] = each_WT$D
107+
dat_C[j] = each_WT$C[length(each_WT$C)]
108+
}
109+
110+
dat_W[,1] = dat_C
111+
dat_W[,2] = -dat_D[,(T -1)]
112+
113+
st_input = 3
114+
en_posi = T - 2
115+
for(k in 1:(J-1)){
116+
st_posi = en_posi - 2^k + 1
117+
en_input = st_input + 2^k - 1
118+
dat_W[,st_input:en_input] = -dat_D[,st_posi:en_posi]
119+
en_posi = st_posi - 1
120+
st_input = en_input + 1
121+
}
122+
123+
return(list(WCs = dat_W))
124+
125+
}
126+
127+
128+
129+
130+
##' Quantile-transform data to a standard normal distribution.
131+
##'
132+
##'
133+
##' This function quantile-transforms data to a standard normal distribution. It randomly assign
134+
##' ranks for ties.
135+
##' @param x a vector containing data to be quantile-transformed.
136+
##' @return quantile-transformed data.
137+
QT_randomTie <- function(x) {
138+
139+
x.rank = rank(x, ties.method="random")
140+
return(qqnorm(x.rank,plot.it = F)$x)
141+
}
142+
143+
144+
##' Correct for covariates.
145+
##'
146+
##'
147+
##' This function corrects for covariates.
148+
##' @param x a vector of length N (# of samples) containing data.
149+
##' @param Covariates a matrix (or a vector if M = 1) with N by M (# of covariates)
150+
##' containing covariates to correct for.
151+
##' @return a vector of length N; covariates corrected data.
152+
corrected_forCovariates <- function(x, Covariates){
153+
return(lm(x~Covariates)$residuals)
154+
}
155+
156+
157+
158+
##' Normalize WCs.
159+
##'
160+
##'
161+
##' This function quantile-transforms WCs to a standard normal ditribution.
162+
##' If covarites are provided, it corrects the quantile-transformed WCs for the covariates
163+
##' and quantile-transforms the covariates-corrected WCs to a standard normal distribution.
164+
##' @param WCs a matrix with N (# of samples) by T (# of bps in a region or # of WCs);
165+
##' n-th row contains WCs for n-th sample.
166+
##' @param Covariates default = NULL; a matrix (or a vector if M = 1) with N by M (# of covariates)
167+
##' containing covariates to correct for.
168+
##' @return QT_WCs a matrix with N (# of samples) by T (# of bps in a region or # of WCs);
169+
##' It contains normalized WCs (Quantile-transformed and covariate-corrected WCs).
170+
Normalize.WCs <- function(WCs, Covariates=NULL){
171+
172+
# QT to a standard normal distribution
173+
QT_dat = apply(WCs, 2, QT_randomTie)
174+
175+
# correct for covariates and QT to a standard normal distribution.
176+
if(!is.null(Covariates)){
177+
corrected_QT.dat = apply(QT_dat, 2, corrected_forCovariates, C)
178+
QT.dat = apply(corrected_QT.dat, 2, QT_randomTie)
179+
}
180+
181+
return(list(QT_WCs = QT.dat))
182+
183+
}
184+
185+
186+
187+
188+
189+
190+
##'
191+
##'
192+
##' ##' Preprocess functional data for a WaveQTL software.
193+
##'
194+
##'
195+
##' This function preprocesses functiona data for a wavelet-based approach implmented in
196+
##' a WaveQTL software.
197+
##'
198+
##'
199+
##' performs a wavelet transform using a \code{\link{wavethresh}} R package
200+
##' and returns WCs in the order that corresponds to output from the function
201+
##' \code{\link{fiter.WCs}}. For now, the function doesn't allow users to specify the level of wavelet
202+
##' decomposition and uses the maximum level decomposition.
203+
##' @param Data matrix (or a vector when N = 1) with N (# of samples) by T (# of bps in a region);
204+
##' This matrix contains original data to be decomposed; Here, T should be a power of 2.
205+
##' @param filter.number default=1; argument to the function \code{\link{wd}} in the R package
206+
##' \code{\link{wavethresh}}; See their manual for details.
207+
##' @param family default="DaubExPhase"; argument to the function \code{\link{wd}} in the R package
208+
##' \code{\link{wavethresh}}; See their manual for details.
209+
##' @return WCs a matrix with N (# of samples) by T (# of bps in a region); n-th row contains WCs
210+
##' for n-th sample; WCs are ordered from low resolution WC to high resolution WC; For example,
211+
##' with a Haar wavelet transform, the first column contains WC (precisely speaking, scaling
212+
##' coefficient) that corresponds to sum of data in the region. The second column contains WC
213+
##' that contrasts the data in the first half vs second half of the region. The last column
214+
##' contains WC that contrasts the data in the (T-1)-th bp vs T-th bp.
215+
216+
217+
# input
218+
# 1. Data : N (# of samples) by T (# of bps in a region); read count at t-th bp (t = 1, ..., T) for i-th sample (i = 1, ..., N)
219+
# 2. Read.depth (=NULL) : a vector of length N; read.depth for each individual
220+
# 3. C (=NULL) : N by M (# of covariates); covariates to correct for
221+
# 4. meanR.thresh (=2) : average reads across individuals < meanR.thresh, we will filter those WCs.
222+
# output
223+
# 1. WCs : N by T (# of WCs)
224+
# 2. filtered.WCs : a vector of length T (either 0 or 1 indicating whether it's filtered (0) or not (1)
225+
226+
227+
228+
229+
WaveQTL_preprocess <- function(Data, library.read.depth = NULL, Covariates = NULL, meanR.thresh = 2){
230+
231+
232+
if(is.vector(Data)){dim(Data)<- c(1,length(Data))} #change Data to matrix
233+
if(nrow(Data)==1){C = NULL} #if only one observation, don't correct for covariates
234+
235+
if(!is.null(C)){
236+
if(is.vector(C)){dim(C)<- c(1,length(C))} #change C to matrix
237+
}
238+
239+
240+
241+
T = dim(Data)[2]
242+
J = log2(T)
243+
if(!isTRUE(all.equal(J,trunc(J)))){stop("Error: number of columns of Data must be power of 2")}
244+
N = dim(Data)[1]
245+
246+
247+
### generate filtered.WCs
248+
if(!is.null(meanR.thresh)){
249+
filtered.WCs = fiter.WCs(Data, meanR.thresh)
250+
}else{
251+
filtered.WCs = NULL
252+
}
253+
254+
### corrected for read depth
255+
if(!is.null(library.read.depth)){
256+
DataC = Data/library.read.depth
257+
}else{
258+
DataC = Data
259+
}
260+
261+
### Wavelet Transform
262+
WCs = FWT(DataC)$WCs
263+
264+
### Quantile Transform to a standard normal distribution
265+
if(N > 1){
266+
WCs = Quantile.Transform(WCs, Covariates)
267+
}
268+
269+
return(list(WCs = WCs$QT_WCs, filtered.WCs = filtered.WCs))
270+
271+
}
272+
273+
274+
275+
276+
277+
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,58 @@
1+
## `combine_two_strands_with_correction_for_mappability.R' shows how to combine DNase-seq data from two strands while taking mappability into account as we did in Shim and Stephens (2014).
2+
## Copyright (C) 2014 Heejung Shim
3+
##
4+
## This program is free software: you can redistribute it and/or modify
5+
## it under the terms of the GNU General Public License as published by
6+
## the Free Software Foundation, either version 3 of the License, or
7+
## (at your option) any later version.
8+
##
9+
## This program is distributed in the hope that it will be useful,
10+
## but WITHOUT ANY WARRANTY; without even the implied warranty of
11+
## MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12+
## GNU General Public License for more details.
13+
##
14+
## You should have received a copy of the GNU General Public License
15+
## along with this program. If not, see <http://www.gnu.org/licenses/>.
16+
17+
#######################################################
18+
## Read DNase-seq data from two strands.
19+
## You probably need to correct a path to the data.
20+
#######################################################
21+
path = "/mnt/lustre/home/shim/WaveQTL/data/dsQTL/DNase.chr17.10160989.10162012.dat"
22+
DNase.dat = read.table(path)
23+
dim(DNase.dat) # 70 by 2048; each column corresponds to each individual; the first (second) 1024 rows contain DNass-seq read count from +(-) strand in each positions;
24+
25+
#######################################################
26+
## Read mappability.
27+
## You probably need to correct a path to the data.
28+
#######################################################
29+
path = "/mnt/lustre/home/shim/WaveQTL/data/dsQTL/DNase.mappability.chr17.10160989.10162012.dat"
30+
map.dat = read.table(path)
31+
dim(map.dat) # 1 by 2048; the first (second) 1024 rows indicates mappability from +(-) strand in each positions; `1' indicates uniquly mappabile base.
32+
33+
#############################################################
34+
## combine two strands while taking mappability into account
35+
#############################################################
36+
numBPs = dim(DNase.dat)[2]/2
37+
numINDs = dim(DNase.dat)[1]
38+
39+
# take mappability into account
40+
map = rep(0, numBPs*2)
41+
wh = (map.dat[1,] == 1)
42+
map[wh] = 1
43+
dat = matrix(data = 0, nr = numINDs, nc = numBPs*2)
44+
dat[,wh] = as.matrix(DNase.dat[,wh])
45+
46+
# combine two strands
47+
all.dat = dat[,1:numBPs] + dat[,(numBPs+1):(numBPs+numBPs)]
48+
all.map = map[1:numBPs] + map[(numBPs+1):(numBPs+numBPs)]
49+
pheno.dat = matrix(data = 0, nr = numINDs, nc = numBPs)
50+
wh2 = which(all.map > 0)
51+
pheno.dat[,wh2] = t(t(all.dat[,wh2])/all.map[wh2])
52+
53+
# we can check if "pheno.dat" is the same as "/mnt/lustre/home/shim/WaveQTL/data/dsQTL/chr17.10160989.10162012.pheno.dat"
54+
A = as.matrix(read.table("/mnt/lustre/home/shim/WaveQTL/data/dsQTL/chr17.10160989.10162012.pheno.dat"))
55+
sum(pheno.dat != A)
56+
# 0
57+
58+

0 commit comments

Comments
 (0)