|
| 1 | +## `WaveQTL_preprocess_funcs.R' contains functions to preprocess functional data for WaveQTL |
| 2 | +## software. |
| 3 | +## Copyright (C) 2014 Heejung Shim |
| 4 | +## |
| 5 | +## This program is free software: you can redistribute it and/or modify |
| 6 | +## it under the terms of the GNU General Public License as published by |
| 7 | +## the Free Software Foundation, either version 3 of the License, or |
| 8 | +## (at your option) any later version. |
| 9 | +## |
| 10 | +## This program is distributed in the hope that it will be useful, |
| 11 | +## but WITHOUT ANY WARRANTY; without even the implied warranty of |
| 12 | +## MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the |
| 13 | +## GNU General Public License for more details. |
| 14 | +## |
| 15 | +## You should have received a copy of the GNU General Public License |
| 16 | +## along with this program. If not, see <http://www.gnu.org/licenses/>. |
| 17 | + |
| 18 | + |
| 19 | +require("wavethresh") |
| 20 | + |
| 21 | + |
| 22 | +##' Filter low count WCs. |
| 23 | +##' |
| 24 | +##' |
| 25 | +##' This function filters out some WCs that are computed based on very |
| 26 | +##' low counts. This function considers WCs as low count if the total |
| 27 | +##' counts used in their computation were less than `meanR.thresh' |
| 28 | +##' per individual on average. |
| 29 | +##' @param Data matrix (or a vector when N = 1) with N (# of samples) by T (# of bps in a region); This matrix contains original data before a wavelet transform; Here, T should be a power of 2. |
| 30 | +##' @param meanR.thresh If average reads across individuals <= meanR.thresh, |
| 31 | +##' those WCs are filtered out. |
| 32 | +##' @return filtered.WCs a vector of length T; t-th element indicates whether t-th WC in output from the fuction \code{\link{FWT}} is filtered (0) or not (1). |
| 33 | +fiter.WCs <- function(Data, meanR.thresh){ |
| 34 | + |
| 35 | + if(is.vector(Data)){ |
| 36 | + dim(Data) <- c(1, length(Data)) |
| 37 | + } |
| 38 | + numWCs = dim(Data)[2] |
| 39 | + J = log2(numWCs) |
| 40 | + |
| 41 | + Mean_R = rep(NA, numWCs) |
| 42 | + Mean_R[1] = mean(apply(Data, 1, sum)) |
| 43 | + Mean_R[2] = Mean_R[1] |
| 44 | + |
| 45 | + posi = 3 |
| 46 | + for(ss in 1:(J-1)){ |
| 47 | + num_loc = 2^ss |
| 48 | + size_int = numWCs/num_loc |
| 49 | + st = (0:(num_loc-1))*size_int + 1 |
| 50 | + en = st + size_int -1 |
| 51 | + |
| 52 | + for(ll in 1:num_loc){ |
| 53 | + Mean_R[posi] = mean(apply(Data[,st[ll]:en[ll]], 1, sum)) |
| 54 | + posi = posi + 1 |
| 55 | + } |
| 56 | + } |
| 57 | + |
| 58 | + filtered.WCs = rep(0, numWCs) |
| 59 | + wh = which(Mean_R > meanR.thresh) |
| 60 | + |
| 61 | + if(length(wh) > 0){ |
| 62 | + filtered.WCs[wh] = rep(1, length(wh)) |
| 63 | + } |
| 64 | + |
| 65 | + return(filtered.WCs) |
| 66 | +} |
| 67 | + |
| 68 | + |
| 69 | + |
| 70 | + |
| 71 | +##' Perform a wavelet transform. |
| 72 | +##' |
| 73 | +##' |
| 74 | +##' This function performs a wavelet transform using a \code{\link{wavethresh}} R package |
| 75 | +##' and returns WCs in the order that corresponds to output from the function |
| 76 | +##' \code{\link{fiter.WCs}}. For now, the function doesn't allow users to specify the level of wavelet |
| 77 | +##' decomposition and uses the maximum level decomposition. |
| 78 | +##' @param Data matrix (or a vector when N = 1) with N (# of samples) by T (# of bps in a region); |
| 79 | +##' This matrix contains original data to be decomposed; Here, T should be a power of 2. |
| 80 | +##' @param filter.number default=1; argument to the function \code{\link{wd}} in the R package |
| 81 | +##' \code{\link{wavethresh}}; See their manual for details. |
| 82 | +##' @param family default="DaubExPhase"; argument to the function \code{\link{wd}} in the R package |
| 83 | +##' \code{\link{wavethresh}}; See their manual for details. |
| 84 | +##' @return WCs a matrix with N (# of samples) by T (# of bps in a region); n-th row contains WCs |
| 85 | +##' for n-th sample; WCs are ordered from low resolution WC to high resolution WC; For example, |
| 86 | +##' with a Haar wavelet transform, the first column contains WC (precisely speaking, scaling |
| 87 | +##' coefficient) that corresponds to sum of data in the region. The second column contains WC |
| 88 | +##' that contrasts the data in the first half vs second half of the region. The last column |
| 89 | +##' contains WC that contrasts the data in the (T-1)-th bp vs T-th bp. |
| 90 | +FWT <- function(Data, filter.number=1, family="DaubExPhase"){ |
| 91 | + |
| 92 | + if(is.vector(Data)){ |
| 93 | + dim(Data) <- c(1, length(Data)) |
| 94 | + } |
| 95 | + T = dim(Data)[2] |
| 96 | + J = log2(T) |
| 97 | + N = dim(Data)[1] |
| 98 | + |
| 99 | + dat_D = matrix(data=NA, nr = N, nc = (T - 1)) |
| 100 | + dat_C = rep(NA, N) |
| 101 | + |
| 102 | + dat_W = matrix(data=NA, nr = N, nc = T) |
| 103 | + |
| 104 | + for(j in 1:N){ |
| 105 | + each_WT = wd(Data[j,], filter.number=filter.number ,family=family) |
| 106 | + dat_D[j,] = each_WT$D |
| 107 | + dat_C[j] = each_WT$C[length(each_WT$C)] |
| 108 | + } |
| 109 | + |
| 110 | + dat_W[,1] = dat_C |
| 111 | + dat_W[,2] = -dat_D[,(T -1)] |
| 112 | + |
| 113 | + st_input = 3 |
| 114 | + en_posi = T - 2 |
| 115 | + for(k in 1:(J-1)){ |
| 116 | + st_posi = en_posi - 2^k + 1 |
| 117 | + en_input = st_input + 2^k - 1 |
| 118 | + dat_W[,st_input:en_input] = -dat_D[,st_posi:en_posi] |
| 119 | + en_posi = st_posi - 1 |
| 120 | + st_input = en_input + 1 |
| 121 | + } |
| 122 | + |
| 123 | + return(list(WCs = dat_W)) |
| 124 | + |
| 125 | +} |
| 126 | + |
| 127 | + |
| 128 | + |
| 129 | + |
| 130 | +##' Quantile-transform data to a standard normal distribution. |
| 131 | +##' |
| 132 | +##' |
| 133 | +##' This function quantile-transforms data to a standard normal distribution. It randomly assign |
| 134 | +##' ranks for ties. |
| 135 | +##' @param x a vector containing data to be quantile-transformed. |
| 136 | +##' @return quantile-transformed data. |
| 137 | +QT_randomTie <- function(x) { |
| 138 | + |
| 139 | + x.rank = rank(x, ties.method="random") |
| 140 | + return(qqnorm(x.rank,plot.it = F)$x) |
| 141 | +} |
| 142 | + |
| 143 | + |
| 144 | +##' Correct for covariates. |
| 145 | +##' |
| 146 | +##' |
| 147 | +##' This function corrects for covariates. |
| 148 | +##' @param x a vector of length N (# of samples) containing data. |
| 149 | +##' @param Covariates a matrix (or a vector if M = 1) with N by M (# of covariates) |
| 150 | +##' containing covariates to correct for. |
| 151 | +##' @return a vector of length N; covariates corrected data. |
| 152 | +corrected_forCovariates <- function(x, Covariates){ |
| 153 | + return(lm(x~Covariates)$residuals) |
| 154 | +} |
| 155 | + |
| 156 | + |
| 157 | + |
| 158 | +##' Normalize WCs. |
| 159 | +##' |
| 160 | +##' |
| 161 | +##' This function quantile-transforms WCs to a standard normal ditribution. |
| 162 | +##' If covarites are provided, it corrects the quantile-transformed WCs for the covariates |
| 163 | +##' and quantile-transforms the covariates-corrected WCs to a standard normal distribution. |
| 164 | +##' @param WCs a matrix with N (# of samples) by T (# of bps in a region or # of WCs); |
| 165 | +##' n-th row contains WCs for n-th sample. |
| 166 | +##' @param Covariates default = NULL; a matrix (or a vector if M = 1) with N by M (# of covariates) |
| 167 | +##' containing covariates to correct for. |
| 168 | +##' @return QT_WCs a matrix with N (# of samples) by T (# of bps in a region or # of WCs); |
| 169 | +##' It contains normalized WCs (Quantile-transformed and covariate-corrected WCs). |
| 170 | +Normalize.WCs <- function(WCs, Covariates=NULL){ |
| 171 | + |
| 172 | + # QT to a standard normal distribution |
| 173 | + QT_dat = apply(WCs, 2, QT_randomTie) |
| 174 | + |
| 175 | + # correct for covariates and QT to a standard normal distribution. |
| 176 | + if(!is.null(Covariates)){ |
| 177 | + corrected_QT.dat = apply(QT_dat, 2, corrected_forCovariates, C) |
| 178 | + QT.dat = apply(corrected_QT.dat, 2, QT_randomTie) |
| 179 | + } |
| 180 | + |
| 181 | + return(list(QT_WCs = QT.dat)) |
| 182 | + |
| 183 | +} |
| 184 | + |
| 185 | + |
| 186 | + |
| 187 | + |
| 188 | + |
| 189 | + |
| 190 | +##' |
| 191 | +##' |
| 192 | +##' ##' Preprocess functional data for a WaveQTL software. |
| 193 | +##' |
| 194 | +##' |
| 195 | +##' This function preprocesses functiona data for a wavelet-based approach implmented in |
| 196 | +##' a WaveQTL software. |
| 197 | +##' |
| 198 | +##' |
| 199 | +##' performs a wavelet transform using a \code{\link{wavethresh}} R package |
| 200 | +##' and returns WCs in the order that corresponds to output from the function |
| 201 | +##' \code{\link{fiter.WCs}}. For now, the function doesn't allow users to specify the level of wavelet |
| 202 | +##' decomposition and uses the maximum level decomposition. |
| 203 | +##' @param Data matrix (or a vector when N = 1) with N (# of samples) by T (# of bps in a region); |
| 204 | +##' This matrix contains original data to be decomposed; Here, T should be a power of 2. |
| 205 | +##' @param filter.number default=1; argument to the function \code{\link{wd}} in the R package |
| 206 | +##' \code{\link{wavethresh}}; See their manual for details. |
| 207 | +##' @param family default="DaubExPhase"; argument to the function \code{\link{wd}} in the R package |
| 208 | +##' \code{\link{wavethresh}}; See their manual for details. |
| 209 | +##' @return WCs a matrix with N (# of samples) by T (# of bps in a region); n-th row contains WCs |
| 210 | +##' for n-th sample; WCs are ordered from low resolution WC to high resolution WC; For example, |
| 211 | +##' with a Haar wavelet transform, the first column contains WC (precisely speaking, scaling |
| 212 | +##' coefficient) that corresponds to sum of data in the region. The second column contains WC |
| 213 | +##' that contrasts the data in the first half vs second half of the region. The last column |
| 214 | +##' contains WC that contrasts the data in the (T-1)-th bp vs T-th bp. |
| 215 | + |
| 216 | + |
| 217 | +# input |
| 218 | +# 1. Data : N (# of samples) by T (# of bps in a region); read count at t-th bp (t = 1, ..., T) for i-th sample (i = 1, ..., N) |
| 219 | +# 2. Read.depth (=NULL) : a vector of length N; read.depth for each individual |
| 220 | +# 3. C (=NULL) : N by M (# of covariates); covariates to correct for |
| 221 | +# 4. meanR.thresh (=2) : average reads across individuals < meanR.thresh, we will filter those WCs. |
| 222 | +# output |
| 223 | +# 1. WCs : N by T (# of WCs) |
| 224 | +# 2. filtered.WCs : a vector of length T (either 0 or 1 indicating whether it's filtered (0) or not (1) |
| 225 | + |
| 226 | + |
| 227 | + |
| 228 | + |
| 229 | +WaveQTL_preprocess <- function(Data, library.read.depth = NULL, Covariates = NULL, meanR.thresh = 2){ |
| 230 | + |
| 231 | + |
| 232 | + if(is.vector(Data)){dim(Data)<- c(1,length(Data))} #change Data to matrix |
| 233 | + if(nrow(Data)==1){C = NULL} #if only one observation, don't correct for covariates |
| 234 | + |
| 235 | + if(!is.null(C)){ |
| 236 | + if(is.vector(C)){dim(C)<- c(1,length(C))} #change C to matrix |
| 237 | + } |
| 238 | + |
| 239 | + |
| 240 | + |
| 241 | + T = dim(Data)[2] |
| 242 | + J = log2(T) |
| 243 | + if(!isTRUE(all.equal(J,trunc(J)))){stop("Error: number of columns of Data must be power of 2")} |
| 244 | + N = dim(Data)[1] |
| 245 | + |
| 246 | + |
| 247 | + ### generate filtered.WCs |
| 248 | + if(!is.null(meanR.thresh)){ |
| 249 | + filtered.WCs = fiter.WCs(Data, meanR.thresh) |
| 250 | + }else{ |
| 251 | + filtered.WCs = NULL |
| 252 | + } |
| 253 | + |
| 254 | + ### corrected for read depth |
| 255 | + if(!is.null(library.read.depth)){ |
| 256 | + DataC = Data/library.read.depth |
| 257 | + }else{ |
| 258 | + DataC = Data |
| 259 | + } |
| 260 | + |
| 261 | + ### Wavelet Transform |
| 262 | + WCs = FWT(DataC)$WCs |
| 263 | + |
| 264 | + ### Quantile Transform to a standard normal distribution |
| 265 | + if(N > 1){ |
| 266 | + WCs = Quantile.Transform(WCs, Covariates) |
| 267 | + } |
| 268 | + |
| 269 | + return(list(WCs = WCs$QT_WCs, filtered.WCs = filtered.WCs)) |
| 270 | + |
| 271 | +} |
| 272 | + |
| 273 | + |
| 274 | + |
| 275 | + |
| 276 | + |
| 277 | + |
0 commit comments