-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathfit_cov_ebnmf.R
148 lines (130 loc) · 6.98 KB
/
fit_cov_ebnmf.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
### fit Empirical Bayes Matrix Factorization to single cell RNA-seq data based on covariance decomposition, with a nonnegative prior on GEP memberships
### Y: a cell by gene matrix of normalized and log-transformed gene expression data
### Kmax: a positive integer (at least 2) specifying an upper bound of the number of GEPs for initialization, which is approximately but often not exactly the final number of GEPs
### prior: a nonnegative prior for GEP memberships, such as generalized binary prior, which must be a function defined in the ebnm package
### thres: a positive number in (0,1) to specify threshold on Pearson correlation between L and L-tilde and to use concordant ones only
### extrapolate: a logical indicator specifying whether to use extrapolation to accelerate backfitting GEP memberships; see the flashier package for details
### maxiter: a positive integer specifying the maximum number of iterations to backfit GEP memberships
### verbose: an integer specifying whether and how to display progress updates, as described in the flashier package
flash_fit_cov_ebnmf <-
function (Y, Kmax, prior = ebnm::ebnm_generalized_binary, thres = 0.8,
extrapolate = TRUE, maxiter = 500, verbose = 1) {
### calculate the covariance matrix from the cell by gene matrix of gene expression data
dat <- Y %*% t(Y)/ncol(Y)
### fit EBMF with point laplace prior to covariance matrix without considering the diagonal component
fit.cov <- flash_init(dat, var_type = 0) %>%
flash_greedy(Kmax = 1, ebnm_fn = ebnm_unimodal_nonnegative, verbose = 1) %>%
flash_greedy(Kmax = Kmax - 1, ebnm_fn = ebnm_point_laplace, verbose = 1) %>%
flash_backfit(maxiter = maxiter, verbose = 1)
### fit EBMF with point laplace prior to covariance matrix with the diagonal component
fit.cov <- fit_ebmf_to_YY(dat = dat, fl = fit.cov, prior = ebnm::ebnm_point_laplace, maxiter = maxiter, verbose = 1)$fl
### initialize EB-NMF fit from the EBMF fit with point laplace prior
cov.init <- init_cov_ebnmf(fit.cov)
kmax <- which.max(colSums(cov.init[[1]]))
fit.cov <- flash_init(dat, var_type = 0) %>%
flash_factors_init(
init = lapply(cov.init, function(x) x[, kmax, drop = FALSE]),
ebnm_fn = ebnm_unimodal_nonnegative
) %>%
flash_factors_init(
init = lapply(cov.init, function(x) x[, -c(kmax), drop = FALSE]),
ebnm_fn = prior
) %>%
flash_backfit(extrapolate = extrapolate, maxiter = 25, verbose = verbose)
### fit EB-NMF with a nonnegative prior to covariance matrix with the diagonal component
fit.cov <- fit_ebmf_to_YY(dat = dat, fl = fit.cov, prior = prior, extrapolate = extrapolate, maxiter = 25, verbose = verbose)$fl
### keep at most Kmax factors based on proportion of variance explained and refit EB-NMF to covariance matrix
kset <- (length(fit.cov$pve) - rank(fit.cov$pve) < 1.5*Kmax) & (fit.cov$pve > 0)
# kset <- (fit.cov$pve > 0)
kall <- 1:fit.cov$n_factors
if(!all(kset))
fit.cov <- flash_factors_remove(fit.cov, kset=kall[!kset])
fit.cov <- fit_ebmf_to_YY(dat = dat, fl = fit.cov, prior = prior, extrapolate = extrapolate, maxiter = maxiter, verbose = verbose)$fl
### scale GEP membership estimates to 0-1 scale, and calculate Pearson correlations between L and L-tilde
k.order <- order(fit.cov$pve, decreasing = TRUE)
fit.L <- fit.cov$L_pm[, k.order]
fit.Ltilde <- fit.cov$F_pm[, k.order]
corr <- diag(cor(fit.L, fit.Ltilde))
fit.L <- t(t(fit.L)/apply(fit.L, 2, max))
### estimate GEP signatures by fitting EB-SNMF to gene expression data matrix with fixed L estimated from covariance decomposition above
init.F <- t(solve(crossprod(fit.L), crossprod(fit.L, Y)))
fit.snmf <- flash_init(Y, S = 1/sqrt(nrow(Y)), var_type = 2) %>%
flash_factors_init(
init = list(as.matrix(fit.L), as.matrix(init.F)),
ebnm_fn = c(prior, ebnm_point_laplace)
) %>%
flash_factors_fix(kset = 1:ncol(fit.L), which_dim = "loadings") %>%
flash_backfit(extrapolate = FALSE, verbose = verbose)
### calculate the z-score and lfsr of GEP signatures by running linear regression followed by ash
J <- ncol(Y)
genes <- colnames(Y)
F.est <- matrix(0, J, ncol(fit.snmf$L_pm))
rownames(F.est) <- genes
colnames(F.est) <- colnames(fit.snmf$L_pm)
F.se <- F.est
F.z.ash <- F.est
F.lfsr.ash <- F.est
for (j in 1:J) {
y <- Y[,j]
dat <- as.data.frame(cbind(y, fit.snmf$L_pm))
fit <- lm(y ~ 0 + ., dat)
coefs <- summary(fit)$coefficients
F.est[j,] <- coefs[, "Estimate"]
F.se[j,] <- coefs[, "Std. Error"]
}
for(k in 1:ncol(F.est)){
fit <- ash(F.est[,k], F.se[,k], mixcompdist = "normal", method = "shrink")
F.z.ash[,k] <- fit$result$PosteriorMean/fit$result$PosteriorSD
F.lfsr.ash[,k] <- fit$result$lfsr
}
### return the estimated memberships and signatures only for GEPs whose L and Ltilde from covariance decomposition are highly concordant
k.idx <- which(corr > thres)
L.pm <- fit.snmf$L_pm[, k.idx]
colnames(L.pm) <- c("Baseline", paste0("GEP", 1:(ncol(L.pm)-1)))
F.lfc <- fit.snmf$F_pm[, k.idx]/log(2)
F.z <- F.z.ash[, k.idx]
F.lfsr <- F.lfsr.ash[, k.idx]
colnames(F.lfc) <- colnames(L.pm)
colnames(F.z) <- colnames(L.pm)
colnames(F.lfsr) <- colnames(L.pm)
return(list(L = L.pm, F = list(lfc = F.lfc, z_score = F.z, lfsr = F.lfsr)))
}
### initialize the EB-NMF fit to covariance matrix YY' s.t. E[YY'] = LL'+ D from an estimate of L without nonnegative constraints
init_cov_ebnmf <- function(fl, kset = 1:ncol(fl$flash_fit$EF[[1]])) {
LL <- fl$flash_fit$EF[[1]][, kset, drop = FALSE]
FF <- fl$flash_fit$EF[[2]][, kset, drop = FALSE]
LL <- cbind(pmax(LL, 0), pmax(-LL, 0))
LL <- cbind(fl$flash_fit$EF[[1]][, -kset, drop = FALSE], LL)
FF <- cbind(pmax(FF, 0), pmax(-FF, 0))
FF <- cbind(fl$flash_fit$EF[[2]][, -kset, drop = FALSE], FF)
to.keep <- (colSums(LL) > .Machine$double.eps) & (colSums(FF) > .Machine$double.eps)
LL <- LL[, to.keep, drop = FALSE]
FF <- FF[, to.keep, drop = FALSE]
return(list(LL, FF))
}
### fit EBMF to covariance matrix YY' s.t. E[YY'] = LL'+ D, where D = s2*I and I is an identity matrix
fit_ebmf_to_YY <- function(dat, fl, prior, extrapolate = TRUE, maxiter = 500, epsilon = 0, verbose = 1){
s2 <- max(0, mean(diag(dat) - diag(fitted(fl))))
s2_diff <- Inf
### alternate between estimating s2 and backfitting until convergence
while(s2 > 0 && abs(s2_diff - 1) > 1e-3) {
dat_minuss2 <- dat - diag(rep(s2, ncol(dat)))
kset <- which(fl$pve > epsilon)
kmax <- which.max(fl$pve)
kset <- kset[-c(kmax)]
fl <- flash_init(dat_minuss2) %>%
flash_factors_init(
init = lapply(fl$flash_fit$EF, function(x) x[, kmax, drop = FALSE]),
ebnm_fn = ebnm_unimodal_nonnegative
) %>%
flash_factors_init(
init = lapply(fl$flash_fit$EF, function(x) x[, kset, drop = FALSE]),
ebnm_fn = prior
) %>%
flash_backfit(extrapolate = extrapolate, maxiter = maxiter, verbose = verbose)
old_s2 <- s2
s2 <- max(0, mean(diag(dat) - diag(fitted(fl))))
s2_diff <- s2 / old_s2
}
return(list(dat=dat, fl=fl, s2=s2))
}