-
Notifications
You must be signed in to change notification settings - Fork 6
/
Copy pathbig_test.R
133 lines (117 loc) · 5.18 KB
/
big_test.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
# This is a test script for the EAGLE, the Environment-Ase Generalized Linear modEl
# Data sampled from the generative model and then we run inference to attempt to
# recover the true "hits"
require(eagle) # load the package
set.seed(1) # for reproducibility
#-------------- generate some synthetic data -------------------------
n.loci=300 # number of loci
n.samples=200 # number of individuals
# to run on your own data you need to generate the following matrices:
alt=matrix(NA,n.samples,n.loci) # matrix of alternative counts
totalreads=matrix(NA,n.samples,n.loci) # matrix of total counts
het=matrix(NA,n.samples,n.loci) # matrix of which individuals are hets at each locus
environmentVar=runif(n.samples) # values of the environment variable, e.g. age
eqtlGenotypes=matrix(NA,n.samples,n.loci) # genotypes (0,1,2) of the lead eQTL for each gene
# make "true" regression coefficients, most of which are 0
#trueBeta=ifelse(runif(n.loci) < 0.05, rgamma(n.loci,shape=5,rate=5), 0.0)
trueBeta=ifelse(runif(n.loci) < 0.05, runif(n.loci), 0.0)
logistic=function(x) 1/(1+exp(-x))
for (i in 1:n.loci){
maf=runif(1)*.4+.1 # MAF between .1 and .5 for this locus
# sample which individuals are heterozygous at this locus assuming HWE
hap1=runif(n.samples)<maf
hap2=runif(n.samples)<maf
het[,i]=xor(hap1,hap2)
totalreads[,i]=rpois(n.samples,100*rgamma(n.samples,shape=2,rate=2)) # sample read depth from overdispersed Poisson (NB?)
logOdds=.3+trueBeta[i]*environmentVar+runif(1,min=.1,max=.5)*rnorm(n.samples)
if (runif(1)<.5){ # add eQTL
eSNPmaf=runif(1)*.4+.1
ehap1=runif(n.samples)<eSNPmaf
ehap2=runif(n.samples)<eSNPmaf
eqtlGenotypes[,i]=ehap1+ehap2
logOdds=logOdds+.3*(eqtlGenotypes[,i]==1)
}
p=logistic(logOdds) # sample underlying probabilities, with overdispersion
alt[,i]=rbinom(n.samples,totalreads[,i],p) # sample alternative counts
}
alt[het!=1]=NA
ref=totalreads-alt
#------------------ filters -------------------
alt.list <- list()
n.list <- list()
x.null <- list()
x.full <- list()
original.index <- list()
# for defining monoallelic expression
count.cutoff=3 # monoallelic if fewer reads than this...
prop.cutoff=0.01 # ...or lower proportion than this mapping to one allele
prop.mono.cutoff=.5 # maximum proportion of hets who are allowed to how monoallelic expression
minSampleSize=20
minGroupSize=10 # minimum testable individuals in an environmental group (e.g. smokers)
minModel=T # whether to use the symmetrized version of the likeihood
non.problematic.counter=1
# iterate over exonic SNPs
for (snp.index in 1:n.loci) {
if (snp.index %% 1000 == 0) print(snp.index)
valid=het[,snp.index]==1 & totalreads[,snp.index]>5
# check we have at least 20 valid samples
if (sum(valid)<minSampleSize)
next
a=alt[valid,snp.index]
r=ref[valid,snp.index]
heteq= eqtlGenotypes[valid,snp.index]==1
x=environmentVar[valid]
# check not too many are in one group (e.g. female, non-smokers)
if ((length(x)-max(table(x)))<minGroupSize)
next
n=a+r
# check there isn't too much mono-allelic expression
min.a.r=apply(cbind(a,r),1,min)
is.mono=(min.a.r<count.cutoff)|((as.double(min.a.r)/n)<prop.cutoff)
if (mean(is.mono)>prop.mono.cutoff)
next
alt.list[[non.problematic.counter]]=if (minModel) pmin(a,n-a) else a
n.list[[non.problematic.counter]]=n
original.index[[non.problematic.counter]]=snp.index
num.samples=length(x)
x.full[[non.problematic.counter]]=cbind(env=x,intercept=numeric(num.samples)+1.0)
x.null[[non.problematic.counter]]=cbind(intercept=numeric(num.samples)+1.0)
# add eQTL heterozygosity
if (!any(is.na(heteq))) if ((sd(heteq)>0) & (min(table(heteq))>5)){
x.full[[non.problematic.counter]]=cbind(x.full[[non.problematic.counter]],eqtl=heteq)
x.null[[non.problematic.counter]]=cbind(x.null[[non.problematic.counter]],eqtl=heteq)
# lots of conditions to decide whether to add an interaction terms
if (min(table(heteq))>=10) { # only if at least 10 hets and 10 non-hets
e1=x[heteq==1.0]
e0=x[heteq==0.0]
if ((length(e1)-max(table(e1)))>=10 & (length(e0)-max(table(e0)))>=10){
if (length(unique(x))>2 | min(table(x,heteq))>=5){
temp=cbind(x.full[[non.problematic.counter]],interaction=x*heteq)
if (det(t(temp) %*% temp) > 1e-8)
x.full[[non.problematic.counter]]=temp
}
}
}
}
non.problematic.counter=non.problematic.counter+1
}
original.index=unlist(original.index)
trueBetaAtTested=trueBeta[original.index]
#---------------- run the model --------------------------
s=eagle.settings()
s$debug=F
#s$rev.model=as.integer(3) # local regression
s$rev.model=2
s$normalised.depth=scale(log10(unlist(lapply(n,mean)))) # only required for rev.model=3
s$max.iterations=10000
s$convergence.tolerance=.001
s$coeff.regulariser=0.1
s$learn.rev=T
# learnt parameters from the DGN dataset
s$rep.global.shape=1.0
s$rep.global.rate=0.0033
s$traceEvery=1
system.time( res <- eagle.helper(alt.list,n.list,x.full,x.null,s) )
cat("p-values for true hits:",res$p.values[trueBetaAtTested!=0],"\n")
cat("true hit betas: ",trueBetaAtTested[trueBetaAtTested!=0],"\n")
hist( res$p.values[ trueBetaAtTested==0.0 ], xlab="p-value", main="p-values for null sites")