-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathjin2017.R
149 lines (109 loc) · 6.52 KB
/
jin2017.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
# Import and process data from Jin et al. 2017
library(data.table)
library(mixOmics)
# Import traits and join with metadata (doesn't seem like metadata will be that useful)
traits <- fread('project/data/jin2017/Traits - Jin et al. 2017.csv', skip = 2)
trait_names <- c('plantID', 'top_second_leaf_length', 'top_second_leaf_width', 'main_stem_height', 'main_stem_width', 'panicle_length', 'panicle_diameter', 'fringe_neck_length', 'panicle_weight', 'grain_weight', 'hundred_kernel_weight', 'spikelet_number', 'grain_number')
setnames(traits, trait_names)
metadata <- fread('project/data/jin2017/Metadata - Jin et al. 2017.csv', skip = 1)
setnames(metadata, c('plantID', 'cultivar', 'location', 'compartment', 'number'))
metadata[cultivar == '', cultivar := NA]
metadata[, cultivar := zoo::na.locf(cultivar)]
traits <- merge(traits, metadata, by = 'plantID', all.x = TRUE)
# Import OTU table
otu <- fread('project/data/jin2017/OTU table - Jin et al. 2017.csv', skip = 1)
# Subset the OTU table for only the individuals that we have trait data for
otu_use <- otu[, mget(names(otu)[names(otu) %in% c('#OTU ID', 'taxonomy', traits$plantID)])]
otu_mat <- as.matrix(otu_use[, mget(setdiff(names(otu_use), c('#OTU ID', 'taxonomy')))])
dimnames(otu_mat) <- list(otu_use[['#OTU ID']], setdiff(names(otu_use), c('#OTU ID', 'taxonomy')))
# Transpose to a plant by taxon matrix (site by species)
otu_mat <- t(otu_mat)
# Perform CLR transform on the OTU data and then do a dimension reduction with sPCA
otu_clr <- logratio.transfo(otu_mat, logratio = 'CLR', offset = 1)
otu_spca_10x50 <- spca(otu_clr, ncomp = 10, keepX = rep(50, 10)) # run the method
plotIndiv(otu_spca_10x50) # plot the samples
plotVar(otu_spca_10x50) # plot the variables
# extract the variables used to construct the first PC
selectVar(otu_spca_10x50, comp = 1)$name
# depict weight assigned to each of these variables
plotLoadings(otu_spca_10x50, method = 'mean', contrib = 'max')
# Join the OTU PCA and trait data together
otu_pca_loading_df <- data.frame(plantID = row.names(otu_spca_10x50$x), otu_spca_10x50$x)
traits_otu <- traits[otu_pca_loading_df, on = 'plantID']
fwrite(traits_otu, 'project/data/jin2017/traits_otupca.csv')
# Test relationship with traits -------------------------------------------
traits_otu <- fread('project/data/jin2017/traits_otupca.csv')
# Determine which traits have the highest correlation with the PCA axes
cormat <- cor(traits_otu[, mget(c(trait_names[-1], paste0('PC', 1:10)))], use = 'pairwise.complete.obs')[trait_names[-1], paste0('PC', 1:10)]
sort(apply(abs(cormat), 1, mean))
# Grain weight has one of the highest correlations
lm_test <- lm(log(grain_weight) ~ PC1+PC2+PC3+PC4+PC5+PC6+PC7+PC8+PC9+PC10, data=traits_otu)
# Try regularization. We do get some nonzero values.
library(glmnet)
set.seed(333)
glmnet_test <- cv.glmnet(x = as.matrix(traits_otu[, mget(paste0('PC', 1:10))]), y = log(traits_otu$grain_weight), alpha = 1, nfolds = 10, type.measure = 'mse',
lambda = seq(0.001, 0.1, by = 0.001))
# Analysis grouped by cultivar --------------------------------------------
# Use only data with >1 observation per cultivar
multiple_obs_cultivars <- unique(traits_otu$cultivar[duplicated(traits_otu$cultivar)])
traits_otu_multipleobs <- traits_otu[cultivar %in% multiple_obs_cultivars]
library(lme4)
lmer_test <- lmer(log(grain_weight) ~ PC1+PC2+PC3+PC4+PC5+PC6+PC7+PC8+PC9+PC10 + (1|cultivar), data=traits_otu_multipleobs)
# BRM model with no missing values ----------------------------------------
# Use similar horseshoe priors
library(brms)
library(rstan)
options(brms.backend = 'cmdstanr', brms.file_refit = 'on_change', mc.cores = 4)
n_PCs <- 10
# Construct priors programmatically because we have one for each taxon.
sd_X_priors <- lapply(1:n_PCs, function(i) prior_string('gamma(1, 1)', class = 'sd', resp = paste0('PC', i)))
sd_X_priors <- do.call(c, sd_X_priors)
sigma_X_priors <- lapply(1:n_PCs, function(i) prior_string('gamma(1, 1)', class = 'sigma', resp = paste0('PC', i)))
sigma_X_priors <- do.call(c, sigma_X_priors)
# Also construct formula programmatically.
X_formula <- paste0('mvbind(', paste(paste0('PC',1:n_PCs), collapse = ','), ') ~ 0 + (1||cultivar)')
y_formula <- paste0('grain_weight ~ ', paste(paste0('PC',1:n_PCs), collapse = '+'), ' + (1||cultivar)')
fit_grainweight_nomiss <- brm(
bf(mvbind(PC1,PC2,PC3,PC4,PC5,PC6,PC7,PC8,PC9,PC10) ~ 0 + (1||cultivar)) + bf(grain_weight ~ PC1+PC2+PC3+PC4+PC5+PC6+PC7+PC8+PC9+PC10 + (1||cultivar)) + set_rescor(FALSE),
prior = c(
sd_X_priors,
sigma_X_priors,
prior(gamma(1, 1), class = sd, resp = grainweight),
prior(gamma(1, 1), class = sigma, resp = grainweight),
prior(horseshoe(df = 1, df_global = 1, scale_slab = 20, df_slab = 10, par_ratio = 1), class = b, resp = grainweight)
),
data = traits_otu_multipleobs,
chains = 4, iter = 7500, warmup = 5000,
init = 0, seed = 1004,
control = list(adapt_delta = 0.95),
file = 'project/fits/brm_grainweight_nomiss'
)
# BRM model with missing values -------------------------------------------
# Artificially generate the missing data
# 50% of the individuals within each cultivar will have missing trait data, and 50% will have missing microbiome data
set.seed(444)
# Function to randomly sample half
samplehalf <- function(n) sample(rep(1:2, times=ceiling(n/2)))[1:n]
traits_otu_withmiss <- copy(traits_otu_multipleobs)
traits_otu_withmiss[, grp := samplehalf(.N) , by = .(cultivar)]
traits_otu_withmiss[grp == 1, grain_weight := NA]
PCcols <- paste0('PC', 1:n_PCs)
traits_otu_withmiss[ , (PCcols) := lapply(.SD, function(x) fifelse(grp == 2, NA_real_, x)), .SDcols = PCcols ]
# Construct formulas programmatically.
Xmiss_formula <- paste0('mvbind(', paste(PCcols, collapse = ','), ') | mi() ~ 0 + (1||cultivar)')
ymiss_formula <- paste0('grain_weight | mi() ~ ', paste(paste0('mi(PC', 1:n_PCs, ')'), collapse = '+'), ' + (1||cultivar)')
fit_grainweight_miss <- brm(
bf(mvbind(PC1,PC2,PC3,PC4,PC5,PC6,PC7,PC8,PC9,PC10) | mi() ~ 0 + (1||cultivar)) + bf(grain_weight | mi() ~ mi(PC1)+mi(PC2)+mi(PC3)+mi(PC4)+mi(PC5)+mi(PC6)+mi(PC7)+mi(PC8)+mi(PC9)+mi(PC10) + (1||cultivar)) + set_rescor(FALSE),
prior = c(
sd_X_priors,
sigma_X_priors,
prior(gamma(1, 1), class = sd, resp = grainweight),
prior(gamma(1, 1), class = sigma, resp = grainweight),
prior(horseshoe(df = 1, df_global = 1, scale_slab = 20, df_slab = 10, par_ratio = 1), class = b, resp = grainweight)
),
data = traits_otu_withmiss,
chains = 4, iter = 7500, warmup = 5000,
init = 0, seed = 1104,
control = list(adapt_delta = 0.95),
file = 'project/fits/brm_grainweight_miss'
)