-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathpreprocess.R
58 lines (44 loc) · 1.37 KB
/
preprocess.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
library(readr)
library(dplyr)
library(tidyr)
library(purrr)
library(reshape)
library(MatrixGenerics)
library(ggplot2)
library(ggrepel)
library(patchwork)
library(ComplexHeatmap)
library(tibble)
library(doParallel)
library(stringr)
df <- read_csv2("data/mags.csv")
files <- list()
for (i in seq(1, 9)) {
files[[i]] <- read_csv2(paste0("data/", i, ".csv")) %>%
mutate(sample = i)
}
micro_df <- merge_all(files) %>%
na.omit() %>%
mutate(genome = gsub(".fna", "", genome)) %>%
group_by(genome)
micro_stats <- micro_df %>%
summarise(MeanCov = mean(coverage),
MinCov = min(coverage),
MaxCov = max(coverage))
df <- micro_stats %>%
left_join(df, by = c("genome" = "Bin"))
rpkm_mat <- df %>%
select(starts_with("RPKM")) %>%
as.matrix()
rpkm_mat[rpkm_mat == 0] <- min(rpkm_mat[rpkm_mat > 0]) / 2
rpkm_shift_mat <- matrix(nrow = dim(rpkm_mat)[[1]], ncol = dim(rpkm_mat)[[2]] - 1)
nucdiv_mat <- df %>%
select(starts_with("nucdiv"))
nucdiv_mat[is.na(nucdiv_mat)] <- min(nucdiv_mat[!is.na(nucdiv_mat)]) / 2
nucdiv_shift_mat <- matrix(nrow = dim(nucdiv_mat)[[1]], ncol = dim(nucdiv_mat)[[2]] - 1)
for (i in seq(1, dim(rpkm_mat)[[2]] - 1)) {
rpkm_shift_mat[ , i] <- map2_dbl(rpkm_mat[ , i + 1], rpkm_mat[ , i], `/`)
}
colnames(rpkm_shift_mat) <- sapply(seq(1, 8), function(x) paste0("Ratio_", x))
df <- df %>%
cbind(as.data.frame(rpkm_shift_mat))