-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathhbv.R
122 lines (97 loc) · 3.72 KB
/
hbv.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
library(GEOquery)
## Array quality in mice studies was assessed with the arrayQualityMetrics package of Bioconductor (http://www.bioconductor.org). Data were normalized and logged-2 transformed using the RMA algorithm.
hbv_raw_data <- getGEO(filename = './raw_data/GSE25097_family.soft')
# dir.create('./data')
# save.image('./data/data.rdata')
gsmlist <- GSMList(hbv_raw_data)
# rm(hbv_raw_data)
gc()
probe_ids <- as.vector(Table(gsmlist[[1]])$ID_REF)
sample_names <- sapply(1:length(gsmlist), function(ii){
names(gsmlist[ii])
})
matrix_data <- ldply(1: length(gsmlist), .progress = 'text', .fun = function(ii){
tab <- Table(gsmlist[[ii]])
id_match <- match(tab$ID_REF, probe_ids)
tab$VALUE[id_match] %>% as.numeric()
})
matrix_data <- t(matrix_data)
colnames(matrix_data) <- sample_names
rownames(matrix_data) <- probe_ids
# expriment design ----
experTable <- ldply(1: length(gsmlist), function(ii){
c(names(gsmlist[ii]), gsmlist[[ii]]@header$source_name_ch1)
})
phenoData()
# mapping gene ----
matrix_data_df <- matrix_data %>% as.data.frame()
# matrix_data_df$probe_id <- rownames(matrix_data)
# hcc & nohcc
hbv_data <- matrix_data_df[, c(290:557, 7:46)]
colnames(matrix_data_df)
# 差异共表达分析 DCGL
hbv_data <- log2(hbv_data)
library(DCGL)
# 方差对探针进行过滤
var_gene <- varianceBasedfilter(hbv_data, 0.01)
var_gene_name <- rownames(var_gene)
hbv_hcc <- hbv_data[, 1:268]
hbv_nohcc <- hbv_data[, 269: 308]
# 差异表达
# 添加差异表达探针
library(limma)
# browseVignettes('limma')
tinyCond <- c(rep(1,268),rep(2,40)) # 1是肝癌,2是肝硬化
design <- model.matrix(~ 0+factor(c(rep(1, 268),rep(2, 40))))
colnames(design) <- c("hcc", "nohcc")
fit <- lmFit(hbv_data, design)
contrast.matrix <- makeContrasts(hcc-nohcc, levels=design)
fit2 <- contrasts.fit(fit, contrast.matrix)
fit2 <- eBayes(fit2)
dc_raw <- topTable(fit2, adjust="BH", number = 100000)
dc_raw$gene <- rownames(dc_raw)
dc_raw_filter <- filter(dc_raw, adj.P.Val<0.01)
dc_raw_gene <- dc_raw_filter$gene
# 两者合并
raw_gene <- c(var_gene_name, dc_raw_gene) %>% unique
raw_gene_df <- data.frame(gene = raw_gene, stringsAsFactors = F)
hbv_data_1 <- hbv_data[raw_gene, ]
hbv_hcc <- hbv_data_1[, 1:268]
hbv_nohcc <- hbv_data_1[, 269: 308]
save.image('./data/hbv.rda')
DCp.res <- DCp(hbv_hcc, hbv_nohcc,
r.method = c("pearson", "spearman")[1],
link.method = c("qth", "rth", "percent")[1],
cutoff = 0.01,
N=100,
N.type = c("pooled", "gene_by_gene")[2],
q.method = c("BH", "holm", "hochberg", "hommel", "bonferroni","BY", "fdr")[1])
View(DCp.res)
save(DCp.res, file = './data/DCp_res_hbv.rdata')
DCe.res <- DCe(exprs_hcc, exprs_nohcc,
link.method = c("qth", "rth", "percent")[1],
cutoff = 0.01,
r.method = c("pearson", "spearman")[1],
q.method = c("BH", "holm", "hochberg", "hommel","bonferroni", "BY", "fdr")[1],
nbins = 20, p = 0.1)
# colnames(DCp.res)[3:4] <- colnames(DCp.res)[4:3]
# 修改包数据, 无需pvalue校正
DCp.res <- DCp.res[, c(1, 2, 4, 3)]
colnames(DCp.res)[3:4] <- colnames(DCp.res)[4:3]
DCsum.res <- DCsum(DCp.res, DCe.res,
DCpcutoff = 0.01,
DCecutoff = 0.01)
# 差异共表达基因与差异表达基因合并
dce <- DCsum.res$DCGs
dce_pvalue <- dce %>% left_join(dc_fc, by = c('DCG' = 'gene'))
DCsum.res$DCGs %>% View
DCsum.res$DCLs %>% nrow
# colnames(DCsum.res$DCGs)
dcg <- DCsum.res$DCGs
dc_link <- DCsum.res$DCLs
# 合并DCp与差异表达
DCp.res$DCG <- rownames(DCp.res)
dcp_de <- left_join(DCp.res, dc_fc, by = c('DCG' = 'gene'))
# 寻找 TF 调控关系
data(tf2target)
DRsort.res <- DRsort(DCsum.res$DCGs, DCsum.res$DCLs, tf2target, D2)