-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathvdj_gene_comparison_functions.R
181 lines (150 loc) · 6.24 KB
/
vdj_gene_comparison_functions.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
library(dplyr)
library(ggplot2)
library(readr)
library(tibble)
library(tidyr)
##-------
# VDJ gene comparison functions for CCR4 mouse TCR-seq project
##-------
get_gene_usage= function(dir_path, md_path, gene= "V"){
## Takes iRepertoire raw sequencing data and produces combined data frame of gene usage for all samples
dirs_out= list.dirs(dir_path, full.names= FALSE, recursive= FALSE)
out_list0= vector('list', length= length(dirs_out))
out_list1 = out_list0
for(i in 1:length(dirs_out)){
current_dir= paste0(dir_path,"/", dirs_out[i])
gene_files= list.files(current_dir,
pattern= paste0(gene,"_\\d_usage.csv"))
gene0= read_csv(paste(current_dir,"/", gene_files[1], sep = ""),
col_names= FALSE, show_col_types= FALSE) %>%
mutate(sample= gsub("_\\w+.csv","", gene_files[1]),
origin= as.factor(0)) %>%
setNames(nm= c("gene", "score", "samples", "origin"))
gene1= read_csv(paste(current_dir,"/",gene_files[2], sep=""),
col_names= FALSE, show_col_types= FALSE) %>%
mutate(sample= gsub("_\\w+.csv","", gene_files[2]),
origin= as.factor(1)) %>%
setNames(nm= c("gene", "score", "samples", "origin"))
out_list0[[i]]<- gene0
out_list1[[i]]<- gene1
}
cdf <- c(out_list0, out_list1)
combined_df <- do.call(rbind, cdf)
combined_df$samples <- paste("s", combined_df$samples, sep= "_")
## Adding metadata
metadata <- read_csv(md_path, show_col_types= FALSE)
combined_df <- left_join(combined_df, metadata, by= "samples") %>%
mutate_all(~replace(., is.na(.), 0))
combined_df <- combined_df %>% mutate(gene_family = gsub("-\\w+", "", gene))
return(combined_df)
}
score_gene_usage <- function(df, new_name, group_cols="gene", score_col= "score"){
## Flexible function to score gene usage across groups
new_name= sym(new_name)
df <- df %>%
group_by(across(all_of(group_cols))) %>%
mutate(sum_score= sum(.data[[score_col]])) %>%
rename("{{new_name}}" := sum_score) %>%
ungroup()
return(df)
}
# load_genes <- function(path){
# df <- read_csv(path, show_col_types= FALSE) %>%
# mutate(score= as.numeric(score))
# return(df)
# }
fisher_data_prep <- function(df, gene_col, names_from, values_from){
fisher_data <- df %>%
select(all_of(c(gene_col, names_from, values_from))) %>%
distinct() %>%
pivot_wider(names_from = all_of(names_from),
values_from = all_of(values_from)) %>%
column_to_rownames(gene_col) %>%
mutate(across(everything(), ~round(.x*10)))
return(fisher_data)
}
lfc <- function(df, split_col, constant_col, score_col,
constant_val, pseudo_zero= 1e-6){
## Log fold change function for working with formatted differential gene expression data
uni_split= paste0("log_score_", unique(df[,split_col, drop= TRUE]))
if(length(uni_split) != 2){
stop("trying to split on a column with more than 2 unique variables.")
}
lfc <- df %>%
as.data.frame() %>%
select(gene, all_of(c(split_col, constant_col, score_col))) %>%
distinct() %>%
filter(get(constant_col) == constant_val) %>%
## get() gets columns that match its string argument
mutate(log_scores= log2(get(score_col) + pseudo_zero)) %>%
## have to add a small constant to prevent zeros from going to infinity
pivot_wider(id_cols= all_of(c("gene", constant_col)),
values_from= log_scores,
names_from= split_col,
names_prefix= "log_score_") %>%
mutate(lfc= get(uni_split[1]) - get(uni_split[2])) %>%
as.data.frame()
## Making two columns for the different log values.
## This changes the shape of the df.
## If you want to add the logFC back to your main df, do pivot_longer and add back with a join
return(lfc)
}
VDJ_usage_DE_formatter <- function(df, score_col){
wide_df <- df %>%
pivot_wider(id_cols= "gene",
names_from= c("cell_type","phenotype","samples"),
values_from= score_col) %>%
rename(X= gene)
colnames(wide_df) <- gsub("s_","", colnames(wide_df))
return(wide_df)
}
widyr_wilcox <- function(df, group1, group2){
rows= nrow(df)
p_vals= vector(mode= "numeric",length= rows)
for(row in 1:rows){
x= df[row,] %>% select(contains(group1)) %>% as.numeric()
y= df[row,] %>% select(contains(group2)) %>% as.numeric()
p_vals[row]= suppressWarnings(wilcox.test(x= x, y= y)$p.value)
}
p_vals= p.adjust(p= p_vals, method= "BH", n= rows)
return(p_vals)
}
widyr_fisher <- function(df, group1, group2){
rows= nrow(df)
p_vals= vector(mode= "numeric", length= rows)
for(row in 1:rows){
x= df[row,] %>% select(contains(group1)) %>% as.numeric()
y= df[row,] %>% select(contains(group2)) %>% as.numeric()
if(length(unique(x)) == 1 || length(unique(y)) == 1){
p_vals[row]= NA
## fisher.test doesn't work if there aren't 2 unique values for each group
} else{
p_vals[row]= fisher.test(x= x, y= y, simulate.p.value = TRUE, B= 2000)$p.value
}
}
p_vals= p.adjust(p= p_vals, method= "BH", n= rows)
return(p_vals)
}
get_gene_reads= function(dir_path, md_path){
## Gets all of the raw data that we need for differential expression from all TCR-seq samples
dirs_out= list.dirs(dir_path, full.names= FALSE, recursive= FALSE)
out_list= vector('list', length= length(dirs_out))
for(i in 1:length(dirs_out)){
current_dir= paste0(dir_path,"/", dirs_out[i])
gene_file= list.files(current_dir, pattern= "_CDR3_list_2.csv")
gene_reads= read_csv(paste(current_dir,"/", gene_file, sep = ""),
col_names= FALSE, show_col_types= FALSE) %>%
setNames(nm= c("X", "v_gene", "j_gene", "reads")) %>%
mutate(samples= gsub("_\\w+.csv","", gene_file))
out_list[[i]]<- gene_reads
}
combined_df <- do.call(rbind, out_list)
combined_df$samples <- paste("s", combined_df$samples, sep= "_")
## Adding metadata
metadata <- read_csv(md_path, show_col_types= FALSE)
combined_df <- left_join(combined_df, metadata, by= "samples") %>%
mutate_all(~replace(., is.na(.), 0))
combined_df <- combined_df %>% mutate(v_subgroup = gsub("(\\w+\\d+).*","\\1", v_gene))
## There are no IMGT mTRAJ subgroups.
return(combined_df)
}