-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathscrna-seq_analysis.r
125 lines (95 loc) · 5.88 KB
/
scrna-seq_analysis.r
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
#' scRNA-seq analysis of GSE181919 dataset for Stefan Stoiber, 17.5.2023
#' @author Karolina Trachtova
#'
library(Seurat)
library(ComplexHeatmap)
library(gridExtra)
library(cowplot)
library(gplots)
project_dir <- "~/stefan_scRNA-seq_re-analysis"
################################################################################
# Load input data
################################################################################
# counts downloaded from GEO
# to run the following code, first download files (GSE181919_UMI_counts.txt and GSE181919_Barcode_metadata.txt)
# from https://www.ncbi.nlm.nih.gov/geo/query/acc.cgi?acc=GSE181919
# place these files to the ./data folder
# unzip counts matrix -> might take some time or do it manually in the terminal
scrnaseq_counts <- read.table(paste0(project_dir,"/data/GSE181919_UMI_counts.txt"), sep="\t", header=T)
# metadata, downloaded from GEO
metadata <- read.table(paste0(project_dir,"/data/GSE181919_Barcode_metadata.txt"), sep="\t", heade=T)
# Tumor Initiating Cells -> list of marker genes, derived from the picture from Schramek Lab
TICgenes <- c("CD44", "NGFR", "TP63", "ETS2", "ITGA6", "WNT10A", "MYC", "JUN", "FOS", "KRT5", "KRT14", "KRT10", "KRT1", "MKI67", "CCNA2", "CCNB1", "CCNB2", "KLF4", "PDK1")
SHHgenes <- c("BMP2", "BMP4", "BMP5", "BMP6", "BMP7", "BMP8A", "BMP8B", "BTRC", "CSNK1A1", "CSNK1A1L", "CSNK1D", "CSNK1E", "CSNK1G1", "CSNK1G2", "CSNK1G3", "DHH", "FBXW11", "GAS1", "GLI1", "GLI2", "GLI3", "GSK3B", "HHIP", "IHH", "LRP2", "PRKACA", "PRKACB", "PRKACG", "PRKX", "PTCH1", "PTCH2", "RAB23", "SHH", "SMO" , "STK36" , "SUFU" , "WNT1" , "WNT10A" , "WNT10B" , "WNT11" , "WNT16", "WNT2", "WNT2B", "WNT3", "WNT3A", "WNT4", "WNT5A", "WNT5B", "WNT6", "WNT7A", "WNT7B", "WNT8A", "WNT8B", "WNT9A", "WNT9B", "ZIC2")
# check which SHH genes are not in the GSE181919 dataset
SHHgenes[!SHHgenes %in% rownames(scrnaseq)]
# [1] "BMP8A" "IHH" "LRP2" "PRKACG" "SHH" "WNT1" "WNT8A" "WNT8B" "WNT9B"
################################################################################
# Data preparation
################################################################################
# filter out tissue specimens we do not want -> keep just primary cancer cells ("CA")
metadata_CA <- metadata[metadata$tissue.type %in% c("CA"),]
rownames(metadata_CA) <- gsub("-","\\.",rownames(metadata_CA))
dim(metadata_CA) # 23088 cells are "CA"
# subset also the initial expression matrix
scrnaseq_CA <- scrnaseq[,colnames(scrnaseq) %in% rownames(metadata_CA)]
dim(scrnaseq_CA)
# create Seurat object
scrnaseq_obj <- CreateSeuratObject(scrnaseq_CA, project = "SeuratProject", assay = "RNA",
min.cells = 0, min.features = 0, names.field = 1,
names.delim = ".", meta.data = metadata_CA)
################################################################################
# FirstQC + pre-filtering
################################################################################
# create slot for MT RNA
scrnaseq_obj[["percent.mt"]] <- PercentageFeatureSet(scrnaseq_obj, pattern = "^MT-")
VlnPlot(scrnaseq_obj, features = c("nFeature_RNA", "nCount_RNA", "percent.mt"), ncol = 3)
plot1 <- FeatureScatter(scrnaseq_obj, feature1 = "nCount_RNA", feature2 = "percent.mt")
plot2 <- FeatureScatter(scrnaseq_obj, feature1 = "nCount_RNA", feature2 = "nFeature_RNA")
plot1 + plot2
# Pre-filtering -> if I wanted to filter out more cells it would be done like this:
#hnscc <- subset(scrnaseq_obj, subset = percent.mt < 10)
################################################################################
# Normalization
################################################################################
hnscc_allFeatures <- SCTransform(scrnaseq_obj, vars.to.regress = "percent.mt", verbose = FALSE, return.only.var.genes = F)
################################################################################
# UMAP + heatmap plotting
################################################################################
# calculate SHH enrichment
hnscc_allFeatures <- AddModuleScore(hnscc_allFeatures,
features = list(SHHgenes),
name="SHH_enriched", ctrl = 10)
dim <- 10
res <- 0.05
hnscc_sb <- subset(x = hnscc_allFeatures, features = TICgenes)
hnscc_sb <- RunPCA(hnscc_sb, verbose = FALSE, approx=FALSE)
hnscc_sb <- RunUMAP(hnscc_sb, dims = 1:dim, verbose = FALSE)
hnscc_sb <- FindNeighbors(hnscc_sb, dims = 1:dim, verbose = FALSE)
hnscc_sb <- FindClusters(hnscc_sb, verbose = FALSE, resolution = res)
# Plot UMAP of TIC genes only
p1 <- DimPlot(hnscc_sb, label = TRUE) + ggtitle(paste0("dim 1:", dim, ", resolution=",res))
df_mean <- list()
for(i in 0:(length(levels(hnscc_sb@meta.data$seurat_clusters))-1)){
clstr <- hnscc_sb@assays$SCT[,colnames(hnscc_sb@assays$SCT) %in% rownames(hnscc_sb@meta.data[hnscc_sb@meta.data$seurat_clusters == i,])]
tmp <- as.data.frame(rowMeans(clstr))
colnames(tmp) <- paste0("c",i)
df_mean <- c(df_mean, list(tmp))
}
df <- do.call("cbind", df_mean)
# plot heatmap of clusters expression
p2 <- grid.grabExpr(draw(ComplexHeatmap::pheatmap(df, scale="row",
cluster_cols = F,
cluster_rows = F,
color = bluered(201),
border_color = NA,
gaps_row = c(13),
column_names_side = c("top"), angle_col = c("0"),
main = "TIC genes (average)")))
print(plot_grid(p1, p2, nrow=1))
# plot SHH enrichment
p3 <- FeaturePlot(hnscc_sb,
features = "SHH_enriched1", label = TRUE, repel = TRUE,
order = T,
min.cutoff = 0)
print(plot_grid(p1, p2, p3, nrow=1))