-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathCluster.Rmd
145 lines (122 loc) · 3.64 KB
/
Cluster.Rmd
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
---
title: "BloodPAC: MTDE Clustering Exercise"
output:
html_document:
css: custom.css
highlight: zenburn
theme: lumen
---
## Goal
To answer the question: "which projects have the most similar MTDE submissions?"
## Environment Prep
```{r, message=FALSE}
if (!require('plyr')) install.packages('plyr')
if (!require('ggfortify')) install.packages('ggfortify')
if (!require('pheatmap')) install.packages('pheatmap')
if (!require('grid')) install.packages('grid')
if (!require('DT')) install.packages('DT')
source('credentials.R')
```
## Data Prep
### Helper: Get Counts TSVS for each MTDE
```{r}
GetTsvHelper <- function(tsv, remove) {
# gets tsvs from MTDE matrix using https credentials
#
# Args:
# tsv: name of tsv to get
# remove: list of project that have been created but have not completed submission
#
# Returns:
# df representing MTDE counts
finalURL <- paste0("https://", username, ":", password,
"@services.bloodpac.org/", tsv)
data <- read.table(finalURL, sep='\t', header=T)
data <- data[!data$Project %in% remove, ]
return(data)
}
```
### Import Data
```{r}
mtde <- c("hrs.to.fractionation",
"storage.temperature",
"analyte.isolation.method",
"shipping.temperature",
"assay.method",
"sample.type",
"quantification.method",
"time.to.freezer",
"composition",
"tube.type") #,
#"dna.concentration")
tsvs <- c("table_hours_to_fractionation.tsv",
"table_storage_temperature.tsv",
"table_analyte_isolation_method.tsv",
"table_shipping_temperature.tsv",
"table_assay_method.tsv",
"table_clinical_or_contrived.tsv",
"table_quantification_assay.tsv",
"table_hours_to_freezer.tsv",
"table_composition.tsv",
"table_blood_tube_type.tsv") #,
#"table_molecular_concentration.tsv")
# removes project for which there are established projects but no submission.
remove <- c("MSKCC_P0002_T1",
"MSKCC_P0003_T1",
"MSKCC_P0004_T1",
"Novartis_Contrived2_T1")
# get dfs for each MTDE
for (i in 1:length(mtde)) {
assign(mtde[i], GetTsvHelper(tsvs[i], remove))
}
```
### Helper: Combine Desired MTDE Counts to Prep for Clustering
```{r}
SelectAndCombine <- function(list) {
# select list of MTDES and prepare for PCA
#
# Args:
# list: list of MTDES to review
#
# Returns:
# df ready for PCA
df <- data.frame()
flag <- TRUE
for (i in list) {
if (flag) {
df <- get(i)
flag <- FALSE
} else {
df <- merge(df, get(i),
by = c("Organization", "Project"))
}
}
# remove columns strings / no values
df <- df[, colSums(df != 0) > 0]
data <- df[,-2]
rownames(data) <- df[,2]
df <- data[,-1]
df
}
```
## Prep
```{r}
data <- SelectAndCombine(mtde)
```
## Jaccard Distance - Binarize
```{r}
d <- as.matrix(dist(data, method="binary"))
datatable(d)
```
## Visualize
```{r, message=F, warning=F, fig.align='center'}
set.seed(32)
draw_colnames_45 <- function (coln, gaps, ...) {
coord = pheatmap:::find_coordinates(length(coln), gaps)
x = coord$coord - 0.5 * coord$size
res = textGrob(coln, x = x, y = unit(1, "npc") - unit(3,"bigpts"), vjust = 0.5, hjust = 1, rot = 60, gp = gpar(...))
return(res)}
assignInNamespace(x="draw_colnames", value="draw_colnames_45",
ns=asNamespace("pheatmap"))
pheatmap(d, kmeans_k=5, main="BloodPAC MTDEs: Project Similarity \n Binary | Jaccard Distance Measure")
```