-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy path2_SAEGUS_to_MACH_format.R
188 lines (155 loc) · 8.96 KB
/
2_SAEGUS_to_MACH_format.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
# This takes output from SAEGUS and formats for MACH based on missingness data and parent genotypes
# Requires sim output from SAEGUS and parent key data (see file formats in README)
# Required Packages
library("tidyr")
library("gdata")
library("data.table")
# User inputs
# Working directory
wd <- "~/working/directory/"
# Simulated GT matrix
sd <- "known_GT.csv"
#Simulated Parent_of_origin
ld <- "known_parent_of_origin.csv"
# Name of parent data key
fd <- "parent_key_vcf.txt"
# Number of chromosomes
chrom <- 10
# Name of Population (for MACH output)
popID <- "Pop"
# Sample number
sn <- 1000
#Rsq Threshold (for imputation output)
rsq <- 0.8
#Genotyping Error
g_error <- 0.006
# Set working directory, which contains the simulated output from SAEGUS and the parent key data
setwd(wd)
#write file that contains all user inputs for downstream scripts to use
write.table(rbind(wd,sd,ld,fd,chrom,popID,sn,rsq),"user_input.txt", col.names = F, sep="\t")
# parent Data
parents <- read.table(fd, head=T, stringsAsFactors = FALSE,sep='\t') #parent data key for GT info and marker info
parents <- rename.vars(parents,"chr","CHROM") #Rename because SAEGUS uses "chr" as the variable name
# SimuPop Output
sim <- read.csv(sd,head=T, stringsAsFactors = FALSE) #Reads in GT output from 1_SAEGUS.py
#Collapse alleles into Genotypes and transpose for MACH
out <- sim[1]
out[2:(((ncol(sim)-1)/2)+1)] <- lapply(seq(2, ncol(sim), 2),
function(i) do.call(paste0, c(sim[i],"/",sim[(i+1)])))
sim <- as.data.frame(t(out[2:ncol(out)]))
#Rename samples to sim.sample#
names(sim) <- paste("sim.sample",out$ind_id,sep="")
#Concatenate snpID, CHROM, POS, cM, and F_MISS data from parent key
sim <- cbind(parents[,1:7],sim)
known <- sim
#Genotyping error
#1 Create a dataframe (num_loci x num_indv) filled with random sampling from uniform distribution
err_df <- as.data.frame(matrix(runif(nrow(parents)*sn),nrow(parents),sn))
#2 sample GTs and change
#index positions in err_df < user defined ERR at that locus
gt_change <- which(err_df<g_error, arr.ind = T)
#locate indexed positions in known data and randomly change to one of two remaining GTs (equal prob)
sim[,8:ncol(sim)][gt_change] <- ifelse(sim[,8:ncol(sim)][gt_change]=="0/0", sample(c("0/1","1/1"),1), ifelse(sim[,8:ncol(sim)][gt_change]=="1/1", sample(c("0/0","0/1"),1), sample(c("0/0","1/1"))))
#Calculate realized error at each locus
err_df$R_ERR <- apply(err_df,1,function(x) length(which(x<g_error))/sn)
#Histogram of realized error
hist(err_df$R_ERR, main = "Realized Genotyping Error Across all Loci", xlab = "Realized GT Error")
#Append realized error distribution to known GT data set
known <- cbind.data.frame(known[,1:5],R_ERR = err_df[,(sn+1)], parents[,6:ncol(parents)], known[,8:ncol(known)])
#Output known GT data for later
write.csv(known,"known_GT2.csv",row.names = F)
nparents <- length(parents)-7 #Number of parents
write.table(cbind("kd","known_GT2.csv"),"user_input.txt", col.names = F, sep="\t", append = TRUE, row.names = F)
write.table(cbind("fn",nparents),"user_input.txt", col.names = F, sep="\t", append = TRUE, row.names = F)
#Induce missing data
sim$F_MISS_r <- (round(sim$F_MISS,(nchar(sn)-1))*sn) #Create an integer for number of genotypes to set to missing (rounded to whole number based on original sample number)
sim <- subset(sim,sim$F_MISS_r<sn) #Remove markers with completely missing data, will be imputed back in later by MACH (also reduces time to run for next step)
#Randomly pick n number of samples based on the rounded missing proportion for each marker and set to missing
look <- split(as.matrix(sim[,8:(ncol(sim))]), row(as.matrix(sim[,8:(ncol(sim))])))
for (i in 1:nrow(sim)){
look[[i]][sample(sn,look[[i]][sn+1])] <- "./."
}
look <- as.data.frame(do.call("rbind",look))
names <- (colnames(sim[1:ncol(sim)-1]))
sim <- cbind(sim[,1:7],look[,1:sn])
colnames(sim) <- names
#Format alleles in required format for MACH (A,C,T,G) for parents and population
#parents
look <- split(as.matrix(parents[,6:(ncol(parents))]), row(as.matrix(parents[,6:(ncol(parents))])))
for (i in 1:nrow(parents)){
look[[i]] <- gsub("0/0", (paste0(as.character(look[[i]][1]), "/", as.character(look[[i]][1]))), look[[i]])
look[[i]] <- gsub("1/1", (paste0(as.character(look[[i]][2]), "/", as.character(look[[i]][2]))), look[[i]])
}
look <- as.data.frame(do.call("rbind",look))
names <- colnames(parents)
parents <- cbind(parents[,1:5],look)
colnames(parents) <- names
#Population
look <- split(as.matrix(sim[,6:(ncol(sim))]), row(as.matrix(sim[,6:(ncol(sim))])))
for (i in 1:nrow(sim)){
look[[i]] <- gsub("0/0", (paste0(as.character(look[[i]][1]), "/", as.character(look[[i]][1]))), look[[i]])
look[[i]] <- gsub("1/1", (paste0(as.character(look[[i]][2]), "/", as.character(look[[i]][2]))), look[[i]])
look[[i]] <- gsub("0/1", (paste0(as.character(look[[i]][1]), "/", as.character(look[[i]][2]))), look[[i]])
look[[i]] <- gsub("1/0", (paste0(as.character(look[[i]][1]), "/", as.character(look[[i]][2]))), look[[i]])
}
look <- as.data.frame(do.call("rbind",look))
names <- (colnames(sim))
sim <- cbind(sim[,1:5],look)
colnames(sim) <- names
### Create MACH data, based on required format for MACH, see MACH help page
# Create MACH directory and folder for each chromosome within working directory
dir.create("MACH")
setwd("./MACH") #navigate to directory
# Create directory for each chromosome
for (c in seq(1,chrom)){
dir.create(paste0('chrom',c))
}
# Create parent_map, which lists chrom and pos of parent data, population.dat, which lists the overlapping markers between parents and population
parents_map <- parents[,1:3]
parents_map <- parents_map[order(as.numeric(parents_map$snpID)),]
# Subset all_data to keep snpID, Chrom, Pos, and samples
all_data2 <- sim[,-c(4:7)] #Removes cM, F_MISS, REF, ALT
parents_map[,1] <- sub("^","M SNP",parents_map[,1]) #Add "M_SNP" in front of marker ids in population (required format for MACH)
#This is the set of overlapping markers between the parent set and the population
population.dat <- parents_map[which(interaction(parents_map$CHROM, parents_map$POS) %in%
interaction(all_data2$CHROM,all_data2$POS)),1:2]
# Create .dat files for each chromosome from population.dat
for(c in seq(1,chrom)){
setwd(paste0(wd,"/MACH/chrom",c))
write.table(population.dat[population.dat$CHROM == c,][1], file = paste0("progeny_chrom_",c,".dat"),
quote = FALSE, sep = "\t", row.names = FALSE, col.names = FALSE)
}
# Create .PED file (genotype data on specific chromosome for population)
all_data2 <- all_data2[with(all_data2,order(as.numeric(snpID))),] #Make sure order of markers is maintained
for (c in seq(1,chrom)){
population.ped <- as.data.frame(t(all_data2[all_data2$CHROM == c,][,-c(1:3)]))
population.ped <- data.frame(fID=popID, iID=rownames(population.ped), pID=0, mID=0, sex=0, population.ped) #Required IDs for MACH
setwd(paste0(wd,"/MACH/chrom",c))
write.table(population.ped, file = paste0("progeny_chrom_",c,".PED"),
quote = FALSE, sep = "\t", row.names = FALSE, col.names = FALSE)
}
# Create parents_chrom_X.snps file (list of parent SNP#'s on specfic chromosome)
for(c in seq(1,chrom)){
parents.snps <- data.frame(snpID=substring(parents_map[parents_map$CHROM == c,][,1],3))
setwd(paste0(wd,"/MACH/chrom",c))
write.table(parents.snps, file = paste0("parents_chrom_",c,".snps"), quote = FALSE,
sep = "\t", row.names = FALSE,col.names = FALSE)
}
# Create parents_chrom_X.haplos file (parent haplotype data on specific chromosome)
for(c in seq(1,chrom)){
parents.chrom.haps <- as.data.frame(t(parents[parents$CHROM == c,c(-1:-7)])) #transpose data and remove snpID, CHROM, and POS columns
rownames(parents.chrom.haps) <- paste0(' ',rownames(parents.chrom.haps),' (',c(1:length(8:ncol(parents))),')') #Set row names as parent IDs
parents.chrom.haps <- as.data.frame(parents.chrom.haps[rep(1:nrow(parents.chrom.haps), each=2),]) #Creates two rows for each parent (each homolog)
parents.chrom.haps <- data.frame("fID" = rownames(parents.chrom.haps),parents.chrom.haps, row.names = NULL)
parents.chrom.haps <- cbind.data.frame('index' = rownames(parents.chrom.haps),
"fID" =gsub('\\.1',"", parents.chrom.haps$fID),"HapID" = c("HAP1","HAP2"),
parents.chrom.haps[2:ncol(parents.chrom.haps)]) #creates and formats dataframe
# Removing the second allele from first haplotype and first allele from second haplotype
parents.chrom.haps <- rbind.data.frame(lapply(parents.chrom.haps[c(TRUE,FALSE),], gsub, pattern='/[A-Z]', replacement=''),
lapply(parents.chrom.haps[c(FALSE,TRUE),], gsub, pattern='[A-Z]/', replacement=''))
# Reordering into correct order
parents.chrom.haps <- parents.chrom.haps[order(as.numeric(parents.chrom.haps$index)),]
setwd(paste0(wd,"/MACH/chrom",c))
write.table(parents.chrom.haps[,-1],file = paste0('parents_chrom_',c,".haplos"),quote = FALSE, sep = '\t',
row.names = FALSE,col.names = FALSE)
}