-
Notifications
You must be signed in to change notification settings - Fork 21
/
Copy pathpipeline.make
711 lines (551 loc) · 28.6 KB
/
pipeline.make
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
SHELL=/bin/bash -o pipefail
##################################################
#
# Step 0. Preamble: set up paths, variables and
# install software.
#
##################################################
# do not leave failed files around
.DELETE_ON_ERROR:
# do not delete intermediate files
.SECONDARY:
# Parameters to control execution
THREADS=4
# Path to this file
SCRIPT_DIR:=$(shell dirname $(realpath $(lastword $(MAKEFILE_LIST))))
# Install samtools
samtools.version:
git clone --recursive https://github.com/samtools/htslib.git
cd htslib; git checkout 1.2.1; make
git clone --recursive https://github.com/samtools/samtools.git
cd samtools; git checkout 1.2; make
-cd samtools; git log | head -1 > ../$@
# Install bwa
bwa.version:
git clone https://github.com/lh3/bwa.git
cd bwa; make
-cd bwa; git log | head -1 > ../$@
# Install python libs
pythonlibs.version:
pip install biopython >> $@
pip install Cython >> $@
pip install numpy >> $@
-git clone https://github.com/arq5x/poretools.git
pip install -r poretools/requirements.txt
cd poretools && python setup.py install
pip freeze >> $@
# Install bedtools
bedtools.version:
git clone https://github.com/arq5x/bedtools2.git bedtools
cd bedtools; git checkout v2.26.0; make
-cd bedtools; git log | head -1 > ../$@
# Install nanopolish, automatically downloading libhdf5
nanopolish.version:
git clone --recursive https://github.com/jts/nanopolish.git
cd nanopolish; git checkout 4533516; make
-cd nanopolish; git log | head -1 > ../$@
#
PORETOOLS=poretools
#
# Resources, references, etc
#
HUMAN_REFERENCE=GRCh38.primary_assembly.genome.fa
ECOLI_REFERENCE=ecoli_k12.fasta
BISULFITE_BED=ENCFF279HCL.bed
DATA_ROOT=../data
#
# Output targets
#
.DEFAULT_GOAL := all
all: all-except-cancer-normal all-cancer-normal
all-except-cancer-normal: all-nucleotide all-cpg all-results-plots
all-software: pythonlibs.version bedtools.version nanopolish.version samtools.version bwa.version
all-download: all-software $(BISULFITE_BED) $(HUMAN_REFERENCE) $(ECOLI_REFERENCE) MCF10A.cyto.txt.gz MDAMB231.cyto.txt.gz
# Function from http://stackoverflow.com/questions/6145041/makefile-filter-out-strings-containing-a-character
# Removes all strings containing the argument from the input list
FILTER_OUT = $(foreach v,$(2),$(if $(findstring $(1),$(v)),,$(v)))
# Only keeps strings containing the argument
FILTER_IN = $(foreach v,$(2),$(if $(findstring $(1),$(v)),$(v),))
# Build a complete list of NA12878/Ecoli data sets
# For E.coli this is just all of the ecoli* runs in the data directory
# For NA12878 we have to manually add in the merged datasets as well
all_NA12878=$(notdir $(patsubst %.fast5,%.fasta,$(wildcard $(DATA_ROOT)/NA12878.*.fast5))) \
NA12878.native.merged.fasta NA12878.native.r9.merged.fasta
all_ecoli=$(notdir $(patsubst %.fast5,%.fasta,$(wildcard $(DATA_ROOT)/ecoli*.fast5)))
all_cancer=mcf10a.rr1.timp.031116.fasta mcf10a.rr2.timp.031116.fasta mcf10a.merged.fasta \
mdamb231.rr1.timp.031316.fasta mdamb231.rr2.timp.031316.fasta mdamb231.merged.fasta
# Rules to generate all trained models
all-nucleotide: $(patsubst %.fasta,%.alphabet_nucleotide.fofn,$(all_ecoli))
all-cpg: $(patsubst %.fasta,%.alphabet_cpg.fofn,$(all_ecoli))
# Rules to generate all output reports
all-island-plots: $(addprefix results/,$(patsubst %.fasta,%.cpg_island_plot.pdf,$(all_NA12878)))
all-training-plots: results/figure.emissions.pdf results/figure.shift_by_position.pdf
all-accuracy-plots: results/figure.accuracy_roc.pdf results/figure.accuracy_by_threshold.pdf results/figure.site_likelihood_distribution.pdf
all-TSS-plots: results/figure.methylation_by_TSS_distance.pdf results/figure.methylation_by_TSS_distance_by_chromosome.pdf
all-results-plots: all-accuracy-plots all-island-plots all-TSS-plots all-training-plots results/all.training.tables.tex results/accuracy.table.R7.tex results/accuracy.table.R9.tex
all-cancer-normal: results/mcf10a.bsnanocorr.pdf results/mdamb231.bsnanocorr.pdf results/cn.region.plot.pdf results/cn.strand.plot.pdf
##################################################
#
# Step 1. Prepare input data
#
##################################################
#
# Define variables for each data set
#
ECOLI_K12_PCR_RUN1_DATA=ecoli_k12.pcr.loman.250915.fasta
# R7 data sets
ECOLI_ER2925_PCR_RUN1_DATA=ecoli_er2925.pcr.timp.113015.fasta
ECOLI_ER2925_PCR_RUN2_DATA=ecoli_er2925.pcr.timp.021216.fasta
ECOLI_ER2925_PCR_MSSSI_RUN1_DATA=ecoli_er2925.pcr_MSssI.timp.113015.fasta
ECOLI_ER2925_PCR_MSSSI_RUN2_DATA=ecoli_er2925.pcr_MSssI.timp.021216.fasta
HUMAN_NA12878_NATIVE_RUN1_DATA=NA12878.native.timp.093015.fasta
HUMAN_NA12878_NATIVE_RUN2_DATA=NA12878.native.simpson.101515.fasta
HUMAN_NA12878_NATIVE_RUN3_DATA=NA12878.native.simpson.103015.fasta
HUMAN_NA12878_NATIVE_MERGED_DATA=NA12878.native.merged.fasta
HUMAN_NA12878_PCR_DATA=NA12878.pcr.simpson.021616.fasta
HUMAN_NA12878_PCR_MSSSI_DATA=NA12878.pcr_MSssI.simpson.021016.fasta
# Cancer pair
HUMAN_MCF10A_RR1_DATA=mcf10a.rr1.timp.031116.fasta
HUMAN_MCF10A_RR2_DATA=mcf10a.rr2.timp.031116.fasta
HUMAN_MCF10A_MERGED_DATA=mcf10a.merged.fasta
HUMAN_MDAMB231_RR1_DATA=mdamb231.rr1.timp.031316.fasta
HUMAN_MDAMB231_RR2_DATA=mdamb231.rr2.timp.031316.fasta
HUMAN_MDAMB231_MERGED_DATA=mdamb231.merged.fasta
# R9 data sets
ECOLI_ER2925_PCR_R9_RUN1_DATA=ecoli_er2925.pcr.r9.timp.061716.fasta
ECOLI_ER2925_PCR_MSSSI_R9_RUN1_DATA=ecoli_er2925.pcr_MSssI.r9.timp.061716.fasta
HUMAN_NA12878_NATIVE_R9_RUN1_DATA=NA12878.native.r9.simpson.062416.fasta
HUMAN_NA12878_NATIVE_R9_RUN2_DATA=NA12878.native.r9.timp.062216.fasta
HUMAN_NA12878_NATIVE_R9_RUN3_DATA=NA12878.native.r9.timp.062416.fasta
HUMAN_NA12878_NATIVE_R9_MERGED_DATA=NA12878.native.r9.merged.fasta
HUMAN_NA12878_PCR_R9_DATA=NA12878.pcr.r9.timp.081016.fasta
HUMAN_NA12878_PCR_R9_MSSSI_DATA=NA12878.pcr_MSssI.r9.timp.081016.fasta
# For each data set that we use, define a variable containing its reference
# E.coli samples
$(foreach file,$(all_ecoli),$(eval $(file)_REFERENCE=$(ECOLI_REFERENCE)))
# NA12878 samples
$(foreach file,$(all_NA12878),$(eval $(file)_REFERENCE=$(HUMAN_REFERENCE)))
# Cancer samples
$(foreach file,$(all_cancer),$(eval $(file)_REFERENCE=$(HUMAN_REFERENCE)))
#
# Reference and region file used for model training
#
TRAINING_REFERENCE=$(ECOLI_REFERENCE)
TRAINING_REGION="Chromosome:50000-3250000"
# for debugging
METHYLTRAIN_EXTRA_OPTS =
# the name of the output model that we use for the human analysis
TRAINED_R7_MODEL_ROOT=ecoli_er2925.pcr_MSssI.timp.021216.alphabet_cpg
TRAINED_R9_MODEL_ROOT=ecoli_er2925.pcr_MSssI.r9.timp.061716.alphabet_cpg
# The log-likelihood threshold required to make a call
CALL_THRESHOLD=2.5
# The minimum mapping quality to use a read
MIN_MAPQ=20
#
# Download data
#
# Reference genomes
$(ECOLI_REFERENCE):
curl -L ftp://ftp.ensemblgenomes.org/pub/release-29/bacteria//fasta/bacteria_0_collection/escherichia_coli_str_k_12_substr_mg1655/dna/Escherichia_coli_str_k_12_substr_mg1655.GCA_000005845.2.29.dna.genome.fa.gz | gunzip - > $@
$(HUMAN_REFERENCE):
curl -L ftp://ftp.sanger.ac.uk/pub/gencode/Gencode_human/release_24/GRCh38.primary_assembly.genome.fa.gz | gunzip - > $@
#
# Rules for generating fasta files from FAST5
#
# Pass reads
%.pass.fasta: $(DATA_ROOT)/%.fast5/pass
$(PORETOOLS) fasta --type 2D $< > $@
# Fail reads
%.fail.fasta: $(DATA_ROOT)/%.fast5/fail
$(PORETOOLS) fasta --type 2D $< > $@
# all reads
%.fasta: %.pass.fasta %.fail.fasta
cat $^ > $@
# all human runs
$(HUMAN_NA12878_NATIVE_MERGED_DATA): NA12878.native.timp.093015.fasta \
NA12878.native.simpson.101515.fasta \
NA12878.native.simpson.103015.fasta
cat $^ > $@
$(HUMAN_MCF10A_MERGED_DATA): mcf10a.rr1.timp.031116.fasta \
mcf10a.rr2.timp.031116.fasta
cat $^ > $@
$(HUMAN_MDAMB231_MERGED_DATA): mdamb231.rr1.timp.031316.fasta \
mdamb231.rr2.timp.031316.fasta
cat $^ > $@
$(HUMAN_NA12878_NATIVE_R9_MERGED_DATA): $(HUMAN_NA12878_NATIVE_R9_RUN1_DATA) \
$(HUMAN_NA12878_NATIVE_R9_RUN2_DATA) \
$(HUMAN_NA12878_NATIVE_R9_RUN3_DATA)
cat $^ > $@
##################################################
#
# Step 2. Align each data set to a reference
#
##################################################
# Index a reference genome with BWA
%.fasta.bwt: %.fasta bwa.version
bwa/bwa index $<
%.fa.bwt: %.fa bwa.version
bwa/bwa index $<
# We use secondary expansion to construct the name of the variable
# containing the reference genome for this sample
.SECONDEXPANSION:
%.sorted.bam: %.fasta $$(%.fasta_REFERENCE) $$(%.fasta_REFERENCE).bwt bwa.version samtools.version
bwa/bwa mem -t $(THREADS) -x ont2d $($*.fasta_REFERENCE) $< |\
samtools/samtools view -q $(MIN_MAPQ) -Sb - |\
samtools/samtools sort -f - $@
%.bam.bai: %.bam
samtools/samtools index $<
%.eventalign.summary %.eventalign: %.sorted.bam %.sorted.bam.bai
nanopolish/nanopolish eventalign --summary $*.eventalign.summary \
--scale-events \
--print-read-names \
-b $*.sorted.bam \
-r $*.fasta \
-g $($*.fasta_REFERENCE) \
-t $(THREADS) > $*.eventalign
##################################################
#
# Step 3. Train models
#
##################################################
# initialize the R7 models
t.006.ont.model: $(SCRIPT_DIR)/models/r7.3_e6_70bps_6mer_template_median68pA.model
$(SCRIPT_DIR)/initialize_model.sh $^ template t.006 SQK006 > $@
c.p1.006.ont.model: $(SCRIPT_DIR)/models/r7.3_e6_70bps_6mer_complement_median68pA_pop1.model
$(SCRIPT_DIR)/initialize_model.sh $^ complement.pop1 c.p1.006 SQK006 > $@
c.p2.006.ont.model: $(SCRIPT_DIR)/models/r7.3_e6_70bps_6mer_complement_median68pA_pop2.model
$(SCRIPT_DIR)/initialize_model.sh $^ complement.pop2 c.p2.006 SQK006 > $@
# initalize R9 models
MODEL_REV=rev2
t.007.ont.model: $(SCRIPT_DIR)/models/template_median68pA.r9.$(MODEL_REV).model
$(SCRIPT_DIR)/initialize_model.sh $^ template t.007 SQK007 > $@
c.p1.007.ont.model: $(SCRIPT_DIR)/models/complement_median68pA_pop1.r9.$(MODEL_REV).model
$(SCRIPT_DIR)/initialize_model.sh $^ complement.pop1 c.p1.007 SQK007 > $@
c.p2.007.ont.model: $(SCRIPT_DIR)/models/complement_median68pA_pop2.r9.$(MODEL_REV).model
$(SCRIPT_DIR)/initialize_model.sh $^ complement.pop2 c.p2.007 SQK007 > $@
# Make a 5-mer model from the input 6-mer model (used by nanopolish to calculate scalings)
%.5mer.model: %.model
python nanopolish/scripts/dropmodel.py --type base -i $^ > $@
# Make a file-of-filenames from a model set
# R7 models
%.R7.fofn: t.006.%.model c.p1.006.%.model c.p2.006.%.model
echo $^ | tr " " "\n" > $@
# R9 models
%.R9.fofn: t.007.%.model c.p1.007.%.model c.p2.007.%.model t.007.%.5mer.model c.p1.007.%.5mer.model c.p2.007.%.5mer.model
echo $^ | tr " " "\n" > $@
# this code generates a training rule for a data set/alphabet pair
define generate-training-rules
$(eval DATASET=$(basename $1))
$(eval ALPHABET=$2)
$(eval PORE_VERSION=$3)
$(eval PREFIX=$(DATASET).alphabet_$(ALPHABET))
# Training rule
$(PREFIX).fofn: $(DATASET).fasta $(DATASET).sorted.bam $(DATASET).sorted.bam.bai $(TRAINING_REFERENCE).alphabet_$(ALPHABET) ont.alphabet_$(ALPHABET).$(PORE_VERSION).fofn nanopolish.version
nanopolish/nanopolish methyltrain -t $(THREADS) $(METHYLTRAIN_EXTRA_OPTS) \
--train-kmers all \
--out-fofn $$@ \
--out-suffix .$(PREFIX).model \
-m ont.alphabet_$(ALPHABET).$(PORE_VERSION).fofn \
-b $(DATASET).sorted.bam \
-r $(DATASET).fasta \
-g $(TRAINING_REFERENCE).alphabet_$(ALPHABET) \
--filter-policy $(PORE_VERSION) \
$(TRAINING_REGION)
# Rules for additional files made by methyltrain
t.006.$(PREFIX).model: $(PREFIX).fofn
c.p1.006.$(PREFIX).model: $(PREFIX).fofn
c.p2.006.$(PREFIX).model: $(PREFIX).fofn
# Bit of a hack, should only specify 006/007 models depending on the pore
t.007.$(PREFIX).model: $(PREFIX).fofn
c.p1.007.$(PREFIX).model: $(PREFIX).fofn
c.p2.007.$(PREFIX).model: $(PREFIX).fofn
methyltrain.$(PREFIX).model.summary: $(PREFIX).fofn
methyltrain.$(PREFIX).model.round4.events.tsv: $(PREFIX).fofn
endef
#
# 3a. Train over a normal nucleotide (ACGT) alphabet
#
# Special case, the expanded reference for the alphabet is just the reference
$(TRAINING_REFERENCE).alphabet_nucleotide: $(TRAINING_REFERENCE)
ln -s $< $@
# Special case, the expanded model for the nucleotide alphabet is just the ONT model
%.alphabet_nucleotide.model: %.model
ln -s $< $@
# make the rules using the generation function
all_ecoli_R7=$(call FILTER_OUT,r9,$(all_ecoli))
all_ecoli_R9=$(call FILTER_IN,r9,$(all_ecoli))
$(foreach file,$(all_ecoli_R7),$(eval $(call generate-training-rules,$(file),nucleotide,R7)))
$(foreach file,$(all_ecoli_R9),$(eval $(call generate-training-rules,$(file),nucleotide,R9)))
#
# 3b. Train over a CpG alphabet
#
# Expand the alphabet to include 5-mC k-mers
%.alphabet_cpg.model: %.model
python $(SCRIPT_DIR)/expand_model_alphabet.py --alphabet cpg $< > $@
# Convert all CG dinucleotides of the reference genome to MG
$(TRAINING_REFERENCE).alphabet_cpg: $(TRAINING_REFERENCE) pythonlibs.version
python $(SCRIPT_DIR)/methylate_reference.py --recognition cpg $< > $@
# make the rules using the generation function
$(foreach file,$(all_ecoli_R7),$(eval $(call generate-training-rules,$(file),cpg,R7)))
$(foreach file,$(all_ecoli_R9),$(eval $(call generate-training-rules,$(file),cpg,R9)))
#
# 3c. Make training plots
#
# Emissions figure, three panels showing baseline, strong methylation signal, weak/no methylation signal
# R7
PANEL_A_R7_SET=ecoli_er2925.pcr.timp.021216.alphabet_nucleotide
PANEL_BC_R7_SET1=ecoli_er2925.pcr.timp.021216.alphabet_cpg
PANEL_BC_R7_SET2=ecoli_er2925.pcr_MSssI.timp.021216.alphabet_cpg
# R9
PANEL_A_R9_SET=ecoli_er2925.pcr.r9.timp.061716.alphabet_nucleotide
PANEL_BC_R9_SET1=ecoli_er2925.pcr.r9.timp.061716.alphabet_cpg
PANEL_BC_R9_SET2=ecoli_er2925.pcr_MSssI.r9.timp.061716.alphabet_cpg
PANEL_A_KMER="AGGTAG"
PANEL_B_KMER="AGGTMG"
PANEL_C_KMER="TMGAGT"
results/figure.emissions.pdf: methyltrain.$(PANEL_A_R7_SET).panelA.tsv \
methyltrain.$(PANEL_BC_R7_SET1).panelB.tsv \
methyltrain.$(PANEL_BC_R7_SET2).panelB.tsv \
methyltrain.$(PANEL_BC_R7_SET1).panelC.tsv \
methyltrain.$(PANEL_BC_R7_SET2).panelC.tsv \
methyltrain.$(PANEL_A_R9_SET).panelA.tsv \
methyltrain.$(PANEL_BC_R9_SET1).panelB.tsv \
methyltrain.$(PANEL_BC_R9_SET2).panelB.tsv \
methyltrain.$(PANEL_BC_R9_SET1).panelC.tsv \
methyltrain.$(PANEL_BC_R9_SET2).panelC.tsv
Rscript $(SCRIPT_DIR)/methylation_plots.R make_emissions_figure $@ $^
%.panelA.tsv: %.model.round4.events.tsv
cat $^ | awk 'NR == 1 || ($$1 ~ "t.00" && $$2 == $(PANEL_A_KMER))' > $@
%.panelB.tsv: %.model.round4.events.tsv
cat $^ | awk 'NR == 1 || ($$1 ~ "t.00" && $$2 == $(PANEL_B_KMER))' > $@
%.panelC.tsv: %.model.round4.events.tsv
cat $^ | awk 'NR == 1 || ($$1 ~ "t.00" && $$2 == $(PANEL_C_KMER))' > $@
# Mean-shift by position figure
%.alphabet_cpg.model.summary.delta: %.alphabet_cpg.model.summary
python $(SCRIPT_DIR)/generate_kmer_deltas.py --summary $^ --ont-fofn ont.alphabet_cpg.fofn > $@
results/figure.shift_by_position.pdf: methyltrain.$(TRAINED_R7_MODEL_ROOT).model.summary.delta methyltrain.$(TRAINED_R9_MODEL_ROOT).model.summary.delta
mkdir -p results
Rscript $(SCRIPT_DIR)/methylation_plots.R make_mean_shift_by_position_figure $@ $^
#
# 3d. Make training tables from the summary files
#
# Subset by treatment and add "methyltrain." to the start of every filename
mt_ecoli_pcr=$(addprefix methyltrain.,$(call FILTER_IN,.pcr.,$(all_ecoli)))
mt_ecoli_pcr_msssi=$(addprefix methyltrain.,$(call FILTER_IN,.pcr_MSssI.,$(all_ecoli)))
# PCR data
# nucleotide
results/pcr.nucleotide.training.table.tex: $(patsubst %.fasta,%.alphabet_nucleotide.model.summary,$(mt_ecoli_pcr))
mkdir -p results
python $(SCRIPT_DIR)/generate_training_table.py --treatment pcr --alphabet nucleotide $^ > $@
# cpg
results/pcr.cpg.training.table.tex: $(patsubst %.fasta,%.alphabet_cpg.model.summary,$(mt_ecoli_pcr))
mkdir -p results
python $(SCRIPT_DIR)/generate_training_table.py --treatment pcr --alphabet cpg $^ > $@
# PCR + M.SssI
# nucleotide
results/pcr_MSssI.nucleotide.training.table.tex: $(patsubst %.fasta,%.alphabet_nucleotide.model.summary,$(mt_ecoli_pcr_msssi))
mkdir -p results
python $(SCRIPT_DIR)/generate_training_table.py --treatment pcr_MSssI --alphabet nucleotide $^ > $@
# CpG
results/pcr_MSssI.cpg.training.table.tex: $(patsubst %.fasta,%.alphabet_cpg.model.summary,$(mt_ecoli_pcr_msssi))
mkdir -p results
python $(SCRIPT_DIR)/generate_training_table.py --treatment pcr_MSssI --alphabet cpg $^ > $@
# Merge all tables
results/all.training.tables.tex: results/pcr.nucleotide.training.table.tex \
results/pcr_MSssI.nucleotide.training.table.tex \
results/pcr.cpg.training.table.tex \
results/pcr_MSssI.cpg.training.table.tex
cat $^ > $@
##################################################
#
# Step 4. Human genome analysis
#
##################################################
TRAINED_R7_MODEL_FOFN=trained.R7.methylation.model.fofn
TRAINED_R9_MODEL_FOFN=trained.R9.methylation.model.fofn
$(TRAINED_R9_MODEL_FOFN): t.007.$(TRAINED_R9_MODEL_ROOT).model \
c.p1.007.$(TRAINED_R9_MODEL_ROOT).model \
c.p2.007.$(TRAINED_R9_MODEL_ROOT).model \
t.007.$(TRAINED_R9_MODEL_ROOT).5mer.model \
c.p1.007.$(TRAINED_R9_MODEL_ROOT).5mer.model \
c.p2.007.$(TRAINED_R9_MODEL_ROOT).5mer.model
echo $^ | tr " " "\n" > $@
$(TRAINED_R7_MODEL_FOFN): $(TRAINED_R7_MODEL_ROOT).fofn
ln -s $< $@
# Calculate methylation likelihood ratios at every CpG covered by nanopore reads
# We do this with a rule macro to switch the model depending on if the input data is R7 or R9
define generate-calling-rules
$(eval FASTA=$1)
$(eval PORE_VERSION=$2)
$(eval BAM=$(FASTA:.fasta=.sorted.bam))
$(BAM).methyltest.sites.bed: $(BAM) $(BAM).bai trained.$(PORE_VERSION).methylation.model.fofn
$(eval TMP_REF = $($(FASTA)_REFERENCE))
/usr/bin/time -v nanopolish/nanopolish methyltest -t $(THREADS) \
-m trained.$(PORE_VERSION).methylation.model.fofn \
-b $(BAM) \
-r $(FASTA) \
-g $(TMP_REF) 2> $(BAM).methlytest.stderr
$(BAM).methyltest.reads.tsv: $(BAM).methyltest.sites.bed
$(BAM).methyltest.strand.tsv: $(BAM).methyltest.sites.bed
endef
# make the rules using the generation function
all_NA12878_R7=$(call FILTER_OUT,r9,$(all_NA12878))
all_NA12878_R9=$(call FILTER_IN,r9,$(all_NA12878))
$(foreach file,$(all_NA12878_R7),$(eval $(call generate-calling-rules,$(file),R7)))
$(foreach file,$(all_NA12878_R9),$(eval $(call generate-calling-rules,$(file),R9)))
$(foreach file,$(all_cancer),$(eval $(call generate-calling-rules,$(file),R7)))
# Convert a site BED file into a tsv file for R
%.methyltest.sites.tsv: %.methyltest.sites.bed
cat $< | python $(SCRIPT_DIR)/annotated_bed_to_tsv.py > $@
# Download a bed file summarizing an NA12878 bisulfite experiment from ENCODE
$(BISULFITE_BED):
curl -L https://www.encodeproject.org/files/ENCFF279HCL/download/$(BISULFITE_BED).gz | gunzip - > $@
# Extract tsv for strand-based methylation plots
%.methyltest.phase.tsv: %.methyltest.sites.bed
python $(SCRIPT_DIR)/phase_extract.py -c $(CALL_THRESHOLD) -i $<
# Summarize methylation per cg
%.methyltest.cyto.txt: %.methyltest.phase.tsv
python $(SCRIPT_DIR)/per_cg_methylation.py -i $<
#
# CpG Island Methylation Results
#
CGI_FILE=$(SCRIPT_DIR)/annotations/cgisl_hg38.bed.gz
PROMOTER_FILE=$(SCRIPT_DIR)/annotations/gencode.v24.promoter.bed.gz
CGI_PROMOTER_BED=cpg_islands.promoter.bed
# Annotate the CpG islands with whether they are <= 2kb upstream of a gene
$(CGI_PROMOTER_BED): $(CGI_FILE) $(PROMOTOR_FILE) bedtools.version
bedtools/bin/bedtools map -o first -c 4 -a <(zcat $(CGI_FILE) | bedtools/bin/bedtools sort) \
-b <(zcat $(PROMOTER_FILE) | bedtools/bin/bedtools sort) | \
sed s/:_/=/ | awk '{ print $$1 "\t" $$2 "\t" $$3 "\t" $$4 ";Feature=" $$5 }' > $@
# Calculate a summary data for each CpG island from the bisulfite data
NA12878.bisulfite_score.cpg_islands: $(CGI_PROMOTER_BED) $(BISULFITE_BED) bedtools.version
bedtools/bin/bedtools intersect -wb -b $(CGI_PROMOTER_BED) -a $(BISULFITE_BED) | \
python $(SCRIPT_DIR)/calculate_methylation_at_cpg_islands.py -t bisulfite > $@
# Calculate a summary score for each CpG island from the ONT reads
%.ont_score.cpg_islands: $(CGI_PROMOTER_BED) %.sorted.bam.methyltest.sites.bed bedtools.version
bedtools/bin/bedtools intersect -wb -b $(CGI_PROMOTER_BED) \
-a $*.sorted.bam.methyltest.sites.bed | \
python $(SCRIPT_DIR)/calculate_methylation_at_cpg_islands.py -t ont -c $(CALL_THRESHOLD) > $@
results/%.cpg_island_plot.pdf: NA12878.bisulfite_score.cpg_islands %.ont_score.cpg_islands
mkdir -p results
Rscript $(SCRIPT_DIR)/methylation_plots.R human_cpg_island_plot $^ $@
#
# TSS results
#
TSS_FILE=$(SCRIPT_DIR)/annotations/gencode.v24.tss.bed.gz
# Calculate methylation as a function of distance from a TSS for the ONT data
%.methylated_sites.distance_to_TSS.bed: %.sorted.bam.methyltest.sites.bed bedtools.version $(TSS_FILE)
bedtools/bin/bedtools closest -D b -b <(zcat $(TSS_FILE) | bedtools/bin/bedtools sort)\
-a <(cat $*.sorted.bam.methyltest.sites.bed | bedtools/bin/bedtools sort) > $@
%.methylated_sites.distance_to_TSS.table: %.methylated_sites.distance_to_TSS.bed
python $(SCRIPT_DIR)/calculate_methylation_by_distance.py --type ont -c $(CALL_THRESHOLD) -i $^ > $@
# Calculate methylation as a function of distance for the bisulfite data
NA12878.bisulfite.distance_to_TSS.bed: $(BISULFITE_BED) bedtools.version $(TSS_FILE)
bedtools/bin/bedtools closest -D b -b <(zcat $(TSS_FILE) | bedtools/bin/bedtools sort)\
-a <(cat $(BISULFITE_BED) | bedtools/bin/bedtools sort) > $@
# Calculate methylation as a function of distance for the bisulfite data
%.bisulfite.distance_to_TSS.bed: %.bisulfite.bed bedtools.version $(TSS_FILE)
bedtools/bin/bedtools closest -D b -b <(zcat $(TSS_FILE) | bedtools/bin/bedtools sort)\
-a <(cat $< | bedtools/bin/bedtools sort) > $@
%.bisulfite.distance_to_TSS.table: %.bisulfite.distance_to_TSS.bed
python $(SCRIPT_DIR)/calculate_methylation_by_distance.py --type bisulfite -i $^ > $@
# Build the list of datasets to use in the TSS analysis
# The FILTER_IN rule for pcr intentially includes
# the MSssI data as well
TSS_FILES=$(call FILTER_IN,merged,$(all_NA12878)) \
$(call FILTER_IN,pcr,$(all_NA12878))
TSS_INPUT=$(TSS_FILES:.fasta=.methylated_sites.distance_to_TSS.table) NA12878.bisulfite.distance_to_TSS.table
results/figure.methylation_by_TSS_distance.pdf: $(TSS_INPUT)
mkdir -p results
Rscript $(SCRIPT_DIR)/methylation_plots.R TSS_distance_plot $^ $@
results/figure.methylation_by_TSS_distance_by_chromosome.pdf: $(TSS_INPUT)
mkdir -p results
Rscript $(SCRIPT_DIR)/methylation_plots.R TSS_distance_plot_by_chromosome $^ $@
#
# Site likelihood plot
#
results/figure.site_likelihood_distribution.pdf: NA12878.pcr.r9.timp.081016.sorted.bam.methyltest.sites.tsv \
NA12878.pcr_MSssI.r9.timp.081016.sorted.bam.methyltest.sites.tsv \
NA12878.native.r9.merged.sorted.bam.methyltest.sites.tsv \
NA12878.pcr.simpson.021616.sorted.bam.methyltest.sites.tsv \
NA12878.pcr_MSssI.simpson.021016.sorted.bam.methyltest.sites.tsv \
NA12878.native.merged.sorted.bam.methyltest.sites.tsv
mkdir -p results/
Rscript $(SCRIPT_DIR)/methylation_plots.R site_likelihood_distribution $^ $@
#
# Accuracy results
#
# Helper to run FILTER_IN and change the suffix
FILTER_IN_AND_MAKE_SITES = $(patsubst %.fasta,%.sorted.bam.methyltest.sites.bed,$(call FILTER_IN,$(1),$(2)))
NA12878_R7_PCR=$(call FILTER_IN_AND_MAKE_SITES,.pcr.,$(all_NA12878_R7))
NA12878_R7_PCR_MSSSI=$(call FILTER_IN_AND_MAKE_SITES,.pcr_MSssI,$(all_NA12878_R7))
NA12878_R9_PCR=$(call FILTER_IN_AND_MAKE_SITES,.pcr.,$(all_NA12878_R9))
NA12878_R9_PCR_MSSSI=$(call FILTER_IN_AND_MAKE_SITES,.pcr_MSssI,$(all_NA12878_R9))
accuracy.by_threshold.R7.tsv: $(NA12878_R7_PCR) $(NA12878_R7_PCR_MSSSI)
python $(SCRIPT_DIR)/calculate_call_accuracy.py --unmethylated $(NA12878_R7_PCR) \
--methylated $(NA12878_R7_PCR_MSSSI) \
--pore R7
accuracy.by_threshold.R9.tsv: $(NA12878_R9_PCR) $(NA12878_R9_PCR_MSSSI)
python $(SCRIPT_DIR)/calculate_call_accuracy.py --unmethylated $(NA12878_R9_PCR) \
--methylated $(NA12878_R9_PCR_MSSSI) \
--pore R9
accuracy.precision_recall.R7.tsv: accuracy.by_threshold.R7.tsv
accuracy.precision_recall.R9.tsv: accuracy.by_threshold.R9.tsv
accuracy.table.R7.tex: accuracy.by_threshold.R7.tsv
accuracy.table.R9.tex: accuracy.by_threshold.R9.tsv
results/accuracy.table.%.tex: accuracy.table.%.tex
mkdir -p results
cp $^ $@
results/figure.accuracy_roc.pdf: accuracy.precision_recall.R7.tsv accuracy.precision_recall.R9.tsv
mkdir -p results
Rscript $(SCRIPT_DIR)/methylation_plots.R call_accuracy_roc $^ $@
results/figure.accuracy_by_threshold.pdf: accuracy.by_threshold.R7.tsv accuracy.by_threshold.R9.tsv
mkdir -p results
Rscript $(SCRIPT_DIR)/methylation_plots.R call_accuracy_by_threshold $^ $@
results/accuracy_by_kmer.pdf: accuracy.by_kmer.tsv
mkdir -p results
Rscript $(SCRIPT_DIR)/methylation_plots.R call_accuracy_by_kmer $^ $@
#
# Cancer/Normal results
#
# D/l illumina bs data
MCF10A.cyto.txt.gz:
wget http://timplab.org/data/MCF10A.cyto.txt.gz
MDAMB231.cyto.txt.gz:
wget http://timplab.org/data/MDAMB231.cyto.txt.gz
# Correlation plot
results/mcf10a.bsnanocorr.pdf: MCF10A.cyto.txt.gz mcf10a.merged.sorted.bam.methyltest.cyto.txt
Rscript $(SCRIPT_DIR)/methylation_region_plot.R correlation $^ $@
results/mdamb231.bsnanocorr.pdf: MDAMB231.cyto.txt.gz mdamb231.merged.sorted.bam.methyltest.cyto.txt
Rscript $(SCRIPT_DIR)/methylation_region_plot.R correlation $^ $@
# Region Plot
# Thresholds for filtering regions to plot
COV_THRESHOLD=5
SIZE_MN_THRESHOLD=3000
SIZE_MX_THRESHOLD=7000
filt.isl.bed.gz: $(CGI_FILE) \
MCF10A.cyto.txt.gz \
MDAMB231.cyto.txt.gz
Rscript $(SCRIPT_DIR)/methylation_region_plot.R best_regions $^ $@
covd.regions.bed: bedtools.version \
mcf10a.merged.sorted.bam \
mdamb231.merged.sorted.bam \
filt.isl.bed.gz
bedtools/bin/bedtools genomecov -ibam mcf10a.merged.sorted.bam -bg | awk -v cov=${COV_THRESHOLD} '$$4 > cov' | bedtools/bin/bedtools merge -d 50 -i stdin >mcf10a.merged.sorted.bam.covd
bedtools/bin/bedtools genomecov -ibam mdamb231.merged.sorted.bam -bg | awk -v cov=${COV_THRESHOLD} '$$4 > cov' | bedtools/bin/bedtools merge -d 50 -i stdin >mdamb231.merged.sorted.bam.covd
bedtools/bin/bedtools intersect -a mcf10a.merged.sorted.bam.covd -b mdamb231.merged.sorted.bam.covd -u | \
bedtools/bin/bedtools intersect -a stdin -b filt.isl.bed.gz -u | awk -v mn=${SIZE_MN_THRESHOLD} -v mx=${SIZE_MX_THRESHOLD} '($$3 - $$2) > mn && ($$3 - $$2) < mx' | \
bedtools/bin/bedtools coverage -a stdin -b mcf10a.merged.sorted.bam mdamb231.merged.sorted.bam | sort -k 4rn >$@
results/cn.region.plot.pdf: covd.regions.bed \
mcf10a.merged.sorted.bam.methyltest.cyto.txt \
mdamb231.merged.sorted.bam.methyltest.cyto.txt \
MCF10A.cyto.txt.gz \
MDAMB231.cyto.txt.gz
mkdir -p results
Rscript $(SCRIPT_DIR)/methylation_region_plot.R meth_diff_plot $^ $@
# Strand plot
results/cn.strand.plot.pdf: covd.regions.bed \
mcf10a.merged.sorted.bam.methyltest.phase.tsv \
mdamb231.merged.sorted.bam.methyltest.phase.tsv
mkdir -p results
Rscript $(SCRIPT_DIR)/methylation_region_plot.R strand_plot $^ $@