-
Notifications
You must be signed in to change notification settings - Fork 5
/
Copy pathpindel_2_combined_vcf.pl
executable file
·423 lines (319 loc) · 15.9 KB
/
pindel_2_combined_vcf.pl
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
#!/usr/bin/perl
# Copyright (c) 2014-2021 Genome Research Ltd
#
# Author: CASM/Cancer IT <cgphelp@sanger.ac.uk>
#
# This file is part of cgpPindel.
#
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU Affero General Public License as
# published by the Free Software Foundation, either version 3 of the
# License, or (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU Affero General Public License for more details.
#
# You should have received a copy of the GNU Affero General Public License
# along with this program. If not, see <https://www.gnu.org/licenses/>.
#
# 1. The usage of a range of years within a copyright statement contained within
# this distribution should be interpreted as being equivalent to a list of years
# including the first and last year specified and all consecutive years between
# them. For example, a copyright statement that reads ‘Copyright (c) 2005, 2007-
# 2009, 2011-2012’ should be interpreted as being identical to a statement that
# reads ‘Copyright (c) 2005, 2007, 2008, 2009, 2011, 2012’ and a copyright
# statement that reads ‘Copyright (c) 2005-2012’ should be interpreted as being
# identical to a statement that reads ‘Copyright (c) 2005, 2006, 2007, 2008,
# 2009, 2010, 2011, 2012’.
#
use strict;
use warnings FATAL => 'all';
use autodie qw(:all);
use Cwd qw(abs_path);
use File::Basename;
use Getopt::Long;
use Try::Tiny;
use Carp;
use Pod::Usage qw(pod2usage);
use FindBin qw($Bin);
use lib "$Bin/../lib";
use PerlIO::gzip;
use Bio::DB::HTS;
use Sanger::CGP::Pindel::OutputGen::CombinedRecordGenerator;
use Sanger::CGP::Pindel::OutputGen::VcfConverter;
use Sanger::CGP::Pindel::OutputGen::BamUtil;
use Sanger::CGP::Vcf::Contig;
use Sanger::CGP::Vcf::Sample;
use Sanger::CGP::Vcf::VcfProcessLog;
use Sanger::CGP::Vcf::OutputGen::SequentialIdGenerator;
use Sanger::CGP::Vcf::OutputGen::UuIdGenerator;
{
my $opts = setup();
my $output;
my $output_location = '';
my $current_path = '';
my @file_paths = ();
@file_paths = @{$opts->{input}} if($opts->{input});
if($opts->{inputdir}){
opendir(my $dh, $opts->{inputdir}) or die "Cannot open |".$opts->{inputdir}."| for reading: $!";
push(@file_paths, map {join('/',$opts->{inputdir},$_)} grep {m/.*_((D)|(SI))$/} readdir $dh);
closedir $dh;
}
my $wt_sam = Bio::DB::HTS->new(-bam => $opts->{wt}, -fasta => $opts->{r});
my $mt_sam = Bio::DB::HTS->new(-bam => $opts->{mt}, -fasta => $opts->{r});
## combine the header text to get a combined list of contigs.
my $contigs = Sanger::CGP::Pindel::OutputGen::BamUtil::parse_contigs($wt_sam->header->text . $mt_sam->header->text, $opts->{sp}, $opts->{as});
my $wt_samples = Sanger::CGP::Pindel::OutputGen::BamUtil::parse_samples($wt_sam->header->text,$opts->{wts},$opts->{wtp},$opts->{wta},$opts->{a},'Wild type');
my $mt_samples = Sanger::CGP::Pindel::OutputGen::BamUtil::parse_samples($mt_sam->header->text,$opts->{mts},$opts->{mtp},$opts->{mta},$opts->{a},'Mutant');
## make sure we have the correct number of samples in the bam files.
die "No samples found in wild type bam file." if(scalar values %$wt_samples == 0);
die "No samples found in mutant type bam file." if(scalar values %$mt_samples == 0);
die "Multiple samples found in wild type bam file." if(scalar values %$wt_samples > 1);
die "Multiple samples found in mutant type bam file." if(scalar values %$mt_samples > 1);
my $wt_sample = (values(%$wt_samples))[0];
my $mt_sample = (values(%$mt_samples))[0];
my $fai = Bio::DB::HTS::Fai->load($opts->{r});
my $wt_out_sam_path;
my $mt_out_sam_path;
my $wt_out_sam_fh;
my $mt_out_sam_fh;
if(defined $opts->{'samoutput'}){
my $base_path = $opts->{'samoutput'};
$wt_out_sam_path = $base_path . '_wt.sam.gz';
$mt_out_sam_path = $base_path . '_mt.sam.gz';
open $wt_out_sam_fh, '>:gzip', $wt_out_sam_path or die "Cannot open |$wt_out_sam_path| for writing: $!";
#open($wt_out_sam_fh, ">", $wt_out_sam_path) or die "Cannot open |$wt_out_sam_path| for writing: $!";
print $wt_out_sam_fh Sanger::CGP::Pindel::OutputGen::BamUtil::pindel_header($opts->{'wt'}, 'wt', $opts->{'pp'}, $opts->{'cmd'}, $opts->{'sp'}, $opts->{'as'});
print $wt_out_sam_fh "\n";
open $mt_out_sam_fh, '>:gzip', $mt_out_sam_path or die "Cannot open |$wt_out_sam_path| for writing: $!";
#open($mt_out_sam_fh, ">", $mt_out_sam_path) or die "Cannot open |$mt_out_sam_path| for writing: $!";
print $mt_out_sam_fh Sanger::CGP::Pindel::OutputGen::BamUtil::pindel_header($opts->{'mt'}, 'mt', $opts->{'pp'}, $opts->{'cmd'}, $opts->{'sp'}, $opts->{'as'});
print $mt_out_sam_fh "\n";
}
my $record_converter = new Sanger::CGP::Pindel::OutputGen::VcfConverter(
-contigs => [values %$contigs]
);
my $id_gen;
if(defined $opts->{'g'}){
$id_gen = new Sanger::CGP::Vcf::OutputGen::SequentialIdGenerator(-start => $opts->{'g'});
}else{
$id_gen = new Sanger::CGP::Vcf::OutputGen::UuIdGenerator();
}
try{
if($opts->{'output'}){
open($output, '>', $opts->{'output'}) or die "Cannot open |".$opts->{'output'}."| for reading: $!";
$output_location = $opts->{'output'};
}else{
$output = \*STDOUT;
$output_location = 'STDOUT';
}
## loop though all the pindel files for the given locations.
if(scalar @file_paths){
my $print_header = 1;
foreach $current_path (@file_paths){
open(my $fh, "<", $current_path) or croak "Cannot open |$current_path| for reading: $!";
_process_fh($fh, $output, $print_header, $wt_out_sam_fh, $mt_out_sam_fh, $opts, $record_converter, $fai, $wt_sam, $mt_sam, $wt_sample, $mt_sample, $id_gen, $opts->{r});
close($fh) or croak "Unable to close |$current_path| for reading: $!";
$print_header = 0;
}
}else{
$current_path = 'STDIN';
_process_fh(\*STDIN, $output, 1, $wt_out_sam_fh, $mt_out_sam_fh, $opts, $record_converter, $fai, $wt_sam, $mt_sam, $wt_sample, $mt_sample, $id_gen, $opts->{r});
}
}catch{
die 'Error! Reading: |'.$current_path."| Writing to: |$output_location|\n$_";
}finally{
close $output or die "Unable to close |".$opts->{'output'}."|: $!" if($opts->{'output'});
close $wt_out_sam_fh or die "Unable to close $wt_out_sam_path|: $!" if(defined $wt_out_sam_fh);
close $mt_out_sam_fh or die "Unable to close $mt_out_sam_path|: $!" if(defined $mt_out_sam_fh);
};
}
sub _process_fh{
my($fh, $output, $header, $wt_out_sam_fh, $mt_out_sam_fh, $opts, $record_converter, $fai, $wt_sam, $mt_sam, $wt_sample, $mt_sample, $id_gen, $ref) = @_;
my $record_generator = new Sanger::CGP::Pindel::OutputGen::CombinedRecordGenerator(
-wt_sam => $wt_sam,
-mt_sam => $mt_sam,
-mutant_sample_name => $mt_sample->name,
-fai => $fai,
-fh => $fh
);
if($header){
## write the header... we need to test if S2 is preant first...
my $source = basename($0). '_v'. Sanger::CGP::Pindel->VERSION;
my $include_s2;
if(defined($record_generator->version)){
$include_s2 = $record_generator->version eq 'v01' ? 1 : 0;
}else{
$include_s2 = 1;
}
my @process_logs = ();
push @process_logs, new Sanger::CGP::Vcf::VcfProcessLog(
-input_vcf_source => 'Pindel',
-input_vcf_ver => $record_generator->version,
);
push @process_logs, new Sanger::CGP::Vcf::VcfProcessLog(
-input_vcf_source => basename($0),
-input_vcf_ver => Sanger::CGP::Pindel::OutputGen->VERSION,
-input_vcf_param => $opts,
);
print $output $record_converter->gen_header($wt_sample,$mt_sample, \@process_logs, $include_s2, $ref, $source) or croak("Unable to write VCF header: $!");
}
my ($active_sam_fh, $sample, $strand);
while(my $record = $record_generator->next_record){
next if($opts->{'s'} && ($record->p_mt_pos + $record->p_mt_neg) < 3);
$record->id($id_gen->next);
## write the record
print $output $record_converter->gen_record($record) or croak("Unable to write VCF record: $!") ;
## only write to sam files if they have been defined.
## these can be used for viewing in g/jbrowse.
if($mt_out_sam_fh){
foreach $sample (keys %{$record->reads}){
if($sample eq $mt_sample->name) {
$active_sam_fh = $mt_out_sam_fh;
}
elsif($sample eq $wt_sample->name) {
$active_sam_fh = $wt_out_sam_fh;
}
else {
die "Samples in pindel result files don't match BAM files\n";
}
for $strand ('+','-'){
foreach my $read_arr (@{$record->reads->{$sample}->{$strand}}){
print $active_sam_fh join("\t",@$read_arr)."\n" or die "Unable to write sam line: $!";
}
}
}
}
}
}
sub setup{
my %opts;
$opts{'cmd'} = join " ", $0, @ARGV;
my @random_args;
GetOptions( 'h|help' => \$opts{'h'},
'm|man' => \$opts{'m'},
'v|version' => \$opts{'v'},
'i|input=s@' => \$opts{'input'},
'd|inputdir=s' => \$opts{'inputdir'},
'so|samoutput=s' => \$opts{'samoutput'},
'o|output=s' => \$opts{'output'},
'r|ref=s' => \$opts{'r'},
'wt|wtbam=s' => \$opts{'wt'},
'mt|mtbam=s' => \$opts{'mt'},
'wts|wtstudy=s' => \$opts{'wts'},
'mts|mtstudy=s' => \$opts{'mts'},
'wtp|wtprot=s' => \$opts{'wtp'},
'mtp|mtprot=s' => \$opts{'mtp'},
'wta|wtacc=s' => \$opts{'wta'},
'mta|mtacc=s' => \$opts{'mta'},
'as|assembly=s' => \$opts{'as'},
'sp|species=s{0,}' => \@{$opts{'sp'}},
'a|accsource=s' => \$opts{'a'},
's|skipwt' => \$opts{'s'},
'g|idstart=i' => \$opts{'g'},
'pp|parent=s' => \$opts{'pp'},
'<>' => sub{push(@random_args,shift(@_));}
) or pod2usage(2);
my $version = Sanger::CGP::Pindel::OutputGen::BamUtil->VERSION;
if(defined $opts{'v'}){
print "Version: $version\n";
exit;
}
pod2usage(-verbose => 1) if(defined $opts{'h'});
pod2usage(-verbose => 2) if(defined $opts{'m'});
pod2usage(-message => "\nERROR: unrecognised commandline arguments: ".join(', ',@random_args).".\n", -verbose => 1, -output => \*STDERR) if(scalar @random_args) ;
pod2usage(-message => "\nERROR: w|wtbam must be defined.\n", -verbose => 1, -output => \*STDERR) unless($opts{'wt'});
pod2usage(-message => "\nERROR: w|wtbam |".$opts{'wt'}."| must be a valid file.\n", -verbose => 1, -output => \*STDERR) unless(-f $opts{'wt'});
pod2usage(-message => "\nERROR: w|wtbam |".$opts{'wt'}."| cannot locate index file.\n", -verbose => 1, -output => \*STDERR) unless(-f $opts{'wt'}.'.bai' || -f $opts{'wt'}.'.csi' || -f $opts{'wt'}.'.crai');
pod2usage(-message => "\nERROR: m|mtbam must be defined.\n", -verbose => 1, -output => \*STDERR) unless($opts{'mt'});
pod2usage(-message => "\nERROR: m|mtbam |".$opts{'mt'}."| must be a valid file.\n", -verbose => 1, -output => \*STDERR) unless(-f $opts{'mt'});
pod2usage(-message => "\nERROR: w|wtbam |".$opts{'mt'}."| cannot locate index file.\n", -verbose => 1, -output => \*STDERR) unless(-f $opts{'mt'}.'.bai' || -f $opts{'mt'}.'.csi' || -f $opts{'mt'}.'.crai');
pod2usage(-message => "\nERROR: r|ref must be defined.\n", -verbose => 1, -output => \*STDERR) unless($opts{'r'});
pod2usage(-message => "\nERROR: r|ref |".$opts{'r'}."| must be a valid file.\n", -verbose => 1, -output => \*STDERR) unless(-f $opts{'r'});
if(scalar @{$opts{'sp'}} > 0 ){
$opts{'sp'}="@{$opts{'sp'}}";
}
else {
delete $opts{'sp'};
}
return \%opts;
}
__END__
=head1 NAME
pindel_2_combined_vcf.pl - Takes a raw Pindel file and a pair of wild type/mutant bam files and produces a combined .vcf file.
=head1 SYNOPSIS
pindel_2_combined_vcf.pl [options]
Required parameters:
-output -o File path to output. Defaults to STDOUT.
-samoutput -so File path-stub to sam file output. If present will create two sam files <path-stub>_wt|mt.sam containing the pindel call reads.
-wtbam -wt File path to the wild type bam/cram file (index must be co-located).
-mtbam -mt File path to the mutant bam/cram file (index must be co-located).
-ref -r File path to the reference file used to provide the coordinate system.
Optional parameters:
-input -i File path to read in.
-inputdir -d Directory path to read in. All files ending in _D or _SI.
(If neither -i nor -d are set, input will be read from STDIN)
-wtstudy -wts String representing the wild type sample study.
-mtstudy -mts String representing the mutant sample study.
-wtprot -wtp String representing the wild type sample sequence protocol (e.g. genomic, targeted, RNA-seq).
-mtprot -mtp String representing the mutant sample sequence protocol (e.g. genomic, targeted, RNA-seq).
-wtacc -wta String representing the wild type sample accession id.
-mtacc -mta String representing the mutant sample accession id.
-accsource -a String representing the source of the accession id (e.g. COSMIC_SAMPLE_ID, EGA etc...)
-skipwt -s If present, will skip variants where there are more wt calls than mt.
-idstart -g Will set a sequential id generator to the given integer value. If not present will assign UUIDs to each variant.
-assembly -as Reference assembly name, used when not found in BAM/CRAM headers.
-species -sp Species name, used when not found in BAM/CRAM headers.
-parent -pp Process information from parent program (where one exists)
Other:
-help -h Brief help message.
-man -m Full documentation.
-version -v Prints the version number.
=head1 OPTIONS
=over 8
=item B<-input>
File path to read. Accepts only raw pindel files.
=item B<-samoutput>
File path-stub to sam file output. If present will create two sam files <path-stub>_wt|mt.sam containing
the pindel call reads. These files are used to display the pindel reads in various browsers.
=item B<-output>
File path to output data. If this option is omitted the script will attempt to write to STDOUT.
=item B<-wtbam>
File path to read. Accepts BAM/CRAM files. This file is treated as the wild type sample.
=item B<-mtbam>
File path to read. Accepts BAM/CRAM files. This file is treated as the mutant sample.
=item B<-wtstudy>
String identifying the study to which the wild type sample belongs.
=item B<-mtstudy>
String identifying the study to which the mutant sample belongs.
=item B<-wtprot>
String identifying the sequence protocol that wild type sample was sequenced under (e.g. genomic, targeted, RNA-seq).
=item B<-mtprot>
String identifying the sequence protocol that mutant sample was sequenced under (e.g. genomic, targeted, RNA-seq).
=item B<-wtacc>
String identifying the wild type sample accession identifier.
=item B<-mtacc>
String identifying the mutant sample accession identifier.
=item B<-accsource>
String identifying the accession source (e.g. COSMIC_SAMPLE_ID, EGA etc...).
=item B<-assembly>
Reference assembly name, used when not found in BAM headers. Validated against header if both are present.
=item B<-species>
Species name, used when not found in BAM headers. Validated against header if both are present.
=item B<-skipwt>
If present, will skip variants where there are more wt calls than mt. Unfortunately the variant will still be processed in the background.
=item B<-idstart>
Will set the id generator to the given integer value. If not present will assign UUIDs to each variant.
=item B<-help>
Print a brief help message and exits.
=item B<-man>
Prints the manual page and exits.
=item B<-version>
Prints the version number and exits.
=back
=head1 DESCRIPTION
B<Pindel2CombinedVcf.pl> will attempt to generate a vcf file from a Pindel output file and bwa pileup information.
For every variant called by Pindel a pileup will be performed and the results merged into a single vcf record.
=cut