-
Notifications
You must be signed in to change notification settings - Fork 2
/
Copy pathparse_blastn_local.pl
executable file
·458 lines (403 loc) · 17.1 KB
/
parse_blastn_local.pl
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
#!/usr/bin/perl -w
use strict;
# Default options
my $E_max=10;
my $E_min=-1.0;
my $P_max=1;
my $cov_thrshd=0;
my $sc_thrshd=-10;
my $qid_thrshd=0;
my $bl=-10; # minimum per-residue bit score with query at ends of HSP for loose end pruning
my $bs=-10; # minimum per-residue bit score with query at ends of HSP for strict end pruning
my $bg=30; # below this number of end gaps the loose HSP pruning score is used
my $outformat="fas";
my $append=0;
my $query_file="";
my $infile;
my $outfile;
my $v=2;
# Variable declarations
my $i; # residue index
my $j; # residue index
my $k; # sequence index
my $options="";
my $line; # line read in from file
my $query_length=0; # number of residues in query sequence
my $query_match=0; # number of upper-case residues (=match states) in query sequence
my $capitalize=0; # capitalize query
my $nameline; # >template_name
my $Evalue; # e-value of hit
my $score; # bit score of hit
my $hit_length; # number of residues in HSP
my $coverage; # hit-length/$query_length
my $score_col; # score per column
my $score_min=0; # $score_min=-3*log($P_max)/log(2);
my $query_name; # name of query file
my $queryseq; # residues of query read in with -q or -q2m option
my $qfirst; # index of first query residue in pairwise alignment
my $qlast; # index of last query residue in pairwise alignment
my $tfirst; # index of first template residue in pairwise alignment
my $tlast; # index of last template residue in pairwise alignment
my $tlen=0; # length of template in pairwise alignment
my @query_res; # query residues from current pairwise alignment
my @template_res; # template residues from current pairwise alignment
my $query_res; # query residues from current pairwise alignment
my $template_res; # template residues from current pairwise alignment
my $line_number=0;
my $new_hit=""; # new sequence record; is only written if coverage threshold is exceeded
my $nhit=0; # counts the number of sequences already in alignment
my @hitnames; # $hitnames[$nhit] is the nameline of the ihit'th hit
my @hitseqs; # $hitseqs[$nhit] contains the residues of the ihit'th hit
my @match; # for -q option: $match[$i]=1 if $i'th query residue is capital letter in query, else 0
my $qid; # $qid is sequence identity with query (for -q option: CONSIDER ONLY MATCH STATES)
my $len; # $len is sequence number of residues of seq k aligned with a match state in query
my $b; # minimum per-residue bit score with query at ends of HSP
my $pfile=""; # alignment file used to calculate PSSM for -p and s/c options
my $bfile=""; # alignment file used to calculate PSSM for -b option
my $GAP=11.0/3.0; # gap opening penalty in bits (for BLOSUM62: 11 bits/3)
my $EXTEND=1.0/3.0; # gap extension penalty in bits (for BLOSUM62: 1 bits/3)
my @queryseq;
my $skip=0; # skip this template sequence because it might be a synthetic fusion protein
my $best=0; # extract only the best HSP per sequence
my $rescaled_Gonnet=0; # Gonnet matrix not yet rescaled to bits
my @qp=(); # $qb[$i][$a] is PSSM from alignment read in with -B option
my @qb=(); # $qp[$i][$a] is PSSM from alignment read in with -P option
$infile=$ARGV[0];
$query_file= $ARGV[1];
$outfile=$ARGV[2];
#Include query sequence as first sequence in alignment?
if ($query_file) {
open(QUERYFILE,"<$query_file") or die ("ERROR: Cannot open $query_file: $!\n");
while($line=<QUERYFILE>) # Read name line
{
if ($line=~/^>(.*)/)
{
$query_name=$1;
last;
}
}
$hitseqs[0]="";
while($line=<QUERYFILE>) # Read residues
{
if ($line=~/^>/) {last;}
chomp($line);
$line=~s/\s+//g; # remove white space
$hitseqs[0].=$line;
}
close(QUERYFILE);
# Prepare name line of hit
if ($outformat eq "psi") {
$query_name=~/^(\S{1,20})\S*\s*(.*)/; # delete everything after first block
$line=sprintf("%s",$1);
$line=~ tr/ /_/;
$hitnames[0] = sprintf("%-31.31s ",$line);
} else {
$hitnames[0] = sprintf(">%s E=0.0",$query_name);
}
$hitseqs[0] =~ tr/-.//d; # delete all gaps from query
$queryseq = $hitseqs[0];
$hitseqs[0] =~ tr/a-z/A-Z/d; # capitalize hitseq[0] and delete gaps
# $hitseqs[0] =~ tr/Uu/Cc/; # nicht mehr noetig in blast. Kann aber alignhits.pl zum abschmieren bringen.
$nhit=1;
# Capitalize query?
if ($capitalize) {$queryseq =~ tr/a-z/A-Z/;}
$query_match = ($queryseq=~tr/A-Z/A-Z/); # count number of match states in query
# Determine match columns as those with upper case residue in query
@queryseq=unpack("C*",$queryseq);
for ($j=0; $j<@queryseq; $j++) {
if ($queryseq[$j]>=65 && $queryseq[$j]<=90) {$match[$j]=1;} else {$match[$j]=0;}
}
}
# Scan Blast output file for query length (needed for coverage)
open(INFILE,"<$infile") or die ("Error: cannot open $infile: $!\n");
$line_number++;
while ($line=<INFILE>)
{
if ($line=~/^Length\s*=\s*(\d+)/) {$query_length = $1; last;}
$line_number++;
}
#print("Query length = $query_length\n");
while ($line = <INFILE>) #scan through PsiBlast-output line by line
{
# New nameline found?
#print "$line";
#if ($line=~/^Length\s*=\s*(\d+)/) { print "length=$1\n\n\n\n";}
if ($line=~s/^>//)
{
#print "$line";
$line=~s/\s+/ /g;
chomp($line);
$nameline=$line;
while ($line=<INFILE>)
{
if ($line=~/^Length\s*=\s*(\d+)/) {last;}
chomp($line);
$nameline.=$line;
}
$line=~/^Length\s*=\s*(\d+)/;
$tlen=$1;
$nameline=~s/\s+/ /g;
$nameline=~s/\s+gi\|/ gi\|/g;
# Is sequence a synthetic fusion protein ?
#if ($nameline=~/(\[synthetic| synthetic|construct|cloning|vector|chimeric|fusion)/i) {$skip=1;} else {$skip=0;}
#print "$nameline\n";
}
# New HSP found?
elsif (!$skip && $line=~/^ Score =/)
{
if($best) {$skip=1;} # skip all following hits with same sequence?
# First check whether E-value is small enough
if($line =~ /^ Score =\s*(\S+)\s*bits\s*\S*\s*Expect =\s*(\S+)/)
{
$score=$1;
$Evalue=$2;
#print "$score, $Evalue\n";
}
else
{
print("\nWARNING: wrong format in blast output. Expecting Score = ... Expect = ..\n$line\n");
}
$Evalue=~s/^(e|E)/1$1/; # Expect = e-123 -> 1e-123
$Evalue=~tr/,//d;
if ($Evalue>$E_max || $Evalue<$E_min) {$new_hit=""; next;} # reject hit
# Record sequence identity
# (not needed, qid calculated afterwards WITHOUT counting template residues aligned to gaps in query)
$line=<INFILE>;
if ($line =~ /^ Identities =\s*\S+\/(\S+)\s+\((\S+)%\)/)
{
$qid=$2;
#print "$qid\n";
$line=<INFILE>;
}
else
{
$qid=0.0; # if match is too poor then no identities are given
}
# Skip another line and read following line
$line=<INFILE>;
$line=<INFILE>;
# Read pairwise alignment
$qfirst="";
$tfirst="";
$query_res="";
$template_res="";
while ($line=~/^Query\s+\d+\s+\S+\s+\d*/) # Cycle in this loop until no new "Query:" lines are found
{
if ($line!~/^Query\s+(\d+)\s+(\S+)\s+(\d*)/)
{
print("WARNING 1: wrong format of blast output in $infile, line $.\n");
last;
}
if ($3 eq "") {
<INFILE>; <INFILE>; <INFILE>; $line=<INFILE>;
print("WARNING 2: wrong format of blast output in $infile, line $. Skipping alignment block.\n");
next;
}
if ($qfirst eq "") {$qfirst=$1;}
$query_res .= $2;
$qlast=$3;
<INFILE>; $line=<INFILE>;
if ($line!~/^Sbjct\s+(\d+)\s+(\S+)\s+(\d+)/)
{
print("WARNING 3: wrong format of blast output in $infile, line $.\n");
last;
}
if ($tfirst eq "") {$tfirst=$1;}
$template_res .= $2;
$tlast=$3;
<INFILE>; $line=<INFILE>;
} # end while(1)
# Check lengths
$query_res = uc($query_res);
$template_res = uc($template_res);
if (length($template_res)!=length($query_res)) {
print("WARNING: Query and template lines do not have the same length in $infile, line $.\n");
print("Q: $query_res\n");
print("T: $template_res\n");
next;
}
#print "$query_res\n";
#print "$template_res\n";
# Check whether hit has sufficient score per column
$hit_length=($template_res=~tr/a-zA-Z/a-zA-Z/);
if ($hit_length==0) {next;} # Reject hit?
$score_col=$score/$hit_length;
@query_res =unpack("C*",$query_res);
@template_res=unpack("C*",$template_res);
# Prune ends of HSP which are not reliably homologous
#if (($bs>-9 || $bl>-9) && !&PruneHSP()) {next;} # if entire HSP is pruned away, goto next alignment
# Check whether hit has sufficient sequence identity and coverage with query
if (!$query_file)
{
$len=0; $qid=0;
for ($i=0; $i<scalar(@query_res); $i++)
{
if ($template_res[$i]!=45 && $query_res[$i]!=45) { # count only non-gap template residues in match columns!
$len++;
if ($query_res[$i]==$template_res[$i]) {$qid++;}
}
}
$coverage = 100*$len/$query_length;
}
else
{
$len=1; $qid=0; $j=$qfirst-1; # if first_res=1 then $j=0
for ($i=0; $i<scalar(@query_res); $i++)
{
if ($query_res[$i]!=45)
{
if ($template_res[$i]!=45 && $match[$j]) { # count only non-gap template residues in match columns!
$len++;
if ($query_res[$i]==$template_res[$i]) {$qid++;}
}
$j++; # $j = next position in query
}
}
$coverage = 100*$len/$query_match;
}
if ($len==0) {next;} # Reject hit?
if (100*$qid/$len<$qid_thrshd) {next;} # Reject hit?
if ($coverage<$cov_thrshd) {next;} # Reject hit?
# print("Q: $query_res\n");
# print("T: $template_res\n\n");
# Check score per column
if ($sc_thrshd>-9 || $score_min>0) {
if (!&CheckScorePerColumn()) {next;}
}
if ($v>=3) {printf("nhit=%-2i qid=%-3i qlen=%-3i qid=%-3i%% s/c=%-6.3f\n",$nhit,$qid,$len,100*$qid/$len,$score_col);}
# Record residues
$new_hit = "-"x($qfirst-1); # Print gaps at beginning of sequence
if ($outformat eq "psi") {
for ($i=0; $i<scalar(@query_res); $i++) {
if ($query_res[$i]!=45) { # residues aligned to gaps are ignored
$new_hit .= uc(chr($template_res[$i])); # UPPER case if aligned with a query residue (match state)
}
}
} else {
for ($i=0; $i<scalar(@query_res); $i++) {
if ($query_res[$i]!=45) {
$new_hit .= uc(chr($template_res[$i])); # UPPER case if aligned with a query residue (match state)
} else {
if($template_res[$i]!=45) {
$new_hit.=lc(chr($template_res[$i])); # lower case if aligned with a gap in the query (insert state)
}
}
}
}
$new_hit .= "-" x ($query_length-$qlast); # Print gaps at end of sequence
# $new_hit =~ tr/Uu/Cc/; # nicht mehr noetig in blast. Kann aber alignhits.pl zum abschmieren bringen.
$hitseqs[$nhit] = $new_hit;
# printf("%s\n",$new_hit);
# Prepare name line of hit
if ($outformat eq "psi") {
$nameline=~/^(\S{1,20})\S*\s*(.*)/; # delete everything after first block
$line=sprintf("%s(%i-%i:%i)",$1,$tfirst,$tlast,$tlen);
$line=~ tr/ /_/;
$hitnames[$nhit] = sprintf("%-31.31s ",$line);
} else {
$nameline=~/^(\S*)\s*(.*)/; # delete everything after first block
$hitnames[$nhit] = sprintf(">%s(%i-%i:%i) %s E=%g s/c=%4.2f id=%.0f%% cov=%.0f%%",
$1,$tfirst,$tlast,$tlen,$2,$Evalue,$score_col,100*$qid/$len,$coverage);
}
$nhit++;
#print "$nhit\n" if($nhit%100 ==0);
} # end elseif new HSP found
} # end while ($line)
close(INFILE);
# If output format is fasta or a2m we have to insert gaps:
if ($outformat ne "psi")
{
my @len_ins; # $len_ins[$j] will count the maximum number of inserted residues after match state $j.
my @inserts; # $inserts[$j] contains the insert (in small case) of sequence $k after the $j'th match state
my $insert;
my $ngap;
# For each match state determine length of LONGEST insert after this match state and store in @len_ins
for ($k=0; $k<$nhit; $k++) {
# split into list of single match states and variable-length inserts
# ([A-Z]|-) is the split pattern. The parenthesis indicate that split patterns are to be included as list elements
# The '#' symbol is prepended to get rid of a perl bug in split
$j=0;
@inserts = split(/([A-Z]|-)/,"#".$hitseqs[$k]."#");
# printf("%3i: %12.12s %s\n",$k,$hitnames[$k],$hitseqs[$k]);
# printf("Sequence $k: @inserts\n");
foreach $insert (@inserts) {
if( !defined $len_ins[$j] || length($insert)>$len_ins[$j]) {
$len_ins[$j]=length($insert);
}
$j++;
# printf("$insert|");
}
# for (my $i=0; $i<@inserts; $i++) {printf("%s%-2i ",$inserts[$i],$len_ins[$i]);}
# printf("\n");
}
# After each match state insert residues and fill up with gaps to $len_ins[$i] characters
for ($k=0; $k<$nhit; $k++) {
# split into list of single match states and variable-length inserts
@inserts = split(/([A-Z]|-)/,"#".$hitseqs[$k]."#");
$j=0;
# append the missing number of gaps after each match state
foreach $insert (@inserts) {
if($outformat eq "fas") {
for (my $l=length($insert); $l<$len_ins[$j]; $l++) {$insert.="-";}
}
else {
for (my $l=length($insert); $l<$len_ins[$j]; $l++) {$insert.=".";}
}
$j++;
}
$hitseqs[$k] = join("",@inserts);
$hitseqs[$k] =~ tr/\#//d; # remove the '#' symbols inserted at the beginning and end
}
}
if ($query_file) {
# Determine match states
my @qa2m = unpack("C*",$hitseqs[0]); # $hitseq[0] is query sequence WITH INSERTS
my @matchali=();
my $L=scalar(@qa2m);
$j=0;
for ($i=0; $i<@match; $i++) {
while ($j<$L && !($qa2m[$j]>=65 && $qa2m[$j]<=90)) {$matchali[$j++]=0;} #move to column with next upper case residue
$matchali[$j++]=$match[$i]; #is next query residue upper-case or not?
}
# Set all match states to upper case, non-match states to lower case
my @res;
for ($k=0; $k<$nhit; $k++) {
@res = unpack("C*",$hitseqs[$k]);
# printf("Q: %s\n",$hitseqs[0]);
# printf("T: %s\n",$hitseqs[$k]);
for ($i=0; $i<@res; $i++) {
if ($matchali[$i]) {
if ($res[$i]>=97 && $res[$i]<=122) {$res[$i]-=32;} #convert to upper case
} else {
if ($res[$i]>=65 && $res[$i]<=90) {$res[$i]+=32;} # convert to lower case
elsif ($res[$i]==45) {$res[$i]=46;} # convert '-' to '.'
}
# printf("%3i Q:%s T:%s match=%i len=%i\n",$i,chr($qa2m[$i]),chr($res[$i]),$qid[$k],$len);
}
$hitseqs[$k] = pack("C*",@res);
}
}
# Remove gaps? Captialize?
if ($outformat eq "ufas") {
for ($k=0; $k<$nhit; $k++) {$hitseqs[$k]=~tr/a-z.-/A-Z/d;} # Transform to upper case and remove all gaps
} elsif ($outformat eq "fas") {
for ($k=0; $k<$nhit; $k++) {$hitseqs[$k]=~tr/a-z./A-Z-/;} # Transform to upper case
} elsif ($outformat eq "a3m") {
for ($k=0; $k<$nhit; $k++) {$hitseqs[$k]=~tr/.//d;} # Remove gaps aligned to inserts
}
# Write sequences into output file
open (OUTFILE, ">$outfile") or die ("cannot open $outfile:$!\n");
if ($outformat eq "psi") {
for ($k=0; $k<$nhit; $k++) {
$hitseqs[$k] =~ tr/./-/;
printf(OUTFILE "%s %s\n",$hitnames[$k],$hitseqs[$k]);
}
}
else {
for ($k=0; $k<$nhit; $k++) {
printf(OUTFILE "%s\n%s\n",$hitnames[$k],$hitseqs[$k]);
}
}
close OUTFILE;
if ($v>=1) {printf("$nhit sequences extracted from $infile and written to $outfile\n");}
exit(0);