-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathtagit6.pl
248 lines (231 loc) · 8.69 KB
/
tagit6.pl
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
#!/usr/bin/perl
use strict;
use warnings;
# ------------------------------------------------- #
# - Author: Jun Yu Jacinta Cai & Dr. Puri Bangalore & Dr. Robert Thacker
# - Last modified: 22 May 2018
# - Version: 5.0.lb
# - CS Research with Dr. Thacker & Dr. Bangalore
# - Description: Tags line with corresponding list
# provided by the user
# - Usage: perl tagit5.pl inputFileName tagListFileName outputFileName
#
# Note: _lb linebreaks fixed? for legacy mac users
# ------------------------------------------------- #
my %tagList;
my $inputfile;
# get the input metadata file
my $inputFileName = $ARGV[0];
if (! open INPUT, "<$inputFileName") {
warn "Could not open input file";
exit;
}
#get the tag list file
my $tagListFileName = $ARGV[1];
if (! open TAGLIST, "<$tagListFileName") {
warn "Could not open tag list file not found";
exit;
}
#get the output file name
my $outputFileName = $ARGV[2];
if (! open OUTPUT, ">$outputFileName") {
warn "Could not open output file";
exit;
}
#read the tag list file
#must have format like
#parent
# parent
# child1
my $tagline; #will contain initial read, could be multiple lines if legacy Mac
my @taglines; #will hold all of lines in the file
{local $/=undef; #temporarily set line break as undefined for a local block
while($tagline = <TAGLIST>) {
my @lines = split /\r\n?|\n/, $tagline;
push @taglines, @lines;
}
} #end the local block
#note: this approach reads the entire file into memory; might fail for big files
#populate %tagList hash
my $parent = ""; #initialize parent tag
foreach my $tag (@taglines) {
chomp($tag);
$tag =~ s/\r//; #just in case, also remove CRs
$tag = lc($tag); # make tags lowercase
if ($tag =~ /\t(\w+)/) {
# If the line has a tab, then push the value
# into the reference array within the correct hash key
# these are the "children" terms that invoke the "parent" tag
push @{$tagList{$parent}}, $1;
} else {
# If the line doesn't have a tab, then make a new
# key with an empty reference array
$tagList{$tag} = [];
$parent = $tag;
}
}
#read the metadata file
#first line should contain headers
#note: this approach reads the entire file into memory; might fail for big files
my $metaline; #will contain initial read, could be multiple lines if legacy Mac
my @metalines; #will hold all of the lines in the file
{local $/=undef; #temporarily set line break as undefined for a local block
while($metaline = <INPUT>) {
my @lines = split /\r\n?|\n/, $metaline;
push @metalines, @lines;
}
} #end the local block
my %tagHash; #set up tagHash to count tags
my $noTagCount = 0; #count number without tags for later use
my @untaggedRecords; #set up @untaggedRecords as an array of hash references
my @taggedRecords; #set up @taggedRecords as an array of hash references
my $counter = 0;
foreach my $line (@metalines) {
#step through line by line of input file; the lines are in the array @metalines
chomp($line);
$line =~ s/\r//; #just in case, also remove CRs
if ($counter == 0) {
print OUTPUT "$line"."\t"."tag"."\n"; #adds header to output file
$counter = 1;
} else {
my $tagB = 0; #track if tag added or not
while(my($k, $v) = each %tagList) {
# Steps through the key / values of the hash
if ($tagB == 1) {}
else {
foreach (@$v) {
# looks at each value of the hash
if ($tagB == 1) {} #move along if there is already a match - only the first match in tag list counts
else {
if(lc($line) =~ /\b$_\b/) { #added word boundaries - yes, this helps
# searches the input file line and see if it
# contains the value of the hash
# If it does have it, then print out the corresponding
# key with the line (to tag the line)
print OUTPUT "$line"."\t"."$k"."\n";
$tagHash{$k} += 1;
$tagB = 1;
# make a hash for output of taggedRecords only
my @elements = split ('\t', $line);
my $hashref = { accession => $elements[0],
definitionLine => $elements[1],
organism => $elements[2],
product => $elements[3],
isolation_source => $elements[4],
db_xref => $elements[5],
host => $elements[6],
clone => $elements[7],
isolate => $elements[8],
strain => $elements[9],
tissue_type => $elements[10],
lat_lon => $elements[11],
country => $elements[12],
note => $elements[13],
lab_host => $elements[14],
tag => $k,
};
push @taggedRecords, $hashref;
}
}
}
}
}
if ($tagB == 0) { #if nothing has matched, need to output "no tag"
print OUTPUT "$line"."\t"."no tag"."\n";
$noTagCount += 1;
my @elements = split ('\t', $line);
my $hashref = { accession => $elements[0],
definitionLine => $elements[1],
organism => $elements[2],
product => $elements[3],
isolation_source => $elements[4],
db_xref => $elements[5],
host => $elements[6],
clone => $elements[7],
isolate => $elements[8],
strain => $elements[9],
tissue_type => $elements[10],
lat_lon => $elements[11],
country => $elements[12],
note => $elements[13],
lab_host => $elements[14],
};
push @untaggedRecords, $hashref;
}
}
}
close INPUT;
#show the user the counts of how many of each tag exist
print "\n\nNumber of each tag:\n";
sortedPrint (%tagHash);
print "\nNumber of records without tags: ".$noTagCount."\n";
#add this information to the output file
select (OUTPUT);
print "\n\nNumber of each tag:\n";
sortedPrint (%tagHash);
print "\nNumber of records without tags: ".$noTagCount."\n";
select (STDOUT);
#show the user the next 5 records without tags
my $index = 4;
if ($noTagCount > 0) {
if ($noTagCount < 5) { $index = $noTagCount; }
print "\nNext records that need tags:\n";
print OUTPUT "\nNext records that need tags:\n";
foreach my $i (0..$index) {
my $hashref = $untaggedRecords[$i];
my %record = %$hashref;
print $record{accession}."\t".$record{definitionLine}."\t".$record{organism};
print "\t".$record{product}."\t".$record{isolation_source}."\t".$record{db_xref};
print "\t".$record{host}."\t".$record{clone}."\t".$record{isolate}."\t".$record{strain};
print "\t".$record{tissue_type}."\t".$record{lat_lon}."\t".$record{country};
print "\t".$record{note}."\t".$record{lab_host}."\n";
select (OUTPUT);
print $record{accession}."\t".$record{definitionLine}."\t".$record{organism};
print "\t".$record{product}."\t".$record{isolation_source}."\t".$record{db_xref};
print "\t".$record{host}."\t".$record{clone}."\t".$record{isolate}."\t".$record{strain};
print "\t".$record{tissue_type}."\t".$record{lat_lon}."\t".$record{country};
print "\t".$record{note}."\t".$record{lab_host}."\n";
select (STDOUT);
}
}
close OUTPUT;
#ask if user wants to export CSV of tagged records for Arbor or R
print "\n\nExport .csv file of tagged records for Arbor or R? (y or n): ";
chomp(my $response = <STDIN>);
if ($response eq "y") {
print "\nEnter filename: ";
chomp(my $exportFileName = <STDIN>);
if (! open EXPORT, ">$exportFileName") {
warn "Could not open export file";
exit;
}
select (EXPORT);
# first print headers # CSV # only really need name on phylogeny and tag
print "name".","."accession".","."definitionLine".","."organism".","."product";
print ","."isolation_source".","."db_xref".","."host";
print ","."clone".","."isolate".","."strain".","."tissue_type".","."lat_lon";
print ","."country".","."note".","."lab_host".","."tag"."\n";
foreach my $hashref (@taggedRecords) {
my %record = %$hashref;
my $name = $record{accession}."_".$record{organism};
$name =~ s/ /_/g; #replace spaces with underscores in the name
$name =~ s/['".-]//g; #remove quotes and periods and hyphens, because the phylogeny workflow removes these
print $name.",".$record{accession}.",".$record{definitionLine}.",".$record{organism};
print ",".$record{product}.",".$record{isolation_source}.",".$record{db_xref};
print ",".$record{host}.",".$record{clone}.",".$record{isolate}.",".$record{strain};
print ",".$record{tissue_type}.",".$record{lat_lon}.",".$record{country};
print ",".$record{note}.",".$record{lab_host}.",".$record{tag}."\n";
}
select STDOUT;
}
close EXPORT;
sub sortedPrint {
my %hash = @_;
my @sortedKeys = sort {
$hash{$b} <=> $hash{$a} #high to low value sort of keys
or $a cmp $b #if tied, alphabetical sort of keys
} keys %hash;
foreach my $key (@sortedKeys) {
print " $key: $hash{$key}\n";
}
}