-
Notifications
You must be signed in to change notification settings - Fork 34
/
Copy pathburrows-wheeler_file_transform.pl
124 lines (94 loc) · 2.77 KB
/
burrows-wheeler_file_transform.pl
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
#!/usr/bin/perl
# Author: Trizen
# Date: 14 June 2023
# Edit: 17 September 2023
# https://github.com/trizen
# Apply the Burrows–Wheeler transform on a file.
# https://rosettacode.org/wiki/Burrows–Wheeler_transform
# References:
# Data Compression (Summer 2023) - Lecture 12 - The Burrows-Wheeler Transform (BWT)
# https://youtube.com/watch?v=rQ7wwh4HRZM
#
# Data Compression (Summer 2023) - Lecture 13 - BZip2
# https://youtube.com/watch?v=cvoZbBZ3M2A
use 5.036;
use Getopt::Std qw(getopts);
use File::Basename qw(basename);
use constant {
LOOKAHEAD_LEN => 128, # lower values are usually faster
};
sub bwt_balanced ($s) { # O(n * LOOKAHEAD_LEN) space (fast)
#<<<
[
map { $_->[1] } sort {
($a->[0] cmp $b->[0])
|| ((substr($s, $a->[1]) . substr($s, 0, $a->[1])) cmp(substr($s, $b->[1]) . substr($s, 0, $b->[1])))
}
map {
my $t = substr($s, $_, LOOKAHEAD_LEN);
if (length($t) < LOOKAHEAD_LEN) {
$t .= substr($s, 0, ($_ < LOOKAHEAD_LEN) ? $_ : (LOOKAHEAD_LEN - length($t)));
}
[$t, $_]
} 0 .. length($s) - 1
];
#>>>
}
sub bwt_encode ($s) {
my $bwt = bwt_balanced($s);
my $ret = join('', map { substr($s, $_ - 1, 1) } @$bwt);
my $idx = 0;
foreach my $i (@$bwt) {
$i || last;
++$idx;
}
return ($ret, $idx);
}
sub bwt_decode ($bwt, $idx) { # fast inversion: O(n * log(n))
my @tail = split(//, $bwt);
my @head = sort @tail;
if ($idx > $#head) {
die "Invalid bwt-index: $idx (must be <= $#head)\n";
}
my %indices;
foreach my $i (0 .. $#tail) {
push @{$indices{$tail[$i]}}, $i;
}
my @table;
foreach my $v (@head) {
push @table, shift(@{$indices{$v}});
}
my $dec = '';
my $i = $idx;
for (1 .. scalar(@head)) {
$dec .= $head[$i];
$i = $table[$i];
}
return $dec;
}
getopts('dh', \my %opts);
if ($opts{h} or !@ARGV) {
die "usage: $0 [-d] [input file] [output file]\n";
}
my $input_file = $ARGV[0];
my $output_file = $ARGV[1] // (basename($input_file) . ($opts{d} ? '.dec' : '.bw'));
my $content = do {
open my $fh, '<:raw', $input_file
or die "Can't open file <<$input_file>> for reading: $!";
local $/;
<$fh>;
};
if ($opts{d}) { # decode mode
my $idx = unpack('N', substr($content, 0, 4, ''));
my $dec = bwt_decode($content, $idx);
open my $out_fh, '>:raw', $output_file
or die "Can't open file <<$output_file>> for writing: $!";
print $out_fh $dec;
}
else {
my ($bwt, $idx) = bwt_encode($content);
open my $out_fh, '>:raw', $output_file
or die "Can't open file <<$output_file>> for writing: $!";
print $out_fh pack('N', $idx);
print $out_fh $bwt;
}