Skip to content

Commit

Permalink
wip on new name indexing strategy
Browse files Browse the repository at this point in the history
  • Loading branch information
rbuels committed Dec 16, 2013
1 parent 822c299 commit 2000aca
Show file tree
Hide file tree
Showing 6 changed files with 124 additions and 12 deletions.
1 change: 1 addition & 0 deletions Makefile.PL
Original file line number Diff line number Diff line change
Expand Up @@ -55,6 +55,7 @@ my %WriteMakefileArgs = (
"File::Basename" => 0,
"File::Copy::Recursive" => 0,
"File::Path" => 2,
"File::Next" => 0,
"File::Spec" => 0,
"File::Spec::Functions" => 0,
"File::Temp" => 0,
Expand Down
5 changes: 5 additions & 0 deletions bin/generate-names.pl
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,11 @@
require Bio::JBrowse::Cmd::IndexNames::BackCompat;
exit Bio::JBrowse::Cmd::IndexNames::BackCompat->new(@ARGV)->run;
}

elsif( 1 ) {
require Bio::JBrowse::Cmd::IndexNames2;
exit Bio::JBrowse::Cmd::IndexNames2->new(@ARGV)->run;
}
else {
require Bio::JBrowse::Cmd::IndexNames;
exit Bio::JBrowse::Cmd::IndexNames->new(@ARGV)->run;
Expand Down
6 changes: 4 additions & 2 deletions src/perl5/Bio/JBrowse/Cmd/IndexNames.pm
Original file line number Diff line number Diff line change
Expand Up @@ -461,8 +461,10 @@ sub make_operations {

my $lc_name = lc $record->[0];
unless( $lc_name ) {
warn "WARNING: some blank name records found, skipping.\n"
unless $self->{already_warned_about_blank_name_records}++;
unless( $self->{already_warned_about_blank_name_records} ) {
warn "WARNING: some blank name records found, skipping.\n";
$self->{already_warned_about_blank_name_records} = 1;
}
return;
}

Expand Down
1 change: 0 additions & 1 deletion src/perl5/Bio/JBrowse/Cmd/IndexNames/BackCompat.pm
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,6 @@ sub load {
# on the data in the hash store
my $operation_stream = $self->make_operation_stream( $self->make_name_record_stream( $ref_seqs, $names_files ), $names_files );

# finally copy the temp store to the namestore
$self->vprint( "Using ".$self->name_store->meta->{hash_bits}."-bit hashing (".$self->requested_hash_bits." requested)\n" );

$self->close_name_store;
Expand Down
31 changes: 31 additions & 0 deletions src/perl5/Bio/JBrowse/Cmd/IndexNames2.pm
Original file line number Diff line number Diff line change
@@ -0,0 +1,31 @@
package Bio::JBrowse::Cmd::IndexNames2;
use strict;
use warnings;

use base 'Bio::JBrowse::Cmd::IndexNames';


sub load {
my ( $self, $ref_seqs, $names_files ) = @_;

# convert the stream of name records into a stream of operations to do
# on the data in the hash store
my $operation_stream = $self->make_operation_stream( $self->make_name_record_stream( $ref_seqs, $names_files ), $names_files );

# hash each operation and write it to a log file
$self->name_store->stream_do(
$operation_stream,
sub {
my ( $operation, $data ) = @_;
my %fake_store = ( $operation->[0] => $data );
$self->do_hash_operation( \%fake_store, $operation );
return $fake_store{ $operation->[0] } || {};
});

}

sub _hash_operation_freeze { $_[1] }
sub _hash_operation_thaw { $_[1] }

1;

92 changes: 83 additions & 9 deletions src/perl5/Bio/JBrowse/HashStore.pm
Original file line number Diff line number Diff line change
Expand Up @@ -31,7 +31,7 @@ use warnings;

use Carp;

use Storable;
use Storable ();
use JSON 2;

use File::Spec ();
Expand Down Expand Up @@ -144,6 +144,54 @@ sub get {
return $bucket->{data}{$key};
}

=head2 stream_do( $arg_stream, $operation_callback )
=cut

sub stream_do {
my ( $self, $op_stream, $do_operation ) = @_;

my $filehandle_cache = $self->_make_cache( size => 30000 );

# make log files for each bucket, log the operations that happen
# on that bucket, but don't actually do them yet
while( my $op = $op_stream->() ) {
my $key = $op->[0];
my $hex = $self->_hexHash( $key );
my $log_handle = $filehandle_cache->compute( $hex, sub {
my $pathinfo = $self->_hexToPath( $hex );
File::Path::mkpath( $pathinfo->{dir} ) unless -d $pathinfo->{dir};
#warn "writing $pathinfo->{fullpath}.log\n";
CORE::open( my $f, '>>', "$pathinfo->{fullpath}.log" )
or die "$! opening bucket log $pathinfo->{fullpath}.log";
return $f;
});

Storable::store_fd( $op, $log_handle );
}

undef $filehandle_cache;

# play back the operations, feeding the $do_operation sub with the
# bucket and the operation to be done
my $log_iterator = $self->_file_iterator( sub { /\.log$/ } );
while( my $log_path = $log_iterator->() ) {
#unlink $log_path or die "$! unlinking $log_path\n";
( my $bucket_path = $log_path ) =~ s/\.log$//;
my $bucket = $self->_readBucket({ fullpath => $bucket_path, dir => File::Basename::dirname( $bucket_path ) });
CORE::open( my $log_fh, '<', $log_path ) or die "$! reading $log_path";
unlink $log_path;
while( my $op = eval { Storable::fd_retrieve( $log_fh ) } ) {
$bucket->{data}{$op->[0]} = $do_operation->( $op, $bucket->{data}{$op->[0]} );
}
}
}

use File::Next ();
sub _file_iterator {
my ( $self, $filter ) = @_;
return File::Next::files( { file_filter => $filter }, $self->{dir} );
}

=head2 set( $key, $value )
Expand Down Expand Up @@ -272,7 +320,7 @@ sub _stream_set_build_buckets {
print "Rehashing to final HashStore buckets...\n" if $self->{verbose} && ! $progressbar;

while ( my ( $k, $v ) = $kv_stream->() ) {
my $hex = $self->_hex( $self->_hash( $k ) );
my $hex = $self->_hexHash( $k );
my $b = $buckets->{$hex};
if( $b ) {
$b = Storable::thaw( $b );
Expand Down Expand Up @@ -361,11 +409,22 @@ sub UNTIE {

########## helper methods ###########

# cached combination hash and print as hex
sub _hexHash {
my ( $self, $key ) = @_;
my $cache = $self->{hex_hash_cache} ||= $self->_make_cache( size => 300 );
return $cache->compute( $key, sub {
my ($k) = @_;
return $self->_hex( $self->_hash( $key ) );
});
}

sub _hash {
my ( $self, $key ) = @_;
my $crc = ( $self->{crc} ||= do { require Digest::Crc32; Digest::Crc32->new } )
->strcrc32( $key );
return $crc & $self->{hash_mask};
$crc &= $self->{hash_mask};
return $crc;
}

sub _hex {
Expand All @@ -384,19 +443,34 @@ sub _hexToPath {

sub _getBucket {
my ( $self, $key ) = @_;
return $self->_getBucketFromHex( $self->_hex( $self->_hash( $key ) ) );
return $self->_getBucketFromHex( $self->_hexHash( $key ) );
}

sub _flushAllCaches {
my ( $self ) = @_;
delete $self->{$_} for (
qw(
bucket_cache
bucket_log_filehandle_cache
hex_hash_cache
bucket_path_cache_by_hex
));
}

sub _getBucketFromHex {
my ( $self, $hex ) = @_;
my $bucket_cache = $self->{bucket_cache} ||= $self->_make_cache( size => $self->{cache_size} );
return $bucket_cache->compute( $hex, sub {
my $path_cache = $self->{bucket_path_cache_by_hex} ||= $self->_make_cache( size => $self->{cache_size} );
my $pathinfo = $path_cache->compute( $hex, sub { $self->_hexToPath( $hex ) });
return $self->_readBucket( $pathinfo )
return $self->_readBucket( $self->_getBucketPath( $hex ) )
});
}

sub _getBucketPath {
my ( $self, $hex ) = @_;
my $path_cache = $self->{bucket_path_cache_by_hex} ||= $self->_make_cache( size => $self->{cache_size} );
return $path_cache->compute( $hex, sub { $self->_hexToPath( $hex ) });
}

sub _readBucket {
my ( $self, $pathinfo ) = @_;

Expand All @@ -421,7 +495,7 @@ sub _readBucket {
}
} || {}
)
: ( dirty => 1 )
: ( data => {}, dirty => 1 )
));
}

Expand Down Expand Up @@ -468,7 +542,7 @@ sub new {

sub compute {
my ( $self, $key, $callback ) = @_;
return $self->{bykey}{$key} ||= do {
return exists $self->{bykey}{$key} ? $self->{bykey}{$key} : do {
my $fifo = $self->{fifo};
if( @$fifo >= $self->{size} ) {
delete $self->{bykey}{ shift @$fifo };
Expand Down

0 comments on commit 2000aca

Please sign in to comment.