-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
failed experiment to compress quality scores
- Loading branch information
0 parents
commit 41c1060
Showing
5 changed files
with
924 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,121 @@ | ||
///////////////////////////////////////////////////////////////////////////// | ||
// SOFTWARE COPYRIGHT NOTICE AGREEMENT // | ||
// This software and its documentation are copyright (2008) by the // | ||
// Broad Institute/Massachusetts Institute of Technology. All rights // | ||
// are reserved. This software is supplied without any warranty or // | ||
// guaranteed support whatsoever. Neither the Broad Institute nor MIT // | ||
// can be responsible for its use, misuse, or functionality. // | ||
///////////////////////////////////////////////////////////////////////////// | ||
/* | ||
* \file BGZF.cc | ||
* \author tsharpe | ||
* \date May 21, 2009 | ||
* | ||
* \brief Utilities for writing BAM files. | ||
*/ | ||
#include "BGZF.h" | ||
#include <zlib.h> | ||
#include <iostream> | ||
#include <stdlib.h> | ||
#include <string.h> | ||
|
||
namespace | ||
{ | ||
void fatalErr( char const* msg ) | ||
{ | ||
std::cerr << "Can't write BAM file: " << msg << std::endl; | ||
exit(1); | ||
} | ||
} | ||
|
||
unsigned int BGZFBlock::compress( void* data, unsigned int len ) | ||
{ | ||
while ( !tryCompress(data,len) ) | ||
; | ||
return len; | ||
} | ||
|
||
bool BGZFBlock::tryCompress( void* data, unsigned int& len ) | ||
{ | ||
z_stream zs; | ||
zs.zalloc = 0; | ||
zs.zfree = 0; | ||
zs.opaque = 0; | ||
if ( deflateInit2(&zs,Z_DEFAULT_COMPRESSION,Z_DEFLATED,WINDOW_BITS,MEM_LEVEL,Z_DEFAULT_STRATEGY) != Z_OK ) | ||
fatalErr("Can't initialize BAM file's z_stream."); | ||
|
||
zs.next_in = reinterpret_cast<Bytef*>(data); | ||
zs.avail_in = len; | ||
zs.next_out = mDataBlock; | ||
zs.avail_out = sizeof(mDataBlock); | ||
if ( deflate(&zs,Z_FINISH) != Z_STREAM_END ) | ||
{ | ||
zs.avail_in += 256; | ||
if ( zs.avail_in >= len ) | ||
fatalErr("Can't deflate even a tiny amount of data for BAM file."); | ||
len -= zs.avail_in; // back off by the amount we were unable to compress, and a little more | ||
return false; | ||
} | ||
|
||
if ( deflateEnd(&zs) != Z_OK ) | ||
fatalErr("Can't finalize BAM file's z_stream."); | ||
|
||
unsigned char* ppp = mDataBlock + zs.total_out; | ||
unsigned int crc = crc32(crc32(0,0,0),reinterpret_cast<Bytef*>(data),len); | ||
memcpy(ppp,&crc,sizeof(unsigned int)); // setting mCRC32 | ||
ppp += sizeof(unsigned int); | ||
memcpy(ppp,&len,sizeof(unsigned int)); // setting mInputSize | ||
ppp += sizeof(unsigned int); | ||
mBlockSizeLessOne = (ppp - reinterpret_cast<unsigned char*>(this)) - 1U; | ||
|
||
return true; | ||
} | ||
|
||
BGZFStreambuf::int_type BGZFStreambuf::overflow( int_type ch ) | ||
{ | ||
if ( ch != traits_type::eof() ) | ||
{ | ||
*pptr() = ch; | ||
pbump(1); | ||
} | ||
|
||
char* ppp = pptr(); | ||
unsigned int inLen = ppp - mBuf; | ||
if ( !inLen ) | ||
return 0; // EARLY RETURN! | ||
|
||
setp(mBuf,epptr()); | ||
|
||
BGZFBlock output; | ||
inLen -= output.compress(mBuf,inLen); | ||
if ( inLen ) | ||
{ | ||
memmove(mBuf,ppp-inLen,inLen); | ||
pbump(inLen); | ||
} | ||
|
||
if ( mpSB->sputn(reinterpret_cast<char const*>(&output),output.getBlockSize()) != output.getBlockSize() ) | ||
fatalErr("Can't write to BAM file."); | ||
|
||
return 0; | ||
} | ||
|
||
int BGZFStreambuf::sync() | ||
{ | ||
while ( pptr() != pbase() ) | ||
overflow( traits_type::eof() ); | ||
return 0; | ||
} | ||
|
||
BAMostream::BAMostream( char const* bamFile ) | ||
: std::ostream(&mSB), mSB(&mFilebuf) | ||
{ | ||
mFilebuf.open(bamFile,std::ios_base::out|std::ios_base::binary|std::ios_base::trunc); | ||
} | ||
|
||
void BAMostream::close() | ||
{ | ||
mSB.pubsync(); | ||
mFilebuf.pubsync(); | ||
mFilebuf.close(); | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,103 @@ | ||
///////////////////////////////////////////////////////////////////////////// | ||
// SOFTWARE COPYRIGHT NOTICE AGREEMENT // | ||
// This software and its documentation are copyright (2008) by the // | ||
// Broad Institute/Massachusetts Institute of Technology. All rights // | ||
// are reserved. This software is supplied without any warranty or // | ||
// guaranteed support whatsoever. Neither the Broad Institute nor MIT // | ||
// can be responsible for its use, misuse, or functionality. // | ||
///////////////////////////////////////////////////////////////////////////// | ||
/* | ||
* \file BGZF.h | ||
* \author tsharpe | ||
* \date May 21, 2009 | ||
* | ||
* \brief Utilities for writing BAM files. | ||
*/ | ||
#ifndef LOOKUP_BGZF_H_ | ||
#define LOOKUP_BGZF_H_ | ||
|
||
#include <fstream> | ||
|
||
class GZIPHeader | ||
{ | ||
public: | ||
GZIPHeader() | ||
: mID1(31), mID2(139), mCompressionMethod(8), mFlags(4), mModTime(0), | ||
mExtraFlags(0), mOpSys(255), mXLen(6), mSI1('B'), mSI2('C'), mSLen(2) | ||
{} | ||
|
||
// compiler-supplied copying and destructor are OK | ||
|
||
private: | ||
unsigned char mID1; | ||
unsigned char mID2; | ||
unsigned char mCompressionMethod; | ||
unsigned char mFlags; | ||
unsigned int mModTime; | ||
unsigned char mExtraFlags; | ||
unsigned char mOpSys; | ||
unsigned short mXLen; | ||
unsigned char mSI1; | ||
unsigned char mSI2; | ||
unsigned short mSLen; | ||
protected: | ||
unsigned short mBlockSizeLessOne; | ||
}; | ||
|
||
class BGZFBlock : public GZIPHeader | ||
{ | ||
public: | ||
// compiler-supplied no-arg constructor, copying and destructor are OK | ||
|
||
// returns the amount of uncompressed data put into the block. | ||
// if len is too big, or the data is too incompressible, not all of the supplied | ||
// data will fit into a block. | ||
unsigned int compress( void* data, unsigned int len ); | ||
|
||
unsigned int getBlockSize() | ||
{ return mBlockSizeLessOne + 1U; } | ||
|
||
private: | ||
bool tryCompress( void* data, unsigned int& len ); | ||
|
||
// mBlockSizeLessOne is 16 bits, so 64K is the max block length and there are 26 bytes of header and footer | ||
unsigned char mDataBlock[64*1024UL-26UL]; | ||
unsigned int mCRC32; // these last two members actually immediately follow the variable-length data block | ||
unsigned int mInputSize; // this struct is what a maximum-size block looks like | ||
|
||
static int const WINDOW_BITS = -15; | ||
static int const MEM_LEVEL = 8; | ||
}; | ||
|
||
class BGZFStreambuf : public std::streambuf | ||
{ | ||
public: | ||
BGZFStreambuf( std::streambuf* psb ) | ||
: mpSB(psb) | ||
{ setp(mBuf,mBuf+sizeof(mBuf)-1); } | ||
|
||
~BGZFStreambuf() | ||
{ sync(); } | ||
|
||
private: | ||
BGZFStreambuf( BGZFStreambuf const& ); // undefined -- no copying | ||
BGZFStreambuf& operator=( BGZFStreambuf const& ); // undefined -- no copying | ||
|
||
int_type overflow( int_type ch ); | ||
int sync(); | ||
|
||
std::streambuf* mpSB; | ||
char mBuf[128UL*1024UL]; | ||
}; | ||
|
||
class BAMostream : public std::ostream | ||
{ | ||
public: | ||
BAMostream( char const* bamFile ); | ||
void close(); | ||
|
||
std::filebuf mFilebuf; | ||
BGZFStreambuf mSB; | ||
}; | ||
|
||
#endif /* LOOKUP_BGZF_H_ */ |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,3 @@ | ||
all: OQCompress | ||
OQCompress: OQCompress.cc BGZF.h BGZF.cc | ||
g++ -std=c++11 -fno-strict-aliasing -Wextra -Wall -Wsign-promo -Woverloaded-virtual -Wendif-labels -march=native -g -o OQCompress OQCompress.cc BGZF.cc -lz |
Oops, something went wrong.