Skip to content

Commit

Permalink
failed experiment to compress quality scores
Browse files Browse the repository at this point in the history
  • Loading branch information
tedsharpe committed Nov 16, 2015
0 parents commit 41c1060
Show file tree
Hide file tree
Showing 5 changed files with 924 additions and 0 deletions.
121 changes: 121 additions & 0 deletions BGZF.cc
Original file line number Diff line number Diff line change
@@ -0,0 +1,121 @@
/////////////////////////////////////////////////////////////////////////////
// SOFTWARE COPYRIGHT NOTICE AGREEMENT //
// This software and its documentation are copyright (2008) by the //
// Broad Institute/Massachusetts Institute of Technology. All rights //
// are reserved. This software is supplied without any warranty or //
// guaranteed support whatsoever. Neither the Broad Institute nor MIT //
// can be responsible for its use, misuse, or functionality. //
/////////////////////////////////////////////////////////////////////////////
/*
* \file BGZF.cc
* \author tsharpe
* \date May 21, 2009
*
* \brief Utilities for writing BAM files.
*/
#include "BGZF.h"
#include <zlib.h>
#include <iostream>
#include <stdlib.h>
#include <string.h>

namespace
{
void fatalErr( char const* msg )
{
std::cerr << "Can't write BAM file: " << msg << std::endl;
exit(1);
}
}

unsigned int BGZFBlock::compress( void* data, unsigned int len )
{
while ( !tryCompress(data,len) )
;
return len;
}

bool BGZFBlock::tryCompress( void* data, unsigned int& len )
{
z_stream zs;
zs.zalloc = 0;
zs.zfree = 0;
zs.opaque = 0;
if ( deflateInit2(&zs,Z_DEFAULT_COMPRESSION,Z_DEFLATED,WINDOW_BITS,MEM_LEVEL,Z_DEFAULT_STRATEGY) != Z_OK )
fatalErr("Can't initialize BAM file's z_stream.");

zs.next_in = reinterpret_cast<Bytef*>(data);
zs.avail_in = len;
zs.next_out = mDataBlock;
zs.avail_out = sizeof(mDataBlock);
if ( deflate(&zs,Z_FINISH) != Z_STREAM_END )
{
zs.avail_in += 256;
if ( zs.avail_in >= len )
fatalErr("Can't deflate even a tiny amount of data for BAM file.");
len -= zs.avail_in; // back off by the amount we were unable to compress, and a little more
return false;
}

if ( deflateEnd(&zs) != Z_OK )
fatalErr("Can't finalize BAM file's z_stream.");

unsigned char* ppp = mDataBlock + zs.total_out;
unsigned int crc = crc32(crc32(0,0,0),reinterpret_cast<Bytef*>(data),len);
memcpy(ppp,&crc,sizeof(unsigned int)); // setting mCRC32
ppp += sizeof(unsigned int);
memcpy(ppp,&len,sizeof(unsigned int)); // setting mInputSize
ppp += sizeof(unsigned int);
mBlockSizeLessOne = (ppp - reinterpret_cast<unsigned char*>(this)) - 1U;

return true;
}

BGZFStreambuf::int_type BGZFStreambuf::overflow( int_type ch )
{
if ( ch != traits_type::eof() )
{
*pptr() = ch;
pbump(1);
}

char* ppp = pptr();
unsigned int inLen = ppp - mBuf;
if ( !inLen )
return 0; // EARLY RETURN!

setp(mBuf,epptr());

BGZFBlock output;
inLen -= output.compress(mBuf,inLen);
if ( inLen )
{
memmove(mBuf,ppp-inLen,inLen);
pbump(inLen);
}

if ( mpSB->sputn(reinterpret_cast<char const*>(&output),output.getBlockSize()) != output.getBlockSize() )
fatalErr("Can't write to BAM file.");

return 0;
}

int BGZFStreambuf::sync()
{
while ( pptr() != pbase() )
overflow( traits_type::eof() );
return 0;
}

BAMostream::BAMostream( char const* bamFile )
: std::ostream(&mSB), mSB(&mFilebuf)
{
mFilebuf.open(bamFile,std::ios_base::out|std::ios_base::binary|std::ios_base::trunc);
}

void BAMostream::close()
{
mSB.pubsync();
mFilebuf.pubsync();
mFilebuf.close();
}
103 changes: 103 additions & 0 deletions BGZF.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,103 @@
/////////////////////////////////////////////////////////////////////////////
// SOFTWARE COPYRIGHT NOTICE AGREEMENT //
// This software and its documentation are copyright (2008) by the //
// Broad Institute/Massachusetts Institute of Technology. All rights //
// are reserved. This software is supplied without any warranty or //
// guaranteed support whatsoever. Neither the Broad Institute nor MIT //
// can be responsible for its use, misuse, or functionality. //
/////////////////////////////////////////////////////////////////////////////
/*
* \file BGZF.h
* \author tsharpe
* \date May 21, 2009
*
* \brief Utilities for writing BAM files.
*/
#ifndef LOOKUP_BGZF_H_
#define LOOKUP_BGZF_H_

#include <fstream>

class GZIPHeader
{
public:
GZIPHeader()
: mID1(31), mID2(139), mCompressionMethod(8), mFlags(4), mModTime(0),
mExtraFlags(0), mOpSys(255), mXLen(6), mSI1('B'), mSI2('C'), mSLen(2)
{}

// compiler-supplied copying and destructor are OK

private:
unsigned char mID1;
unsigned char mID2;
unsigned char mCompressionMethod;
unsigned char mFlags;
unsigned int mModTime;
unsigned char mExtraFlags;
unsigned char mOpSys;
unsigned short mXLen;
unsigned char mSI1;
unsigned char mSI2;
unsigned short mSLen;
protected:
unsigned short mBlockSizeLessOne;
};

class BGZFBlock : public GZIPHeader
{
public:
// compiler-supplied no-arg constructor, copying and destructor are OK

// returns the amount of uncompressed data put into the block.
// if len is too big, or the data is too incompressible, not all of the supplied
// data will fit into a block.
unsigned int compress( void* data, unsigned int len );

unsigned int getBlockSize()
{ return mBlockSizeLessOne + 1U; }

private:
bool tryCompress( void* data, unsigned int& len );

// mBlockSizeLessOne is 16 bits, so 64K is the max block length and there are 26 bytes of header and footer
unsigned char mDataBlock[64*1024UL-26UL];
unsigned int mCRC32; // these last two members actually immediately follow the variable-length data block
unsigned int mInputSize; // this struct is what a maximum-size block looks like

static int const WINDOW_BITS = -15;
static int const MEM_LEVEL = 8;
};

class BGZFStreambuf : public std::streambuf
{
public:
BGZFStreambuf( std::streambuf* psb )
: mpSB(psb)
{ setp(mBuf,mBuf+sizeof(mBuf)-1); }

~BGZFStreambuf()
{ sync(); }

private:
BGZFStreambuf( BGZFStreambuf const& ); // undefined -- no copying
BGZFStreambuf& operator=( BGZFStreambuf const& ); // undefined -- no copying

int_type overflow( int_type ch );
int sync();

std::streambuf* mpSB;
char mBuf[128UL*1024UL];
};

class BAMostream : public std::ostream
{
public:
BAMostream( char const* bamFile );
void close();

std::filebuf mFilebuf;
BGZFStreambuf mSB;
};

#endif /* LOOKUP_BGZF_H_ */
3 changes: 3 additions & 0 deletions Makefile
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
all: OQCompress
OQCompress: OQCompress.cc BGZF.h BGZF.cc
g++ -std=c++11 -fno-strict-aliasing -Wextra -Wall -Wsign-promo -Woverloaded-virtual -Wendif-labels -march=native -g -o OQCompress OQCompress.cc BGZF.cc -lz
Loading

0 comments on commit 41c1060

Please sign in to comment.